fixtureProvider() {
16 | return Stream.of(
17 | Arguments.of(new OcrInfo(27, .131f, .527f, .879f, .053f), "p:27,x:13.1,y:52.7,w:87.9,h:5.3"),
18 | Arguments.of(new OcrInfo(.131f, .527f, .879f, .053f), "x:13.1,y:52.7,w:87.9,h:5.3"),
19 | Arguments.of(new OcrInfo(-1, 50, .123f, .456f, .789f, .091f), "l:50,x:12.3,y:45.6,w:78.9,h:9.1"),
20 | Arguments.of(new OcrInfo(123,456,.123f,.456f, .234f, .456f), "p:123,l:456,x:12.3,y:45.6,w:23.4,h:45.6"),
21 | Arguments.of(new OcrInfo( 123, 456, 511, .123f, .234f, .345f, .456f), "p:123,l:456,n:511,x:12.3,y:23.4,w:34.5,h:45.6"),
22 | Arguments.of(new OcrInfo(123, -1, 456, .123f, .234f, .345f, .456f), "p:123,n:456,x:12.3,y:23.4,w:34.5,h:45.6"),
23 | Arguments.of(new OcrInfo( 123, 456, 511, .1234f, .2345f, .3456f, .4567f), "p:123,l:456,n:511,x:12.34,y:23.45,w:34.56,h:45.67"),
24 | Arguments.of(new OcrInfo(123, 456, 511, 768, 1024, 2048, 4095),
25 | "p:123,l:456,n:511,x:768,y:1024,w:2048,h:4095")
26 | );
27 | }
28 |
29 | @ParameterizedTest
30 | @MethodSource("fixtureProvider")
31 | public void parseFromBeginning(OcrInfo info, String payload) {
32 | char[] buf = toChars(payload);
33 | OcrInfo parsed = OcrInfo.parse(
34 | buf, 0, payload.length(),
35 | info.getWordIndex() > 0 ? 9 : 0, info.getLineIndex() > 0 ? 11 : 0, info.getPageIndex() > 0 ? 12 : 0,
36 | 12,
37 | info.getHasAbsoluteCoordinates());
38 | assertThat(parsed).isEqualToComparingFieldByField(info);
39 | }
40 |
41 | @ParameterizedTest
42 | @MethodSource("fixtureProvider")
43 | public void parseFromPosition(OcrInfo info, String payload) {
44 | String padding = "someToken|";
45 | String padded = padding + payload;
46 | char[] buf = toChars(padded);
47 | OcrInfo parsed = OcrInfo.parse(
48 | buf, padding.length(), payload.length(),
49 | info.getWordIndex() > 0 ? 9 : 0, info.getLineIndex() > 0 ? 11 : 0, info.getPageIndex() > 0 ? 12 : 0,
50 | 12,
51 | info.getHasAbsoluteCoordinates());
52 | assertThat(parsed).isEqualToComparingFieldByField(info);
53 | }
54 |
55 | @Test
56 | public void keysMustNotBeUsedMultipleTimes() {
57 | String payload = "p:12,x:34.5,n:56,x:78.9,y:87.6,w:54.3,h:21";
58 | assertThatThrownBy(() -> OcrInfo.parse(toChars(payload), 0, payload.length(), 9, 11, 12, 12, false))
59 | .isInstanceOf(IllegalArgumentException.class)
60 | .hasMessageContaining("Invalid payload p:12,x:34.5,n:56,x:78.9,y:87.6,w:54.3,h:21: duplicate key 'x'");
61 | }
62 |
63 | @Test
64 | public void catchOverFlow() {
65 | String idxOverflow = "p:12,l:34,n:512,x:78.9,y:87.6,w:54.3,h:2.1";
66 | assertThatThrownBy(() -> OcrInfo.parse(toChars(idxOverflow), 0, idxOverflow.length(), 9, 11, 12, 12, false))
67 | .isInstanceOf(IllegalArgumentException.class)
68 | .hasMessageContaining("512 for word needs more than 9 bits (valid values range from 0 to 511). Payload=p:12,l:34,n:512,x:78.9,y:87.6,w:54.3,h:2.1");
69 | String coordOverFlow = "p:1,l:2,n:3,x:4096,y:2048,w:1024,h:512";
70 | assertThatThrownBy(() -> OcrInfo.parse(toChars(coordOverFlow), 0, coordOverFlow.length(), 9, 11, 12, 12, true))
71 | .isInstanceOf(IllegalArgumentException.class)
72 | .hasMessageContaining("4096 for x needs more than 12 bits (valid values range from 0 to 4095). Payload=p:1,l:2,n:3,x:4096,y:2048,w:1024,h:512");
73 | }
74 |
75 | @Test
76 | public void missingParametersAreCaught() {
77 | String missingLine = "p:12,n:56,x:78.9,y:87.6,w:54.3,h:2.1";
78 | assertThatThrownBy(() -> OcrInfo.parse(toChars(missingLine), 0, missingLine.length(), 9, 11, 12, 12, false))
79 | .isInstanceOf(IllegalArgumentException.class)
80 | .hasMessageContaining("fix payload or set the 'lineBits' option to 0.");
81 | String missingWord = "p:12,l:34,x:78.9,y:87.6,w:54.3,h:2.1";
82 | assertThatThrownBy(() -> OcrInfo.parse(toChars(missingWord), 0, missingWord.length(), 9, 11, 12, 12, false))
83 | .isInstanceOf(IllegalArgumentException.class)
84 | .hasMessageContaining("fix payload or set the 'wordBits' option to 0.");
85 | String missingPage = "l:34,n:56,x:78.9,y:87.6,w:54.3,h:2.1";
86 | assertThatThrownBy(() -> OcrInfo.parse(toChars(missingPage), 0, missingPage.length(), 9, 11, 12, 12, false))
87 | .isInstanceOf(IllegalArgumentException.class)
88 | .hasMessageContaining("fix payload or set the 'pageBits' option to 0.");
89 | String missingCoord = "p:12,l:34,n:56,x:78.9,y:87.6,w:54.3";
90 | assertThatThrownBy(() -> OcrInfo.parse(toChars(missingCoord), 0, missingCoord.length(), 9, 11, 12, 12, false))
91 | .isInstanceOf(IllegalArgumentException.class)
92 | .hasMessageContaining("coordinates are missing from payload ");
93 | }
94 | }
--------------------------------------------------------------------------------
/src/test/java/de/digitalcollections/solr/plugin/components/ocrhighlighting/OcrHighlightingTest.java:
--------------------------------------------------------------------------------
1 | package de.digitalcollections.solr.plugin.components.ocrhighlighting;
2 |
3 | import java.nio.file.Files;
4 | import java.nio.file.Paths;
5 | import org.apache.solr.SolrTestCaseJ4;
6 | import org.apache.solr.handler.component.SearchComponent;
7 | import org.junit.BeforeClass;
8 | import org.junit.Test;
9 |
10 | public class OcrHighlightingTest extends SolrTestCaseJ4 {
11 | @BeforeClass
12 | public static void beforeClass() throws Exception {
13 | initCore("solrconfig.xml", "schema.xml", "src/test/resources/solr", "alldata");
14 |
15 | // The highlighting component should be active
16 | SearchComponent highlighter = h.getCore().getSearchComponent("ocr_highlight");
17 | assertTrue("wrong highlighter: " + highlighter.getClass(),
18 | highlighter instanceof OcrHighlighting);
19 |
20 | String ocrText = String.join(" ", Files
21 | .readAllLines(Paths.get(OcrHighlighting.class.getResource("/data/ocrtext_full.txt").toURI())));
22 | assertU(adoc("ocr_text", "two|p:27,l:13,n:24,x:12.3,y:43.2,w:54.3,h:65.4, one|p:28,l:27,n:64,x:65.4,y:54.3,w:43.2,h:32.1", "id", "101"));
23 | assertU(adoc("ocr_text", "three|p:28,l:14,n:25,x:12.7,y:48.2,w:54.9,h:65.4, two|p:29,l:27,n:64,x:65.4,y:54.3,w:43.1,h:34.1, five|p:30,l:17,n:80,x:0,y:0,w:0,h:0, "
24 | + "four|p:31,l:32,n:33,x:11.1,y:11.1,w:11.1,h:11.1", "id", "102"));
25 | assertU(adoc("ocr_text", ocrText, "id", "103"));
26 |
27 | // Test with a dynamic field
28 | assertU(adoc("body_ocr", "one|p:42,l:13,n:55,x:11.1,y:22.2,w:33.3,h:44.4, two|p:42,l:13,n:66,x:55.5,y:66.6,w:77.7,h:88.8", "id", "106"));
29 |
30 | assertU(commit());
31 | }
32 |
33 | @Test
34 | public void testSingleQueryTerm() {
35 | assertQ(
36 | "single query term",
37 | req("q", "two", "sort", "id asc", "ocr_hl", "true","ocr_hl.fields", "ocr_text", "df", "ocr_text"),
38 | "count(//lst[@name='ocr_highlighting']/*)=2",
39 | "//lst[@name='ocr_highlighting']/lst[@name='101']/arr[@name='ocr_text']/lst[1]/int[@name='page']='27'",
40 | "count(//lst[@name='ocr_highlighting']/lst[@name='101']/arr[@name='ocr_text']/lst)=number('1')",
41 | "//lst[@name='ocr_highlighting']/lst[@name='102']/arr[@name='ocr_text']/lst[1]/int[@name='page']='29'",
42 | "count(//lst[@name='ocr_highlighting']/lst[@name='102']/arr[@name='ocr_text']/lst)=number('1')");
43 | }
44 |
45 | @Test
46 | public void testMultipleQueryTerms() {
47 | assertQ(
48 | "multiple query terms",
49 | req("q", "five four", "sort", "id asc", "ocr_hl", "true","ocr_hl.fields", "ocr_text", "df", "ocr_text"),
50 | "count(//lst[@name='ocr_highlighting']/*)=1",
51 | "count(//lst[@name='ocr_highlighting']/lst[@name='102']/arr[@name='ocr_text']/lst)=number('2')",
52 | "//lst[@name='ocr_highlighting']/lst[@name='102']/arr[@name='ocr_text']/lst[1]/int[@name='page']='30'",
53 | "//lst[@name='ocr_highlighting']/lst[@name='102']/arr[@name='ocr_text']/lst[2]/int[@name='page']='31'");
54 |
55 | }
56 |
57 | @Test
58 | public void testMultipleFuzzyQueryTerms() {
59 | assertQ(
60 | "multiple fuzzy query terms",
61 | req("q", "fives fours", "sort", "id asc", "ocr_hl", "true","ocr_hl.fields", "ocr_text", "df", "ocr_text"),
62 | "count(//lst[@name='ocr_highlighting']/*)=1",
63 | "count(//lst[@name='ocr_highlighting']/lst[@name='102']/arr[@name='ocr_text']/lst)=number('2')",
64 | "//lst[@name='ocr_highlighting']/lst[@name='102']/arr[@name='ocr_text']/lst[1]/int[@name='page']='30'",
65 | "//lst[@name='ocr_highlighting']/lst[@name='102']/arr[@name='ocr_text']/lst[1]/str[@name='term']='five'",
66 | "//lst[@name='ocr_highlighting']/lst[@name='102']/arr[@name='ocr_text']/lst[2]/int[@name='page']='31'",
67 | "//lst[@name='ocr_highlighting']/lst[@name='102']/arr[@name='ocr_text']/lst[2]/str[@name='term']='four'");
68 | }
69 |
70 | @Test
71 | public void testLimitHighlightsPerDoc() {
72 | assertQ(
73 | "limit number of highlights per document",
74 | req("q", "und", "sort", "id asc", "ocr_hl", "true","ocr_hl.fields", "ocr_text", "ocr_hl.maxPerDoc", "5", "df",
75 | "ocr_text"),
76 | "count(//lst[@name='ocr_highlighting']/lst[@name='103']/arr[@name='ocr_text']/lst)=number('5')");
77 | }
78 |
79 | @Test
80 | public void testLimitHighlightsPerPage() {
81 | assertQ(
82 | "limit number of highlights per page",
83 | req("q", "und", "sort", "id asc", "ocr_hl", "true","ocr_hl.fields", "ocr_text", "ocr_hl.maxPerPage", "5", "df",
84 | "ocr_text"),
85 | "count(//lst[@name='ocr_highlighting']/lst[@name='103']/arr[@name='ocr_text']/lst[int[@name='page']='183'])=number('5')");
86 | }
87 |
88 | @Test
89 | public void testDynamicField() {
90 | assertQ(
91 | "Dynamic field contains term with page number and word position",
92 | req("q", "one two", "sort", "id asc", "ocr_hl", "true", "ocr_hl.fields", "body_ocr", "df", "body_ocr"),
93 | "count(//lst[@name='ocr_highlighting']/*)=1",
94 | "count(//lst[@name='ocr_highlighting']/lst[@name='106']/arr[@name='body_ocr']/lst)=number('2')",
95 | "(//lst[@name='ocr_highlighting']/lst[@name='106']/arr[@name='body_ocr']/lst)[1]/int[@name='page']='42'",
96 | "(//lst[@name='ocr_highlighting']/lst[@name='106']/arr[@name='body_ocr']/lst)[1]/int[@name='word']='55'",
97 | "(//lst[@name='ocr_highlighting']/lst[@name='106']/arr[@name='body_ocr']/lst)[1]/int[@name='line']='13'",
98 | "(//lst[@name='ocr_highlighting']/lst[@name='106']/arr[@name='body_ocr']/lst)[2]/int[@name='page']='42'",
99 | "(//lst[@name='ocr_highlighting']/lst[@name='106']/arr[@name='body_ocr']/lst)[2]/int[@name='line']='13'",
100 | "(//lst[@name='ocr_highlighting']/lst[@name='106']/arr[@name='body_ocr']/lst)[2]/int[@name='word']='66'"
101 | );
102 | }
103 | }
104 |
--------------------------------------------------------------------------------
/src/main/java/de/digitalcollections/lucene/analysis/payloads/OcrPayloadHelper.java:
--------------------------------------------------------------------------------
1 | package de.digitalcollections.lucene.analysis.payloads;
2 |
3 | import com.google.common.math.IntMath;
4 | import java.math.BigInteger;
5 | import java.util.Arrays;
6 | import org.apache.lucene.util.BytesRef;
7 |
8 | /** Helper class to decode and encode OCR information from/into an efficient binary representation. **/
9 | public class OcrPayloadHelper {
10 |
11 | private OcrPayloadHelper() {
12 | // Cannot be instantiated, is only here for the static methods
13 | }
14 |
15 | /**
16 | * Encode a {@link OcrInfo} object into a byte array.
17 | *
18 | * If the coordinates are set to be stored as relative (i.e. percentage values), we first scale the bounding box
19 | * coordinates according to `precision`. We then pack the complete information into `coordBits * 4` bits. The bit
20 | * packing is done to save as much space as possible, while scaling is used t ostill maintain as much precision as
21 | * possible.
22 | *
23 | * Optionally, we also store word, line and page indices if the corresponding option
24 | * (`wordBits`, `lineBits`, `pageBits`) is non-zero.
25 | *
26 | * Here an example with page, line and word indices, relative coordinates and 10 bits per coordinate value:
27 | *
28 | * **Input:**
29 | * ```
30 | * info = OcrInfo(pageIndex=837, lineIndex=13, wordIndex=20, horizontalOffset=0.136838387,
31 | * verticalOffset=0.477909823, width=0.978231258, height=0.532390081),
32 | * coordBits = 10
33 | * wordBits = 9
34 | * lineBits = 10
35 | * pageBits = 12
36 | * absoluteCoordinates = false
37 | * ```
38 | *
39 | * Since we are using 10 bits for each of the four coordinates, 9 bits for the word index, 10 for the line index and
40 | * 12 for the page index, the resulting binary representation will have 72 bits (`4 * 10 + 9 + 11 + 12`) or 9 bytes.
41 | * This is very space-efficient compared to a string-based encoding, e.g. `x136y478w978h532n20l13p837`, which is
42 | * 36 bytes.
43 | *
44 | *
45 | * **Output:**
46 | * ```
47 | * {@code
48 | * field | width | scaled value | binary representation
49 | * ========================================================================
50 | * pageIndex | 12bit | 837 | 001101000100
51 | * lineIndex | 11bit | 13 | 00000001101
52 | * wordIndex | 9bit | 20 | 000010100
53 | * x | 10bit | 0.136838387 * 2^10 ~> 140 | 0010001100
54 | * y | 10bit | 0.477909823 * 2^10 ~> 489 | 0111101001
55 | * width | 10bit | 0.978231258 * 2^10 ~> 1002 | 1111101010
56 | * height | 10bit | 0.532390081 * 2^10 ~> 545 | 1000100001
57 | * }
58 | * ````
59 | *
60 | * The resulting byte sequence is as follows (bytes are separated by whitespace):
61 | * ```
62 | * pageIndex | lineIndex | wordIndex| x | y | w | h
63 | * 00110100 0100|0000 0001101|0 00010100| 00100011 00|011110 1001|1111 101010|10 00100001
64 | * 0x34 0x40 0x1A 0x14 0x23 0x1E 0x9F 0xAA 0x21
65 | * ```
66 | *
67 | * @param info The {@link OcrInfo} to encode
68 | * @param coordBits The number of bits to encode each OCR coordinate value into
69 | * @param wordBits The number of bits to encode the word index into
70 | * @param lineBits The number of bits to encode the line index into
71 | * @param pageBits The number of bits to encode the page index into
72 | * @return The resulting byte payload
73 | */
74 | public static byte[] encodeOcrInfo(OcrInfo info, int coordBits, int wordBits, int lineBits, int pageBits) {
75 | // To make bit-fiddling easier, we encode all the values into an arbitrary-length BigInteger
76 | int numBitsTotal = getOutputSize(coordBits, wordBits, lineBits, pageBits);
77 | int outSize = (int) Math.ceil((double) numBitsTotal / 8.0);
78 | BigInteger encoded = new BigInteger(new byte[outSize]);
79 |
80 | if (pageBits > 0) {
81 | encoded = encoded.or(BigInteger.valueOf(info.getPageIndex()));
82 | }
83 | if (lineBits > 0) {
84 | encoded = encoded.shiftLeft(lineBits)
85 | .or(BigInteger.valueOf(info.getLineIndex()));
86 | }
87 | if (wordBits > 0) {
88 | encoded = encoded.shiftLeft(wordBits)
89 | .or(BigInteger.valueOf(info.getWordIndex()));
90 | }
91 | if (info.getHasAbsoluteCoordinates()) {
92 | encoded = encoded
93 | .shiftLeft(coordBits)
94 | .or(BigInteger.valueOf(verifyAbsoluteValue((int) info.getHorizontalOffset(), coordBits)))
95 | .shiftLeft(coordBits)
96 | .or(BigInteger.valueOf(verifyAbsoluteValue((int) info.getVerticalOffset(), coordBits)))
97 | .shiftLeft(coordBits)
98 | .or(BigInteger.valueOf(verifyAbsoluteValue((int) info.getWidth(), coordBits)))
99 | .shiftLeft(coordBits)
100 | .or(BigInteger.valueOf(verifyAbsoluteValue((int) info.getHeight(), coordBits)));
101 |
102 | } else {
103 | encoded = encoded
104 | .shiftLeft(coordBits)
105 | .or(BigInteger.valueOf(encodeValue(info.getHorizontalOffset(), coordBits)))
106 | .shiftLeft(coordBits)
107 | .or(BigInteger.valueOf(encodeValue(info.getVerticalOffset(), coordBits)))
108 | .shiftLeft(coordBits)
109 | .or(BigInteger.valueOf(encodeValue(info.getWidth(), coordBits)))
110 | .shiftLeft(coordBits)
111 | .or(BigInteger.valueOf(encodeValue(info.getHeight(), coordBits)));
112 | }
113 |
114 | byte[] out = encoded.toByteArray();
115 |
116 | // FIXME: This should only strip as many leading zeroes as out.length - outSize
117 | // Strip extra leading null-bytes
118 | if (out.length > outSize) {
119 | byte[] trimmed = new byte[outSize];
120 | int trimmedIdx = 0;
121 | boolean prefix = true;
122 | for (byte anOut : out) {
123 | if (anOut != 0 || !prefix) {
124 | prefix = false;
125 | trimmed[trimmedIdx] = anOut;
126 | trimmedIdx += 1;
127 | }
128 | }
129 | out = trimmed;
130 | }
131 | return out;
132 | }
133 |
134 | private static int verifyAbsoluteValue(int value, int coordBits) {
135 | if (value >= IntMath.pow(2, coordBits)) {
136 | throw new IllegalArgumentException(String.format(
137 | "Value %d exceeds legal range of %d bits (0 to %d).", value, coordBits, IntMath.pow(2, coordBits) - 1));
138 | }
139 | return value;
140 | }
141 |
142 | /** Calculate the size of the payload resulting from the parameters **/
143 | private static int getOutputSize(int coordBits, int wordBits, int lineBits, int pageBits) {
144 | int outSize = coordBits * 4;
145 | if (pageBits > 0) {
146 | outSize += pageBits;
147 | }
148 | if (lineBits > 0) {
149 | outSize += lineBits;
150 | }
151 | if (wordBits > 0) {
152 | outSize += wordBits;
153 | }
154 | return outSize;
155 | }
156 |
157 | /**
158 | * Encode a given floating point value (between 0 and 1) to an integer with the given number of bits.
159 | **/
160 | private static int encodeValue(float source, int numBits) {
161 | return (int) Math.round(source * Math.pow(2, numBits));
162 | }
163 |
164 | /**
165 | * Decode a given integer (encoded with a certain number of bits) to a floating point value.
166 | **/
167 | private static float decodeValue(long source, int numBits) {
168 | return (float) (source / Math.pow(2, numBits));
169 | }
170 |
171 | /**
172 | * Create a bit mask to mask out a given number of bits
173 | */
174 | private static BigInteger makeBitMask(int numBits) {
175 | return BigInteger.valueOf(IntMath.pow(2, numBits) - 1);
176 | }
177 |
178 | /**
179 | * Decode an {@link OcrInfo} instance from the encoded byte array.
180 | *
181 | * @param data Buffer with encoded binary OCR information
182 | * @param coordBits Number of bits the OCR information was encoded with
183 | * @param wordBits Number of bits the word index was encoded with
184 | * @param lineBits Number of bits the line index was encoded with
185 | * @param pageBits Number of bits the page index was encoded with
186 | * @param absoluteCoordinates Whether the coordinates are stored absolute or relative (percent-values)
187 | * @return The decoded {@link OcrInfo} instance
188 | */
189 | public static OcrInfo decodeOcrInfo(BytesRef data, int coordBits, int wordBits, int lineBits, int pageBits,
190 | boolean absoluteCoordinates) {
191 | int coordMask = IntMath.pow(2, coordBits) - 1;
192 | OcrInfo info = new OcrInfo();
193 | info.setHasAbsoluteCoordinates(absoluteCoordinates);
194 | BigInteger encoded = new BigInteger(Arrays.copyOfRange(data.bytes, data.offset, data.offset + data.length));
195 |
196 | if (absoluteCoordinates) {
197 | info.setHeight(encoded.and(BigInteger.valueOf(coordMask)).intValue());
198 | info.setWidth(encoded.shiftRight(coordBits)
199 | .and(BigInteger.valueOf(coordMask)).intValue());
200 | info.setVerticalOffset(encoded.shiftRight(coordBits * 2)
201 | .and(BigInteger.valueOf(coordMask)).intValue());
202 | info.setHorizontalOffset(encoded.shiftRight(coordBits * 3)
203 | .and(BigInteger.valueOf(coordMask)).intValue());
204 | } else {
205 | info.setHeight(OcrPayloadHelper.decodeValue(
206 | encoded.and(BigInteger.valueOf(coordMask)).intValue(), coordBits));
207 | info.setWidth(OcrPayloadHelper.decodeValue(
208 | encoded.shiftRight(coordBits)
209 | .and(BigInteger.valueOf(coordMask)).intValue(), coordBits));
210 | info.setVerticalOffset(OcrPayloadHelper.decodeValue(
211 | encoded.shiftRight(coordBits * 2)
212 | .and(BigInteger.valueOf(coordMask)).intValue(), coordBits));
213 | info.setHorizontalOffset(OcrPayloadHelper.decodeValue(
214 | encoded.shiftRight(coordBits * 3)
215 | .and(BigInteger.valueOf(coordMask)).intValue(), coordBits));
216 | }
217 |
218 | int shift = coordBits * 4;
219 | if (wordBits > 0) {
220 | info.setWordIndex(encoded.shiftRight(shift).and(makeBitMask(wordBits)).intValue());
221 | shift += wordBits;
222 | }
223 | if (lineBits > 0) {
224 | info.setLineIndex(encoded.shiftRight(shift).and(makeBitMask(lineBits)).intValue());
225 | shift += lineBits;
226 | }
227 | if (pageBits > 0) {
228 | info.setPageIndex(encoded.shiftRight(shift).intValue());
229 | }
230 |
231 | return info;
232 | }
233 | }
234 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | 4.0.0
4 |
5 | solr-ocrpayload-plugin
6 | de.digitalcollections.search
7 | 0.2.2-SNAPSHOT
8 | jar
9 | Solr OCR Coordinate Payload Plugin
10 |
11 | Efficient indexing and bounding-box "highlighting" for OCR text
12 |
13 | https://github.com/dbmdz/solr-ocrpayload-plugin
14 |
15 |
16 | MIT License
17 | https://github.com/dbmdz/solr-ocrpayload-plugin/blob/master/LICENSE
18 | repo
19 |
20 |
21 |
22 |
23 |
24 | Johannes Baiter
25 | johannes.baiter@bsb-muenchen.de
26 | jbaiter
27 |
28 |
29 | Christoph Lorenz
30 | christoph.lorenz@bsb-muenchen.de
31 | clorenz
32 |
33 |
34 |
35 |
36 | https://travis-ci.org/dbmdz/solr-ocrpayload-plugin
37 | Travis CI
38 |
39 |
40 |
41 | https://github.com/dbmdz/solr-ocrpayload-plugin/issues
42 | GitHub Issues
43 |
44 |
45 |
46 | https://github.com/dbmdz/solr-ocrpayload-plugin.git
47 | git@github.com:dbmdz/solr-ocrpayload-plugin.git
48 | https://github.com/dbmdz/solr-ocrpayload-plugin
49 |
50 |
51 |
52 | 1.8
53 | 1.8
54 | 1.8
55 | UTF-8
56 |
57 | 3.12.1
58 | 1.2.0
59 | 5.3.2
60 | 2.11.1
61 | 1.7.25
62 | 7.5.0
63 |
64 | 0.8.3
65 | 3.0.0
66 | 3.8.0
67 | 3.1.1
68 | 3.1.0
69 | 3.0.1
70 | 2.22.1
71 | 1.6.8
72 |
73 |
74 |
75 |
76 | com.revinate
77 | assertj-json
78 | ${version.assertj-json}
79 | test
80 |
81 |
82 | org.apache.logging.log4j
83 | log4j-core
84 | ${version.log4j}
85 |
86 |
87 | org.apache.solr
88 | solr-core
89 | ${version.solr}
90 | compile
91 |
92 |
93 | org.apache.solr
94 | solr-test-framework
95 | ${version.solr}
96 | test
97 |
98 |
99 | org.assertj
100 | assertj-core
101 | ${version.assertj}
102 | test
103 |
104 |
105 | org.junit.jupiter
106 | junit-jupiter-api
107 | ${version.junit}
108 | test
109 |
110 |
111 | org.junit.jupiter
112 | junit-jupiter-engine
113 | ${version.junit}
114 | test
115 |
116 |
117 | org.junit.jupiter
118 | junit-jupiter-params
119 | ${version.junit}
120 | test
121 |
122 |
123 | org.junit.vintage
124 | junit-vintage-engine
125 | ${version.junit}
126 |
127 |
128 | org.slf4j
129 | slf4j-api
130 | ${version.slf4j}
131 |
132 |
133 | org.slf4j
134 | slf4j-nop
135 | ${version.slf4j}
136 | test
137 |
138 |
139 |
140 |
141 |
142 |
143 | org.apache.maven.plugins
144 | maven-checkstyle-plugin
145 | ${version.maven-checkstyle-plugin}
146 |
147 |
148 | validate
149 | validate
150 |
151 | https://raw.githubusercontent.com/dbmdz/development/master/code-quality/checkstyle.xml
152 | UTF-8
153 | true
154 | true
155 | false
156 |
157 |
158 | check
159 |
160 |
161 |
162 |
163 |
164 | org.apache.maven.plugins
165 | maven-compiler-plugin
166 | ${version.maven-compiler-plugin}
167 |
168 | 1.8
169 | 1.8
170 |
171 |
172 |
173 | org.apache.maven.plugins
174 | maven-jar-plugin
175 | ${version.maven-jar-plugin}
176 |
177 |
178 |
179 | true
180 | true
181 |
182 |
183 |
184 |
185 |
186 | org.apache.maven.plugins
187 | maven-javadoc-plugin
188 | ${version.maven-javadoc-plugin}
189 |
190 | 8
191 |
192 |
193 |
194 | attach-javadocs
195 |
196 | jar
197 |
198 |
199 |
200 |
201 |
202 | org.apache.maven.plugins
203 | maven-source-plugin
204 | ${version.maven-source-plugin}
205 |
206 |
207 | attach-sources
208 |
209 | jar-no-fork
210 |
211 |
212 |
213 |
214 |
215 | org.apache.maven.plugins
216 | maven-surefire-plugin
217 | ${version.maven-surefire-plugin}
218 |
219 |
220 | file:/dev/./urandom
221 |
222 |
223 |
224 |
225 | org.jacoco
226 | jacoco-maven-plugin
227 | ${version.jacoco-maven-plugin}
228 |
229 |
230 | pre-unit-test
231 |
232 | prepare-agent
233 |
234 |
235 |
236 | test
237 |
238 | report
239 |
240 |
241 |
242 |
243 |
244 | org.sonatype.plugins
245 | nexus-staging-maven-plugin
246 | ${version.nexus-staging-maven-plugin}
247 | true
248 |
249 | ossrh
250 | https://oss.sonatype.org/
251 | true
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 | org.jacoco
261 | jacoco-maven-plugin
262 | ${version.jacoco-maven-plugin}
263 |
264 |
265 |
266 |
267 |
268 |
269 | deploy
270 |
271 |
272 |
273 | org.apache.maven.plugins
274 | maven-gpg-plugin
275 | 1.6
276 |
277 |
278 | sign-artifacts
279 | verify
280 |
281 | sign
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 | ossrh-snapshots
294 | Sonatype Nexus Snapshots
295 | https://oss.sonatype.org/content/repositories/snapshots
296 |
297 |
298 |
299 |
300 |
301 | ossrh-snapshots
302 | Sonatype Nexus Snapshots
303 | https://oss.sonatype.org/content/repositories/snapshots
304 |
305 | true
306 |
307 |
308 | false
309 |
310 |
311 |
312 |
313 |
--------------------------------------------------------------------------------
/src/main/java/de/digitalcollections/lucene/analysis/payloads/OcrInfo.java:
--------------------------------------------------------------------------------
1 | package de.digitalcollections.lucene.analysis.payloads;
2 |
3 | import com.google.common.math.IntMath;
4 | import java.util.Comparator;
5 | import java.util.HashSet;
6 | import java.util.Set;
7 | import java.util.regex.Matcher;
8 | import java.util.regex.Pattern;
9 |
10 | public class OcrInfo implements Comparable {
11 |
12 | private static final Pattern PAYLOAD_PAT = Pattern.compile("(\\D+):([0-9.]+),?");
13 |
14 | private boolean hasAbsoluteCoordinates = false;
15 | private float horizontalOffset = -1.0f;
16 | private float verticalOffset = -1.0f;
17 | private float width = -1.0f;
18 | private float height = -1.0f;
19 | private int pageIndex = -1;
20 | private int lineIndex = -1;
21 | private int wordIndex = -1;
22 |
23 | private String term; // optional, only when returning search results
24 |
25 | OcrInfo() {
26 | // NOP
27 | }
28 |
29 | public OcrInfo(int horizontalOffset, int verticalOffset, int width, int height) {
30 | this(-1, horizontalOffset, verticalOffset, width, height);
31 | this.setHasAbsoluteCoordinates(true);
32 | }
33 |
34 | public OcrInfo(int pageIndex, int horizontalOffset, int verticalOffset, int width, int height) {
35 | this(pageIndex, -1, -1, horizontalOffset, verticalOffset, width, height);
36 | this.setHasAbsoluteCoordinates(true);
37 | }
38 |
39 | public OcrInfo(int pageIndex, int lineIndex, int horizontalOffset, int verticalOffset, int width, int height) {
40 | this(pageIndex, lineIndex, -1, horizontalOffset, verticalOffset, width, height);
41 | this.setHasAbsoluteCoordinates(true);
42 | }
43 |
44 | public OcrInfo(int pageIndex, int lineIndex, int wordIndex, int horizontalOffset, int verticalOffset, int width, int height) {
45 | this.setHasAbsoluteCoordinates(true);
46 | this.setHorizontalOffset(horizontalOffset);
47 | this.setVerticalOffset(verticalOffset);
48 | this.setWidth(width);
49 | this.setHeight(height);
50 | this.setPageIndex(pageIndex);
51 | this.setLineIndex(lineIndex);
52 | this.setWordIndex(wordIndex);
53 | }
54 |
55 | public OcrInfo(float horizontalOffset, float verticalOffset, float width, float height) {
56 | this(-1, horizontalOffset, verticalOffset, width, height);
57 | }
58 |
59 | public OcrInfo(int pageIndex, float horizontalOffset, float verticalOffset, float width, float height) {
60 | this.setHorizontalOffset(horizontalOffset);
61 | this.setVerticalOffset(verticalOffset);
62 | this.setWidth(width);
63 | this.setHeight(height);
64 | this.setPageIndex(pageIndex);
65 | }
66 |
67 | public OcrInfo(int pageIndex, int lineIndex, float horizontalOffset, float verticalOffset, float width, float height) {
68 | this(pageIndex, horizontalOffset, verticalOffset, width, height);
69 | this.lineIndex = lineIndex;
70 | }
71 |
72 | public OcrInfo(int pageIndex, int lineIndex, int wordIndex, float horizontalOffset, float verticalOffset, float width, float height) {
73 | this(pageIndex, lineIndex, horizontalOffset, verticalOffset, width, height);
74 | this.wordIndex = wordIndex;
75 | }
76 |
77 | /**
78 | * Parse an {@link OcrInfo} object from a character buffer.
79 | *
80 | * The string contains comma-separated pairs of single-character keys and numerical
81 | * values, e.g. `x:13.37`.
82 | *
83 | * Valid keys are:
84 | * - **p**: Page index, ranging from 0 to 2^pageBits (optional)
85 | * - **l**: Line index, ranging from 0 to 2^lineBits (optional)
86 | * - **n**: Word index, ranging from 0 to 2^wordBits (optional)
87 | * - **x**: Horizontal offset as floating point percentage in range [0...100]
88 | * OR absolute position as unsigned integer in range [0...2^coordBits] (mandatory)
89 | * - **y**: Vertical offset as floating point percentage in range [0...100]
90 | * OR absolute position as unsigned integer in range [0...2^coordBits] (mandatory)
91 | * - **w**: Width as floating point percentage in range [0...100]
92 | * OR absolute position as unsigned integer in range [0...2^coordBits] (mandatory)
93 | * - **h**: Height as floating point percentage in range [0...100]
94 | * OR absolute position as unsigned integer in range [0...2^coordBits] (mandatory)
95 | *
96 | * Here es an example: `p:27,l:50,n:13,x:13.1,y:52.7,w:87.9,h:5.3`
97 | * or, with integral (absolute) coordinate
98 | *
99 | * @param buffer Input character buffer
100 | * @param offset Offset of the encoded character information
101 | * @param length Length of the encoded character information
102 | * @param wordBits Number of bits used for encoding the word index
103 | * @param lineBits Number of bits used for encoding the line index
104 | * @param pageBits Number of bits used for encoding the page index
105 | * @param coordBits Number of bits used for encoding the coordinates
106 | * @param absoluteCoordinates Whether the coordinates are stored absolute or relative (percent-values)
107 | * @return The decoded {@link OcrInfo} instance
108 | */
109 | public static OcrInfo parse(char[] buffer, int offset, int length, int wordBits, int lineBits, int pageBits,
110 | int coordBits, boolean absoluteCoordinates) {
111 | OcrInfo info = new OcrInfo();
112 | info.setHasAbsoluteCoordinates(absoluteCoordinates);
113 |
114 | String payload = new String(buffer, offset, length).toLowerCase();
115 | Matcher m = PAYLOAD_PAT.matcher(payload);
116 | Set seenKeys = new HashSet<>();
117 | while (m.find()) {
118 | char key = m.group(1).charAt(0);
119 | if (seenKeys.contains(key)) {
120 | throw new IllegalArgumentException(String.format("Invalid payload %s: duplicate key '%c'", payload, key));
121 | } else {
122 | seenKeys.add(key);
123 | }
124 | String value = m.group(2);
125 | switch (key) {
126 | case 'p':
127 | info.setPageIndex(parseIntValue(value, pageBits, "page", payload));
128 | break;
129 | case 'l':
130 | info.setLineIndex(parseIntValue(value, lineBits, "line", payload));
131 | break;
132 | case 'n':
133 | info.setWordIndex(parseIntValue(value, wordBits, "word", payload));
134 | break;
135 | case 'x':
136 | if (absoluteCoordinates) {
137 | info.setHorizontalOffset(parseIntValue(value, coordBits, "x", payload));
138 | } else {
139 | info.setHorizontalOffset(Float.parseFloat(value) / 100f);
140 | }
141 | break;
142 | case 'y':
143 | if (absoluteCoordinates) {
144 | info.setVerticalOffset(parseIntValue(value, coordBits, "y", payload));
145 | } else {
146 | info.setVerticalOffset(Float.parseFloat(value) / 100f);
147 | }
148 | break;
149 | case 'w':
150 | if (absoluteCoordinates) {
151 | info.setWidth(parseIntValue(value, coordBits, "w", payload));
152 | } else {
153 | info.setWidth(Float.parseFloat(value) / 100f);
154 | }
155 | break;
156 | case 'h':
157 | if (absoluteCoordinates) {
158 | info.setHeight(parseIntValue(value, coordBits, "h", payload));
159 | } else {
160 | info.setHeight(Float.parseFloat(value) / 100f);
161 | }
162 | break;
163 | default:
164 | throw new IllegalArgumentException(String.format(
165 | "Could not parse OCR bounding box information, string was %s, invalid character was %c",
166 | new String(buffer, offset, length), key));
167 | }
168 | }
169 | if (info.getHorizontalOffset() < 0 || info.getHorizontalOffset() < 0 || info.getWidth() < 0 || info.getHeight() < 0) {
170 | throw new IllegalArgumentException(String.format(
171 | "One or more coordinates are missing from payload (was %s), make sure you have 'x', 'y', 'w' and 'h' set!",
172 | payload));
173 | }
174 | if (pageBits > 0 && info.getPageIndex() < 0) {
175 | throw new IllegalArgumentException(String.format(
176 | "Page index is missing from payload (was: '%s'), fix payload or set the 'pageBits' option to 0.", payload));
177 | }
178 | if (lineBits > 0 && info.getLineIndex() < 0) {
179 | throw new IllegalArgumentException(String.format(
180 | "Line index is missing from payload (was: '%s'), fix payload or set the 'lineBits' option to 0.", payload));
181 | }
182 | if (wordBits > 0 && info.getWordIndex() < 0) {
183 | throw new IllegalArgumentException(String.format(
184 | "Word index is missing from payload (was: '%s'), fix payload or set the 'wordBits' option to 0.", payload));
185 | }
186 | return info;
187 | }
188 |
189 | private static int parseIntValue(String value, int numBits, String type, String payload) {
190 | int index = Integer.parseInt(value);
191 | if (index >= IntMath.pow(2, numBits)) {
192 | throw new IllegalArgumentException(String.format("Value %d for %s needs more than %d bits (valid values range from 0 to %d). Payload=%s",
193 | index, type, numBits, IntMath.pow(2, numBits) - 1, payload));
194 | }
195 | return index;
196 | }
197 |
198 | public float getHorizontalOffset() {
199 | return horizontalOffset;
200 | }
201 |
202 | public void setHorizontalOffset(float horizontalOffset) {
203 | this.horizontalOffset = horizontalOffset;
204 | }
205 |
206 | private void checkCoordinate(float coordinate) {
207 | if (coordinate > 1) {
208 | throw new IllegalArgumentException(String.format("Coordinates can at most be 100, was %1f!", coordinate * 100));
209 | }
210 | }
211 |
212 | public float getVerticalOffset() {
213 | return verticalOffset;
214 | }
215 |
216 | public void setVerticalOffset(float verticalOffset) {
217 | if (!hasAbsoluteCoordinates) {
218 | checkCoordinate(verticalOffset);
219 | }
220 | this.verticalOffset = verticalOffset;
221 | }
222 |
223 | public float getWidth() {
224 | return width;
225 | }
226 |
227 | public void setWidth(float width) {
228 | if (!hasAbsoluteCoordinates) {
229 | checkCoordinate(width);
230 | }
231 | this.width = width;
232 | }
233 |
234 | public float getHeight() {
235 | return height;
236 | }
237 |
238 | public void setHeight(float height) {
239 | if (!hasAbsoluteCoordinates) {
240 | checkCoordinate(height);
241 | }
242 | this.height = height;
243 | }
244 |
245 | public int getPageIndex() {
246 | return pageIndex;
247 | }
248 |
249 | public void setPageIndex(int pageIndex) {
250 | this.pageIndex = pageIndex;
251 | }
252 |
253 | public String getTerm() {
254 | return term;
255 | }
256 |
257 | public void setTerm(String term) {
258 | this.term = term;
259 | }
260 |
261 | public int getLineIndex() {
262 | return lineIndex;
263 | }
264 |
265 | public void setLineIndex(int lineIndex) {
266 | this.lineIndex = lineIndex;
267 | }
268 |
269 | public int getWordIndex() {
270 | return wordIndex;
271 | }
272 |
273 | public void setWordIndex(int wordIndex) {
274 | this.wordIndex = wordIndex;
275 | }
276 |
277 | @Override
278 | public String toString() {
279 | return "OcrInfo{"
280 | + "horizontalOffset=" + horizontalOffset
281 | + ", verticalOffset=" + verticalOffset
282 | + ", width=" + width
283 | + ", height=" + height
284 | + ", pageIndex=" + pageIndex
285 | + ", lineIndex=" + lineIndex
286 | + ", wordIndex=" + wordIndex
287 | + ", term='" + term + '\''
288 | + '}';
289 | }
290 |
291 | @Override
292 | public int compareTo(OcrInfo other) {
293 | return Comparator
294 | .comparing(OcrInfo::getPageIndex)
295 | .thenComparing(OcrInfo::getLineIndex)
296 | .thenComparing(OcrInfo::getWordIndex)
297 | .thenComparing(OcrInfo::getHorizontalOffset)
298 | .thenComparing(OcrInfo::getVerticalOffset)
299 | .compare(this, other);
300 | }
301 |
302 | public boolean getHasAbsoluteCoordinates() {
303 | return hasAbsoluteCoordinates;
304 | }
305 |
306 | public void setHasAbsoluteCoordinates(boolean hasAbsoluteCoordinates) {
307 | this.hasAbsoluteCoordinates = hasAbsoluteCoordinates;
308 | }
309 | }
310 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # :construction: Deprecated in favor of [solr-ocrhighlighting](https://github.com/dbmdz/solr-ocrhighlighting)
2 |
3 | # Solr OCR Coordinate Payload Plugin
4 |
5 | [](https://javadoc.io/doc/de.digitalcollections.search/solr-ocrpayload-plugin)
6 | [](https://travis-ci.org/dbmdz/solr-ocrpayload-plugin)
7 | [](https://codecov.io/gh/dbmdz/solr-ocrpayload-plugin)
8 | [](LICENSE)
9 | [](https://github.com/dbmdz/solr-ocrpayload-plugin/releases)
10 | [](https://search.maven.org/search?q=a:solr-ocrpayload-plugin)
11 |
12 | *Efficient indexing and bounding-box "highlighting" for OCR text*
13 |
14 | ## tl;dr
15 |
16 | - Store OCR bounding box information and token position directly in the Solr index in a space-efficient manner
17 | - Retrieve bounding box and token position directly in your Solr query results, no additional parsing necessary
18 |
19 | **Indexing**:
20 |
21 | The OCR information is appended after each token as a concatenated list of `:` pairs, see further down
22 | for a detailed description of available keys.
23 |
24 | `POST /solr/mycore/update`
25 |
26 | ```json
27 | [{ "id": "test_document",
28 | "ocr_text": "this|p:13,l:5,n:6,x:11.1,y:22.2,w:33.3,h:44.4 is|p:13,l:5,n:7,x:22.2,y:33.3,w:44.4,h:55.5 a|p:13,l:5,n:8,x:33.3,y:33.3,w:44.4,h:55.5 test|p:13,l:5,n:9,x:44.4,y:33.3,w:44.4h:55.5" }]
29 | ```
30 |
31 | **Querying**:
32 |
33 | The plugin adds a new top-level key (`ocr_highlight` in this case) that contains the OCR information for
34 | each matching token as a structured object.
35 |
36 | `GET /solr/mycore/select?ocr_hl=true&ocr_hl.fields=ocr_text&indent=true&wt=json&q=test`
37 |
38 | ```json
39 | {
40 | "responseHeader": "...",
41 | "response": {
42 | "numFound": 1,
43 | "docs": [{"id": "test_document"}]
44 | },
45 | "ocr_highlight":{
46 | "test_document":{
47 | "ocr_text":[{
48 | "term":"test",
49 | "page":13,
50 | "line": 5,
51 | "word": 9,
52 | "x":0.444,
53 | "y":0.333,
54 | "width":0.444,
55 | "height":0.555}]
56 | }
57 | }
58 | }
59 | ```
60 |
61 | ## Use Case
62 | At the Bavarian State Library, we try to provide full-text search over all of our OCRed content. In addition
63 | to obtaining matching documents, the user should also get a small snippet of the corresponding part of the
64 | page image, with the matching words highlighted, similar to what e.g. Google Books provides.
65 |
66 |
67 | ## Approaches
68 | For this to work, we need some way of mapping matching tokens to their corresponding location in the underlying
69 | OCR text. A common approach used by a number of libraries is to **use a secondary microservice for this** that takes
70 | as input a document identifier and a text snippet and will return all coordinates of matching text snippets on
71 | the page. While this approach generally works okay, it has several drawbacks:
72 |
73 | - **Performance:** Every snippet requires a query to the OCR service, which itself has to do a linear scan
74 | through the OCR document. For e.g. a result set of 100 snippets, this will result in 101 queries (initial
75 | Solr query and 100 snippet queries). Of course this can be optimized by batching and having a good index
76 | structure for the coordinate lookup, but it's still less than ideal.
77 | - **Storage:** To reliably be able to map text matches to the base text, you have to store a copy of the
78 | full text in the index, alongside the regular index. This blows up the index size significantly.
79 | Foregoing storing the text and only using the normalized terms from the index for matching will
80 | break the mapping to OCR, since depending on the analyzer configuration, Lucene will perform stemming, etc.
81 |
82 | Alternatively, you could also **store the coordinates directly as strings in the index**. This works by e.g.
83 | indexing each token as `|` and telling Lucene to ignore everything after the pipe during
84 | analysis. As the full text of the document is stored, you wil get back a series of these annotated tokens
85 | as query results and can then parse the coordinates from your highlighting information. This solves the
86 | *Performance* part of the above approach, but worsens the *Storage* problem: For every token, we now not only
87 | have to store the token itself, but an expensive coordinate string as well.
88 |
89 | ## Our Approach
90 |
91 | This plugin uses a similar approach to the above, but solves the *Storage* problem by using an efficient binary
92 | format to store the OCR coordinate information in the index: We use bit-packing to combine a number of OCR
93 | coordinate parameters into a **byte payload**, which is not stored in the field itself, but as an associated
94 | [Lucene Payload](https://lucidworks.com/2017/09/14/solr-payloads/):
95 |
96 | - `x`, `y`, `w`, `h`: Coordinates of the bounding box on the page as either:
97 | - **absolute** unsigned integer offsets between 0 and `2^coordinateBits` (see below)
98 | - **relative** floating point percentages between 0 and 100 (e.g. `x:42.3` for a horizontal offset of 43.2%)
99 | - `pageIndex`: Unsigned integer that stores the page index of a token (optional)
100 | - `lineIndex`: Unsigned integer that stores the line index of a token (optional)
101 | - `wordIndex`: Unsigned integer that stores the word index of a token (optional)
102 |
103 | For each of these values, you can configure the number of bits the plugin should use to store them, or disable
104 | certain parameters entirely. This allows you to fine-tune the settings to your needs. In our case, for example, we
105 | use these values: `4 * 12 bits (coordinates) + 9 bits (word index) + 11 bits (line index) + 12 bits (page index)`,
106 | resulting in a 80 bit or 10 byte payload per token. A comparable string representation `p0l0n0x000y000w000h000`
107 | would have at least 22 bytes, so we save >50% for every token.
108 |
109 | At query time, we then retrieve the payload for each matching token and put the decoded information into the
110 | `ocr_highlight` result key that can be directly used without having to do any additional parsing.
111 |
112 | ## Usage
113 | ### Installation
114 |
115 | Download the [latest release from GitHub](https://github.com/dbmdz/solr-ocrpayload-plugin/releases) and put the JAR into your `$SOLR_HOME/$SOLR_CORE/lib/` directory.
116 |
117 | ### Indexing configuration
118 |
119 | To use it, first add the `DelimitedOcrInfoPayloadTokenFilterFactory`☕ filter to your analyzer chain (e.g. for a `ocr_text` field type):
120 |
121 | ```xml
122 |
123 |
124 |
125 |
127 |
128 |
129 |
130 |
131 |
132 |
133 | ```
134 |
135 | The filter takes the following parameters:
136 |
137 | - `delimiter`: Character used for delimiting the payload from the token in the input document (default: `|`)
138 | - `absoluteCoordinates`: `true` or `false` to configure whether the stored coordinates are absolute
139 | - `coordinateBits`: Number of bits to use for encoding OCR coordinates in the index. (mandatory)
140 | A value of `10` (default) is recommended, resulting in coordBits to approximately two decimal places.
141 | - `wordBits`: Number of bits to use for encoding the word index.
142 | Set to 0 (default) to disable storage of the word index.
143 | - `lineBits`: Number of bits to use for encoding the line index.
144 | Set to 0 (default) to disable storage of the line index.
145 | - `pageBits`: Number of bits to use for encoding the page index.
146 | Set to 0 (default) to disable storage of the page index.
147 |
148 | The filter expects an input payload after the configured `delimiter` in the input stream, with the payload being a
149 | pseudo-JSON structure (e.g. `k1:1,k2:3`) with the following keys:
150 |
151 | - `p`: Page index (if `pageBits` > 0)
152 | - `l`: Line index (if `lineBits` > 0)
153 | - `n`: Word index (if `wordBits` > 0)
154 | - `x`, `y`, `w`, `h`: Coordinates of the OCR box as floating point percentages or integers (if `absoluteCoordinates`)
155 |
156 | As an example, consider the token `foobar` with an OCR box of `(0.50712, 0.31432, 0.87148, 0.05089)`
157 | (i.e. with `absoluteCoordinates="false"`), the configured delimiter `☞` and storage of indices for the word (`30`),
158 | line (`12`) and page (`13`):
159 | `foobar☞p:13,l:12,n:30,x:50.7,y:31.4,w:87.1,h:5.1`.
160 |
161 | Alternatively, with `absoluteCoordinates="true"`, an OCR box of `(512, 1024, 3192, 256)` and otherwise the same
162 | settings:
163 | `foobar☞p:13,l:12,n:30,x:512,y:1024,w:3192,h:256`.
164 |
165 | Finally, you just have to configure your schema to use the field type defined above. Storing the content is **not**
166 | recommended, since it significantly increases the index size and is not used at all for querying and highlighting:
167 |
168 | ```xml
169 |
170 | ```
171 |
172 | ### Highlighting configuration
173 |
174 | To enable highlighting using the OCR payloads, add the `OcrHighlighting` component to your Solr
175 | configuration, configure it with the same `absoluteCoordinates`, `coordinateBits`, `wordBits`, `lineBits` and `pageBits`
176 | values that were used for the filter in the analyzer chain:
177 |
178 | ```xml
179 |
180 |
183 |
184 |
185 |
186 | ocr_highlight
187 |
188 |
189 |
190 | ```
191 |
192 | Now at query time, you can just set the `ocr_hl=true` parameter, specify the fields you want highlighted via
193 | `ocr_hl.fields=myfield,myotherfield` and retrieve highlighted matches with their OCR coordinates:
194 |
195 | `GET /solr/mycore/select?ocr_hl=true&ocr_hl.fields=ocr_text&indent=true&q=augsburg&wt=json`
196 |
197 | ```json
198 | {
199 | "responseHeader":{
200 | "status":0,
201 | "QTime":158},
202 | "response":{"numFound":526,"start":0,"docs":[
203 | {
204 | "id":"bsb10502835"},
205 | {
206 | "id":"bsb11032147"},
207 | {
208 | "id":"bsb10485243"},
209 | ...
210 | },
211 | "ocr_highlight":{
212 | "bsb10502835":{
213 | "ocr_text":[{
214 | "page":7,
215 | "position":9,
216 | "term":"augsburg",
217 | "x":0.111,
218 | "y":0.062,
219 | "width":0.075,
220 | "height":0.013},
221 | {
222 | "page":7,
223 | "position":264,
224 | "term":"augsburg",
225 | "x":0.320,
226 | "y":0.670,
227 | "width":0.099,
228 | "height":0.012},
229 | ...]}},
230 | ...
231 | }
232 | }
233 | }
234 | ```
235 |
236 |
237 | ## FAQ
238 |
239 | - **How does highlighting work with phrase queries?**
240 |
241 | You will receive a bounding box object for every individual matching term in the phrase.
242 |
243 | - **What are the performance and storage implications of using this plugin?**
244 |
245 | *Performance*: With an Intel Xeon E5-1620@3.5GHz on a single core, we measured (with JMH):
246 |
247 | - Encoding the Payload: 1,484,443.200 Payloads/Second or ~14.2MiB/s with an 80bit payload
248 | - Decoding the Payload: 1,593,036.372 Payloads/Second or ~15.2MiB/s with an 80bit payload
249 |
250 | *Storage*: This depends on your configuration. With our sample configuration of an 80 bit payload
251 | (see above), the payload overhead is 10 bytes per token. That is, for a corpus size of 10 Million Tokens,
252 | you will need approximately 95MiB to store the payloads.
253 | The actual storage required might be lower, since Lucene compresses the payloads with LZ4.
254 |
255 | - **Does this work with SolrCloud?**
256 |
257 | It does! We're running it with SolrCloud ourselves.
258 |
--------------------------------------------------------------------------------
/src/main/java/de/digitalcollections/solr/plugin/components/ocrhighlighting/OcrHighlighting.java:
--------------------------------------------------------------------------------
1 | package de.digitalcollections.solr.plugin.components.ocrhighlighting;
2 |
3 | import de.digitalcollections.lucene.analysis.payloads.OcrInfo;
4 | import de.digitalcollections.lucene.analysis.payloads.OcrPayloadHelper;
5 | import java.io.IOException;
6 | import java.util.ArrayList;
7 | import java.util.Arrays;
8 | import java.util.Collections;
9 | import java.util.HashMap;
10 | import java.util.List;
11 | import java.util.Map;
12 | import java.util.Set;
13 | import java.util.TreeSet;
14 | import org.apache.lucene.document.Document;
15 | import org.apache.lucene.index.IndexReader;
16 | import org.apache.lucene.index.LeafReader;
17 | import org.apache.lucene.index.LeafReaderContext;
18 | import org.apache.lucene.index.MultiReader;
19 | import org.apache.lucene.index.PostingsEnum;
20 | import org.apache.lucene.index.ReaderUtil;
21 | import org.apache.lucene.index.Term;
22 | import org.apache.lucene.index.Terms;
23 | import org.apache.lucene.index.TermsEnum;
24 | import org.apache.lucene.search.IndexSearcher;
25 | import org.apache.lucene.search.Query;
26 | import org.apache.lucene.util.BytesRef;
27 | import org.apache.solr.common.params.SolrParams;
28 | import org.apache.solr.common.util.NamedList;
29 | import org.apache.solr.common.util.SimpleOrderedMap;
30 | import org.apache.solr.core.PluginInfo;
31 | import org.apache.solr.handler.component.ResponseBuilder;
32 | import org.apache.solr.handler.component.SearchComponent;
33 | import org.apache.solr.handler.component.ShardRequest;
34 | import org.apache.solr.request.SolrQueryRequest;
35 | import org.apache.solr.schema.IndexSchema;
36 | import org.apache.solr.schema.SchemaField;
37 | import org.apache.solr.search.DocIterator;
38 | import org.apache.solr.search.DocList;
39 | import org.apache.solr.search.SolrIndexSearcher;
40 | import org.apache.solr.util.SolrPluginUtils;
41 | import org.apache.solr.util.plugin.PluginInfoInitialized;
42 |
43 | public class OcrHighlighting extends SearchComponent implements PluginInfoInitialized {
44 |
45 | private static final IndexSearcher EMPTY_INDEXSEARCHER;
46 |
47 | static {
48 | try {
49 | IndexReader emptyReader = new MultiReader();
50 | EMPTY_INDEXSEARCHER = new IndexSearcher(emptyReader);
51 | EMPTY_INDEXSEARCHER.setQueryCache(null);
52 | } catch (IOException bogus) {
53 | throw new RuntimeException(bogus);
54 | }
55 | }
56 |
57 | private int coordBits;
58 | private int wordBits;
59 | private int lineBits;
60 | private int pageBits;
61 | private boolean absoluteCoordinates;
62 |
63 | @Override
64 | public void prepare(ResponseBuilder rb) {
65 | // NOP
66 | }
67 |
68 | @Override
69 | public void process(ResponseBuilder rb) throws IOException {
70 | if (rb.req.getParams().getBool("ocr_hl", false)) {
71 | NamedList