├── .gitignore
├── lexpredict-tika
    ├── setenv.sh
    ├── src
    │   ├── test
    │   │   ├── resources
    │   │   │   └── test-documents
    │   │   │   │   ├── jbig.pdf
    │   │   │   │   ├── scanned.pdf
    │   │   │   │   ├── sample_ol.pdf
    │   │   │   │   ├── chylde_harold.pdf
    │   │   │   │   ├── sample_table.pdf
    │   │   │   │   ├── text_on_white.pdf
    │   │   │   │   ├── transp_scanned.pdf
    │   │   │   │   ├── double_space_test.pdf
    │   │   │   │   ├── mixed_scanned_text.pdf
    │   │   │   │   └── industrial developing authority.pdf
    │   │   └── java
    │   │   │   └── com
    │   │   │       └── lexpredict
    │   │   │           └── tika
    │   │   │               ├── PdfContentTypeCheckerTest.java
    │   │   │               ├── ShallowCopyTest.java
    │   │   │               ├── PdfContentImagePreprocessorTest.java
    │   │   │               ├── AlterPDFParserTest.java
    │   │   │               └── TikaTest.java
    │   └── main
    │   │   └── java
    │   │       └── com
    │   │           └── lexpredict
    │   │               └── tika
    │   │                   ├── TagData.java
    │   │                   ├── AlterPDFParserConfig.java
    │   │                   ├── ShallowCopy.java
    │   │                   ├── FieldLookup.java
    │   │                   ├── AlterXHTMLContentHandler.java
    │   │                   ├── OCR2XHTML.java
    │   │                   ├── PdfContentImagePreprocessor.java
    │   │                   ├── HttpRequestParamsReader.java
    │   │                   ├── PDFEncodedStringDecoder.java
    │   │                   ├── PdfContentTypeChecker.java
    │   │                   ├── PdfStripperProcessor.java
    │   │                   ├── PDMetadataExtractor.java
    │   │                   ├── XFAExtractor.java
    │   │                   ├── LegacyPDFStreamEngine.java
    │   │                   ├── AlterPDFParser.java
    │   │                   ├── PDF2XHTML.java
    │   │                   └── AbstractPDF2XHTML.java
    ├── .gitignore
    ├── debug.sh
    ├── example_tika.config
    ├── prepare_debug_env.sh
    ├── pom.xml.debug
    ├── pom.xml
    └── readme.txt
├── deployment-example
    ├── deploy.sh
    ├── docker-compose.yml
    └── tika-config.xml
├── tika-config.xml
├── Dockerfile
├── README.md
└── LICENSE


/.gitignore:
--------------------------------------------------------------------------------
1 | /.idea


--------------------------------------------------------------------------------
/lexpredict-tika/setenv.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | TIKA_VERSION=1.20
4 | LEXPREDICT_TIKA_VERSION=1.0
5 | 


--------------------------------------------------------------------------------
/deployment-example/deploy.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | 
4 | sudo -E docker stack deploy --compose-file docker-compose.yml lexpredict-tika-cluster
5 | 


--------------------------------------------------------------------------------
/lexpredict-tika/src/test/resources/test-documents/jbig.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LexPredict/tika-server/HEAD/lexpredict-tika/src/test/resources/test-documents/jbig.pdf


--------------------------------------------------------------------------------
/lexpredict-tika/src/test/resources/test-documents/scanned.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LexPredict/tika-server/HEAD/lexpredict-tika/src/test/resources/test-documents/scanned.pdf


--------------------------------------------------------------------------------
/lexpredict-tika/src/test/resources/test-documents/sample_ol.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LexPredict/tika-server/HEAD/lexpredict-tika/src/test/resources/test-documents/sample_ol.pdf


--------------------------------------------------------------------------------
/lexpredict-tika/src/test/resources/test-documents/chylde_harold.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LexPredict/tika-server/HEAD/lexpredict-tika/src/test/resources/test-documents/chylde_harold.pdf


--------------------------------------------------------------------------------
/lexpredict-tika/src/test/resources/test-documents/sample_table.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LexPredict/tika-server/HEAD/lexpredict-tika/src/test/resources/test-documents/sample_table.pdf


--------------------------------------------------------------------------------
/lexpredict-tika/src/test/resources/test-documents/text_on_white.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LexPredict/tika-server/HEAD/lexpredict-tika/src/test/resources/test-documents/text_on_white.pdf


--------------------------------------------------------------------------------
/lexpredict-tika/src/test/resources/test-documents/transp_scanned.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LexPredict/tika-server/HEAD/lexpredict-tika/src/test/resources/test-documents/transp_scanned.pdf


--------------------------------------------------------------------------------
/lexpredict-tika/src/test/resources/test-documents/double_space_test.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LexPredict/tika-server/HEAD/lexpredict-tika/src/test/resources/test-documents/double_space_test.pdf


--------------------------------------------------------------------------------
/lexpredict-tika/src/test/resources/test-documents/mixed_scanned_text.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LexPredict/tika-server/HEAD/lexpredict-tika/src/test/resources/test-documents/mixed_scanned_text.pdf


--------------------------------------------------------------------------------
/lexpredict-tika/src/test/resources/test-documents/industrial developing authority.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LexPredict/tika-server/HEAD/lexpredict-tika/src/test/resources/test-documents/industrial developing authority.pdf


--------------------------------------------------------------------------------
/lexpredict-tika/.gitignore:
--------------------------------------------------------------------------------
 1 | # Eclipse
 2 | .classpath
 3 | .project
 4 | .settings/
 5 |  
 6 | # Intellij
 7 | .idea/
 8 | *.iml
 9 | *.iws
10 |  
11 | # Mac
12 | .DS_Store
13 |  
14 | # Maven
15 | log/
16 | target/
17 | 
18 | 
19 | debug/
20 | 
21 | tmp/
22 | 


--------------------------------------------------------------------------------
/lexpredict-tika/debug.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | source setenv.sh
3 | 
4 | java -agentlib:jdwp=transport=dt_socket,server=y,address=8001,suspend=n -cp "./debug/tika-server-${TIKA_VERSION}.jar:./target/lexpredict-tika-${LEXPREDICT_TIKA_VERSION}.jar:libs/*" org.apache.tika.server.TikaServerCli --config ../tika-config.xml


--------------------------------------------------------------------------------
/deployment-example/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.3"
 2 | services:
 3 | 
 4 |   tika:
 5 |     image: lexpredict/tika-server:latest
 6 |     ports:
 7 |       - 9999:9998
 8 |     configs:
 9 |     - source: tika_config_3
10 |       target: /tika-config.xml
11 |     networks:
12 |       - net
13 |     deploy:
14 |       replicas: 3
15 | 
16 | networks:
17 |   net:
18 | 
19 | configs:
20 |   tika_config_3:
21 |     file: ./tika-config.xml
22 | 


--------------------------------------------------------------------------------
/tika-config.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <properties>
 3 |   <parsers>
 4 |     <!-- Default Parser for most things, except for 2 mime types, and never
 5 |          use the Executable Parser -->
 6 |     <parser class="org.apache.tika.parser.DefaultParser">
 7 |       <mime-exclude>application/pdf</mime-exclude>
 8 |       <parser-exclude class="org.apache.tika.parser.executable.ExecutableParser"/>
 9 |     </parser>
10 |     <!-- Use a different parser for PDF -->
11 |     <parser class="com.lexpredict.tika.AlterPDFParser">
12 |       <mime>application/pdf</mime>
13 |     </parser>
14 |   </parsers>
15 | </properties>


--------------------------------------------------------------------------------
/lexpredict-tika/src/test/java/com/lexpredict/tika/PdfContentTypeCheckerTest.java:
--------------------------------------------------------------------------------
 1 | package com.lexpredict.tika;
 2 | 
 3 | import org.junit.Test;
 4 | import java.io.InputStream;
 5 | import static org.junit.Assert.assertEquals;
 6 | 
 7 | public class PdfContentTypeCheckerTest extends TikaTest {
 8 |     @Test
 9 |     public void testPdfTypeChecker() throws Exception {
10 |         InputStream stream = AlterPDFParserTest.class.getResourceAsStream("/test-documents/scanned.pdf");
11 |         PdfContentTypeChecker checker = new PdfContentTypeChecker();
12 |         PdfContentTypeChecker.PdfContent docType = checker.determineDocContentType(stream);
13 |         assertEquals(PdfContentTypeChecker.PdfContent.IMAGES, docType);
14 |     }
15 | }
16 | 


--------------------------------------------------------------------------------
/lexpredict-tika/src/test/java/com/lexpredict/tika/ShallowCopyTest.java:
--------------------------------------------------------------------------------
 1 | package com.lexpredict.tika;
 2 | 
 3 | import org.apache.tika.parser.pdf.PDFParserConfig;
 4 | import org.junit.Test;
 5 | import static org.junit.Assert.assertTrue;
 6 | 
 7 | public class ShallowCopyTest {
 8 |     @Test
 9 |     public void testPdfAlphaImageReplacing() throws Exception {
10 |         PDFParserConfig cfg = new PDFParserConfig();
11 |         cfg.setExtractUniqueInlineImagesOnly(false);
12 |         cfg.setOcrStrategy("OCR_ONLY");
13 | 
14 |         PDFParserConfig cpy = new PDFParserConfig();
15 |         ShallowCopy.copyFields(cfg, cpy);
16 |         assertTrue(cfg.getExtractUniqueInlineImagesOnly() ==
17 |                 cpy.getExtractUniqueInlineImagesOnly());
18 |         assertTrue(cfg.getOcrStrategy() ==
19 |                 cpy.getOcrStrategy());
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/lexpredict-tika/src/main/java/com/lexpredict/tika/TagData.java:
--------------------------------------------------------------------------------
 1 | package com.lexpredict.tika;
 2 | 
 3 | public class TagData {
 4 |     public String tagName;
 5 |     public Boolean isCdata;
 6 |     public String attributeString;
 7 |     public StringBuilder data = new StringBuilder();
 8 | 
 9 |     public TagData(String tagName,
10 |                    Boolean isCdata,
11 |                    String attributeString) {
12 |         this.tagName = tagName;
13 |         this.isCdata = isCdata;
14 |         this.attributeString = attributeString;
15 |     }
16 | 
17 |     @Override
18 |     public String toString() {
19 |         return "TagData{" +
20 |                 "tagName='" + tagName + '\'' +
21 |                 ", isCdata=" + isCdata +
22 |                 ", attributeString='" + attributeString + '\'' +
23 |                 ", data=" + data +
24 |                 '}';
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/lexpredict-tika/src/main/java/com/lexpredict/tika/AlterPDFParserConfig.java:
--------------------------------------------------------------------------------
 1 | package com.lexpredict.tika;
 2 | 
 3 | import org.apache.tika.parser.pdf.PDFParserConfig;
 4 | 
 5 | public class AlterPDFParserConfig{
 6 |     public static void configureAlterPtf2Xhtml(PDFParserConfig config, PDF2XHTML pdf2XHTML) {
 7 |         pdf2XHTML.setSortByPosition(config.getSortByPosition());
 8 |         if (config.getEnableAutoSpace()) {
 9 |             pdf2XHTML.setWordSeparator(" ");
10 |         } else {
11 |             pdf2XHTML.setWordSeparator("");
12 |         }
13 |         if (config.getAverageCharTolerance() != null) {
14 |             pdf2XHTML.setAverageCharTolerance(config.getAverageCharTolerance());
15 |         }
16 |         if (config.getSpacingTolerance() != null) {
17 |             pdf2XHTML.setSpacingTolerance(config.getSpacingTolerance());
18 |         }
19 |         pdf2XHTML.setSuppressDuplicateOverlappingText(config.getSuppressDuplicateOverlappingText());
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/lexpredict-tika/example_tika.config:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <properties>
 3 |   <service-loader initializableProblemHandler="ignore"/>
 4 |   <parsers>
 5 |     <!-- Default Parser for most things, except for 2 mime types, and never
 6 |          use the Executable Parser -->
 7 |     <parser class="org.apache.tika.parser.DefaultParser">
 8 |       <mime-exclude>image/jpeg</mime-exclude>
 9 |       <mime-exclude>application/pdf</mime-exclude>
10 |       <parser-exclude class="org.apache.tika.parser.executable.ExecutableParser"/>
11 |     </parser>
12 |     <!-- Use a different parser for PDF -->
13 |     <parser class="com.lexpredict.tika.AlterPDFParser">
14 |       <mime>application/pdf</mime>
15 |       <param name="extractInlineImages" type="bool">true</param>
16 |       <param name="extractUniqueInlineImagesOnly" type="bool">false</param>
17 |       <param name="ocrStrategy" type="String">ocr_and_text_extraction</param>
18 |       <param name="enableImageProcessing" type="int">1</param>
19 |     </parser>
20 |   </parsers>
21 | </properties>


--------------------------------------------------------------------------------
/deployment-example/tika-config.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!--
 3 | This file is distributed among all Tika Server node machines in the cluster via Docker Swarm config routines.
 4 | 
 5 | At the current moment Docker Swarm does not allow normal updating of configs and will throw an error
 6 | if one tries to re-deploy a service after changing this file.
 7 | 
 8 | To make config changes accepted after changing this file please also change config name in docker-compose.yml
 9 | (for example - increment version number):
10 | 
11 | ...
12 | configs:
13 |     - source: tika_config_1
14 | ...
15 | ...
16 | configs:
17 |   tika_config_1:
18 | ...
19 | 
20 | 
21 | -->
22 | <properties>
23 |     <!-- Uncomment the following to disable Tesseract OCR in Tika -
24 |          to speedup text extracting if OCR-ing is not required. -->
25 | 
26 |     <!--<parsers>
27 |         <parser class="org.apache.tika.parser.DefaultParser">
28 |             <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
29 |         </parser>
30 |     </parsers>-->
31 | </properties>


--------------------------------------------------------------------------------
/lexpredict-tika/prepare_debug_env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source setenv.sh
 4 | TIKA_SERVER_URL=https://www.apache.org/dist/tika/tika-server-$TIKA_VERSION.jar
 5 | 
 6 | 
 7 | mkdir -p ./debug
 8 | pushd debug
 9 | 
10 | sudo apt-get install -y gpg curl gdal-bin openjdk-8-jre-headless
11 | 
12 | sudo apt-get -y install \
13 |         tesseract-ocr \
14 |         tesseract-ocr-eng tesseract-ocr-ita tesseract-ocr-fra tesseract-ocr-spa tesseract-ocr-deu tesseract-ocr-rus \
15 |  && tesseract -v
16 | 
17 | curl -sSL https://people.apache.org/keys/group/tika.asc -o /tmp/tika.asc \
18 |  && gpg --import /tmp/tika.asc \
19 |  && curl -sSL "$TIKA_SERVER_URL.asc" -o /tmp/tika-server-${TIKA_VERSION}.jar.asc \
20 |  && NEAREST_TIKA_SERVER_URL=$(curl -sSL http://www.apache.org/dyn/closer.cgi/${TIKA_SERVER_URL#https://www.apache.org/dist/}\?asjson\=1 \
21 |  | awk '/"path_info": / { pi=$2; }; /"preferred":/ { pref=$2; }; END { print pref " " pi; };' \
22 |  | sed -r -e 's/^"//; s/",$//; s/" "//') \
23 |  && echo "Nearest mirror: $NEAREST_TIKA_SERVER_URL" \
24 |  && wget "$NEAREST_TIKA_SERVER_URL" -O tika-server-${TIKA_VERSION}.jar
25 | 
26 | 
27 | 
28 | 
29 | popd
30 | 


--------------------------------------------------------------------------------
/lexpredict-tika/src/main/java/com/lexpredict/tika/ShallowCopy.java:
--------------------------------------------------------------------------------
 1 | package com.lexpredict.tika;
 2 | 
 3 | import java.lang.reflect.Field;
 4 | 
 5 | public class ShallowCopy {
 6 |         public static void copyFields(Object from, Object to) {
 7 |             Field[] fields = from.getClass().getDeclaredFields();
 8 |             for (Field field : fields) {
 9 |                 try {
10 |                     Field fieldFrom = from.getClass().getDeclaredField(field.getName());
11 |                     if (java.lang.reflect.Modifier.isStatic(fieldFrom.getModifiers()))
12 |                         continue;
13 | 
14 |                     boolean wasAccessed = fieldFrom.isAccessible();
15 |                     fieldFrom.setAccessible(true);
16 |                     Object value = fieldFrom.get(from);
17 |                     fieldFrom.setAccessible(wasAccessed);
18 | 
19 |                     Field fieldTo = to.getClass().getDeclaredField(field.getName());
20 |                     fieldTo.setAccessible(true);
21 |                     fieldTo.set(to, value);
22 |                     fieldTo.setAccessible(wasAccessed);
23 | 
24 |                 } catch (IllegalAccessException | NoSuchFieldException e) {
25 |                     e.printStackTrace();
26 |                 }
27 |             }
28 |         }
29 | }
30 | 


--------------------------------------------------------------------------------
/lexpredict-tika/src/test/java/com/lexpredict/tika/PdfContentImagePreprocessorTest.java:
--------------------------------------------------------------------------------
 1 | package com.lexpredict.tika;
 2 | 
 3 | import org.apache.pdfbox.pdmodel.PDDocument;
 4 | import org.junit.Test;
 5 | import java.io.InputStream;
 6 | import static org.junit.Assert.assertFalse;
 7 | import static org.junit.Assert.assertTrue;
 8 | 
 9 | public class PdfContentImagePreprocessorTest extends TikaTest {
10 |     @Test
11 |     public void testPdfNonAlphaImageReplacing() throws Exception {
12 |         InputStream stream = AlterPDFParserTest.class.getResourceAsStream("/test-documents/scanned.pdf");
13 |         PDDocument doc = PDDocument.load(stream);
14 | 
15 |         PdfContentImagePreprocessor preproc = new PdfContentImagePreprocessor();
16 |         boolean hasReplaced = preproc.removeImagesAlphaChannel(doc);
17 |         assertFalse(hasReplaced);
18 |     }
19 | 
20 |     @Test
21 |     public void testPdfAlphaImageReplacing() throws Exception {
22 |         InputStream stream = AlterPDFParserTest.class.getResourceAsStream("/test-documents/transp_scanned.pdf");
23 |         PDDocument doc = PDDocument.load(stream);
24 | 
25 |         PdfContentImagePreprocessor preproc = new PdfContentImagePreprocessor();
26 |         boolean hasReplaced = preproc.removeImagesAlphaChannel(doc);
27 |         assertTrue(hasReplaced);
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/lexpredict-tika/src/main/java/com/lexpredict/tika/FieldLookup.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  *
17 |  * Modifications copyright (C) 2020 ContraxSuite, LLC
18 |  */
19 | 
20 | package com.lexpredict.tika;
21 | 
22 | import java.lang.reflect.Field;
23 | 
24 | public class FieldLookup {
25 | 
26 |     // find field in passed class or one of his ancestors
27 |     public static Object getFieldValue(Object obj, String fieldName) {
28 |         Field f = findField(obj.getClass(), fieldName);
29 |         if (f == null)
30 |             return null;
31 |         try {
32 |             f.setAccessible(true);
33 |             return f.get(obj);
34 |         } catch (IllegalAccessException e) {
35 |             return null;
36 |         }
37 |     }
38 | 
39 |     public static Field findField(Class<?> cls, String fieldName) {
40 |         while (true) {
41 |             try {
42 |                 return cls.getDeclaredField(fieldName);
43 |             } catch (NoSuchFieldException e) {
44 |                 // pass
45 |             }
46 |             cls = cls.getSuperclass();
47 |             if (cls == null) break;
48 |         }
49 |         return null;
50 |     }
51 | }
52 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:18.04
 2 | 
 3 | ENV TIKA_VERSION 1.24
 4 | ENV TIKA_SERVER_URL https://www.apache.org/dist/tika/tika-server-$TIKA_VERSION.jar
 5 | 
 6 | 
 7 | 
 8 | RUN apt-get -y --fix-missing update
 9 | 
10 | RUN apt-get install -y gpg curl gdal-bin openjdk-8-jre-headless
11 | 
12 | RUN \
13 |     apt-get -y install \
14 |         tesseract-ocr \
15 |         tesseract-ocr-eng tesseract-ocr-ita tesseract-ocr-fra tesseract-ocr-spa tesseract-ocr-deu tesseract-ocr-rus \
16 |  && tesseract -v
17 | 
18 | RUN \
19 |     curl -sSL https://people.apache.org/keys/group/tika.asc -o /tmp/tika.asc \
20 |  && gpg --import /tmp/tika.asc \
21 |  && curl -sSL "$TIKA_SERVER_URL.asc" -o /tmp/tika-server-${TIKA_VERSION}.jar.asc \
22 |  && NEAREST_TIKA_SERVER_URL=$(curl -sSL http://www.apache.org/dyn/closer.cgi/${TIKA_SERVER_URL#https://www.apache.org/dist/}\?asjson\=1 \
23 |  	    | awk '/"path_info": / { pi=$2; }; /"preferred":/ { pref=$2; }; END { print pref " " pi; };' \
24 | 		| sed -r -e 's/^"//; s/",$//; s/" "//') \
25 |  && echo "Nearest mirror: $NEAREST_TIKA_SERVER_URL" \
26 |  && curl -sSL "$NEAREST_TIKA_SERVER_URL" -o /tika-server-${TIKA_VERSION}.jar
27 | 
28 | 
29 | RUN apt-get -y clean autoclean \
30 |     && apt-get -y autoremove \
31 |     && rm -rf /var/lib/{apt,dpkg,cache,log}/
32 | 
33 | # default Tika config - may be overriden by Docker Swarm config mounting
34 | COPY ./tika-config.xml /tika-config.xml
35 | COPY ./lexpredict-tika/target/lexpredict-tika-1.0.jar /
36 | RUN echo $(date) > /build.date
37 | 
38 | EXPOSE 9998
39 | ENTRYPOINT  echo "Tika Server Docker Image built $(cat /build.date)" \
40 | 	    && echo "Java Version:" \
41 | 	    && java -version \
42 |             && echo "Tesseract:" \
43 |             && tesseract -v \
44 |             && echo "Tika: ${TIKA_VERSION}" \
45 |             && echo "Config:" \
46 |             && cat /tika-config.xml \
47 |             && java -cp "tika-server-${TIKA_VERSION}.jar:lexpredict-tika-1.0.jar:libs/*" org.apache.tika.server.TikaServerCli --h 0.0.0.0 --port 9998 --config /tika-config.xml
48 | 


--------------------------------------------------------------------------------
/lexpredict-tika/pom.xml.debug:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 | 
 6 |     <artifactId>lexpredict-tika</artifactId>
 7 |     <groupId>com.lexpredict.tika</groupId>
 8 |     <version>1.0</version>
 9 |     <modelVersion>4.0.0</modelVersion>
10 | 
11 |     <dependencies>
12 |         <dependency>
13 |             <groupId>org.apache.tika</groupId>
14 |             <artifactId>tika-app</artifactId>
15 |             <version>1.23</version>
16 |         </dependency>
17 |         <dependency>
18 |             <groupId>org.apache.tika</groupId>
19 |             <artifactId>tika-core</artifactId>
20 |             <version>1.23</version>
21 |         </dependency>
22 |         <dependency>
23 |             <groupId>commons-io</groupId>
24 |             <artifactId>commons-io</artifactId>
25 |             <version>2.6</version>
26 |         </dependency>
27 |         <dependency>
28 |             <groupId>org.apache.pdfbox</groupId>
29 |             <artifactId>pdfbox</artifactId>
30 |             <version>2.0.13</version>
31 |         </dependency>
32 |         <dependency>
33 |             <groupId>org.apache.tika</groupId>
34 |             <artifactId>tika-parsers</artifactId>
35 |             <version>1.23</version>
36 |         </dependency>
37 |         <dependency>
38 |             <groupId>org.apache.tika</groupId>
39 |             <artifactId>tika-server</artifactId>
40 |             <version>1.23</version>
41 |         </dependency>
42 |         <dependency>
43 |             <groupId>junit</groupId>
44 |             <artifactId>junit</artifactId>
45 |             <version>4.12</version>
46 |             <scope>test</scope>
47 |         </dependency>
48 |     </dependencies>
49 | 
50 |     <build>
51 |         <plugins>
52 |             <plugin>
53 |                 <groupId>org.apache.maven.plugins</groupId>
54 |                 <artifactId>maven-compiler-plugin</artifactId>
55 |                 <version>3.6.0</version>
56 |                 <configuration>
57 |                     <source>1.8</source>
58 |                     <target>1.8</target>
59 |                 </configuration>
60 |             </plugin>
61 |         </plugins>
62 |     </build>
63 | </project>
64 | 


--------------------------------------------------------------------------------
/lexpredict-tika/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 | 
 6 |     <artifactId>lexpredict-tika</artifactId>
 7 |     <groupId>com.lexpredict.tika</groupId>
 8 |     <version>1.0</version>
 9 |     <modelVersion>4.0.0</modelVersion>
10 | 
11 |     <dependencies>
12 |         <dependency>
13 |             <groupId>org.apache.tika</groupId>
14 |             <artifactId>tika-core</artifactId>
15 |             <version>1.24</version>
16 |         </dependency>
17 |         <dependency>
18 |             <groupId>org.apache.tika</groupId>
19 |             <artifactId>tika-app</artifactId>
20 |             <version>1.24</version>
21 |             <scope>compile</scope>
22 |         </dependency>
23 |         <dependency>
24 |             <groupId>commons-io</groupId>
25 |             <artifactId>commons-io</artifactId>
26 |             <version>2.6</version>
27 |         </dependency>
28 |         <dependency>
29 |             <groupId>org.apache.pdfbox</groupId>
30 |             <artifactId>pdfbox</artifactId>
31 |             <version>2.0.13</version>
32 |         </dependency>
33 |         <dependency>
34 |             <groupId>org.apache.tika</groupId>
35 |             <artifactId>tika-parsers</artifactId>
36 |             <version>1.24</version>
37 |         </dependency>
38 |         <dependency>
39 |             <groupId>org.apache.tika</groupId>
40 |             <artifactId>tika-server</artifactId>
41 |             <version>1.24</version>
42 |         </dependency>
43 |         <dependency>
44 |             <groupId>junit</groupId>
45 |             <artifactId>junit</artifactId>
46 |             <version>4.12</version>
47 |             <scope>test</scope>
48 |         </dependency>
49 |     </dependencies>
50 | 
51 |     <build>
52 |         <plugins>
53 |             <plugin>
54 |                 <groupId>org.apache.maven.plugins</groupId>
55 |                 <artifactId>maven-compiler-plugin</artifactId>
56 |                 <version>3.6.0</version>
57 |                 <configuration>
58 |                     <source>1.8</source>
59 |                     <target>1.8</target>
60 |                 </configuration>
61 |             </plugin>
62 |         </plugins>
63 |     </build>
64 | </project>
65 | 


--------------------------------------------------------------------------------
/lexpredict-tika/src/main/java/com/lexpredict/tika/AlterXHTMLContentHandler.java:
--------------------------------------------------------------------------------
 1 | package com.lexpredict.tika;
 2 | 
 3 | import org.apache.tika.metadata.Metadata;
 4 | import org.apache.tika.sax.ToXMLContentHandler;
 5 | import org.apache.tika.sax.XHTMLContentHandler;
 6 | import org.xml.sax.ContentHandler;
 7 | import org.xml.sax.SAXException;
 8 | 
 9 | import java.lang.reflect.Field;
10 | import java.lang.reflect.InvocationTargetException;
11 | import java.lang.reflect.Method;
12 | 
13 | class AlterXHTMLContentHandler extends XHTMLContentHandler {
14 |     protected static final char[] emptyChar = new char[0];
15 | 
16 |     protected ToXMLContentHandler decoratedHandler;
17 | 
18 |     protected Method charactersRawMethod = null;
19 | 
20 |     public AlterXHTMLContentHandler(ContentHandler handler, Metadata metadata) {
21 |         super(handler, metadata);
22 |         try {
23 |             Class c = Class.forName("org.apache.tika.sax.ContentHandlerDecorator");
24 |             Field field = c.getDeclaredField("handler");
25 |             field.setAccessible(true);
26 |             Object decoratedHandlerObj = field.get(this);
27 | 
28 |             if (decoratedHandlerObj instanceof ToXMLContentHandler) {
29 |                 // handlerClassName can also be TaggedContentHandler
30 |                 this.decoratedHandler = (ToXMLContentHandler)field.get(this);
31 |                 c = Class.forName("org.apache.tika.sax.ToXMLContentHandler");
32 |                 this.charactersRawMethod = c.getDeclaredMethod("write",
33 |                         String.class);
34 |                 this.charactersRawMethod.setAccessible(true);
35 |             }
36 |         } catch (ClassNotFoundException | NoSuchMethodException | NoSuchFieldException | IllegalAccessException e) {
37 |             e.printStackTrace();
38 |         }
39 |     }
40 | 
41 |     public boolean isCharacterInvalid(char c) {
42 |         return this.isInvalid(c);
43 |     }
44 | 
45 |     public void charactersRaw(String data) throws SAXException {
46 |         if (this.charactersRawMethod == null) {
47 |             super.characters(data);
48 |             return;
49 |         }
50 | 
51 |         super.characters(emptyChar, 0, 0);
52 |         try {
53 |             this.charactersRawMethod.invoke(this.decoratedHandler, data);
54 |         } catch (IllegalAccessException | InvocationTargetException e) {
55 |             e.printStackTrace();
56 |         }
57 |     }
58 | }
59 | 


--------------------------------------------------------------------------------
/lexpredict-tika/src/main/java/com/lexpredict/tika/OCR2XHTML.java:
--------------------------------------------------------------------------------
 1 | package com.lexpredict.tika;
 2 | import java.io.IOException;
 3 | import java.io.Writer;
 4 | import org.apache.commons.io.IOExceptionWithCause;
 5 | import org.apache.pdfbox.pdmodel.PDDocument;
 6 | import org.apache.pdfbox.pdmodel.PDPage;
 7 | import org.apache.pdfbox.text.TextPosition;
 8 | import org.apache.tika.exception.TikaException;
 9 | import org.apache.tika.metadata.Metadata;
10 | import org.apache.tika.parser.ParseContext;
11 | import org.apache.tika.parser.pdf.PDFParserConfig;
12 | import org.xml.sax.ContentHandler;
13 | import org.xml.sax.SAXException;
14 | 
15 | class OCR2XHTML extends AbstractPDF2XHTML {
16 |     private OCR2XHTML(PDDocument document, ContentHandler handler, ParseContext context, 
17 |                       Metadata metadata, PDFParserConfig config) throws IOException {
18 |         super(document, handler, context, metadata, config);
19 |     }
20 | 
21 |     public static void process(PDDocument document, ContentHandler handler, 
22 |                                ParseContext context, Metadata metadata, PDFParserConfig config) throws SAXException, TikaException {
23 |         OCR2XHTML ocr2XHTML = null;
24 | 
25 |         try {
26 |             ocr2XHTML = new OCR2XHTML(document, handler, context, metadata, config);
27 |             ocr2XHTML.writeText(document, new Writer() {
28 |                 public void write(char[] cbuf, int off, int len) {
29 |                 }
30 | 
31 |                 public void flush() {
32 |                 }
33 | 
34 |                 public void close() {
35 |                 }
36 |             });
37 |         } catch (IOException var7) {
38 |             if (var7.getCause() instanceof SAXException) {
39 |                 throw (SAXException)var7.getCause();
40 |             }
41 | 
42 |             throw new TikaException("Unable to extract PDF content", var7);
43 |         }
44 | 
45 |         if (ocr2XHTML.exceptions.size() > 0) {
46 |             throw new TikaException("Unable to extract all PDF content", (Throwable)ocr2XHTML.exceptions.get(0));
47 |         }
48 |     }
49 | 
50 |     public void processPage(PDPage pdPage) throws IOException {
51 |         try {
52 |             this.startPage(pdPage);
53 |             this.doOCROnCurrentPage();
54 |             this.endPage(pdPage);
55 |         } catch (SAXException | TikaException var3) {
56 |             throw new IOExceptionWithCause(var3);
57 |         } catch (IOException var4) {
58 |             this.handleCatchableIOE(var4);
59 |         }
60 | 
61 |     }
62 | 
63 |     protected void writeString(String text) throws IOException {
64 |     }
65 | 
66 |     protected void writeCharacters(TextPosition text) throws IOException {
67 |     }
68 | 
69 |     protected void writeWordSeparator() throws IOException {
70 |     }
71 | 
72 |     protected void writeLineSeparator() throws IOException {
73 |     }
74 | }
75 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # tika-server
 2 | 
 3 | Configurable Apache Tika Server Docker Image with Tesseract 4.
 4 | 
 5 | Contains additional PDF parser improvements to workaround problem with obsolete empty lines in PDF files caused by corrupted embedded fonts.
 6 | 
 7 | ## Contents
 8 | - Apache Tika 1.20
 9 | - Tesseract OCR 4
10 | - Tesseract Language Packs: English, Italian, French, Spain, German, Russian
11 | 
12 | Allows providing external configuration file for Tika Server - for disabling OCR or any other needs.
13 | 
14 | ## Building
15 | 
16 | ```
17 | cd build
18 | ./build.sh script.
19 | ```
20 | 
21 | ## Running
22 | 
23 | **Pulling lexnlp/tika-server:**
24 | ```
25 | docker pull lexpredict/tika-server
26 | ```
27 | 
28 | 
29 | 
30 | **Simply running Tika Server with default config and publishing Tika port on the host machine:**
31 | ```
32 | docker run -p 9998:9998 -it lexpredict/tika-server
33 | ``` 
34 | 
35 | **Running Tika Server with external configuration:**
36 | 1. Create tika-config.xml file.
37 | The following example tika-config.xml can be used for disabling OCR:
38 | ```
39 | <?xml version="1.0" encoding="UTF-8"?>
40 | <properties>
41 |   <parsers>
42 |       <parser class="org.apache.tika.parser.DefaultParser">
43 |           <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
44 |       </parser>
45 |   </parsers>
46 | </properties>
47 | ```
48 | 2. Run Tika server with this config file:
49 | ```
50 | docker run -it -p 9998:9998 -v /home/user/tika-config.xml:/tika-config.xml lexpredict/tika-server
51 | ```
52 | If running via sudo ensure you provide full path to the file on the host machine – otherwise it will throw an error.
53 | 
54 | 
55 | **Running Tika Server cluster in Docker Swarm:**
56 | 1. Assuming you already have a Docker Swarm cluster configured (docker swarm init) and some worker machines are connected to it.
57 | 2. To deploy Tika we need docker-compose.yml file (see /deployment-example dir):
58 | ```
59 | version: "3.3"
60 | services:
61 |   tika:
62 |     image: lexpredict/tika-server:latest
63 |     ports:
64 |       - 9998:9998
65 |     configs:
66 |       - source: tika_config_3
67 |         target: /tika-config.xml
68 |     networks:
69 |       - net
70 |     deploy:
71 |       replicas: 3
72 | 
73 | networks:
74 |   net:
75 | 
76 | configs:
77 |   tika_config_3:
78 |     file: ./tika-config.xml
79 | 
80 | ```
81 | Configuration file (tika-config.xml) should be in the same directory with docker-compose.xml.
82 | 3. Deploying Tika to Docker Swarm: 
83 | ```
84 | docker stack deploy --compose-file docker-compose.yml tika-cluster
85 | ```
86 | 
87 | ## Workaround for fixing obsolete empty lines in PDF documents having corrupted embedded fonts
88 | 
89 | In some cases the current PDF text extraction routines from TIKA work incorrectly with PDF documents containing corrupted embedded fonts. The extracted text contains an obsolete blank line after almost every line of normal text.
90 | 
91 | It can be fixed by using PDFTextStripper class from PDFBox which probably was used in previous versions of TIKA.
92 | This workaround is not suitable for all cases because it provides worse results than TIKA's normal text extraction on good uncorrupted PDF documents.
93 | 
94 | Normaly TIKA configured in this Docker image processes PDFs as usual without using the old-style PDFTextStripper.
95 | To trigger processing the document with PDFTextStripper add a header to the request: "pdf-parse:strip".
96 | 


--------------------------------------------------------------------------------
/lexpredict-tika/src/main/java/com/lexpredict/tika/PdfContentImagePreprocessor.java:
--------------------------------------------------------------------------------
 1 | package com.lexpredict.tika;
 2 | 
 3 | import org.apache.pdfbox.cos.COSName;
 4 | import org.apache.pdfbox.pdmodel.PDDocument;
 5 | import org.apache.pdfbox.pdmodel.PDPage;
 6 | import org.apache.pdfbox.pdmodel.PDPageTree;
 7 | import org.apache.pdfbox.pdmodel.PDResources;
 8 | import org.apache.pdfbox.pdmodel.graphics.PDXObject;
 9 | import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
10 | import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
11 | import javax.imageio.ImageIO;
12 | import java.awt.*;
13 | import java.awt.image.BufferedImage;
14 | import java.io.ByteArrayOutputStream;
15 | import java.io.IOException;
16 | 
17 | // TODO: somehow we should determine image type from COSName
18 | // or PDImageXObject before saving it back to ByteArrayOutputStream in getImageBytes()
19 | 
20 | // TODO: determine contrast background color in flattenImage()
21 | 
22 | public class PdfContentImagePreprocessor {
23 |     private boolean imagesWereChanged;
24 | 
25 |     private PDDocument document;
26 | 
27 |     public boolean removeImagesAlphaChannel(PDDocument document) {
28 |         this.document = document;
29 |         imagesWereChanged = false;
30 |         try {
31 |             removeImagesAlphaChannelUnsafe();
32 |             return imagesWereChanged;
33 |         } catch (Exception e) {
34 |             return false;
35 |         }
36 |     }
37 | 
38 |     private void removeImagesAlphaChannelUnsafe() {
39 |         try {
40 |             PDPageTree allPages = document.getDocumentCatalog().getPages();
41 |             for (int i = 0; i < allPages.getCount(); i++) {
42 |                 PDPage page = allPages.get(i);
43 |                 processImagesFromResources(page.getResources());
44 |             }
45 |         } catch (Exception e) {
46 |             e.printStackTrace();
47 |         }
48 |     }
49 | 
50 |     // search for images in document's resources
51 |     private void processImagesFromResources(PDResources resources) throws IOException {
52 |         for (COSName xObjectName : resources.getXObjectNames()) {
53 |             PDXObject xObject = resources.getXObject(xObjectName);
54 | 
55 |             if (xObject instanceof PDFormXObject) {
56 |                 processImagesFromResources(((PDFormXObject) xObject).getResources());
57 |             } else if (xObject instanceof PDImageXObject) {
58 |                 PDImageXObject img = (PDImageXObject) xObject;
59 |                 if (!img.getImage().getColorModel().hasAlpha())
60 |                     return;
61 | 
62 |                 PDImageXObject cpy = makeImageObjectCopy(img);
63 |                 resources.put(xObjectName, cpy);
64 |                 imagesWereChanged = true;
65 |             }
66 |         }
67 |     }
68 | 
69 |     // load the image, "flatten" it and store it into bytes
70 |     // then return new PDImageXObject from image's bytes
71 |     private PDImageXObject makeImageObjectCopy(PDImageXObject img) throws IOException {
72 |         BufferedImage flatImg = flattenImage(img.getImage());
73 |         byte[] bytes = getImageBytes(flatImg);
74 |         return PDImageXObject.createFromByteArray(document, bytes, "image");
75 |     }
76 | 
77 |     // make a new BufferedImage drawn on a solid background
78 |     private BufferedImage flattenImage(BufferedImage img) {
79 |         BufferedImage copy = new BufferedImage(img.getWidth(), img.getHeight(), BufferedImage.TYPE_INT_RGB);
80 |         Graphics2D g2d = copy.createGraphics();
81 |         g2d.setColor(Color.WHITE);
82 |         g2d.fillRect(0, 0, copy.getWidth(), copy.getHeight());
83 |         g2d.drawImage(img, 0, 0, null);
84 |         g2d.dispose();
85 |         return copy;
86 |     }
87 | 
88 |     // serialize image as bytes
89 |     private byte[] getImageBytes(BufferedImage img) throws IOException {
90 |         ByteArrayOutputStream baos = new ByteArrayOutputStream();
91 |         ImageIO.write(img, "png", baos );
92 |         baos.flush();
93 |         byte[] imageInByte = baos.toByteArray();
94 |         baos.close();
95 |         return imageInByte;
96 |     }
97 | }
98 | 


--------------------------------------------------------------------------------
/lexpredict-tika/readme.txt:
--------------------------------------------------------------------------------
  1 | Release notes;
  2 | 
  3 | - current version: 1.0
  4 | 
  5 | 
  6 | 
  7 | 
  8 | 
  9 | 
 10 | - 1 - how to build
 11 | 
 12 | Just run "mvn install" or "mvn install -DskipTests" command in the project directory (lexpredict-tika). Output file lexpredict-tika-<version>.jar would be in lexpredict-tika/target/ folder.
 13 | 
 14 | 
 15 | - 2 - how to use
 16 | 
 17 | The resulted artifact (JAR) is a part of docker container. There, in docker, the jar-file is used as a parameter while starting Tika server like this:
 18 | 
 19 | java -cp "tika-server-${TIKA_VERSION}.jar:lexpredict-tika-<version>.jar:libs/*" org.apache.tika.server.TikaServerCli --config tika.config
 20 | 
 21 | 
 22 | - 3 - how to test locally:
 23 | 
 24 | Suppose we have a following folders and files:
 25 |   documents/
 26 |       source_doc.pdf
 27 |   parsed/
 28 |   scripts/
 29 |       tika-server-1.20.jar
 30 |       lexpredict-tika-1.0.jar
 31 |       tika.config
 32 | first thing to do is to
 33 | 
 34 | 
 35 | To debug processing a single file in IDE:
 36 | Main class:        org.apache.tika.cli.TikaCLI
 37 | Program arguments: --config=example_tika.config -J -t -eutf-8 tmp/your_file.pdf
 38 | 
 39 | 
 40 | - 3.1 - run Tika server
 41 | 
 42 | Currently we are in "parsed" directory. Our tika server has version 1.20, our plugin has version 1.0.
 43 | Run the following command:
 44 | java -cp 'tika-server-1.20.jar:lexpredict-tika-1.0.jar:libs/*' org.apache.tika.server.TikaServerCli --port 9999 --config tika.config
 45 | 
 46 | We should see a number of lines in output, like:
 47 | INFO  Starting Apache Tika 1.20 server
 48 | ...
 49 | INFO  Using custom config: tika.config
 50 | ...
 51 | Started Apache Tika server at http://localhost:9999/
 52 | 
 53 | 
 54 | - 3.2 - parse a document
 55 | 
 56 | Run command:
 57 | curl -T documents/source_doc.pdf http://localhost:9999 -H pdf-parse:pdf_ocr > parsed/parsed_doc.zip
 58 | Note that -H pdf-parse:pdf_ocr parameter
 59 | This parameter comes from the plugin. It could have one of the three values
 60 | 1) "pdf_ocr" means that plugin will decide what internal parser to use, PDF-2-TEXT or OCR,
 61 | 2) "strip" means the same, but the "printed" text will be obtained by PDFBox PDFTextStripper class
 62 | 3) "default" means that the plugin will work as a standard PDFParser plugin
 63 | 
 64 | Alternatively, you can set LEXNLP_TIKA_PARSER_MODE environment variable instead of
 65 | passing "pdf-parse" command line argument.
 66 | 
 67 | 
 68 | - 4.2 - source files
 69 | 
 70 | 1) Directory lexpredict-tika/src/main/java/com/lexpredict/tika, files:
 71 | 1.1) AlterPDFParser.java
 72 | here is the plugin itself. A class derived from standard PDFParser.
 73 | 
 74 | 1.2) FieldLookup.java
 75 | finds field in passed class or one of his ancestors.
 76 | 
 77 | 1.3) HttpRequestParamsReader.java
 78 | a class that captures HTTP context for the command passed to the Tika server. Searches for "pdf-parse" request parameter.
 79 | 
 80 | 1.4) PdfContentImagePreprocessor.java
 81 | this class "removes" alpha channel from all embedded in PDDocument images by drawing them on a solid color background. Thus preventing issue with parsing transparent images.
 82 | 
 83 | 1.5) PdfContentTypeChecker.java
 84 | this class determines the content of the PDDocument passed. The content is either "EMPTY, TEXT", "IMAGES" or "MIXED" (text + images). When the content is "IMAGES" and "pdf-parse" is set to "pdf_ocr" the parser uses OCR document processing.
 85 | 
 86 | 1.6) PdfStripperProcessor.java
 87 | the class encapsulates PDFTextStripper functionality for setting text in ContentHandler parameter.
 88 | 
 89 | 1.7) ShallowCopy.java
 90 | this class makes a shallow copy, setting the "to" object's properties equal to the ones of object "from".
 91 | 
 92 | 
 93 | 2) Directory lexpredict-tika/src/test/java/com/lexpredict/tika
 94 | 2.1) AlterPDFParserTest.java
 95 | contains unit tests for AlterPDFParser: tests getting text from vector and scanned PDF.
 96 | 
 97 | 2.2, 2.3, 2.4) PdfContentImagePreprocessorTest.java, PdfCOntentTypeCheckerTest.java, ShallowCopy.java
 98 | unit tests for PdfContentImagePreprocessor and PdfContentTypeChecker classes
 99 | 
100 | File pom.xml.debug is a copy of pom.xml plus extra dependency for debugging parser with tika-app.
101 | 
102 | 


--------------------------------------------------------------------------------
/lexpredict-tika/src/main/java/com/lexpredict/tika/HttpRequestParamsReader.java:
--------------------------------------------------------------------------------
  1 | package com.lexpredict.tika;
  2 | 
  3 | import org.eclipse.jetty.http.HttpField;
  4 | import org.eclipse.jetty.http.HttpFields;
  5 | import org.eclipse.jetty.http.MetaData;
  6 | import javax.servlet.http.HttpServletRequest;
  7 | import java.io.InputStream;
  8 | import java.lang.reflect.Field;
  9 | import java.util.HashMap;
 10 | 
 11 | enum CommonParseFlag
 12 | {
 13 |     VERBOSE, PDF_PARSE_METHOD;
 14 | }
 15 | 
 16 | // class reads HttpRequest params from InputStream
 17 | // if InputStream is from HttpRequest
 18 | public class HttpRequestParamsReader {
 19 |     public static final String PDF_PARSE_METHOD_STRIP = "strip";
 20 |     public static final String PDF_PARSE_METHOD_PDF_OCR = "pdf_ocr";
 21 |     public static final String PDF_PARSE_METHOD_PDF_PREFER_TEXT = "pdf_prefer_text";
 22 |     public static final String PDF_PARSE_METHOD_PDF_ONLY = "pdf_only";
 23 |     public static final String PDF_PARSE_METHOD_OCR_ONLY = "ocr_only";
 24 | 
 25 |     public HashMap<String, String> rawParams = new HashMap<String, String>();
 26 |     public HashMap<CommonParseFlag, String> typedParams = new HashMap<>();
 27 | 
 28 |     private static HashMap<String, CommonParseFlag> flagByName = new HashMap<String, CommonParseFlag>() {
 29 |         {
 30 |             put("v", CommonParseFlag.VERBOSE);
 31 |             put("-verbose", CommonParseFlag.VERBOSE);
 32 |             put("pdf-parse", CommonParseFlag.PDF_PARSE_METHOD);
 33 |         }
 34 |     };
 35 | 
 36 |     private static HttpRequestParamsReader single_instance = null;
 37 | 
 38 |     private boolean initialized = false;
 39 | 
 40 |     private HttpRequestParamsReader()
 41 |     {
 42 |     }
 43 | 
 44 |     // static method to create instance of Singleton class
 45 |     public static HttpRequestParamsReader getInstance()
 46 |     {
 47 |         if (single_instance == null)
 48 |             single_instance = new HttpRequestParamsReader();
 49 |         return single_instance;
 50 |     }
 51 | 
 52 |     public void initialize(InputStream stream) {
 53 |         if (initialized)
 54 |             return;
 55 |         initialized = true;
 56 |         MetaData metaDict = getMetaDataField(stream);
 57 |         if (metaDict == null)
 58 |             return;
 59 | 
 60 |         HttpFields fields = metaDict.getFields();
 61 |         for (HttpField field : fields)
 62 |             rawParams.put(field.getName(), field.getValue());
 63 |         GetCommonFlags();
 64 |     }
 65 | 
 66 |     public boolean IsVerbose() {
 67 |         return typedParams.containsKey(CommonParseFlag.VERBOSE);
 68 |     }
 69 | 
 70 |     public void outIfVerbose(String s) {
 71 |         if (!IsVerbose()) return;
 72 |         System.out.println(s);
 73 |     }
 74 | 
 75 |     // just check the value specified in the dictionary passed
 76 |     public boolean checkParamValue(CommonParseFlag ptrName, String expectedValue) {
 77 |         return typedParams.containsKey(ptrName) &&
 78 |                 typedParams.get(ptrName).equalsIgnoreCase(
 79 |                         expectedValue);
 80 |     }
 81 | 
 82 |     private void GetCommonFlags() {
 83 |         rawParams.entrySet().forEach(entry -> {
 84 |             flagByName.entrySet().forEach(fl -> {
 85 |                 if (fl.getKey().equals(entry.getKey()))
 86 |                     typedParams.put(fl.getValue(), entry.getValue());
 87 |             });
 88 |         });
 89 |     }
 90 | 
 91 |     // read metadata from HttpRequest
 92 |     private static MetaData getMetaDataField(Object stream) {
 93 |         while (true) {
 94 |             try {
 95 |                 Field field = FieldLookup.findField(stream.getClass(), "val$req");
 96 |                 if (field != null) {
 97 |                     field.setAccessible(true);
 98 |                     HttpServletRequest req = (HttpServletRequest) field.get(stream);
 99 |                     field = FieldLookup.findField(req.getClass(), "_metaData");
100 |                     if (field == null)
101 |                         return null;
102 | 
103 |                     field.setAccessible(true);
104 |                     return (MetaData) field.get(req);
105 |                 }
106 |             } catch (IllegalAccessException ex) {
107 |                 return null;
108 |             }
109 | 
110 |             Field inField = FieldLookup.findField(stream.getClass(), "in");
111 |             if (inField == null)
112 |                 return null;
113 | 
114 |             inField.setAccessible(true);
115 |             try {
116 |                 stream = inField.get(stream);
117 |             } catch (IllegalAccessException e) {
118 |                 return null;
119 |             }
120 |         }
121 |     }
122 | }
123 | 


--------------------------------------------------------------------------------
/lexpredict-tika/src/main/java/com/lexpredict/tika/PDFEncodedStringDecoder.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  *
 17 |  * Modifications copyright (C) 2020 ContraxSuite, LLC
 18 |  */
 19 | 
 20 | package com.lexpredict.tika;
 21 | 
 22 | import static java.nio.charset.StandardCharsets.ISO_8859_1;
 23 | 
 24 | import java.io.ByteArrayInputStream;
 25 | import java.io.IOException;
 26 | import java.io.InputStream;
 27 | 
 28 | import org.apache.pdfbox.cos.COSString;
 29 | import org.apache.pdfbox.io.RandomAccessBuffer;
 30 | import org.apache.pdfbox.io.RandomAccessRead;
 31 | import org.apache.pdfbox.pdfparser.COSParser;
 32 | 
 33 | /**
 34 |  * In fairly rare cases, a PDF's XMP will contain a string that
 35 |  * has incorrectly been encoded with PDFEncoding: an octal for non-ascii and
 36 |  * ascii for ascii, e.g. "\376\377\000M\000i\000c\000r\000o\000s\000o\000f\000t\000"
 37 |  * <p>
 38 |  * This class can be used to decode those strings.
 39 |  * <p>
 40 |  * See TIKA-1678.  Many thanks to Andrew Jackson for raising this issue
 41 |  * and Tilman Hausherr for the solution.
 42 |  * <p>
 43 |  * As of this writing, we are only handling strings that start with
 44 |  * an encoded BOM.  Andrew Jackson found a handful of other examples (e.g.
 45 |  * this ISO-8859-7 string:
 46 |  * "Microsoft Word - \\323\\365\\354\\354\\345\\364\\357\\367\\336
 47 |  * \\364\\347\\362 PRAKSIS \\363\\364\\357")
 48 |  * that we aren't currently handling.
 49 |  */
 50 | class PDFEncodedStringDecoder {
 51 | 
 52 |     private static final String[] PDF_ENCODING_BOMS = {
 53 |             "\\376\\377", //UTF-16BE
 54 |             "\\377\\376", //UTF-16LE
 55 |             "\\357\\273\\277"//UTF-8
 56 |     };
 57 | 
 58 |     /**
 59 |      * Does this string contain an octal-encoded UTF BOM?
 60 |      * Call this statically to determine if you should bother creating a new parser to parse it.
 61 |      * @param s
 62 |      * @return
 63 |      */
 64 |     static boolean shouldDecode(String s) {
 65 |         if (s == null || s.length() < 8) {
 66 |             return false;
 67 |         }
 68 |         for (String BOM : PDF_ENCODING_BOMS) {
 69 |             if (s.startsWith(BOM)) {
 70 |                 return true;
 71 |             }
 72 |         }
 73 |         return false;
 74 |     }
 75 | 
 76 |     /**
 77 |      * This assumes that {@link #shouldDecode(String)} has been called
 78 |      * and has returned true.  If you run this on a non-octal encoded string,
 79 |      * disaster will happen!
 80 |      *
 81 |      * @param value
 82 |      * @return
 83 |      */
 84 |     String decode(String value) {
 85 |         try {
 86 |             byte[] bytes = new String("(" + value + ")").getBytes(ISO_8859_1);
 87 |             InputStream is = new ByteArrayInputStream(bytes);
 88 |             PDFEncodedStringDecoder.COSStringParser p = new PDFEncodedStringDecoder.COSStringParser(new RandomAccessBuffer(is));
 89 |             String parsed = p.myParseCOSString();
 90 |             if (parsed != null) {
 91 |                 return parsed;
 92 |             }
 93 |         } catch (IOException e) {
 94 |             //oh well, we tried.
 95 |         }
 96 |         //just return value if something went wrong
 97 |         return value;
 98 |     }
 99 | 
100 |     class COSStringParser extends COSParser {
101 | 
102 |         COSStringParser(RandomAccessRead buffer) throws IOException {
103 |             super(buffer);
104 |         }
105 | 
106 |         /**
107 |          *
108 |          * @return parsed string or null if something went wrong.
109 |          */
110 |         String myParseCOSString() {
111 |             try {
112 |                 COSString cosString = parseCOSString();
113 |                 if (cosString != null) {
114 |                     return cosString.getString();
115 |                 }
116 |             } catch (IOException e) {
117 |             }
118 |             return null;
119 |         }
120 |     }
121 | }
122 | 


--------------------------------------------------------------------------------
/lexpredict-tika/src/main/java/com/lexpredict/tika/PdfContentTypeChecker.java:
--------------------------------------------------------------------------------
  1 | package com.lexpredict.tika;
  2 | 
  3 | import org.apache.pdfbox.contentstream.operator.Operator;
  4 | import org.apache.pdfbox.cos.COSName;
  5 | import org.apache.pdfbox.pdfparser.PDFStreamParser;
  6 | import org.apache.pdfbox.pdmodel.PDDocument;
  7 | import org.apache.pdfbox.pdmodel.PDPage;
  8 | import org.apache.pdfbox.pdmodel.PDPageTree;
  9 | import org.apache.pdfbox.pdmodel.PDResources;
 10 | import org.apache.pdfbox.pdmodel.graphics.PDXObject;
 11 | import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
 12 | import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
 13 | import org.xml.sax.SAXException;
 14 | 
 15 | import java.io.IOException;
 16 | import java.io.InputStream;
 17 | import java.util.List;
 18 | 
 19 | 
 20 | // import org.apache.tika.parser.pdf.PDFParser;
 21 | // class MyPDF2XHTML extends PDF2XHTML {
 22 | 
 23 | 
 24 | 
 25 | // determine content of the PDDocument passed:
 26 | // whether it contains text, images, text + images or just nothing
 27 | public class PdfContentTypeChecker {
 28 |     public enum PdfContent {
 29 |         EMPTY, TEXT, IMAGES, MIXED, UNKNOWN
 30 |     }
 31 | 
 32 |     private PdfContent docContent = PdfContent.EMPTY;
 33 | 
 34 |     private int pageCount = 0;
 35 | 
 36 |     private int imagesCount = 0;
 37 | 
 38 |     private int textBlocks = 0;
 39 | 
 40 |     private int fullTextLength = 0;
 41 | 
 42 |     private PDFTextStripper pdfTextStripper;
 43 | 
 44 |     public int getImagesCount() {
 45 |         return imagesCount;
 46 |     }
 47 | 
 48 |     public int getTextBlocks() {
 49 |         return textBlocks;
 50 |     }
 51 | 
 52 |     // reads PDDocument from the stream and calls determineDocContentType
 53 |     public PdfContent determineDocContentType(InputStream stream) {
 54 |         try {
 55 |             PDDocument document = PDDocument.load(stream);
 56 |             return determineDocContentType(document);
 57 |         } catch (Exception e) {
 58 |             return PdfContent.UNKNOWN;
 59 |         }
 60 |     }
 61 | 
 62 |     public PdfContent determineDocContentType(PDDocument document) throws IOException {
 63 |         try {
 64 |             calculateObjectsInDocument(document);
 65 |         } catch (Exception e) {
 66 |             return PdfContent.UNKNOWN;
 67 |         }
 68 |         int totalCount = imagesCount + textBlocks;
 69 |         docContent = totalCount == 0 ? PdfContent.EMPTY
 70 |                 : imagesCount > 0 && textBlocks > 0 ? PdfContent.MIXED
 71 |                 : imagesCount > 0 ? PdfContent.IMAGES
 72 |                 : PdfContent.TEXT;
 73 |         return docContent;
 74 |     }
 75 | 
 76 |     // calculate count of text blocks (textBlocks member) and
 77 |     // images (imagesCount) in the document
 78 |     private void calculateObjectsInDocument(PDDocument document) throws IOException {
 79 |         this.pdfTextStripper = new PDFTextStripper();
 80 | 
 81 |         try {
 82 |             PDPageTree allPages = document.getDocumentCatalog().getPages();
 83 |             this.pageCount = allPages.getCount();
 84 |             for (int i = 0; i < allPages.getCount(); i++) {
 85 |                 PDPage page = allPages.get(i);
 86 |                 readObjectsOnPage(page);
 87 |                 calculateTextLengthOnPage(document, i + 1);
 88 |             }
 89 |         } catch (Exception e) {
 90 |             e.printStackTrace();
 91 |         }
 92 |     }
 93 | 
 94 |     // calculate objects' count for the page passed
 95 |     private void readObjectsOnPage(PDPage page) throws IOException {
 96 |         getImagesFromResources(page.getResources());
 97 |         calculateTextObjectsOnPage(page);
 98 |     }
 99 | 
100 | 
101 |     private void calculateTextLengthOnPage(PDDocument doc, int pageNum1Based) throws IOException, SAXException {
102 |         this.pdfTextStripper.setStartPage(pageNum1Based);
103 |         this.pdfTextStripper.setEndPage(pageNum1Based);
104 |         String text = this.pdfTextStripper.getText(doc);
105 |         if (text != null) {
106 |             text = text.trim().replaceAll("\\s+", " ");
107 |             this.fullTextLength += text.length();
108 |         }
109 |     }
110 | 
111 |     private void calculateTextObjectsOnPage(PDPage page) throws IOException {
112 |         PDFStreamParser parser = new PDFStreamParser(page);
113 |         parser.parse();
114 |         List<Object> pageTokens = parser.getTokens();
115 |         for (Object token : pageTokens) {
116 |             if (token instanceof Operator) {
117 |                 String opName = ((Operator) token).getName();
118 |                 if (opName.equals("BT")) // Begin Text
119 |                     textBlocks++;
120 |             }
121 |         }
122 |     }
123 | 
124 |     private void getImagesFromResources(PDResources resources) throws IOException {
125 |         for (COSName xObjectName : resources.getXObjectNames()) {
126 |             PDXObject xObject = resources.getXObject(xObjectName);
127 | 
128 |             if (xObject instanceof PDFormXObject) {
129 |                 getImagesFromResources(((PDFormXObject) xObject).getResources());
130 |             } else if (xObject instanceof PDImageXObject) {
131 |                 //((PDImageXObject) xObject).getImage();
132 |                 imagesCount++;
133 |             }
134 |         }
135 |     }
136 | }
137 | 


--------------------------------------------------------------------------------
/lexpredict-tika/src/main/java/com/lexpredict/tika/PdfStripperProcessor.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  *
 17 |  * Modifications copyright (C) 2020 ContraxSuite, LLC
 18 |  */
 19 | 
 20 | package com.lexpredict.tika;
 21 | 
 22 | import org.apache.pdfbox.pdmodel.PDDocument;
 23 | import org.apache.pdfbox.text.PDFTextStripper;
 24 | import org.apache.tika.sax.SecureContentHandler;
 25 | import org.apache.tika.sax.ToTextContentHandler;
 26 | import org.apache.tika.sax.xpath.Matcher;
 27 | import org.apache.tika.sax.xpath.MatchingContentHandler;
 28 | import org.xml.sax.ContentHandler;
 29 | import org.xml.sax.SAXException;
 30 | import java.io.IOException;
 31 | import java.io.Writer;
 32 | import java.lang.reflect.Field;
 33 | import java.lang.reflect.InvocationTargetException;
 34 | import java.lang.reflect.Method;
 35 | 
 36 | // Class uses PDFBox text "stripping" methods
 37 | // instead of Tika's ones
 38 | // Sometimes PDFBox method format extracted text better than Tika
 39 | public class PdfStripperProcessor {
 40 |     public static void setTextUsingPDFTextStripper(ContentHandler handler, PDDocument pdfDocument)
 41 |             throws IOException, SAXException, NoSuchMethodException, InvocationTargetException,
 42 |             IllegalAccessException, NoSuchFieldException {
 43 |         PDFTextStripper stripper = new PDFTextStripper();
 44 |         String text = stripper.getText(pdfDocument);
 45 |         char[] chars = text.toCharArray();
 46 |         setContentHandlerCharacters(handler, chars);
 47 |     }
 48 | 
 49 |     private static void setContentHandlerCharacters(ContentHandler handler, char[] chars)
 50 |             throws SAXException, NoSuchMethodException, InvocationTargetException, IllegalAccessException,
 51 |             NoSuchFieldException, IOException {
 52 | 
 53 |         advanceSecureContentHandler(handler, chars.length);
 54 | 
 55 |         ContentHandler textHandler = getUnderlyingHandler(handler, ToTextContentHandler.class);
 56 |         if (textHandler instanceof ToTextContentHandler) {
 57 |             writeCharsToTextHandler((ToTextContentHandler)textHandler, chars);
 58 |             return;
 59 |         }
 60 | 
 61 |         ContentHandler matchHandler = getUnderlyingHandler(handler, MatchingContentHandler.class);
 62 |         if (matchHandler instanceof MatchingContentHandler) {
 63 |             setCharsBypassingMatching(handler, (MatchingContentHandler)matchHandler, chars);
 64 |             return;
 65 |         }
 66 | 
 67 |         handler.characters(chars, 0, chars.length);
 68 |     }
 69 | 
 70 |     private static void advanceSecureContentHandler(ContentHandler handler, int bytesCount)
 71 |             throws IllegalAccessException, NoSuchMethodException, InvocationTargetException {
 72 |         ContentHandler secHandler = getUnderlyingHandler(handler, SecureContentHandler.class);
 73 |         if (!(secHandler instanceof SecureContentHandler))
 74 |             return;
 75 |         Method adMethod = SecureContentHandler.class.getDeclaredMethod("advance",
 76 |                 int.class);
 77 |         adMethod.setAccessible(true);
 78 |         adMethod.invoke(secHandler, bytesCount);
 79 |     }
 80 | 
 81 |     private static void writeCharsToTextHandler(ToTextContentHandler handler, char[] chars)
 82 |             throws IllegalAccessException, NoSuchFieldException, IOException {
 83 |         Field writerField = FieldLookup.findField(handler.getClass(), "writer");
 84 |         if (writerField == null)
 85 |             throw new NoSuchFieldException("writer");
 86 |         writerField.setAccessible(true);
 87 |         Writer writer = (Writer)writerField.get(handler);
 88 |         writer.write(chars);
 89 |         writer.close();
 90 |     }
 91 | 
 92 |     private static void directlySetCharacters(ContentHandler handler, char[] chars)
 93 |             throws NoSuchMethodException, IllegalAccessException, InvocationTargetException {
 94 |         Method setter;
 95 |         while (true) {
 96 |             try {
 97 |                 setter = handler.getClass().getDeclaredMethod("characters",
 98 |                         char[].class, int.class, int.class);
 99 |                 break;
100 |             } catch (NoSuchMethodException e) {
101 |                 // pass
102 |             }
103 |             Field handField = FieldLookup.findField(handler.getClass(), "handler");
104 |             if (handField == null)
105 |                 throw new NoSuchMethodException("characters");
106 |             handField.setAccessible(true);
107 |             handler = (ContentHandler)handField.get(handler);
108 |         }
109 |         setter.invoke(handler, chars, 0, chars.length);
110 |     }
111 | 
112 |     private static void setCharsBypassingMatching(ContentHandler handler, MatchingContentHandler matchHandler, char[] chars)
113 |             throws IllegalAccessException, NoSuchFieldException,
114 |             InvocationTargetException, NoSuchMethodException {
115 |         Field matchField = MatchingContentHandler.class.getDeclaredField("matcher");
116 |         matchField.setAccessible(true);
117 |         Object oldMatcher = matchField.get(matchHandler);
118 |         matchField.set(matchHandler, new TrueMatcher());
119 | 
120 |         directlySetCharacters(handler, chars);
121 | 
122 |         matchField.set(matchHandler, oldMatcher);
123 |     }
124 | 
125 |     private static ContentHandler getUnderlyingHandler(ContentHandler handler, Class<?> desiredClass)
126 |             throws IllegalAccessException {
127 |         while (true) {
128 |             Class<?> handlerClass = handler.getClass();
129 |             if (handlerClass == desiredClass) break;
130 |             Field handlerField = FieldLookup.findField(handlerClass,"handler");
131 |             if (handlerField == null)
132 |                 return handler;
133 | 
134 |             handlerField.setAccessible(true);
135 |             handler = (ContentHandler) handlerField.get(handler);
136 |         }
137 |         return handler;
138 |     }
139 | }
140 | 
141 | // the class tells calling code that the passed content should be
142 | // included in the output
143 | class TrueMatcher extends Matcher {
144 |     @Override
145 |     public boolean matchesText() {
146 |         return true;
147 |     }
148 | }


--------------------------------------------------------------------------------
/lexpredict-tika/src/test/java/com/lexpredict/tika/AlterPDFParserTest.java:
--------------------------------------------------------------------------------
  1 | package com.lexpredict.tika;
  2 | 
  3 | import org.apache.tika.parser.ParseContext;
  4 | import org.apache.tika.parser.pdf.PDFParser;
  5 | import org.apache.tika.parser.pdf.PDFParserConfig;
  6 | import org.junit.Test;
  7 | import java.io.InputStream;
  8 | import java.lang.reflect.Field;
  9 | import java.util.Collections;
 10 | import java.util.HashMap;
 11 | import java.util.Map;
 12 | 
 13 | import static org.junit.Assert.*;
 14 | 
 15 | public class AlterPDFParserTest extends TikaTest {
 16 |     @Test
 17 |     public void testDoubleSpacedText() throws Exception {
 18 |         PDFParser pdfParser = new AlterPDFParser();
 19 |         ParseContext context = new ParseContext();
 20 |         PDFParserConfig config = new PDFParserConfig();
 21 |         context.set(PDFParserConfig.class, config);
 22 | 
 23 |         InputStream stream = AlterPDFParserTest.class.getResourceAsStream("/test-documents/double_space_test.pdf");
 24 |         String text = getText(stream, pdfParser, context);
 25 |         stream.close();
 26 | 
 27 |         assertTrue(text.length() > 100);
 28 |     }
 29 | 
 30 |     @Test
 31 |     public void testParseSimpleScannedText() throws Exception {
 32 |         String text = getTextFromDoc("/test-documents/text_on_white.pdf",
 33 |                 AlterPDFParser.ParsePdfMode.PDF_OCR);
 34 |         assertTrue(text.length() > 50);
 35 |     }
 36 | 
 37 |     @Test
 38 |     public void testParseTransparentScannedText() throws Exception {
 39 |         String text = getTextFromDoc("/test-documents/transp_scanned.pdf",
 40 |                 AlterPDFParser.ParsePdfMode.PDF_OCR);
 41 |         assertTrue(text.length() > 50);
 42 |     }
 43 | 
 44 |     private String getTextFromDoc(String docPath,
 45 |                                   AlterPDFParser.ParsePdfMode parseMode) throws Exception {
 46 |         return getTextFromDoc(docPath, parseMode, "text");
 47 |     }
 48 | 
 49 |     private String getTextFromDoc(String docPath,
 50 |                                   AlterPDFParser.ParsePdfMode parseMode,
 51 |                                   String outputFormat) throws Exception {
 52 |         AlterPDFParser pdfParser = new AlterPDFParser();
 53 |         pdfParser.defaultParseMode = parseMode;
 54 |         ParseContext context = new ParseContext();
 55 |         PDFParserConfig config = new PDFParserConfig();
 56 |         context.set(PDFParserConfig.class, config);
 57 | 
 58 |         InputStream stream = AlterPDFParserTest.class.getResourceAsStream(docPath);
 59 |         if (outputFormat.equals("text")) {
 60 |             String txt = getText(stream, pdfParser, context);
 61 |             stream.close();
 62 |             return txt;
 63 |         }
 64 |         XMLResult rst = getXML(stream, pdfParser, context);
 65 |         stream.close();
 66 |         return rst.xml;
 67 |     }
 68 | 
 69 |     @Test
 70 |     public void testParseXhtmlNoDetail() throws Exception {
 71 |         String text = getTextFromDoc("/test-documents/sample_table.pdf",
 72 |                 AlterPDFParser.ParsePdfMode.PDF_ONLY, "xml");
 73 |         assertTrue(text.length() > 50);
 74 |     }
 75 | 
 76 |     @Test
 77 |     public void testParseJBig() throws Exception {
 78 |         String oldSysEnv = setEnvVar("LEXNLP_TIKA_PARSER_MODE", "ocr_only");
 79 |         String text = getTextFromDoc("/test-documents/jbig.pdf",
 80 |                 AlterPDFParser.ParsePdfMode.OCR_ONLY, "xml");
 81 |         setEnvVar("LEXNLP_TIKA_PARSER_MODE", oldSysEnv);
 82 |         assertTrue(text.length() > 50);
 83 |     }
 84 | 
 85 |     @Test
 86 |     public void testParseXhtmlCoordsEmbedded() throws Exception {
 87 |         String oldSysEnv = setEnvVar("LEXNLP_TIKA_XML_DETAIL", "coords_embedded");
 88 |         String text = getTextFromDoc("/test-documents/industrial developing authority.pdf",
 89 |                 AlterPDFParser.ParsePdfMode.PDF_ONLY, "xml");
 90 |         setEnvVar("LEXNLP_TIKA_XML_DETAIL", oldSysEnv);
 91 |         assertTrue(text.length() > 50);
 92 |     }
 93 | 
 94 |     @Test
 95 |     public void testParseXhtmlCoordsFlat() throws Exception {
 96 |         String oldSysEnv = setEnvVar("LEXNLP_TIKA_XML_DETAIL", "coords_flat");
 97 |         String text = getTextFromDoc("/test-documents/industrial developing authority.pdf",
 98 |                 AlterPDFParser.ParsePdfMode.PDF_ONLY, "xml");
 99 |         setEnvVar("LEXNLP_TIKA_XML_DETAIL", oldSysEnv);
100 |         assertTrue(text.length() > 50);
101 |     }
102 | 
103 |     @Test
104 |     public void testParseXhtmlCsTextFlat() throws Exception {
105 |         String oldSysEnv = setEnvVar("LEXNLP_TIKA_XML_DETAIL", "coords_text_flat");
106 |         String text = getTextFromDoc("/test-documents/double_space_test.pdf",
107 |                 AlterPDFParser.ParsePdfMode.PDF_ONLY, "xml");
108 |         setEnvVar("LEXNLP_TIKA_XML_DETAIL", oldSysEnv);
109 |         assertTrue(text.length() > 50);
110 |     }
111 | 
112 |     @Test
113 |     public void testParseToBraces() throws Exception {
114 |         String oldSysEnv = setEnvVar("LEXNLP_TIKA_XML_DETAIL", "coords_text_flat");
115 |         String text = getTextFromDoc("/test-documents/chylde_harold.pdf",
116 |                 AlterPDFParser.ParsePdfMode.PDF_ONLY, "xml");
117 |         setEnvVar("LEXNLP_TIKA_XML_DETAIL", oldSysEnv);
118 |         assertTrue(text.length() > 50);
119 |         assertTrue(text.indexOf("] ]") > 0);
120 |     }
121 | 
122 |     @Test
123 |     public void testParseNoDuplicates() throws Exception {
124 |         String oldSysEnv = setEnvVar("LEXNLP_TIKA_PARSER_MODE", "pdf_prefer_text");
125 |         String text = getTextFromDoc("/test-documents/mixed_scanned_text.pdf",
126 |                 AlterPDFParser.ParsePdfMode.PDF_ONLY, "xml");
127 |         setEnvVar("LEXNLP_TIKA_PARSER_MODE", oldSysEnv);
128 |         assertTrue(text.length() > 50);
129 |     }
130 | 
131 |     protected static String setEnvVar(String varName, String varValue) throws Exception {
132 |         String oldSysEnv = System.getenv(varName);
133 |         oldSysEnv = oldSysEnv == null ? "" : oldSysEnv;
134 |         setEnv(new HashMap<String, String>() {{
135 |             put(varName, varValue);
136 |         }});
137 |         return oldSysEnv;
138 |     }
139 | 
140 |     protected static void setEnv(Map<String, String> newenv) throws Exception {
141 |         try {
142 |             Class<?> processEnvironmentClass = Class.forName("java.lang.ProcessEnvironment");
143 |             Field theEnvironmentField = processEnvironmentClass.getDeclaredField("theEnvironment");
144 |             theEnvironmentField.setAccessible(true);
145 |             Map<String, String> env = (Map<String, String>) theEnvironmentField.get(null);
146 |             env.putAll(newenv);
147 |             Field theCaseInsensitiveEnvironmentField = processEnvironmentClass.getDeclaredField("theCaseInsensitiveEnvironment");
148 |             theCaseInsensitiveEnvironmentField.setAccessible(true);
149 |             Map<String, String> cienv = (Map<String, String>)     theCaseInsensitiveEnvironmentField.get(null);
150 |             cienv.putAll(newenv);
151 |         } catch (NoSuchFieldException e) {
152 |             Class[] classes = Collections.class.getDeclaredClasses();
153 |             Map<String, String> env = System.getenv();
154 |             for(Class cl : classes) {
155 |                 if("java.util.Collections$UnmodifiableMap".equals(cl.getName())) {
156 |                     Field field = cl.getDeclaredField("m");
157 |                     field.setAccessible(true);
158 |                     Object obj = field.get(env);
159 |                     Map<String, String> map = (Map<String, String>) obj;
160 |                     map.clear();
161 |                     map.putAll(newenv);
162 |                 }
163 |             }
164 |         }
165 |     }
166 | }
167 | 


--------------------------------------------------------------------------------
/lexpredict-tika/src/main/java/com/lexpredict/tika/PDMetadataExtractor.java:
--------------------------------------------------------------------------------
  1 | package com.lexpredict.tika;
  2 | 
  3 | import java.io.IOException;
  4 | import java.io.InputStream;
  5 | import java.util.Calendar;
  6 | import java.util.List;
  7 | import java.util.Locale;
  8 | 
  9 | import org.apache.jempbox.xmp.XMPMetadata;
 10 | import org.apache.jempbox.xmp.XMPSchema;
 11 | import org.apache.jempbox.xmp.XMPSchemaDublinCore;
 12 | import org.apache.jempbox.xmp.pdfa.XMPSchemaPDFAId;
 13 | import org.apache.pdfbox.cos.COSArray;
 14 | import org.apache.pdfbox.cos.COSBase;
 15 | import org.apache.pdfbox.cos.COSDictionary;
 16 | import org.apache.pdfbox.cos.COSString;
 17 | import org.apache.pdfbox.pdmodel.common.PDMetadata;
 18 | import org.apache.poi.util.IOUtils;
 19 | import org.apache.tika.exception.TikaException;
 20 | import org.apache.tika.extractor.EmbeddedDocumentUtil;
 21 | import org.apache.tika.metadata.Metadata;
 22 | import org.apache.tika.metadata.PDF;
 23 | import org.apache.tika.metadata.Property;
 24 | import org.apache.tika.metadata.TikaCoreProperties;
 25 | import org.apache.tika.mime.MediaType;
 26 | import org.apache.tika.parser.ParseContext;
 27 | import org.apache.tika.parser.image.xmp.JempboxExtractor;
 28 | import org.apache.tika.utils.XMLReaderUtils;
 29 | import org.w3c.dom.Document;
 30 | import org.xml.sax.SAXException;
 31 | 
 32 | class PDMetadataExtractor {
 33 | 
 34 |     private static final MediaType MEDIA_TYPE = MediaType.application("pdf");
 35 | 
 36 | 
 37 |     static void extract(PDMetadata pdMetadata, Metadata metadata, ParseContext context) {
 38 |         if (pdMetadata == null) {
 39 |             metadata.set(PDF.HAS_XMP, "false");
 40 |             return;
 41 |         }
 42 |         //this file has XMP...
 43 |         //whether or not it is readable or throws an exception is another story...
 44 |         metadata.set(PDF.HAS_XMP, "true");
 45 |         //now go for the XMP
 46 |         Document dom = loadDOM(pdMetadata, metadata, context);
 47 | 
 48 |         XMPMetadata xmp = null;
 49 |         if (dom != null) {
 50 |             xmp = new XMPMetadata(dom);
 51 |         }
 52 |         XMPSchemaDublinCore dcSchema = null;
 53 | 
 54 |         if (xmp != null) {
 55 |             try {
 56 |                 dcSchema = xmp.getDublinCoreSchema();
 57 |             } catch (IOException e) {
 58 |             }
 59 | 
 60 |             JempboxExtractor.extractXMPMM(xmp, metadata);
 61 |         }
 62 | 
 63 |         extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema);
 64 |         extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, dcSchema);
 65 |         extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, dcSchema);
 66 |         extractMultilingualItems(metadata, TikaCoreProperties.TITLE, null, dcSchema);
 67 | 
 68 |         try {
 69 |             if (xmp != null) {
 70 |                 xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class);
 71 |                 XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class);
 72 |                 if (pdfaxmp != null) {
 73 |                     if (pdfaxmp.getPart() != null) {
 74 |                         metadata.set(PDF.PDFAID_PART, Integer.toString(pdfaxmp.getPart()));
 75 |                     }
 76 |                     if (pdfaxmp.getConformance() != null) {
 77 |                         metadata.set(PDF.PDFAID_CONFORMANCE, pdfaxmp.getConformance());
 78 |                         String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase(Locale.ROOT);
 79 |                         metadata.set(PDF.PDFA_VERSION, version);
 80 |                         metadata.add(TikaCoreProperties.FORMAT.getName(),
 81 |                                 MEDIA_TYPE.toString() + "; version=\"" + version + "\"");
 82 |                     }
 83 |                 }
 84 |                 // TODO WARN if this XMP version is inconsistent with document header version?
 85 |             }
 86 |         } catch (IOException e) {
 87 |             metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e);
 88 |         }
 89 |     }
 90 | 
 91 |     /**
 92 |      * As of this writing, XMPSchema can contain bags or sequence lists
 93 |      * for some attributes...despite standards documentation.
 94 |      * JempBox expects one or the other for specific attributes.
 95 |      * Until more flexibility is added to JempBox, Tika will have to handle both.
 96 |      *
 97 |      * @param schema
 98 |      * @param name
 99 |      * @return list of values or null
100 |      */
101 |     static List<String> getXMPBagOrSeqList(XMPSchema schema, String name) {
102 |         List<String> ret = schema.getBagList(name);
103 |         if (ret == null) {
104 |             ret = schema.getSequenceList(name);
105 |         }
106 |         return ret;
107 |     }
108 | 
109 |     /**
110 |      * Try to extract all multilingual items from the XMPSchema
111 |      * <p/>
112 |      * This relies on the property having a valid xmp getName()
113 |      * <p/>
114 |      * For now, this only extracts the first language if the property does not allow multiple values (see TIKA-1295)
115 |      *
116 |      * @param metadata
117 |      * @param property
118 |      * @param pdfBoxBaseline
119 |      * @param schema
120 |      */
121 |     private static void extractMultilingualItems(Metadata metadata, Property property,
122 |                                                  String pdfBoxBaseline, XMPSchema schema) {
123 |         //if schema is null, just go with pdfBoxBaseline
124 |         if (schema == null) {
125 |             if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
126 |                 addMetadata(metadata, property, pdfBoxBaseline);
127 |             }
128 |             return;
129 |         }
130 | 
131 |         for (String lang : schema.getLanguagePropertyLanguages(property.getName())) {
132 |             String value = schema.getLanguageProperty(property.getName(), lang);
133 | 
134 |             if (value != null && value.length() > 0) {
135 |                 //if you're going to add it below in the baseline addition, don't add it now
136 |                 if (pdfBoxBaseline != null && value.equals(pdfBoxBaseline)) {
137 |                     continue;
138 |                 }
139 |                 addMetadata(metadata, property, value);
140 |                 if (!property.isMultiValuePermitted()) {
141 |                     return;
142 |                 }
143 |             }
144 |         }
145 | 
146 |         if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
147 |             //if we've already added something above and multivalue is not permitted
148 |             //return.
149 |             if (!property.isMultiValuePermitted()) {
150 |                 if (metadata.get(property) != null) {
151 |                     return;
152 |                 }
153 |             }
154 |             addMetadata(metadata, property, pdfBoxBaseline);
155 |         }
156 |     }
157 | 
158 | 
159 |     /**
160 |      * This tries to read a list from a particular property in
161 |      * XMPSchemaDublinCore.
162 |      * <p/>
163 |      * Until PDFBOX-1803/TIKA-1233 are fixed, do not call this
164 |      * on dates!
165 |      * <p/>
166 |      * This relies on the property having a DublinCore compliant getName()
167 |      *
168 |      * @param property
169 |      * @param dc
170 |      * @param metadata
171 |      */
172 |     private static void extractDublinCoreListItems(Metadata metadata, Property property, XMPSchemaDublinCore dc) {
173 |         //if no dc, add baseline and return
174 |         if (dc == null) {
175 |             return;
176 |         }
177 |         List<String> items = getXMPBagOrSeqList(dc, property.getName());
178 |         if (items == null) {
179 |             return;
180 |         }
181 |         for (String item : items) {
182 |             addMetadata(metadata, property, item);
183 |         }
184 |     }
185 | 
186 | 
187 |     static void addMetadata(Metadata metadata, Property property, String value) {
188 |         if (value != null) {
189 |             String decoded = decode(value);
190 |             if (property.isMultiValuePermitted() || metadata.get(property) == null) {
191 |                 metadata.add(property, decoded);
192 |             }
193 |             //silently skip adding property that already exists if multiple values are not permitted
194 |         }
195 |     }
196 | 
197 |     static void addMetadata(Metadata metadata, String name, String value) {
198 |         if (value != null) {
199 |             metadata.add(name, decode(value));
200 |         }
201 |     }
202 | 
203 |     static String decode(String value) {
204 |         if (PDFEncodedStringDecoder.shouldDecode(value)) {
205 |             PDFEncodedStringDecoder d = new PDFEncodedStringDecoder();
206 |             return d.decode(value);
207 |         }
208 |         return value;
209 |     }
210 | 
211 |     //can return null!
212 |     private static Document loadDOM(PDMetadata pdMetadata, Metadata metadata, ParseContext context) {
213 |         if (pdMetadata == null) {
214 |             return null;
215 |         }
216 | 
217 |         InputStream is = null;
218 |         try {
219 |             try {
220 |                 is = pdMetadata.exportXMPMetadata();
221 |             } catch (IOException e) {
222 |                 EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
223 |                 return null;
224 |             }
225 |             return XMLReaderUtils.buildDOM(is, context);
226 |         } catch (IOException| SAXException | TikaException e) {
227 |             EmbeddedDocumentUtil.recordException(e, metadata);
228 |         } finally {
229 |             IOUtils.closeQuietly(is);
230 |         }
231 |         return null;
232 | 
233 |     }
234 | 
235 |     static void addMetadata(Metadata metadata, Property property, Calendar value) {
236 |         if (value != null) {
237 |             metadata.set(property, value);
238 |         }
239 |     }
240 | 
241 |     /**
242 |      * Used when processing custom metadata entries, as PDFBox won't do
243 |      * the conversion for us in the way it does for the standard ones
244 |      */
245 |     static void addMetadata(Metadata metadata, String name, COSBase value) {
246 |         if (value instanceof COSArray) {
247 |             for (Object v : ((COSArray) value).toList()) {
248 |                 addMetadata(metadata, name, ((COSBase) v));
249 |             }
250 |         } else if (value instanceof COSString) {
251 |             addMetadata(metadata, name, ((COSString) value).getString());
252 |         }
253 |         // Avoid calling COSDictionary#toString, since it can lead to infinite
254 |         // recursion. See TIKA-1038 and PDFBOX-1835.
255 |         else if (value != null && !(value instanceof COSDictionary)) {
256 |             addMetadata(metadata, name, value.toString());
257 |         }
258 |     }
259 | }
260 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/lexpredict-tika/src/main/java/com/lexpredict/tika/XFAExtractor.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  *
 17 |  * Modifications copyright (C) 2020 ContraxSuite, LLC
 18 |  */
 19 | 
 20 | package com.lexpredict.tika;
 21 | 
 22 | import javax.xml.namespace.QName;
 23 | import javax.xml.stream.XMLStreamConstants;
 24 | import javax.xml.stream.XMLStreamException;
 25 | import javax.xml.stream.XMLStreamReader;
 26 | import java.io.InputStream;
 27 | import java.util.HashMap;
 28 | import java.util.LinkedHashMap;
 29 | import java.util.Map;
 30 | import java.util.regex.Matcher;
 31 | import java.util.regex.Pattern;
 32 | 
 33 | import org.apache.tika.metadata.Metadata;
 34 | import org.apache.tika.parser.ParseContext;
 35 | import org.apache.tika.sax.XHTMLContentHandler;
 36 | import org.xml.sax.SAXException;
 37 | import org.xml.sax.helpers.AttributesImpl;
 38 | 
 39 | 
 40 | class XFAExtractor {
 41 | 
 42 |     private static final Pattern XFA_TEMPLATE_ANY_VERSION = Pattern.compile("^http://www.xfa.org/schema/xfa-template");
 43 |     private static final Pattern TEXT_PATTERN =
 44 |             Pattern.compile("^(speak|text|contents-richtext|toolTip|exData)$");
 45 | 
 46 |     private static final String XFA_DATA_NS = "http://www.xfa.org/schema/xfa-data/1.0/";
 47 | 
 48 |     private static final String FIELD_LN = "field";
 49 |     private static final QName XFA_DATA = new QName(XFA_DATA_NS, "data");
 50 | 
 51 |     private final Matcher xfaTemplateMatcher;//namespace any version
 52 |     private final Matcher textMatcher;
 53 | 
 54 |     XFAExtractor() {
 55 |         xfaTemplateMatcher = XFA_TEMPLATE_ANY_VERSION.matcher("");
 56 |         textMatcher = TEXT_PATTERN.matcher("");
 57 |     }
 58 | 
 59 |     void extract(InputStream xfaIs, XHTMLContentHandler xhtml, Metadata m, ParseContext context)
 60 |             throws XMLStreamException, SAXException {
 61 |         xhtml.startElement("div", "class", "xfa_content");
 62 | 
 63 |         Map<String, String> pdfObjRToValues = new HashMap<>();
 64 | 
 65 |         //for now, store and dump the fields in insertion order
 66 |         Map<String, XFAExtractor.XFAField> namedFields = new LinkedHashMap<>();
 67 | 
 68 |         //The strategy is to cache the fields in fields
 69 |         //and cache the values in pdfObjRToValues while
 70 |         //handling the text etc along the way.
 71 |         //
 72 |         //As a final step, dump the merged fields and the values.
 73 | 
 74 |         XMLStreamReader reader = context.getXMLInputFactory().createXMLStreamReader(xfaIs);
 75 |         while (reader.hasNext()) {
 76 |             switch (reader.next()) {
 77 |                 case XMLStreamConstants.START_ELEMENT :
 78 |                     QName name = reader.getName();
 79 |                     String localName = name.getLocalPart();
 80 |                     if (xfaTemplateMatcher.reset(name.getNamespaceURI()).find() &&
 81 |                             FIELD_LN.equals(name.getLocalPart())) {
 82 |                         handleField(reader, namedFields);
 83 |                     } else if (XFA_DATA.equals(name)) {//full qname match is important!
 84 |                         loadData(reader, pdfObjRToValues);
 85 |                     } else if (textMatcher.reset(localName).find()) {
 86 |                         scrapeTextUntil(reader, xhtml, name);
 87 |                     }
 88 |                     break;
 89 |                 case XMLStreamConstants.END_ELEMENT :
 90 |                     break;
 91 |             }
 92 |         }
 93 | 
 94 |         if (namedFields.size() == 0) {
 95 |             xhtml.endElement("xfa_content");
 96 |             return;
 97 |         }
 98 |         //now dump fields and values
 99 |         xhtml.startElement("div", "class", "xfa_form");
100 |         xhtml.startElement("ol");
101 |         StringBuilder sb = new StringBuilder();
102 |         for (Map.Entry<String, XFAExtractor.XFAField> e : namedFields.entrySet()) {
103 |             String fieldName = e.getKey();
104 |             XFAExtractor.XFAField field = e.getValue();
105 |             String fieldValue = pdfObjRToValues.get(fieldName);
106 |             AttributesImpl attrs = new AttributesImpl();
107 |             attrs.addAttribute("", "fieldName", "fieldName", "CDATA", fieldName);
108 | 
109 |             String displayFieldName = (field.toolTip == null ||
110 |                     field.toolTip.trim().length() == 0) ? fieldName : field.toolTip;
111 | 
112 |             sb.append(displayFieldName).append(": ");
113 |             if (fieldValue != null) {
114 |                 sb.append(fieldValue);
115 |             }
116 | 
117 |             xhtml.startElement("li", attrs);
118 |             xhtml.characters(sb.toString());
119 |             xhtml.endElement("li");
120 |             sb.setLength(0);
121 |         }
122 |         xhtml.endElement("ol");
123 |         xhtml.endElement("div");
124 |         xhtml.endElement("xfa_content");
125 |     }
126 | 
127 |     //try to scrape the text until the endElement
128 |     private void scrapeTextUntil(XMLStreamReader reader, XHTMLContentHandler xhtml,
129 |                                  QName endElement) throws XMLStreamException, SAXException {
130 |         StringBuilder buffer = new StringBuilder();
131 |         boolean keepGoing = true;
132 |         while (reader.hasNext() && keepGoing) {
133 |             switch (reader.next()) {
134 |                 case XMLStreamConstants.START_ELEMENT:
135 |                     break;
136 |                 case XMLStreamConstants.CHARACTERS:
137 |                     int start = reader.getTextStart();
138 |                     int length = reader.getTextLength();
139 |                     buffer.append(reader.getTextCharacters(),
140 |                             start,
141 |                             length);
142 |                     break;
143 | 
144 |                 case XMLStreamConstants.CDATA:
145 |                     start = reader.getTextStart();
146 |                     length = reader.getTextLength();
147 |                     buffer.append(reader.getTextCharacters(),
148 |                             start,
149 |                             length);
150 |                     break;
151 | 
152 |                 case (XMLStreamConstants.END_ELEMENT):
153 |                     if (reader.getName().equals(endElement)) {
154 |                         keepGoing = false;
155 |                     } else if ("p".equals(reader.getName().getLocalPart())) {
156 |                         xhtml.element("p", buffer.toString());
157 |                         buffer.setLength(0);
158 |                     }
159 |                     break;
160 |             }
161 |         }
162 |         String remainder = buffer.toString();
163 |         if (remainder.trim().length() > 0) {
164 |             xhtml.element("p", remainder);
165 |         }
166 |     }
167 | 
168 | 
169 |     private String scrapeTextUntil(XMLStreamReader reader, QName endElement) throws XMLStreamException {
170 |         StringBuilder buffer = new StringBuilder();
171 |         boolean keepGoing = true;
172 |         while (reader.hasNext() && keepGoing) {
173 |             switch (reader.next()) {
174 |                 case XMLStreamConstants.START_ELEMENT:
175 |                     break;
176 |                 case XMLStreamConstants.CHARACTERS:
177 |                     int start = reader.getTextStart();
178 |                     int length = reader.getTextLength();
179 |                     buffer.append(reader.getTextCharacters(),
180 |                             start,
181 |                             length);
182 |                     break;
183 | 
184 |                 case XMLStreamConstants.CDATA:
185 |                     start = reader.getTextStart();
186 |                     length = reader.getTextLength();
187 |                     buffer.append(reader.getTextCharacters(),
188 |                             start,
189 |                             length);
190 |                     break;
191 | 
192 |                 case (XMLStreamConstants.END_ELEMENT):
193 |                     if (reader.getName().equals(endElement)) {
194 |                         keepGoing = false;
195 |                     } else if ("p".equals(reader.getName().getLocalPart())) {
196 |                         buffer.append("\n");
197 |                     }
198 |                     break;
199 |             }
200 |         }
201 |         return buffer.toString();
202 |     }
203 | 
204 |     private void loadData(XMLStreamReader reader, Map<String, String> pdfObjRToValues)
205 |             throws XMLStreamException {
206 |         //reader is at the "xfa:data" element
207 |         //scrape the contents from the text containing nodes
208 |         StringBuilder buffer = new StringBuilder();
209 |         while (reader.hasNext()) {
210 |             switch (reader.next()) {
211 |                 case (XMLStreamConstants.START_ELEMENT) :
212 |                     break;
213 |                 case XMLStreamConstants.CHARACTERS:
214 |                     int start = reader.getTextStart();
215 |                     int length = reader.getTextLength();
216 |                     buffer.append(reader.getTextCharacters(),
217 |                             start,
218 |                             length);
219 |                     break;
220 | 
221 |                 case XMLStreamConstants.CDATA:
222 |                     start = reader.getTextStart();
223 |                     length = reader.getTextLength();
224 |                     buffer.append(reader.getTextCharacters(),
225 |                             start,
226 |                             length);
227 |                     break;
228 | 
229 |                 case (XMLStreamConstants.END_ELEMENT) :
230 |                     if (buffer.length() > 0) {
231 |                         String localName = reader.getLocalName();
232 |                         pdfObjRToValues.put(localName, buffer.toString());
233 |                         buffer.setLength(0);
234 |                     }
235 |                     if (XFA_DATA.equals(reader.getName())) {
236 |                         return;
237 |                     }
238 |                     break;
239 | 
240 |             }
241 |         }
242 |     }
243 | 
244 |     private void handleField(XMLStreamReader reader, Map<String, XFAExtractor.XFAField> fields) throws XMLStreamException {
245 |         //reader is set to the field element
246 |         String fieldName = findFirstAttributeValue(reader, "name");
247 |         String pdfObjRef = "";
248 |         String toolTip = "";
249 |         while (reader.hasNext()) {
250 |             switch (reader.next()) {
251 |                 case XMLStreamConstants.START_ELEMENT :
252 |                     if ("toolTip".equals(reader.getName().getLocalPart())) {
253 |                         toolTip = scrapeTextUntil(reader, reader.getName());
254 |                     }
255 |                     // add checkbutton, etcif (reader.getName().equals())
256 |                     break;
257 |                 case XMLStreamConstants.END_ELEMENT :
258 |                     if (xfaTemplateMatcher.reset(reader.getName().getNamespaceURI()).find() &&
259 |                             FIELD_LN.equals(reader.getName().getLocalPart())) {
260 |                         if (fieldName != null) {
261 |                             fields.put(fieldName, new XFAExtractor.XFAField(fieldName, toolTip, pdfObjRef));
262 |                         }
263 |                         return;
264 |                     }
265 |                     break;
266 |                 case XMLStreamConstants.PROCESSING_INSTRUCTION:
267 |                     if ("PDF_OBJR".equals(reader.getPITarget())) {
268 |                         pdfObjRef = reader.getPIData();
269 |                     }
270 |                     break;
271 | 
272 |             }
273 |         }
274 |     }
275 | 
276 |     private String findFirstAttributeValue(XMLStreamReader reader, String name) {
277 |         for (int i = 0; i < reader.getAttributeCount(); i++) {
278 |             String n = reader.getAttributeLocalName(i);
279 |             if (name.equals(n)) {
280 |                 return reader.getAttributeValue(i);
281 |             }
282 |         }
283 |         return "";
284 |     }
285 | 
286 |     class XFAField {
287 |         String fieldName;
288 |         String toolTip;
289 |         String pdfObjRef;
290 |         String value;
291 | 
292 |         public XFAField(String fieldName, String toolTip, String pdfObjRef) {
293 |             this.fieldName = fieldName;
294 |             this.toolTip = toolTip;
295 |             this.pdfObjRef = pdfObjRef;
296 |         }
297 | 
298 |         @Override
299 |         public String toString() {
300 |             return "XFAField{" +
301 |                     "fieldName='" + fieldName + '\'' +
302 |                     ", toolTip='" + toolTip + '\'' +
303 |                     ", pdfObjRef='" + pdfObjRef + '\'' +
304 |                     ", value='" + value + '\'' +
305 |                     '}';
306 |         }
307 |     }
308 | }
309 | 
310 | 


--------------------------------------------------------------------------------
/lexpredict-tika/src/main/java/com/lexpredict/tika/LegacyPDFStreamEngine.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  *
 17 |  * Modifications copyright (C) 2020 ContraxSuite, LLC
 18 |  */
 19 | 
 20 | package com.lexpredict.tika;
 21 | 
 22 | import org.apache.commons.logging.Log;
 23 | import org.apache.commons.logging.LogFactory;
 24 | import org.apache.fontbox.ttf.TrueTypeFont;
 25 | import org.apache.fontbox.util.BoundingBox;
 26 | import org.apache.pdfbox.contentstream.PDFStreamEngine;
 27 | import org.apache.pdfbox.contentstream.operator.DrawObject;
 28 | import org.apache.pdfbox.pdmodel.PDPage;
 29 | import org.apache.pdfbox.pdmodel.common.PDRectangle;
 30 | import org.apache.pdfbox.pdmodel.font.encoding.GlyphList;
 31 | import org.apache.pdfbox.pdmodel.graphics.state.PDGraphicsState;
 32 | import org.apache.pdfbox.text.TextPosition;
 33 | import org.apache.pdfbox.util.Matrix;
 34 | import org.apache.pdfbox.util.Vector;
 35 | import java.io.IOException;
 36 | import java.io.InputStream;
 37 | import org.apache.pdfbox.pdmodel.font.PDCIDFont;
 38 | import org.apache.pdfbox.pdmodel.font.PDCIDFontType2;
 39 | import org.apache.pdfbox.pdmodel.font.PDFont;
 40 | import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
 41 | import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont;
 42 | import org.apache.pdfbox.pdmodel.font.PDType0Font;
 43 | import org.apache.pdfbox.pdmodel.font.PDType3Font;
 44 | import org.apache.pdfbox.contentstream.operator.state.Concatenate;
 45 | import org.apache.pdfbox.contentstream.operator.state.Restore;
 46 | import org.apache.pdfbox.contentstream.operator.state.Save;
 47 | import org.apache.pdfbox.contentstream.operator.state.SetGraphicsStateParameters;
 48 | import org.apache.pdfbox.contentstream.operator.state.SetMatrix;
 49 | import org.apache.pdfbox.contentstream.operator.text.BeginText;
 50 | import org.apache.pdfbox.contentstream.operator.text.EndText;
 51 | import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize;
 52 | import org.apache.pdfbox.contentstream.operator.text.SetTextHorizontalScaling;
 53 | import org.apache.pdfbox.contentstream.operator.text.ShowTextAdjusted;
 54 | import org.apache.pdfbox.contentstream.operator.text.ShowTextLine;
 55 | import org.apache.pdfbox.contentstream.operator.text.ShowTextLineAndSpace;
 56 | import org.apache.pdfbox.contentstream.operator.text.MoveText;
 57 | import org.apache.pdfbox.contentstream.operator.text.MoveTextSetLeading;
 58 | import org.apache.pdfbox.contentstream.operator.text.NextLine;
 59 | import org.apache.pdfbox.contentstream.operator.text.SetCharSpacing;
 60 | import org.apache.pdfbox.contentstream.operator.text.SetTextLeading;
 61 | import org.apache.pdfbox.contentstream.operator.text.SetTextRenderingMode;
 62 | import org.apache.pdfbox.contentstream.operator.text.SetTextRise;
 63 | import org.apache.pdfbox.contentstream.operator.text.SetWordSpacing;
 64 | import org.apache.pdfbox.contentstream.operator.text.ShowText;
 65 | import org.apache.pdfbox.pdmodel.font.PDFontDescriptor;
 66 | 
 67 | class LegacyPDFStreamEngine extends PDFStreamEngine
 68 | {
 69 |     private static final Log LOG = LogFactory.getLog(com.lexpredict.tika.LegacyPDFStreamEngine.class);
 70 | 
 71 |     private int pageRotation;
 72 |     private PDRectangle pageSize;
 73 |     private Matrix translateMatrix;
 74 |     private final GlyphList glyphList;
 75 | 
 76 |     /**
 77 |      * Constructor.
 78 |      */
 79 |     LegacyPDFStreamEngine() throws IOException
 80 |     {
 81 |         addOperator(new BeginText());
 82 |         addOperator(new Concatenate());
 83 |         addOperator(new DrawObject()); // special text version
 84 |         addOperator(new EndText());
 85 |         addOperator(new SetGraphicsStateParameters());
 86 |         addOperator(new Save());
 87 |         addOperator(new Restore());
 88 |         addOperator(new NextLine());
 89 |         addOperator(new SetCharSpacing());
 90 |         addOperator(new MoveText());
 91 |         addOperator(new MoveTextSetLeading());
 92 |         addOperator(new SetFontAndSize());
 93 |         addOperator(new ShowText());
 94 |         addOperator(new ShowTextAdjusted());
 95 |         addOperator(new SetTextLeading());
 96 |         addOperator(new SetMatrix());
 97 |         addOperator(new SetTextRenderingMode());
 98 |         addOperator(new SetTextRise());
 99 |         addOperator(new SetWordSpacing());
100 |         addOperator(new SetTextHorizontalScaling());
101 |         addOperator(new ShowTextLine());
102 |         addOperator(new ShowTextLineAndSpace());
103 | 
104 |         // load additional glyph list for Unicode mapping
105 |         String path = "org/apache/pdfbox/resources/glyphlist/additional.txt";
106 |         InputStream input = GlyphList.class.getClassLoader().getResourceAsStream(path);
107 |         glyphList = new GlyphList(GlyphList.getAdobeGlyphList(), input);
108 |     }
109 | 
110 |     /**
111 |      * This will initialize and process the contents of the stream.
112 |      *
113 |      * @param page the page to process
114 |      * @throws java.io.IOException if there is an error accessing the stream.
115 |      */
116 |     @Override
117 |     public void processPage(PDPage page) throws IOException
118 |     {
119 |         this.pageRotation = page.getRotation();
120 |         this.pageSize = page.getCropBox();
121 | 
122 |         if (pageSize.getLowerLeftX() == 0 && pageSize.getLowerLeftY() == 0)
123 |         {
124 |             translateMatrix = null;
125 |         }
126 |         else
127 |         {
128 |             // translation matrix for cropbox
129 |             translateMatrix = Matrix.getTranslateInstance(-pageSize.getLowerLeftX(), -pageSize.getLowerLeftY());
130 |         }
131 |         super.processPage(page);
132 |     }
133 | 
134 |     /**
135 |      * This method was originally written by Ben Litchfield for PDFStreamEngine.
136 |      */
137 |     @Override
138 |     protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code, String unicode,
139 |                              Vector displacement) throws IOException
140 |     {
141 |         //
142 |         // legacy calculations which were previously in PDFStreamEngine
143 |         //
144 |         //  DO NOT USE THIS CODE UNLESS YOU ARE WORKING WITH PDFTextStripper.
145 |         //  THIS CODE IS DELIBERATELY INCORRECT
146 |         //
147 | 
148 |         PDGraphicsState state = getGraphicsState();
149 |         Matrix ctm = state.getCurrentTransformationMatrix();
150 |         float fontSize = state.getTextState().getFontSize();
151 |         float horizontalScaling = state.getTextState().getHorizontalScaling() / 100f;
152 |         Matrix textMatrix = getTextMatrix();
153 | 
154 |         BoundingBox bbox = font.getBoundingBox();
155 |         if (bbox.getLowerLeftY() < Short.MIN_VALUE)
156 |         {
157 |             // PDFBOX-2158 and PDFBOX-3130
158 |             // files by Salmat eSolutions / ClibPDF Library
159 |             bbox.setLowerLeftY(- (bbox.getLowerLeftY() + 65536));
160 |         }
161 |         // 1/2 the bbox is used as the height todo: why?
162 |         float glyphHeight = bbox.getHeight() / 2;
163 | 
164 |         // sometimes the bbox has very high values, but CapHeight is OK
165 |         PDFontDescriptor fontDescriptor = font.getFontDescriptor();
166 |         if (fontDescriptor != null)
167 |         {
168 |             float capHeight = fontDescriptor.getCapHeight();
169 |             if (capHeight != 0 && (capHeight < glyphHeight || glyphHeight == 0))
170 |             {
171 |                 glyphHeight = capHeight;
172 |             }
173 |         }
174 | 
175 |         // transformPoint from glyph space -> text space
176 |         float height;
177 |         if (font instanceof PDType3Font)
178 |         {
179 |             height = font.getFontMatrix().transformPoint(0, glyphHeight).y;
180 |         }
181 |         else
182 |         {
183 |             height = glyphHeight / 1000;
184 |         }
185 | 
186 |         float displacementX = displacement.getX();
187 |         // the sorting algorithm is based on the width of the character. As the displacement
188 |         // for vertical characters doesn't provide any suitable value for it, we have to
189 |         // calculate our own
190 |         if (font.isVertical())
191 |         {
192 |             displacementX = font.getWidth(code) / 1000;
193 |             // there may be an additional scaling factor for true type fonts
194 |             TrueTypeFont ttf = null;
195 |             if (font instanceof PDTrueTypeFont)
196 |             {
197 |                 ttf = ((PDTrueTypeFont)font).getTrueTypeFont();
198 |             }
199 |             else if (font instanceof PDType0Font)
200 |             {
201 |                 PDCIDFont cidFont = ((PDType0Font)font).getDescendantFont();
202 |                 if (cidFont instanceof PDCIDFontType2)
203 |                 {
204 |                     ttf = ((PDCIDFontType2)cidFont).getTrueTypeFont();
205 |                 }
206 |             }
207 |             if (ttf != null && ttf.getUnitsPerEm() != 1000)
208 |             {
209 |                 displacementX *= 1000f / ttf.getUnitsPerEm();
210 |             }
211 |         }
212 | 
213 |         //
214 |         // legacy calculations which were previously in PDFStreamEngine
215 |         //
216 |         //  DO NOT USE THIS CODE UNLESS YOU ARE WORKING WITH PDFTextStripper.
217 |         //  THIS CODE IS DELIBERATELY INCORRECT
218 |         //
219 | 
220 |         // (modified) combined displacement, this is calculated *without* taking the character
221 |         // spacing and word spacing into account, due to legacy code in TextStripper
222 |         float tx = displacementX * fontSize * horizontalScaling;
223 |         float ty = displacement.getY() * fontSize;
224 | 
225 |         // (modified) combined displacement matrix
226 |         Matrix td = Matrix.getTranslateInstance(tx, ty);
227 | 
228 |         // (modified) text rendering matrix
229 |         Matrix nextTextRenderingMatrix = td.multiply(textMatrix).multiply(ctm); // text space -> device space
230 |         float nextX = nextTextRenderingMatrix.getTranslateX();
231 |         float nextY = nextTextRenderingMatrix.getTranslateY();
232 | 
233 |         // (modified) width and height calculations
234 |         float dxDisplay = nextX - textRenderingMatrix.getTranslateX();
235 |         float dyDisplay = height * textRenderingMatrix.getScalingFactorY();
236 | 
237 |         //
238 |         // start of the original method
239 |         //
240 | 
241 |         // Note on variable names. There are three different units being used in this code.
242 |         // Character sizes are given in glyph units, text locations are initially given in text
243 |         // units, and we want to save the data in display units. The variable names should end with
244 |         // Text or Disp to represent if the values are in text or disp units (no glyph units are
245 |         // saved).
246 | 
247 |         float glyphSpaceToTextSpaceFactor = 1 / 1000f;
248 |         if (font instanceof PDType3Font)
249 |         {
250 |             glyphSpaceToTextSpaceFactor = font.getFontMatrix().getScaleX();
251 |         }
252 | 
253 |         float spaceWidthText = 0;
254 |         try
255 |         {
256 |             // to avoid crash as described in PDFBOX-614, see what the space displacement should be
257 |             spaceWidthText = font.getSpaceWidth() * glyphSpaceToTextSpaceFactor;
258 |         }
259 |         catch (Throwable exception)
260 |         {
261 |             LOG.warn(exception, exception);
262 |         }
263 | 
264 |         if (spaceWidthText == 0)
265 |         {
266 |             spaceWidthText = font.getAverageFontWidth() * glyphSpaceToTextSpaceFactor;
267 |             // the average space width appears to be higher than necessary so make it smaller
268 |             spaceWidthText *= .80f;
269 |         }
270 |         if (spaceWidthText == 0)
271 |         {
272 |             spaceWidthText = 1.0f; // if could not find font, use a generic value
273 |         }
274 | 
275 |         // the space width has to be transformed into display units
276 |         float spaceWidthDisplay = spaceWidthText * textRenderingMatrix.getScalingFactorX();
277 | 
278 |         // use our additional glyph list for Unicode mapping
279 |         unicode = font.toUnicode(code, glyphList);
280 | 
281 |         // when there is no Unicode mapping available, Acrobat simply coerces the character code
282 |         // into Unicode, so we do the same. Subclasses of PDFStreamEngine don't necessarily want
283 |         // this, which is why we leave it until this point in PDFTextStreamEngine.
284 |         if (unicode == null)
285 |         {
286 |             if (font instanceof PDSimpleFont)
287 |             {
288 |                 char c = (char) code;
289 |                 unicode = new String(new char[] { c });
290 |             }
291 |             else
292 |             {
293 |                 // Acrobat doesn't seem to coerce composite font's character codes, instead it
294 |                 // skips them. See the "allah2.pdf" TestTextStripper file.
295 |                 return;
296 |             }
297 |         }
298 | 
299 |         // adjust for cropbox if needed
300 |         Matrix translatedTextRenderingMatrix;
301 |         if (translateMatrix == null)
302 |         {
303 |             translatedTextRenderingMatrix = textRenderingMatrix;
304 |         }
305 |         else
306 |         {
307 |             translatedTextRenderingMatrix = Matrix.concatenate(translateMatrix, textRenderingMatrix);
308 |             nextX -= pageSize.getLowerLeftX();
309 |             nextY -= pageSize.getLowerLeftY();
310 |         }
311 | 
312 |         processTextPosition(new TextPosition(pageRotation, pageSize.getWidth(),
313 |                 pageSize.getHeight(), translatedTextRenderingMatrix, nextX, nextY,
314 |                 Math.abs(dyDisplay), dxDisplay,
315 |                 Math.abs(spaceWidthDisplay), unicode, new int[] { code } , font, fontSize,
316 |                 (int)(fontSize * textMatrix.getScalingFactorX())));
317 |     }
318 | 
319 |     /**
320 |      * A method provided as an event interface to allow a subclass to perform some specific
321 |      * functionality when text needs to be processed.
322 |      *
323 |      * @param text The text to be processed.
324 |      */
325 |     protected void processTextPosition(TextPosition text)
326 |     {
327 |         // subclasses can override to provide specific functionality
328 |     }
329 | }
330 | 
331 | 


--------------------------------------------------------------------------------
/lexpredict-tika/src/test/java/com/lexpredict/tika/TikaTest.java:
--------------------------------------------------------------------------------
  1 | package com.lexpredict.tika;/*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | import static org.junit.Assert.assertEquals;
 19 | import static org.junit.Assert.assertFalse;
 20 | import static org.junit.Assert.assertTrue;
 21 | import static org.junit.Assert.fail;
 22 | 
 23 | import java.io.ByteArrayOutputStream;
 24 | import java.io.File;
 25 | import java.io.IOException;
 26 | import java.io.InputStream;
 27 | import java.net.URISyntaxException;
 28 | import java.net.URL;
 29 | import java.util.ArrayList;
 30 | import java.util.Collection;
 31 | import java.util.HashSet;
 32 | import java.util.List;
 33 | import java.util.Set;
 34 | 
 35 | import org.apache.tika.extractor.EmbeddedResourceHandler;
 36 | import org.apache.tika.io.IOUtils;
 37 | import org.apache.tika.io.TikaInputStream;
 38 | import org.apache.tika.metadata.Metadata;
 39 | import org.apache.tika.mime.MediaType;
 40 | import org.apache.tika.parser.AutoDetectParser;
 41 | import org.apache.tika.parser.ParseContext;
 42 | import org.apache.tika.parser.Parser;
 43 | import org.apache.tika.parser.RecursiveParserWrapper;
 44 | import org.apache.tika.sax.BasicContentHandlerFactory;
 45 | import org.apache.tika.sax.BodyContentHandler;
 46 | import org.apache.tika.sax.RecursiveParserWrapperHandler;
 47 | import org.apache.tika.sax.ToXMLContentHandler;
 48 | import org.xml.sax.ContentHandler;
 49 | import org.xml.sax.SAXException;
 50 | 
 51 | /**
 52 |  * Parent class of Tika tests
 53 |  */
 54 | public abstract class TikaTest {
 55 |     /**
 56 |      * This method will give you back the filename incl. the absolute path name
 57 |      * to the resource. If the resource does not exist it will give you back the
 58 |      * resource name incl. the path.
 59 |      *
 60 |      * @param name
 61 |      *            The named resource to search for.
 62 |      * @return an absolute path incl. the name which is in the same directory as
 63 |      *         the the class you've called it from.
 64 |      */
 65 |     public File getResourceAsFile(String name) throws URISyntaxException {
 66 |         URL url = this.getClass().getResource(name);
 67 |         if (url != null) {
 68 |             return new File(url.toURI());
 69 |         } else {
 70 |             // We have a file which does not exists
 71 |             // We got the path
 72 |             url = this.getClass().getResource(".");
 73 |             File file = new File(new File(url.toURI()), name);
 74 |             if (file == null) {
 75 |                 fail("Unable to find requested file " + name);
 76 |             }
 77 |             return file;
 78 |         }
 79 |     }
 80 | 
 81 |     public InputStream getResourceAsStream(String name) {
 82 |         InputStream stream = this.getClass().getResourceAsStream(name);
 83 |         if (stream == null) {
 84 |             fail("Unable to find requested resource " + name);
 85 |         }
 86 |         return stream;
 87 |     }
 88 | 
 89 |     public static void assertContainsCount(String needle, String haystack, int targetCount) {
 90 |         int i = haystack.indexOf(needle);
 91 |         int count = 0;
 92 |         while (i > -1) {
 93 |             count++;
 94 |             i = haystack.indexOf(needle, i+1);
 95 |         }
 96 |         assertEquals("found "+count +" but should have found: "+targetCount,
 97 |                 targetCount, count);
 98 |     }
 99 | 
100 | 
101 |     public static void assertContains(String needle, String haystack) {
102 |         assertTrue(needle + " not found in:\n" + haystack, haystack.contains(needle));
103 |     }
104 | 
105 |     public static <T> void assertContains(T needle, Collection<? extends T> haystack) {
106 |         assertTrue(needle + " not found in:\n" + haystack, haystack.contains(needle));
107 |     }
108 | 
109 |     public static void assertNotContained(String needle, String haystack) {
110 |         assertFalse(needle + " unexpectedly found in:\n" + haystack, haystack.contains(needle));
111 |     }
112 |     public static <T> void assertNotContained(T needle, Collection<? extends T> haystack) {
113 |         assertFalse(needle + " unexpectedly found in:\n" + haystack, haystack.contains(needle));
114 |     }
115 | 
116 |     /**
117 |      * Test that in at least one item in metadataList, all keys and values
118 |      * in minExpected are contained.
119 |      * <p>
120 |      * The values in minExpected are tested for whether they are contained
121 |      * within a value in the target.  If minExpected=&dquot;text/vbasic&dquot;  and
122 |      * what was actually found in the target within metadatalist is
123 |      * &dquot;text/vbasic; charset=windows-1252&dquot;,
124 |      * that is counted as a hit.
125 |      *
126 |      * @param minExpected
127 |      * @param metadataList
128 |      */
129 |     public static void assertContainsAtLeast(Metadata minExpected, List<Metadata> metadataList) {
130 | 
131 |         for (Metadata m : metadataList) {
132 |             int foundPropertyCount = 0;
133 |             for (String n : minExpected.names()) {
134 |                 int foundValCount = 0;
135 |                 for (String foundVal : m.getValues(n)) {
136 |                     for (String expectedVal : minExpected.getValues(n)) {
137 |                         if (foundVal.contains(expectedVal)) {
138 |                             foundValCount++;
139 |                         }
140 |                     }
141 |                 }
142 |                 if (foundValCount == minExpected.getValues(n).length) {
143 |                     foundPropertyCount++;
144 |                 }
145 |             }
146 |             if (foundPropertyCount == minExpected.names().length) {
147 |                 //found everything!
148 |                 return;
149 |             }
150 |         }
151 |         //TODO: figure out how to have more informative error message
152 |         fail("Couldn't find everything within a single metadata item");
153 |     }
154 |     protected static class XMLResult {
155 |         public final String xml;
156 |         public final Metadata metadata;
157 | 
158 |         public XMLResult(String xml, Metadata metadata) {
159 |             this.xml = xml;
160 |             this.metadata = metadata;
161 |         }
162 |     }
163 | 
164 |     protected XMLResult getXML(String filePath, Parser parser, ParseContext context) throws Exception {
165 |         return getXML(getResourceAsStream("/test-documents/" + filePath), parser, new Metadata(), context);
166 |     }
167 | 
168 |     protected XMLResult getXML(String filePath, Parser parser, Metadata metadata) throws Exception {
169 |         return getXML(getResourceAsStream("/test-documents/" + filePath), parser, metadata, null);
170 |     }
171 | 
172 |     protected XMLResult getXML(String filePath, ParseContext parseContext) throws Exception {
173 |         return getXML(filePath, new AutoDetectParser(), parseContext);
174 |     }
175 | 
176 |     protected XMLResult getXML(String filePath, Metadata metadata, ParseContext parseContext) throws Exception {
177 |         return getXML(getResourceAsStream("/test-documents/"+filePath), new AutoDetectParser(), metadata, parseContext);
178 |     }
179 | 
180 |     protected XMLResult getXML(String filePath, Metadata metadata) throws Exception {
181 |         return getXML(getResourceAsStream("/test-documents/" + filePath), new AutoDetectParser(), metadata, null);
182 |     }
183 | 
184 |     protected XMLResult getXML(String filePath, Parser parser) throws Exception {
185 |         Metadata metadata = new Metadata();
186 |         metadata.set(Metadata.RESOURCE_NAME_KEY, filePath);
187 |         return getXML(filePath, parser, metadata);
188 |     }
189 | 
190 |     protected XMLResult getXML(String filePath) throws Exception {
191 |         return getXML(getResourceAsStream("/test-documents/" + filePath), new AutoDetectParser(), new Metadata(), null);
192 |     }
193 | 
194 |     protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata) throws Exception {
195 |         return getXML(input, parser, metadata, null);
196 |     }
197 | 
198 |     protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata, ParseContext context) throws Exception {
199 |         if (context == null) {
200 |             context = new ParseContext();
201 |         }
202 | 
203 |         try {
204 |             ContentHandler handler = new ToXMLContentHandler();
205 |             parser.parse(input, handler, metadata, context);
206 |             return new XMLResult(handler.toString(), metadata);
207 |         } finally {
208 |             input.close();
209 |         }
210 |     }
211 | 
212 |     public XMLResult getXML(InputStream is, Parser parser, ParseContext context) throws Exception{
213 |         return getXML(is, parser, new Metadata(), context);
214 |     }
215 | 
216 |     protected List<Metadata> getRecursiveMetadata(String filePath) throws Exception {
217 |         return getRecursiveMetadata(filePath, new ParseContext());
218 |     }
219 | 
220 |     protected List<Metadata> getRecursiveMetadata(String filePath, Metadata metadata) throws Exception {
221 |         return getRecursiveMetadata(filePath, new ParseContext(), metadata);
222 |     }
223 | 
224 |     protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context, Metadata metadata) throws Exception {
225 |         Parser p = new AutoDetectParser();
226 |         RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p);
227 |         RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
228 |                 new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
229 | 
230 |         try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
231 |             wrapper.parse(is, handler, metadata, context);
232 |         }
233 |         return handler.getMetadataList();
234 |     }
235 | 
236 |     protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context) throws Exception {
237 |         Parser p = new AutoDetectParser();
238 |         RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p);
239 | 
240 |         RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
241 |                 new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
242 |         try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
243 |             wrapper.parse(is, handler, new Metadata(), context);
244 |         }
245 |         return handler.getMetadataList();
246 |     }
247 | 
248 |     protected List<Metadata> getRecursiveMetadata(String filePath, Parser parserToWrap) throws Exception {
249 |         return getRecursiveMetadata(filePath, parserToWrap, BasicContentHandlerFactory.HANDLER_TYPE.XML);
250 |     }
251 | 
252 |     protected List<Metadata> getRecursiveMetadata(String filePath, Parser parserToWrap, BasicContentHandlerFactory.HANDLER_TYPE handlerType) throws Exception {
253 |         RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parserToWrap);
254 |         RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
255 |                 new BasicContentHandlerFactory(handlerType, -1));
256 |         try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
257 |             wrapper.parse(is, handler, new Metadata(), new ParseContext());
258 |         }
259 |         return handler.getMetadataList();
260 |     }
261 | 
262 |     protected List<Metadata> getRecursiveMetadata(String filePath, Parser parserToWrap, ParseContext parseContext) throws Exception {
263 |         RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parserToWrap);
264 |         RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
265 |                 new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
266 | 
267 |         try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
268 |             wrapper.parse(is, handler, new Metadata(), parseContext);
269 |         }
270 |         return handler.getMetadataList();
271 |     }
272 | 
273 | 
274 |     /**
275 |      * Basic text extraction.
276 |      * <p>
277 |      * Tries to close input stream after processing.
278 |      */
279 |     public String getText(InputStream is, Parser parser, ParseContext context, Metadata metadata) throws Exception{
280 |         ContentHandler handler = new BodyContentHandler(1000000);
281 |         try {
282 |             parser.parse(is, handler, metadata, context);
283 |         } finally {
284 |             is.close();
285 |         }
286 |         return handler.toString();
287 |     }
288 | 
289 |     public String getTextWoDoublebreaks(InputStream is, Parser parser,
290 |                                         ParseContext context, Metadata metadata) throws Exception{
291 |         ContentHandler handler = new OriginalBodyContentHandler();
292 |         try {
293 |             parser.parse(is, handler, metadata, context);
294 |         } finally {
295 |             is.close();
296 |         }
297 |         return handler.toString();
298 |     }
299 | 
300 |     public String getTextWoDoublebreaks(InputStream is, Parser parser, Metadata metadata) throws Exception{
301 |         return getTextWoDoublebreaks(is, parser, new ParseContext(), metadata);
302 |     }
303 | 
304 |     public String getText(InputStream is, Parser parser, Metadata metadata) throws Exception{
305 |         return getText(is, parser, new ParseContext(), metadata);
306 |     }
307 | 
308 |     public String getText(InputStream is, Parser parser, ParseContext context) throws Exception{
309 |         return getText(is, parser, context, new Metadata());
310 |     }
311 | 
312 |     public String getText(InputStream is, Parser parser) throws Exception{
313 |         return getText(is, parser, new ParseContext(), new Metadata());
314 |     }
315 | 
316 |     /**
317 |      * Keeps track of media types and file names recursively.
318 |      *
319 |      */
320 |     public static class TrackingHandler implements EmbeddedResourceHandler {
321 |         public List<String> filenames = new ArrayList<String>();
322 |         public List<MediaType> mediaTypes = new ArrayList<MediaType>();
323 | 
324 |         private final Set<MediaType> skipTypes;
325 | 
326 |         public TrackingHandler() {
327 |             skipTypes = new HashSet<MediaType>();
328 |         }
329 | 
330 |         public TrackingHandler(Set<MediaType> skipTypes) {
331 |             this.skipTypes = skipTypes;
332 |         }
333 | 
334 |         @Override
335 |         public void handle(String filename, MediaType mediaType,
336 |                            InputStream stream) {
337 |             if (skipTypes.contains(mediaType)) {
338 |                 return;
339 |             }
340 |             mediaTypes.add(mediaType);
341 |             filenames.add(filename);
342 |         }
343 |     }
344 | 
345 |     /**
346 |      * Copies byte[] of embedded documents into a List.
347 |      */
348 |     public static class ByteCopyingHandler implements EmbeddedResourceHandler {
349 | 
350 |         public List<byte[]> bytes = new ArrayList<byte[]>();
351 | 
352 |         @Override
353 |         public void handle(String filename, MediaType mediaType,
354 |                            InputStream stream) {
355 |             ByteArrayOutputStream os = new ByteArrayOutputStream();
356 |             if (! stream.markSupported()) {
357 |                 stream = TikaInputStream.get(stream);
358 |             }
359 |             stream.mark(0);
360 |             try {
361 |                 IOUtils.copy(stream, os);
362 |                 bytes.add(os.toByteArray());
363 |                 stream.reset();
364 |             } catch (IOException e) {
365 |                 //swallow
366 |             }
367 |         }
368 |     }
369 | 
370 |     public class OriginalBodyContentHandler extends BodyContentHandler {
371 |         @Override
372 |         public void ignorableWhitespace(char[] ch, int start, int length)
373 |                 throws SAXException {
374 |             // Not writing extra new lines generated by XHTMLContentHandler.
375 |         }
376 |     }
377 | 
378 |     public static void debug(List<Metadata> list) {
379 |         int i = 0;
380 |         for (Metadata m : list) {
381 |             for (String n : m.names()) {
382 |                 for (String v : m.getValues(n)) {
383 |                     System.out.println(i + ": "+n + " : "+v);
384 |                 }
385 |             }
386 |             i++;
387 |         }
388 |     }
389 | 
390 |     public static void debug(Metadata metadata) {
391 |         for (String n : metadata.names()) {
392 |             for (String v : metadata.getValues(n)) {
393 |                 System.out.println(n + " : "+v);
394 |             }
395 |         }
396 |     }
397 | }
398 | 


--------------------------------------------------------------------------------
/lexpredict-tika/src/main/java/com/lexpredict/tika/AlterPDFParser.java:
--------------------------------------------------------------------------------
  1 | package com.lexpredict.tika;
  2 | 
  3 | import org.apache.commons.io.input.CloseShieldInputStream;
  4 | import org.apache.pdfbox.io.MemoryUsageSetting;
  5 | import org.apache.pdfbox.pdmodel.PDDocument;
  6 | import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
  7 | import org.apache.tika.exception.AccessPermissionException;
  8 | import org.apache.tika.exception.EncryptedDocumentException;
  9 | import org.apache.tika.exception.TikaException;
 10 | import org.apache.tika.io.TikaInputStream;
 11 | import org.apache.tika.metadata.Metadata;
 12 | import org.apache.tika.metadata.PDF;
 13 | import org.apache.tika.mime.MediaType;
 14 | import org.apache.tika.parser.ParseContext;
 15 | import org.apache.tika.parser.ocr.TesseractOCRConfig;
 16 | import org.apache.tika.parser.ocr.TesseractOCRParser;
 17 | import org.apache.tika.parser.pdf.*;
 18 | import org.xml.sax.ContentHandler;
 19 | import org.xml.sax.SAXException;
 20 | 
 21 | import java.io.*;
 22 | import java.lang.reflect.InvocationTargetException;
 23 | import java.lang.reflect.Method;
 24 | 
 25 | 
 26 | public class AlterPDFParser extends PDFParser {
 27 |     public enum ParsePdfMode {
 28 |         DEFAULT, PDF_OCR, PDF_ONLY, TEXT_STRIP, PREFER_TEXT, OCR_ONLY
 29 |     }
 30 | 
 31 |     // uses this value if it is not set in HttpRequest
 32 |     ParsePdfMode defaultParseMode = ParsePdfMode.PDF_OCR;
 33 | 
 34 |     // Metadata key for giving the document password to the parser
 35 |     private static final MediaType MEDIA_TYPE = MediaType.application("pdf");
 36 | 
 37 |     // Serial version UID
 38 |     private static final long serialVersionUID = -752276948656079347L;
 39 | 
 40 |     private PDFParserConfig defaultConfig = new PDFParserConfig();
 41 | 
 42 |     @Override
 43 |     public void parse(
 44 |             InputStream stream, ContentHandler handler,
 45 |             Metadata metadata, ParseContext context)
 46 |             throws IOException, SAXException, TikaException {
 47 | 
 48 |         HttpRequestParamsReader.getInstance().initialize(stream);
 49 |         HttpRequestParamsReader.getInstance().outIfVerbose("AlterPDFParser.parse()");
 50 |         ParsePdfMode pdfParseMode = getParseMode();
 51 | 
 52 |         PDFParserConfig sourceConfig = context.get(PDFParserConfig.class, defaultConfig);
 53 |         PDFParserConfig localConfig = makeConfigLocalCopy(sourceConfig);
 54 | 
 55 |         if (localConfig.getSetKCMS())
 56 |             System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider");
 57 | 
 58 |         PDDocument pdfDocument = null;
 59 |         try {
 60 |             TikaInputStream tstream = TikaInputStream.cast(stream);
 61 |             String password = callGetPassword(metadata, context);
 62 |             MemoryUsageSetting memoryUsageSetting = MemoryUsageSetting.setupMainMemoryOnly();
 63 |             if (localConfig.getMaxMainMemoryBytes() >= 0) {
 64 |                 memoryUsageSetting = MemoryUsageSetting.setupMixed(localConfig.getMaxMainMemoryBytes());
 65 |             }
 66 | 
 67 |             if (tstream != null && tstream.hasFile()) {
 68 |                 // File based -- send file directly to PDFBox
 69 |                 pdfDocument = PDDocument.load(tstream.getPath().toFile(), password, memoryUsageSetting);
 70 |             } else
 71 |                 pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), password, memoryUsageSetting);
 72 | 
 73 |             extractAndCheckMetadata(metadata, context, localConfig, pdfDocument);
 74 | 
 75 |             if (handler == null)
 76 |                 return;
 77 | 
 78 |             // preprocess document
 79 |             //PdfContentImagePreprocessor preproc = new PdfContentImagePreprocessor();
 80 |             //preproc.removeImagesAlphaChannel(pdfDocument);
 81 | 
 82 |             if (callShouldHandleXFAOnly(pdfDocument, localConfig)) {
 83 |                 HttpRequestParamsReader.getInstance().outIfVerbose("AlterPDFParser.parse(callShouldHandleXFAOnly)");
 84 |                 callHandleXFAOnly(pdfDocument, handler, metadata, context);
 85 |             } else if (localConfig.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY)) {
 86 |                 HttpRequestParamsReader.getInstance().outIfVerbose("AlterPDFParser.parse(OCR_ONLY)");
 87 |                 metadata.add("X-Parsed-By", TesseractOCRParser.class.toString());
 88 |                 callOCR2XHTMLProcess(pdfDocument, handler, context, metadata, localConfig);
 89 |             } else {
 90 |                 // parse document by using PDFStripper
 91 |                 if (pdfParseMode == ParsePdfMode.TEXT_STRIP) {
 92 |                     HttpRequestParamsReader.getInstance().outIfVerbose("AlterPDFParser.parse(TEXT_STRIP)");
 93 |                     PdfStripperProcessor.setTextUsingPDFTextStripper(handler, pdfDocument);
 94 |                 }
 95 |                 // just PDF parsing
 96 |                 else if (pdfParseMode == ParsePdfMode.PDF_ONLY) {
 97 |                     HttpRequestParamsReader.getInstance().outIfVerbose("AlterPDFParser.parse(PDF_ONLY)");
 98 |                     callPDF2XHTMLProcess(pdfDocument, handler, context, metadata, localConfig, true);
 99 |                 }
100 |                 // smart parsing: PDF or OCR
101 |                 else if (pdfParseMode == ParsePdfMode.PDF_OCR ||
102 |                          pdfParseMode == ParsePdfMode.PREFER_TEXT) {
103 |                     HttpRequestParamsReader.getInstance().outIfVerbose("AlterPDFParser.parse(PDF_OCR)");
104 |                     PdfContentTypeChecker checker = new PdfContentTypeChecker();
105 |                     PdfContentTypeChecker.PdfContent docType = checker.determineDocContentType(pdfDocument);
106 |                     HttpRequestParamsReader.getInstance().outIfVerbose("detected doc type: " + docType.toString());
107 | 
108 |                     if (docType == PdfContentTypeChecker.PdfContent.TEXT ||
109 |                             (docType != PdfContentTypeChecker.PdfContent.IMAGES && pdfParseMode == ParsePdfMode.PREFER_TEXT))
110 |                         callPDF2XHTMLProcess(pdfDocument, handler, context, metadata, localConfig, false);
111 |                     else {
112 |                         metadata.add("X-Parsed-By", TesseractOCRParser.class.toString());
113 |                         callOCR2XHTMLProcess(pdfDocument, handler, context, metadata, localConfig);
114 |                     }
115 |                 }
116 |                 else if (pdfParseMode == ParsePdfMode.OCR_ONLY) {
117 |                     metadata.add("X-Parsed-By", TesseractOCRParser.class.toString());
118 |                     callOCR2XHTMLProcess(pdfDocument, handler, context, metadata, localConfig);
119 |                 } else { // ... or parse it default Tika-way
120 |                     HttpRequestParamsReader.getInstance().outIfVerbose("AlterPDFParser.parse(callPDF2XHTMLProcess)");
121 |                     callPDF2XHTMLProcess(pdfDocument, handler, context, metadata, localConfig, false);
122 |                 }
123 |             }
124 | 
125 |         } catch (InvalidPasswordException e) {
126 |             metadata.set(PDF.IS_ENCRYPTED, "true");
127 |             throw new EncryptedDocumentException(e);
128 |         } catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException |
129 |                 NoSuchFieldException | ClassNotFoundException | IOException e) {
130 |             e.printStackTrace();
131 |         } // see e.getCause()
132 |         finally {
133 |             if (pdfDocument != null) {
134 |                 pdfDocument.close();
135 |             }
136 |         }
137 |     }
138 | 
139 |     // method determines what parsing strategy to use
140 |     // from HTTPRequest or the default variable value
141 |     private ParsePdfMode getParseMode() {
142 |         String parseMode = HttpRequestParamsReader.getInstance().typedParams.get(CommonParseFlag.PDF_PARSE_METHOD);
143 |         if (parseMode == null || parseMode.length() == 0)
144 |             parseMode = System.getenv("LEXNLP_TIKA_PARSER_MODE");
145 |         if (parseMode == null || parseMode.length() == 0)
146 |             return defaultParseMode;
147 | 
148 |         if (parseMode.equals(HttpRequestParamsReader.PDF_PARSE_METHOD_STRIP))
149 |             return ParsePdfMode.TEXT_STRIP;
150 |         if (parseMode.equals(HttpRequestParamsReader.PDF_PARSE_METHOD_PDF_OCR))
151 |             return ParsePdfMode.PDF_OCR;
152 |         if (parseMode.equals(HttpRequestParamsReader.PDF_PARSE_METHOD_PDF_ONLY))
153 |             return ParsePdfMode.PDF_ONLY;
154 |         if (parseMode.equals(HttpRequestParamsReader.PDF_PARSE_METHOD_OCR_ONLY))
155 |             return ParsePdfMode.OCR_ONLY;
156 |         if (parseMode.equals(HttpRequestParamsReader.PDF_PARSE_METHOD_PDF_PREFER_TEXT))
157 |             return ParsePdfMode.PREFER_TEXT;
158 | 
159 |         return defaultParseMode;
160 |     }
161 | 
162 |     // extract doc's metadata and check whether it is accessible
163 |     private void extractAndCheckMetadata(Metadata metadata, ParseContext context, PDFParserConfig localConfig, PDDocument pdfDocument)
164 |             throws NoSuchMethodException, IllegalAccessException, InvocationTargetException, AccessPermissionException {
165 |         metadata.set(PDF.IS_ENCRYPTED, Boolean.toString(pdfDocument.isEncrypted()));
166 |         metadata.set(Metadata.CONTENT_TYPE, MEDIA_TYPE.toString());
167 |         callExtractMetadata(pdfDocument, metadata, context);
168 | 
169 |         AccessChecker checker = localConfig.getAccessChecker();
170 |         checker.check(metadata);
171 |     }
172 | 
173 |     // process PDF as a printed (vector) document
174 |     // uses standard Tika's PDF2XHTML class by reflection
175 |     // because this class is private (package restricted) and I don't
176 |     // want to copy the class's code and a bunch of dependent modules into plugin
177 |     private void callPDF2XHTMLProcess(PDDocument document, ContentHandler handler,
178 |                                         ParseContext context, Metadata metadata,
179 |                                         PDFParserConfig config,
180 |                                         boolean noOcr) throws
181 |             TikaException, SAXException {
182 |         // noOcr ptr is ignored in current implementation
183 |         PDFParserConfig.OCR_STRATEGY oldOcrStrategy = config.getOcrStrategy();
184 |         config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR);
185 |         PDF2XHTML.process(document, handler, context, metadata, config);
186 |         config.setOcrStrategy(oldOcrStrategy);
187 |     }
188 | 
189 |     // process PDF as a scanned image set
190 |     // again uses reflection
191 |     private void callOCR2XHTMLProcess(PDDocument document, ContentHandler handler,
192 |                                       ParseContext context, Metadata metadata,
193 |                                       PDFParserConfig config) throws
194 |             ClassNotFoundException, NoSuchMethodException, InvocationTargetException, IllegalAccessException,
195 |             TikaException, SAXException {
196 |         TesseractOCRConfig cfg = buildTesseractOCRConfig(config);
197 |         context.set(TesseractOCRConfig.class, cfg);
198 | 
199 |         PDFParserConfig.OCR_STRATEGY oldOcrStrategy = config.getOcrStrategy();
200 |         boolean oldExtractInlineImages = config.getExtractInlineImages();
201 |         boolean oldExtractUniqueInlineImagesOnly = config.getExtractUniqueInlineImagesOnly();
202 | 
203 |         // explicitly tells Tika to use OCR
204 |         config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.OCR_ONLY);
205 |         config.setExtractInlineImages(true);
206 |         config.setExtractUniqueInlineImagesOnly(false);
207 | 
208 |         Class c = Class.forName("org.apache.tika.parser.pdf.OCR2XHTML");
209 |         Method m = c.getDeclaredMethod("process",
210 |                 PDDocument.class, ContentHandler.class, ParseContext.class, Metadata.class,
211 |                 PDFParserConfig.class);
212 |         m.setAccessible(true);
213 |         m.invoke(null, document, handler, context, metadata, config);
214 | 
215 |         config.setOcrStrategy(oldOcrStrategy);
216 |         config.setExtractInlineImages(oldExtractInlineImages);
217 |         config.setExtractUniqueInlineImagesOnly(oldExtractUniqueInlineImagesOnly);
218 |     }
219 | 
220 |     private TesseractOCRConfig buildTesseractOCRConfig(PDFParserConfig config)
221 |     {
222 |         TesseractOCRConfig cfg = new TesseractOCRConfig();
223 |         // here I set default timeout of 2 hours
224 |         // The calling process should check parsing process and terminate it by timeout
225 |         cfg.setTimeout(60 * 60 * 2);
226 |         return cfg;
227 |     }
228 | 
229 |     // check whether the method should read XFA (forms) only
230 |     private boolean callShouldHandleXFAOnly(PDDocument pdDocument, PDFParserConfig config)
231 |             throws InvocationTargetException, IllegalAccessException {
232 |         boolean xfa = this.checkDocHasXFA(pdDocument);
233 |         Method m;
234 |         try {
235 |             m = getClass().getSuperclass().getDeclaredMethod("shouldHandleXFAOnly",
236 |                     boolean.class, PDFParserConfig.class);
237 |         } catch (NoSuchMethodException e) {
238 |             return false;
239 |         }
240 |         m.setAccessible(true);
241 |         return (boolean) m.invoke(this, xfa, config);
242 |     }
243 | 
244 |     private boolean checkDocHasXFA(PDDocument pdDocument)
245 |             throws InvocationTargetException, IllegalAccessException {
246 |         Method m;
247 |         try {
248 |             m = getClass().getSuperclass().getDeclaredMethod("hasXFA",
249 |                     PDDocument.class);
250 |         }
251 |         catch (NoSuchMethodException e) {
252 |             return false;
253 |         }
254 |         m.setAccessible(true);
255 |         return (boolean) m.invoke(this, pdDocument);
256 |     }
257 | 
258 |     // read XFA forms' content
259 |     private void callHandleXFAOnly(PDDocument pdDocument, ContentHandler handler,
260 |                                    Metadata metadata, ParseContext context)
261 |             throws NoSuchMethodException, InvocationTargetException, IllegalAccessException {
262 |         Method m = getClass().getSuperclass().getDeclaredMethod("handleXFAOnly",
263 |                 PDDocument.class, ContentHandler.class, Metadata.class, ParseContext.class);
264 |         m.setAccessible(true);
265 |         m.invoke(this, pdDocument, handler, metadata, context);
266 |     }
267 | 
268 |     // uses reflection, again, for obtaining PDF's metadata
269 |     private void callExtractMetadata(PDDocument document, Metadata metadata, ParseContext context)
270 |             throws NoSuchMethodException, IllegalAccessException, InvocationTargetException {
271 |         Method m = getClass().getSuperclass().getDeclaredMethod("extractMetadata",
272 |                 PDDocument.class, Metadata.class, ParseContext.class);
273 |         m.setAccessible(true);
274 |         m.invoke(this, document, metadata, context);
275 |     }
276 | 
277 |     // read password from metadata
278 |     private String callGetPassword(Metadata metadata, ParseContext context)
279 |             throws NoSuchMethodException, IllegalAccessException, InvocationTargetException {
280 |         Method m = getClass().getSuperclass().getDeclaredMethod("getPassword",
281 |                 Metadata.class, ParseContext.class);
282 |         m.setAccessible(true);
283 |         Object retVal = m.invoke(this, metadata, context);
284 |         return (String) retVal;
285 |     }
286 | 
287 |     // make a copy because I don't want to modify original config params
288 |     private PDFParserConfig makeConfigLocalCopy(PDFParserConfig srcConfig) {
289 |         PDFParserConfig cpy = new PDFParserConfig();
290 |         ShallowCopy.copyFields(srcConfig, cpy);
291 |         return cpy;
292 |     }
293 | 
294 |     public static void main(String args[]) throws IOException {
295 |         if (args.length < 2) return;
296 | 
297 |         if (args[0].equals("--flatten")) {
298 |             // flatten document's images by redrawing them on a white background
299 |             String srcPath = args[1], dstPath = srcPath + ".processed";
300 |             if (args.length > 2)
301 |                 dstPath = args[2];
302 | 
303 |             File inputFile = new File(srcPath);
304 |             FileInputStream fis = new FileInputStream(inputFile);
305 | 
306 |             try {
307 |                 PDDocument doc = PDDocument.load(fis);
308 | 
309 |                 PdfContentImagePreprocessor preproc = new PdfContentImagePreprocessor();
310 |                 boolean hasReplaced = preproc.removeImagesAlphaChannel(doc);
311 |                 if (hasReplaced) {
312 |                     System.out.println("PDF file images were updated");
313 |                     doc.save(dstPath);
314 |                 } else {
315 |                     System.out.println("PDF file was not changed");
316 |                 }
317 |             } catch (Exception e) {
318 |                 System.out.println("Error occurred:");
319 |                 System.out.println(e.toString());
320 |                 fis.close();
321 |             }
322 |         }
323 | 
324 |         if (args[0].equals("--explore")) {
325 |             PdfContentTypeChecker checker = new PdfContentTypeChecker();
326 |             String srcPath = args[1];
327 |             File inputFile = new File(srcPath);
328 |             FileInputStream fis = new FileInputStream(inputFile);
329 | 
330 |             try {
331 |                 PDDocument doc = PDDocument.load(fis);
332 |                 checker.determineDocContentType(doc);
333 | 
334 |                 System.out.printf("images:%d,text_blocks:%d%n",
335 |                         checker.getImagesCount(),
336 |                         checker.getTextBlocks());
337 | 
338 |             } catch (Exception e) {
339 |                 System.out.println("Error occurred:");
340 |                 System.out.println(e.toString());
341 |                 fis.close();
342 |             }
343 |         }
344 |     }
345 | }
346 | 


--------------------------------------------------------------------------------
/lexpredict-tika/src/main/java/com/lexpredict/tika/PDF2XHTML.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  *
 17 |  * Modifications copyright (C) 2020 ContraxSuite, LLC
 18 |  */
 19 | /*
 20 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 21 |  * contributor license agreements.  See the NOTICE file distributed with
 22 |  * this work for additional information regarding copyright ownership.
 23 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 24 |  * (the "License"); you may not use this file except in compliance with
 25 |  * the License.  You may obtain a copy of the License at
 26 |  *
 27 |  *     http://www.apache.org/licenses/LICENSE-2.0
 28 |  *
 29 |  * Unless required by applicable law or agreed to in writing, software
 30 |  * distributed under the License is distributed on an "AS IS" BASIS,
 31 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 32 |  * See the License for the specific language governing permissions and
 33 |  * limitations under the License.
 34 |  *
 35 |  * Modifications copyright (C) 2020 ContraxSuite, LLC
 36 |  */
 37 | package com.lexpredict.tika;
 38 | 
 39 | import org.apache.pdfbox.cos.COSArray;
 40 | import org.apache.pdfbox.cos.COSBase;
 41 | import org.apache.pdfbox.cos.COSName;
 42 | import org.apache.pdfbox.cos.COSStream;
 43 | import org.apache.pdfbox.filter.MissingImageReaderException;
 44 | import org.apache.pdfbox.pdmodel.PDDocument;
 45 | import org.apache.pdfbox.pdmodel.PDPage;
 46 | import org.apache.pdfbox.pdmodel.PDPageContentStream;
 47 | import org.apache.pdfbox.pdmodel.PDResources;
 48 | import org.apache.pdfbox.pdmodel.font.PDFont;
 49 | import org.apache.pdfbox.pdmodel.graphics.PDXObject;
 50 | import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
 51 | import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
 52 | import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
 53 | import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
 54 | import org.apache.pdfbox.text.PDFTextStripper;
 55 | import org.apache.pdfbox.text.TextPosition;
 56 | import org.apache.pdfbox.tools.imageio.ImageIOUtil;
 57 | import org.apache.pdfbox.util.Matrix;
 58 | import org.apache.tika.exception.TikaException;
 59 | import org.apache.tika.extractor.EmbeddedDocumentUtil;
 60 | import org.apache.tika.io.TikaInputStream;
 61 | import org.apache.tika.metadata.Metadata;
 62 | import org.apache.tika.metadata.TikaCoreProperties;
 63 | import org.apache.tika.parser.ParseContext;
 64 | import org.apache.tika.parser.pdf.PDFParserConfig;
 65 | import org.apache.tika.sax.EmbeddedContentHandler;
 66 | import org.xml.sax.ContentHandler;
 67 | import org.xml.sax.SAXException;
 68 | import org.xml.sax.helpers.AttributesImpl;
 69 | 
 70 | import java.awt.image.BufferedImage;
 71 | import java.io.ByteArrayOutputStream;
 72 | import java.io.IOException;
 73 | import java.io.InputStream;
 74 | import java.io.OutputStream;
 75 | import java.io.Writer;
 76 | import java.util.*;
 77 | 
 78 | class PDF2XHTML extends AbstractPDF2XHTML {
 79 |     private static final List<String> JPEG = Arrays.asList(
 80 |             COSName.DCT_DECODE.getName(),
 81 |             COSName.DCT_DECODE_ABBREVIATION.getName());
 82 | 
 83 |     private static final List<String> JP2 =
 84 |             Arrays.asList(COSName.JPX_DECODE.getName());
 85 | 
 86 |     private static final List<String> JB2 = Arrays.asList(
 87 |             COSName.JBIG2_DECODE.getName());
 88 | 
 89 |     /**
 90 |      * This keeps track of the pdf object ids for inline
 91 |      * images that have been processed.
 92 |      * If {@link PDFParserConfig#getExtractUniqueInlineImagesOnly()
 93 |      * is true, this will be checked before extracting an embedded image.
 94 |      * The integer keeps track of the inlineImageCounter for that image.
 95 |      * This integer is used to identify images in the markup.
 96 |      *
 97 |      * This is used across the document.  To avoid infinite recursion
 98 |      * TIKA-1742, we're limiting the export to one image per page.
 99 |      */
100 |     private Map<COSStream, Integer> processedInlineImages = new HashMap<>();
101 |     private int inlineImageCounter = 0;
102 |     private PDF2XHTML(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata,
103 |                       PDFParserConfig config)
104 |             throws IOException {
105 |         super(document, handler, context, metadata, config);
106 |     }
107 | 
108 |     /**
109 |      * Converts the given PDF document (and related metadata) to a stream
110 |      * of XHTML SAX events sent to the given content handler.
111 |      *
112 |      * @param document PDF document
113 |      * @param handler  SAX content handler
114 |      * @param metadata PDF metadata
115 |      * @throws SAXException  if the content handler fails to process SAX events
116 |      * @throws TikaException if there was an exception outside of per page processing
117 |      */
118 |     public static void process(
119 |             PDDocument document,
120 |             ContentHandler handler,
121 |             ParseContext context,
122 |             Metadata metadata,
123 |             PDFParserConfig config)
124 |             throws SAXException, TikaException {
125 |         PDF2XHTML pdf2XHTML = null;
126 |         try {
127 |             // Extract text using a dummy Writer as we override the
128 |             // key methods to output to the given content
129 |             // handler.
130 |             if (config.getDetectAngles()) {
131 |                 pdf2XHTML = new PDF2XHTML.AngleDetectingPDF2XHTML(document, handler, context, metadata, config);
132 |             } else {
133 |                 pdf2XHTML = new PDF2XHTML(document, handler, context, metadata, config);
134 |             }
135 |             AlterPDFParserConfig.configureAlterPtf2Xhtml(config, pdf2XHTML);
136 | 
137 |             pdf2XHTML.writeText(document, new Writer() {
138 |                 @Override
139 |                 public void write(char[] cbuf, int off, int len) {
140 |                 }
141 | 
142 |                 @Override
143 |                 public void flush() {
144 |                 }
145 | 
146 |                 @Override
147 |                 public void close() {
148 |                 }
149 |             });
150 |         } catch (IOException e) {
151 |             if (e.getCause() instanceof SAXException) {
152 |                 throw (SAXException) e.getCause();
153 |             } else {
154 |                 throw new TikaException("Unable to extract PDF content", e);
155 |             }
156 |         }
157 |         if (pdf2XHTML.exceptions.size() > 0) {
158 |             //throw the first
159 |             throw new TikaException("Unable to extract PDF content", pdf2XHTML.exceptions.get(0));
160 |         }
161 |     }
162 | 
163 |     @Override
164 |     public void processPage(PDPage page) throws IOException {
165 |         try {
166 |             super.processPage(page);
167 |         } catch (IOException e) {
168 |             handleCatchableIOE(e);
169 |             endPage(page);
170 |         }
171 |     }
172 | 
173 |     @Override
174 |     protected void endPage(PDPage page) throws IOException {
175 |         try {
176 |             writeParagraphEnd();
177 |             try {
178 |                 extractImages(page.getResources(), new HashSet<COSBase>());
179 |             } catch (IOException e) {
180 |                 handleCatchableIOE(e);
181 |             }
182 |             super.endPage(page);
183 |         } catch (SAXException e) {
184 |             throw new IOException("Unable to end a page", e);
185 |         } catch (IOException e) {
186 |             handleCatchableIOE(e);
187 |         }
188 |     }
189 | 
190 |     private void extractImages(PDResources resources, Set<COSBase> seenThisPage) throws SAXException, IOException {
191 |         if (resources == null || config.getExtractInlineImages() == false) {
192 |             return;
193 |         }
194 | 
195 |         for (COSName name : resources.getXObjectNames()) {
196 | 
197 |             PDXObject object = null;
198 |             try {
199 |                 object = resources.getXObject(name);
200 |             } catch (MissingImageReaderException e) {
201 |                 EmbeddedDocumentUtil.recordException(e, metadata);
202 |                 continue;
203 |             } catch (IOException e) {
204 |                 EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
205 |                 continue;
206 |             }
207 |             processImageObject(object, seenThisPage);
208 |         }
209 |     }
210 | 
211 |     private void processImageObject(PDXObject object, Set<COSBase> seenThisPage) throws SAXException, IOException {
212 |         if (object == null) {
213 |             return;
214 |         }
215 |         COSStream cosStream = object.getCOSObject();
216 |         if (seenThisPage.contains(cosStream)) {
217 |             //avoid infinite recursion TIKA-1742
218 |             return;
219 |         }
220 |         seenThisPage.add(cosStream);
221 | 
222 |         if (object instanceof PDFormXObject) {
223 |             extractImages(((PDFormXObject) object).getResources(), seenThisPage);
224 |         } else if (object instanceof PDImageXObject) {
225 | 
226 |             PDImageXObject image = (PDImageXObject) object;
227 | 
228 |             Metadata embeddedMetadata = new Metadata();
229 |             String extension = image.getSuffix();
230 | 
231 |             if (extension == null || extension.equals("png")) {
232 |                 embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/png");
233 |                 extension = "png";
234 |             } else if (extension.equals("jpg")) {
235 |                 embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
236 |             } else if (extension.equals("tiff")) {
237 |                 embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/tiff");
238 |                 extension = "tif";
239 |             } else if (extension.equals("jpx")) {
240 |                 embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/jp2");
241 |             } else if (extension.equals("jb2")) {
242 |                 embeddedMetadata.set(
243 |                         Metadata.CONTENT_TYPE, "image/x-jbig2");
244 |             } else {
245 |                 //TODO: determine if we need to add more image types
246 | //                    throw new RuntimeException("EXTEN:" + extension);
247 |             }
248 |             Integer imageNumber = processedInlineImages.get(cosStream);
249 |             if (imageNumber == null) {
250 |                 imageNumber = inlineImageCounter++;
251 |             }
252 |             String fileName = "image" + imageNumber + "."+extension;
253 |             embeddedMetadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
254 | 
255 |             // Output the img tag
256 |             AttributesImpl attr = new AttributesImpl();
257 |             attr.addAttribute("", "src", "src", "CDATA", "embedded:" + fileName);
258 |             attr.addAttribute("", "alt", "alt", "CDATA", fileName);
259 |             xhtml.startElement("img", attr);
260 |             xhtml.endElement("img");
261 | 
262 |             //Do we only want to process unique COSObject ids?
263 |             //If so, have we already processed this one?
264 |             if (config.getExtractUniqueInlineImagesOnly() == true) {
265 |                 if (processedInlineImages.containsKey(cosStream)) {
266 |                     return;
267 |                 }
268 |                 processedInlineImages.put(cosStream, imageNumber);
269 |             }
270 | 
271 |             embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
272 |                     TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
273 | 
274 |             if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
275 |                 ByteArrayOutputStream buffer = new ByteArrayOutputStream();
276 |                 try {
277 |                     //extract the metadata contained outside of the image
278 |                     // TODO: fix IT
279 |                     //PDMetadataExtractor.extract(image.getMetadata(),
280 |                     //        embeddedMetadata, context);
281 |                     try {
282 |                         writeToBuffer(image, extension, buffer);
283 |                     } catch (IOException e) {
284 |                         EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
285 |                         return;
286 |                     }
287 |                     try (InputStream embeddedIs = TikaInputStream.get(buffer.toByteArray())) {
288 |                         embeddedDocumentExtractor.parseEmbedded(
289 |                                 embeddedIs,
290 |                                 new EmbeddedContentHandler(xhtml),
291 |                                 embeddedMetadata, false);
292 |                     }
293 |                 } catch (IOException e) {
294 |                     handleCatchableIOE(e);
295 |                 }
296 |             }
297 |         }
298 |     }
299 | 
300 |     //nearly directly copied from PDFBox ExtractImages
301 |     private void writeToBuffer(PDImageXObject pdImage, String suffix, OutputStream out)
302 |             throws IOException {
303 | 
304 |         BufferedImage image = pdImage.getImage();
305 |         if (image != null) {
306 |             if ("jpg".equals(suffix)) {
307 |                 String colorSpaceName = pdImage.getColorSpace().getName();
308 |                 //TODO: figure out if we want directJPEG as a configuration
309 |                 //previously: if (directJPeg || PDDeviceGray....
310 |                 if (PDDeviceGray.INSTANCE.getName().equals(colorSpaceName) ||
311 |                         PDDeviceRGB.INSTANCE.getName().equals(colorSpaceName)) {
312 |                     // RGB or Gray colorspace: get and write the unmodifiedJPEG stream
313 |                     InputStream data = pdImage.getStream().createInputStream(JPEG);
314 |                     org.apache.pdfbox.io.IOUtils.copy(data, out);
315 |                     org.apache.pdfbox.io.IOUtils.closeQuietly(data);
316 |                 } else {
317 |                     // for CMYK and other "unusual" colorspaces, the JPEG will be converted
318 |                     ImageIOUtil.writeImage(image, suffix, out);
319 |                 }
320 |             } else if ("jp2".equals(suffix) || "jpx".equals(suffix)) {
321 |                 InputStream data = pdImage.createInputStream(JP2);
322 |                 org.apache.pdfbox.io.IOUtils.copy(data, out);
323 |                 org.apache.pdfbox.io.IOUtils.closeQuietly(data);
324 |             } else if ("jb2".equals(suffix)) {
325 |                 InputStream data = pdImage.createInputStream(JB2);
326 |                 org.apache.pdfbox.io.IOUtils.copy(data, out);
327 |                 org.apache.pdfbox.io.IOUtils.closeQuietly(data);
328 |             } else{
329 |                 ImageIOUtil.writeImage(image, suffix, out);
330 |             }
331 |         }
332 |         out.flush();
333 |     }
334 | 
335 |     @Override
336 |     protected void writeParagraphStart() throws IOException {
337 |         super.writeParagraphStart();
338 |         try {
339 |             xhtml.startElement("p");
340 |         } catch (SAXException e) {
341 |             throw new IOException("Unable to start a paragraph", e);
342 |         }
343 |     }
344 | 
345 |     @Override
346 |     protected void writeParagraphEnd() throws IOException {
347 |         super.writeParagraphEnd();
348 |         try {
349 |             xhtml.endElement("p");
350 |         } catch (SAXException e) {
351 |             throw new IOException("Unable to end a paragraph", e);
352 |         }
353 |     }
354 | 
355 |     @Override
356 |     protected void writeString(String text) throws IOException {
357 |         try {
358 |             xhtml.characters(text);
359 |         } catch (SAXException e) {
360 |             throw new IOException(
361 |                     "Unable to write a string: " + text, e);
362 |         }
363 |     }
364 | 
365 |     @Override
366 |     protected void writeString(String text, List<TextPosition> textPositions) throws IOException
367 |     {
368 |         try {
369 |             AttributesImpl atr = new AttributesImpl();
370 |             if (textPositions.size() > 0) {
371 |                 StringBuilder posStr = new StringBuilder();
372 |                 for (TextPosition pos : textPositions)
373 |                     posStr.append(
374 |                             formatFloatNumbers(";",
375 |                             pos.getX(), pos.getY(),
376 |                             pos.getWidth(), pos.getHeight()));
377 |                 atr.addAttribute("", "pos", "pos", "string", posStr.toString());
378 |             }
379 |             xhtml.startElement("span", atr);
380 |             xhtml.characters(text);
381 |             xhtml.endElement("span");
382 |         } catch (SAXException e) {
383 |             throw new IOException(
384 |                     "Unable to write a string: " + text, e);
385 |         }
386 |     }
387 | 
388 |     @Override
389 |     protected String normalizeString(
390 |             String text,
391 |             List<TextPosition> textPositions,
392 |             boolean tryFontMapping) throws IOException {
393 |         if (text == null || text.length() == 0)
394 |             return "";
395 | 
396 |         if (tryFontMapping) {
397 |             // replace \r\n with \n
398 |             while (true) {
399 |                 int rnIndex = text.indexOf("\r\n");
400 |                 if (rnIndex < 0) break;
401 |                 text = text.substring(0, rnIndex) + text.substring(rnIndex + 1);
402 |                 if (rnIndex < textPositions.size())
403 |                     textPositions.remove(rnIndex);
404 |             }
405 |         }
406 |         // replace \r (\f) with \n
407 |         text = text.replace('\r', '\n');
408 |         text = text.replace('\f', '\n');
409 | 
410 |         StringBuilder mappedString = new StringBuilder();
411 |         char[] charArray = text.toCharArray();
412 |         for (int i = 0; i < charArray.length; i++) {
413 |             if (!xhtml.isCharacterInvalid(charArray[i])) {
414 |                 mappedString.append(charArray[i]);
415 |                 continue;
416 |             }
417 |             if (tryFontMapping) {
418 |                 PDFont txtFont = textPositions.get(i).getFont();
419 |                 if (txtFont == null) {
420 |                     mappedString.append('?');
421 |                     continue;
422 |                 }
423 |                 String uniStr = txtFont.toUnicode(charArray[i]);
424 |                 uniStr = normalizeString(uniStr, textPositions, false);
425 |                 mappedString.append(uniStr == null || uniStr.length() == 0 ? "?" : uniStr);
426 |             } else
427 |                 mappedString.append("?");
428 |         }
429 |         return mappedString.toString();
430 |     }
431 | 
432 |     @Override
433 |     protected void dumpCDATA() throws SAXException {
434 |         for (Map.Entry<String, TagData> entry : cdataContent.entrySet()) {
435 |             TagData tag = entry.getValue();
436 |             String containerName = tag.tagName;
437 |             if (tag.attributeString.length() > 0)
438 |                 containerName += " " + tag.attributeString;
439 |             xhtml.startElement("", containerName, containerName, new AttributesImpl());
440 |             if (tag.isCdata)
441 |                 //xhtml.charactersRaw("<![CDATA[");
442 |                 xhtml.charactersRaw("<![CDATA[");
443 |             xhtml.characters(tag.data.toString());
444 |             if (tag.isCdata)
445 |                 xhtml.charactersRaw("]]>");
446 |             xhtml.endElement(tag.tagName);
447 |         }
448 |     }
449 | 
450 |     @Override
451 |     protected void writeString(String text, TextPosition boundingPosition) throws IOException
452 |     {
453 |         writeString(text, new ArrayList<TextPosition>() {{add(boundingPosition);}});
454 |     }
455 | 
456 |     @Override
457 |     protected void writeCharacters(TextPosition text) throws IOException {
458 |         try {
459 |             xhtml.characters(text.getUnicode());
460 |         } catch (SAXException e) {
461 |             throw new IOException(
462 |                     "Unable to write a character: " + text.getUnicode(), e);
463 |         }
464 |     }
465 | 
466 |     @Override
467 |     protected void writeWordSeparator() throws IOException {
468 |         try {
469 |             xhtml.characters(getWordSeparator());
470 |         } catch (SAXException e) {
471 |             throw new IOException(
472 |                     "Unable to write a space character", e);
473 |         }
474 |     }
475 | 
476 |     @Override
477 |     protected void writeLineSeparator() throws IOException {
478 |         try {
479 |             xhtml.newline();
480 |         } catch (SAXException e) {
481 |             throw new IOException(
482 |                     "Unable to write a newline character", e);
483 |         }
484 |     }
485 | 
486 |     class AngleCollector extends PDFTextStripper {
487 |         Set<Integer> angles = new HashSet<>();
488 | 
489 |         public Set<Integer> getAngles() {
490 |             return angles;
491 |         }
492 | 
493 |         /**
494 |          * Instantiate a new PDFTextStripper object.
495 |          *
496 |          * @throws IOException If there is an error loading the properties.
497 |          */
498 |         AngleCollector() throws IOException {
499 |         }
500 | 
501 |         @Override
502 |         protected void processTextPosition(TextPosition text) {
503 |             Matrix m = text.getTextMatrix();
504 |             m.concatenate(text.getFont().getFontMatrix());
505 |             int angle = (int) Math.round(Math.toDegrees(Math.atan2(m.getShearY(), m.getScaleY())));
506 |             angle = (angle + 360) % 360;
507 |             angles.add(angle);
508 |         }
509 |     }
510 | 
511 |     private static class AngleDetectingPDF2XHTML extends PDF2XHTML {
512 | 
513 |         private AngleDetectingPDF2XHTML(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata, PDFParserConfig config) throws IOException {
514 |             super(document, handler, context, metadata, config);
515 |         }
516 | 
517 |         @Override
518 |         protected void startPage(PDPage page) throws IOException {
519 |             //no-op
520 |         }
521 | 
522 |         @Override
523 |         protected void endPage(PDPage page) throws IOException {
524 |             //no-op
525 |         }
526 | 
527 |         @Override
528 |         public void processPage(PDPage page) throws IOException {
529 |             try {
530 |                 super.startPage(page);
531 |                 detectAnglesAndProcessPage(page);
532 |             } catch (IOException e) {
533 |                 handleCatchableIOE(e);
534 |             } finally {
535 |                 super.endPage(page);
536 |             }
537 |         }
538 | 
539 |         private void detectAnglesAndProcessPage(PDPage page) throws IOException {
540 |             //copied and pasted from https://issues.apache.org/jira/secure/attachment/12947452/ExtractAngledText.java
541 |             //PDFBOX-4371
542 |             PDF2XHTML.AngleCollector angleCollector = new PDF2XHTML.AngleCollector(); // alternatively, reset angles
543 |             angleCollector.setStartPage(getCurrentPageNo());
544 |             angleCollector.setEndPage(getCurrentPageNo());
545 |             angleCollector.getText(document);
546 | 
547 |             int rotation = page.getRotation();
548 |             page.setRotation(0);
549 | 
550 |             for (Integer angle : angleCollector.getAngles()) {
551 |                 if (angle == 0) {
552 |                     try {
553 |                         super.processPage(page);
554 |                     } catch (IOException e) {
555 |                         handleCatchableIOE(e);
556 |                     }
557 |                 } else {
558 |                     // prepend a transformation
559 |                     try (PDPageContentStream cs = new PDPageContentStream(document, page, PDPageContentStream.AppendMode.PREPEND, false)) {
560 |                         cs.transform(Matrix.getRotateInstance(-Math.toRadians(angle), 0, 0));
561 |                     }
562 | 
563 |                     try {
564 |                         super.processPage(page);
565 |                     } catch (IOException e) {
566 |                         handleCatchableIOE(e);
567 |                     }
568 | 
569 |                     // remove transformation
570 |                     COSArray contents = (COSArray) page.getCOSObject().getItem(COSName.CONTENTS);
571 |                     contents.remove(0);
572 |                 }
573 |             }
574 |             page.setRotation(rotation);
575 |         }
576 | 
577 |         @Override
578 |         protected void processTextPosition(TextPosition text) {
579 |             Matrix m = text.getTextMatrix();
580 |             m.concatenate(text.getFont().getFontMatrix());
581 |             int angle = (int) Math.round(Math.toDegrees(Math.atan2(m.getShearY(), m.getScaleY())));
582 |             if (angle == 0) {
583 |                 super.processTextPosition(text);
584 |             }
585 |         }
586 |     }
587 | }
588 | 
589 | 


--------------------------------------------------------------------------------
/lexpredict-tika/src/main/java/com/lexpredict/tika/AbstractPDF2XHTML.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  *
 17 |  * Modifications copyright (C) 2020 ContraxSuite, LLC
 18 |  */
 19 | 
 20 | package com.lexpredict.tika;
 21 | 
 22 | import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.NO_OCR;
 23 | 
 24 | import javax.xml.stream.XMLStreamException;
 25 | import java.awt.image.BufferedImage;
 26 | import java.io.BufferedInputStream;
 27 | import java.io.ByteArrayInputStream;
 28 | import java.io.IOException;
 29 | import java.io.InputStream;
 30 | import java.io.OutputStream;
 31 | import java.lang.reflect.InvocationTargetException;
 32 | import java.lang.reflect.Method;
 33 | import java.nio.charset.StandardCharsets;
 34 | import java.nio.file.Files;
 35 | import java.nio.file.Path;
 36 | import java.text.SimpleDateFormat;
 37 | import java.util.ArrayList;
 38 | import java.util.Calendar;
 39 | import java.util.HashSet;
 40 | import java.util.List;
 41 | import java.util.ListIterator;
 42 | import java.util.Locale;
 43 | import java.util.Map;
 44 | import java.util.Set;
 45 | import java.util.TreeMap;
 46 | 
 47 | import org.apache.commons.io.IOExceptionWithCause;
 48 | import org.apache.commons.io.IOUtils;
 49 | import org.apache.pdfbox.cos.COSName;
 50 | import org.apache.pdfbox.pdmodel.PDDocument;
 51 | import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
 52 | import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
 53 | import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
 54 | import org.apache.pdfbox.pdmodel.PDPage;
 55 | import org.apache.pdfbox.pdmodel.PDPageTree;
 56 | import org.apache.pdfbox.pdmodel.common.PDDestinationOrAction;
 57 | import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
 58 | import org.apache.pdfbox.pdmodel.common.PDRectangle;
 59 | import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
 60 | import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
 61 | import org.apache.pdfbox.pdmodel.common.filespecification.PDFileSpecification;
 62 | import org.apache.pdfbox.pdmodel.common.filespecification.PDSimpleFileSpecification;
 63 | import org.apache.pdfbox.pdmodel.font.PDFont;
 64 | import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
 65 | import org.apache.pdfbox.pdmodel.interactive.action.PDActionImportData;
 66 | import org.apache.pdfbox.pdmodel.interactive.action.PDActionJavaScript;
 67 | import org.apache.pdfbox.pdmodel.interactive.action.PDActionLaunch;
 68 | import org.apache.pdfbox.pdmodel.interactive.action.PDActionRemoteGoTo;
 69 | import org.apache.pdfbox.pdmodel.interactive.action.PDActionURI;
 70 | import org.apache.pdfbox.pdmodel.interactive.action.PDAnnotationAdditionalActions;
 71 | import org.apache.pdfbox.pdmodel.interactive.action.PDDocumentCatalogAdditionalActions;
 72 | import org.apache.pdfbox.pdmodel.interactive.action.PDFormFieldAdditionalActions;
 73 | import org.apache.pdfbox.pdmodel.interactive.action.PDPageAdditionalActions;
 74 | import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
 75 | import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment;
 76 | import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup;
 77 | import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationWidget;
 78 | import org.apache.pdfbox.pdmodel.interactive.digitalsignature.PDSignature;
 79 | import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
 80 | import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
 81 | import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineNode;
 82 | import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
 83 | import org.apache.pdfbox.pdmodel.interactive.form.PDField;
 84 | import org.apache.pdfbox.pdmodel.interactive.form.PDNonTerminalField;
 85 | import org.apache.pdfbox.pdmodel.interactive.form.PDSignatureField;
 86 | import org.apache.pdfbox.pdmodel.interactive.form.PDXFAResource;
 87 | import org.apache.pdfbox.rendering.PDFRenderer;
 88 | import org.apache.pdfbox.tools.imageio.ImageIOUtil;
 89 | import org.apache.pdfbox.util.Matrix;
 90 | import org.apache.pdfbox.util.Vector;
 91 | import org.apache.tika.exception.TikaException;
 92 | import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 93 | import org.apache.tika.extractor.EmbeddedDocumentUtil;
 94 | import org.apache.tika.io.TemporaryResources;
 95 | import org.apache.tika.io.TikaInputStream;
 96 | import org.apache.tika.metadata.Font;
 97 | import org.apache.tika.metadata.Metadata;
 98 | import org.apache.tika.metadata.PDF;
 99 | import org.apache.tika.metadata.TikaCoreProperties;
100 | import org.apache.tika.parser.ParseContext;
101 | import org.apache.tika.parser.ocr.TesseractOCRConfig;
102 | import org.apache.tika.parser.ocr.TesseractOCRParser;
103 | import org.apache.tika.parser.pdf.PDFParserConfig;
104 | import org.apache.tika.sax.EmbeddedContentHandler;
105 | import org.xml.sax.ContentHandler;
106 | import org.xml.sax.SAXException;
107 | import org.xml.sax.helpers.AttributesImpl;
108 | 
109 | class AbstractPDF2XHTML extends PDFTextStripper {
110 | 
111 |     enum ActionTrigger {
112 |         AFTER_DOCUMENT_PRINT,
113 |         AFTER_DOCUMENT_SAVE,
114 |         ANNOTATION_CURSOR_ENTERS,
115 |         ANNOTATION_CURSOR_EXIT,
116 |         ANNOTATION_LOSE_INPUT_FOCUS,
117 |         ANNOTATION_MOUSE_CLICK,
118 |         ANNOTATION_MOUSE_RELEASED,
119 |         ANNOTATION_PAGE_CLOSED,
120 |         ANNOTATION_PAGE_NO_LONGER_VISIBLE,
121 |         ANNOTATION_PAGE_OPENED,
122 |         ANNOTATION_PAGE_VISIBLE,
123 |         ANNOTATION_RECEIVES_FOCUS,
124 |         ANNOTATION_WIDGET,
125 |         BEFORE_DOCUMENT_CLOSE,
126 |         BEFORE_DOCUMENT_PRINT,
127 |         BEFORE_DOCUMENT_SAVE,
128 |         DOCUMENT_OPEN,
129 |         FORM_FIELD,
130 |         FORM_FIELD_FORMATTED,
131 |         FORM_FIELD_KEYSTROKE,
132 |         FORM_FIELD_RECALCULATE,
133 |         FORM_FIELD_VALUE_CHANGE,
134 |         PAGE_CLOSE,
135 |         PAGE_OPEN, BOOKMARK,
136 |     };
137 | 
138 |     /**
139 |      * Maximum recursive depth during AcroForm processing.
140 |      * Prevents theoretical AcroForm recursion bomb.
141 |      */
142 |     private final static int MAX_ACROFORM_RECURSIONS = 10;
143 | 
144 |     private final static TesseractOCRConfig DEFAULT_TESSERACT_CONFIG = new TesseractOCRConfig();
145 | 
146 |     /**
147 |      * Format used for signature dates
148 |      * TODO Make this thread-safe
149 |      */
150 |     private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.ROOT);
151 | 
152 | 
153 |     final List<IOException> exceptions = new ArrayList<>();
154 |     final PDDocument pdDocument;
155 |     final AlterXHTMLContentHandler xhtml;
156 |     final ParseContext context;
157 |     final Metadata metadata;
158 |     final EmbeddedDocumentExtractor embeddedDocumentExtractor;
159 |     final PDFParserConfig config;
160 |     final TesseractOCRParser tesseractOCRParser;//can be null!
161 | 
162 |     //zero-based pageIndex
163 |     int pageIndex = 0;
164 |     int startPage = -1;//private in PDFTextStripper...must have own copy because we override processpages
165 |     int unmappedUnicodeCharsPerPage = 0;
166 |     int totalCharsPerPage = 0;
167 | 
168 |     private final Set<String> fontNames = new HashSet<>();
169 | 
170 |     AbstractPDF2XHTML(PDDocument pdDocument, ContentHandler handler, ParseContext context, Metadata metadata,
171 |                       PDFParserConfig config) throws IOException {
172 |         this.pdDocument = pdDocument;
173 |         this.xhtml = new AlterXHTMLContentHandler(handler, metadata);
174 |         this.context = context;
175 |         this.metadata = metadata;
176 |         this.config = config;
177 |         embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
178 |         if (config.getOcrStrategy() == NO_OCR) {
179 |             tesseractOCRParser = null;
180 |         } else {
181 |             tesseractOCRParser = (TesseractOCRParser)EmbeddedDocumentUtil.tryToFindExistingLeafParser(TesseractOCRParser.class, context);
182 |         }
183 |     }
184 | 
185 |     @Override
186 |     protected void startPage(PDPage page) throws IOException {
187 |         try {
188 |             AttributesImpl attrs = new AttributesImpl();
189 |             if (this.detalization != OutputDetalization.NO_EXTRA_DETAIL) {
190 |                 StringBuilder sb = new StringBuilder();
191 |                 PDRectangle area = page.getMediaBox();
192 |                 sb.append(area.getLowerLeftX());
193 |                 sb.append(",");
194 |                 sb.append(area.getLowerLeftY());
195 |                 sb.append(",");
196 |                 sb.append(area.getWidth());
197 |                 sb.append(",");
198 |                 sb.append(area.getHeight());
199 |                 attrs.addAttribute("", "area", "area", "string", sb.toString());
200 |             }
201 |             attrs.addAttribute("", "class", "class", "string", "page");
202 |             xhtml.startElement("div", attrs);
203 |         } catch (SAXException e) {
204 |             throw new IOExceptionWithCause("Unable to start a page", e);
205 |         }
206 |         writeParagraphStart();
207 |     }
208 | 
209 |     private void extractEmbeddedDocuments(PDDocument document)
210 |             throws IOException, SAXException, TikaException {
211 |         PDDocumentNameDictionary namesDictionary =
212 |                 new PDDocumentNameDictionary(document.getDocumentCatalog());
213 |         PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles();
214 |         if (efTree == null) {
215 |             return;
216 |         }
217 | 
218 |         Map<String, PDComplexFileSpecification> embeddedFileNames = efTree.getNames();
219 |         //For now, try to get the embeddedFileNames out of embeddedFiles or its kids.
220 |         //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java
221 |         //If there is a need we could add a fully recursive search to find a non-null
222 |         //Map<String, COSObjectable> that contains the doc info.
223 |         if (embeddedFileNames != null) {
224 |             processEmbeddedDocNames(embeddedFileNames);
225 |         } else {
226 |             List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids();
227 |             if (kids == null) {
228 |                 return;
229 |             }
230 |             for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
231 |                 embeddedFileNames = node.getNames();
232 |                 if (embeddedFileNames != null) {
233 |                     processEmbeddedDocNames(embeddedFileNames);
234 |                 }
235 |             }
236 |         }
237 |     }
238 | 
239 |     private void processDoc(String name, PDFileSpecification spec, AttributesImpl attributes) throws TikaException, SAXException, IOException {
240 |         if (spec instanceof PDSimpleFileSpecification) {
241 |             attributes.addAttribute("", "class", "class", "CDATA", "linked");
242 |             attributes.addAttribute("", "id", "id", "CDATA", spec.getFile());
243 |             xhtml.startElement("div", attributes);
244 |             xhtml.endElement("div");
245 |         } else if (spec instanceof  PDComplexFileSpecification){
246 |             if (attributes.getIndex("source") < 0) {
247 |                 attributes.addAttribute("", "source", "source", "CDATA", "attachment");
248 |             }
249 |             extractMultiOSPDEmbeddedFiles(name, (PDComplexFileSpecification)spec, attributes);
250 |         }
251 |     }
252 | 
253 |     private void processEmbeddedDocNames(Map<String, PDComplexFileSpecification> embeddedFileNames)
254 |             throws IOException, SAXException, TikaException {
255 |         if (embeddedFileNames == null || embeddedFileNames.isEmpty()) {
256 |             return;
257 |         }
258 | 
259 |         for (Map.Entry<String, PDComplexFileSpecification> ent : embeddedFileNames.entrySet()) {
260 |             processDoc(ent.getKey(), ent.getValue(), new AttributesImpl());
261 |         }
262 |     }
263 | 
264 |     private void extractMultiOSPDEmbeddedFiles(String displayName,
265 |                                                PDComplexFileSpecification spec, AttributesImpl attributes) throws IOException,
266 |             SAXException, TikaException {
267 | 
268 |         if (spec == null) {
269 |             return;
270 |         }
271 |         //current strategy is to pull all, not just first non-null
272 |         extractPDEmbeddedFile(displayName, spec.getFileUnicode(),
273 |                 spec.getFile(), spec.getEmbeddedFile(), attributes);
274 |         extractPDEmbeddedFile(displayName, spec.getFileUnicode(),
275 |                 spec.getFileMac(), spec.getEmbeddedFileMac(), attributes);
276 |         extractPDEmbeddedFile(displayName, spec.getFileUnicode(),
277 |                 spec.getFileDos(), spec.getEmbeddedFileDos(), attributes);
278 |         extractPDEmbeddedFile(displayName, spec.getFileUnicode(),
279 |                 spec.getFileUnix(), spec.getEmbeddedFileUnix(), attributes);
280 |     }
281 | 
282 |     private void extractPDEmbeddedFile(String displayName, String unicodeFileName,
283 |                                        String fileName, PDEmbeddedFile file, AttributesImpl attributes)
284 |             throws SAXException, IOException, TikaException {
285 | 
286 |         if (file == null) {
287 |             //skip silently
288 |             return;
289 |         }
290 | 
291 |         fileName = (fileName == null || "".equals(fileName.trim())) ? unicodeFileName : fileName;
292 |         fileName = (fileName == null || "".equals(fileName.trim())) ? displayName : fileName;
293 | 
294 |         // TODO: other metadata?
295 |         Metadata embeddedMetadata = new Metadata();
296 |         embeddedMetadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
297 |         embeddedMetadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
298 |         embeddedMetadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
299 |         embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
300 |                 TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
301 |         embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName);
302 |         if (!embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
303 |             return;
304 |         }
305 |         TikaInputStream stream = null;
306 |         try {
307 |             stream = TikaInputStream.get(file.createInputStream());
308 |         } catch (IOException e) {
309 |             //store this exception in the parent's metadata
310 |             EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
311 |             return;
312 |         }
313 |         try {
314 |             embeddedDocumentExtractor.parseEmbedded(
315 |                     stream,
316 |                     new EmbeddedContentHandler(xhtml),
317 |                     embeddedMetadata, false);
318 | 
319 |             attributes.addAttribute("", "class", "class", "CDATA", "embedded");
320 |             attributes.addAttribute("", "id", "id", "CDATA", fileName);
321 |             xhtml.startElement("div", attributes);
322 |             xhtml.endElement("div");
323 |         } finally {
324 |             IOUtils.closeQuietly(stream);
325 |         }
326 | 
327 |     }
328 | 
329 |     void handleCatchableIOE(IOException e) throws IOException {
330 |         if (config.isCatchIntermediateIOExceptions()) {
331 |             if (e.getCause() instanceof SAXException && e.getCause().getMessage() != null &&
332 |                     e.getCause().getMessage().contains("Your document contained more than")) {
333 |                 //TODO -- is there a cleaner way of checking for:
334 |                 // WriteOutContentHandler.WriteLimitReachedException?
335 |                 throw e;
336 |             }
337 | 
338 |             String msg = e.getMessage();
339 |             if (msg == null) {
340 |                 msg = "IOException, no message";
341 |             }
342 |             metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, msg);
343 |             exceptions.add(e);
344 |         } else {
345 |             throw e;
346 |         }
347 |     }
348 | 
349 |     void doOCROnCurrentPage() throws IOException, TikaException, SAXException {
350 |         if (config.getOcrStrategy().equals(NO_OCR)) {
351 |             return;
352 |         }
353 |         TesseractOCRConfig tesseractConfig =
354 |                 context.get(TesseractOCRConfig.class, tesseractOCRParser.getDefaultConfig());
355 | 
356 |         if (! tesseractOCRParser.hasTesseract(tesseractConfig)) {
357 |             throw new TikaException("Tesseract is not available. "+
358 |                     "Please set the OCR_STRATEGY to NO_OCR or configure Tesseract correctly");
359 |         }
360 | 
361 |         PDFRenderer renderer = new PDFRenderer(pdDocument);
362 |         TemporaryResources tmp = new TemporaryResources();
363 |         try {
364 | 
365 |             int dpi = config.getOcrDPI();
366 |             BufferedImage image = renderer.renderImageWithDPI(pageIndex, dpi, config.getOcrImageType());
367 |             Path tmpFile = tmp.createTempFile();
368 |             try (OutputStream os = Files.newOutputStream(tmpFile)) {
369 |                 //TODO: get output format from TesseractConfig
370 |                 ImageIOUtil.writeImage(image, config.getOcrImageFormatName(),
371 |                         os, dpi, config.getOcrImageQuality());
372 |             }
373 |             try (InputStream is = TikaInputStream.get(tmpFile)) {
374 |                 tesseractOCRParser.parseInline(is, xhtml, tesseractConfig);
375 |             }
376 |         } catch (IOException e) {
377 |             handleCatchableIOE(e);
378 |         } catch (SAXException e) {
379 |             throw new IOExceptionWithCause("error writing OCR content from PDF", e);
380 |         } finally {
381 |             tmp.dispose();
382 |         }
383 |     }
384 | 
385 |     @Override
386 |     protected void endPage(PDPage page) throws IOException {
387 |         metadata.add(PDF.CHARACTERS_PER_PAGE, totalCharsPerPage);
388 |         metadata.add(PDF.UNMAPPED_UNICODE_CHARS_PER_PAGE,
389 |                 unmappedUnicodeCharsPerPage);
390 | 
391 |         try {
392 |             for (PDAnnotation annotation : page.getAnnotations()) {
393 | 
394 |                 if (annotation instanceof PDAnnotationFileAttachment) {
395 |                     PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
396 |                     PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
397 |                     try {
398 |                         AttributesImpl attributes = new AttributesImpl();
399 |                         attributes.addAttribute("", "source", "source", "CDATA", "annotation");
400 |                         extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, attributes);
401 |                     } catch (SAXException e) {
402 |                         throw new IOExceptionWithCause("file embedded in annotation sax exception", e);
403 |                     } catch (TikaException e) {
404 |                         throw new IOExceptionWithCause("file embedded in annotation tika exception", e);
405 |                     } catch (IOException e) {
406 |                         handleCatchableIOE(e);
407 |                     }
408 |                 } else if (annotation instanceof PDAnnotationWidget) {
409 |                     handleWidget((PDAnnotationWidget)annotation);
410 |                 }
411 |                 // TODO: remove once PDFBOX-1143 is fixed:
412 |                 if (config.getExtractAnnotationText()) {
413 |                     PDActionURI uri = getActionURI(annotation);
414 |                     if (uri != null) {
415 |                         String link = uri.getURI();
416 |                         if (link != null && link.trim().length() > 0) {
417 |                             xhtml.startElement("div", "class", "annotation");
418 |                             xhtml.startElement("a", "href", link);
419 |                             xhtml.characters(link);
420 |                             xhtml.endElement("a");
421 |                             xhtml.endElement("div");
422 |                         }
423 |                     }
424 | 
425 |                     if (annotation instanceof PDAnnotationMarkup) {
426 |                         PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation;
427 |                         String title = annotationMarkup.getTitlePopup();
428 |                         String subject = annotationMarkup.getSubject();
429 |                         String contents = annotationMarkup.getContents();
430 |                         // TODO: maybe also annotationMarkup.getRichContents()?
431 |                         if (title != null || subject != null || contents != null) {
432 |                             xhtml.startElement("div", "class", "annotation");
433 | 
434 |                             if (title != null) {
435 |                                 xhtml.startElement("div", "class", "annotationTitle");
436 |                                 xhtml.characters(title);
437 |                                 xhtml.endElement("div");
438 |                             }
439 | 
440 |                             if (subject != null) {
441 |                                 xhtml.startElement("div", "class", "annotationSubject");
442 |                                 xhtml.characters(subject);
443 |                                 xhtml.endElement("div");
444 |                             }
445 | 
446 |                             if (contents != null) {
447 |                                 xhtml.startElement("div", "class", "annotationContents");
448 |                                 xhtml.characters(contents);
449 |                                 xhtml.endElement("div");
450 |                             }
451 | 
452 |                             xhtml.endElement("div");
453 |                         }
454 |                     }
455 |                 }
456 |             }
457 |             if (config.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) {
458 |                 doOCROnCurrentPage();
459 |             } else if (config.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.AUTO)) {
460 |                 //TODO add more sophistication
461 |                 if (totalCharsPerPage < 10 || unmappedUnicodeCharsPerPage > 10) {
462 |                     doOCROnCurrentPage();
463 |                 }
464 |             }
465 | 
466 |             PDPageAdditionalActions pageActions = page.getActions();
467 |             if (pageActions != null) {
468 |                 handleDestinationOrAction(pageActions.getC(), AbstractPDF2XHTML.ActionTrigger.PAGE_CLOSE);
469 |                 handleDestinationOrAction(pageActions.getO(), AbstractPDF2XHTML.ActionTrigger.PAGE_OPEN);
470 |             }
471 |             xhtml.endElement("div");
472 |             super.endPage(page);
473 |         } catch (SAXException|TikaException e) {
474 |             throw new IOExceptionWithCause("Unable to end a page", e);
475 |         } catch (IOException e) {
476 |             handleCatchableIOE(e);
477 |         } finally {
478 |             totalCharsPerPage = 0;
479 |             unmappedUnicodeCharsPerPage = 0;
480 |         }
481 | 
482 |         if (config.getExtractFontNames()) {
483 | 
484 |             for (COSName n : page.getResources().getFontNames()) {
485 |                 PDFont font = page.getResources().getFont(n);
486 |                 if (font != null && font.getFontDescriptor() != null) {
487 |                     String fontName = font.getFontDescriptor().getFontName();
488 |                     if (fontName != null) {
489 |                         fontNames.add(fontName);
490 |                     }
491 |                 }
492 |             }
493 |         }
494 |     }
495 | 
496 |     private void handleWidget(PDAnnotationWidget widget) throws TikaException, SAXException, IOException {
497 |         if (widget == null) {
498 |             return;
499 |         }
500 |         handleDestinationOrAction(widget.getAction(), AbstractPDF2XHTML.ActionTrigger.ANNOTATION_WIDGET);
501 |         PDAnnotationAdditionalActions annotationActions = widget.getActions();
502 |         if (annotationActions != null) {
503 |             handleDestinationOrAction(annotationActions.getBl(), AbstractPDF2XHTML.ActionTrigger.ANNOTATION_LOSE_INPUT_FOCUS);
504 |             handleDestinationOrAction(annotationActions.getD(), AbstractPDF2XHTML.ActionTrigger.ANNOTATION_MOUSE_CLICK);
505 |             handleDestinationOrAction(annotationActions.getE(), AbstractPDF2XHTML.ActionTrigger.ANNOTATION_CURSOR_ENTERS);
506 |             handleDestinationOrAction(annotationActions.getFo(), AbstractPDF2XHTML.ActionTrigger.ANNOTATION_RECEIVES_FOCUS);
507 |             handleDestinationOrAction(annotationActions.getPC(), AbstractPDF2XHTML.ActionTrigger.ANNOTATION_PAGE_CLOSED);
508 |             handleDestinationOrAction(annotationActions.getPI(), AbstractPDF2XHTML.ActionTrigger.ANNOTATION_PAGE_NO_LONGER_VISIBLE);
509 |             handleDestinationOrAction(annotationActions.getPO(), AbstractPDF2XHTML.ActionTrigger.ANNOTATION_PAGE_OPENED);
510 |             handleDestinationOrAction(annotationActions.getPV(), AbstractPDF2XHTML.ActionTrigger.ANNOTATION_PAGE_VISIBLE);
511 |             handleDestinationOrAction(annotationActions.getU(), AbstractPDF2XHTML.ActionTrigger.ANNOTATION_MOUSE_RELEASED);
512 |             handleDestinationOrAction(annotationActions.getX(), AbstractPDF2XHTML.ActionTrigger.ANNOTATION_CURSOR_EXIT);
513 |         }
514 | 
515 |     }
516 | 
517 |     @Override
518 |     protected void startDocument(PDDocument pdf) throws IOException {
519 |         try {
520 |             xhtml.startDocument();
521 |             try {
522 |                 handleDestinationOrAction(pdf.getDocumentCatalog().getOpenAction(), AbstractPDF2XHTML.ActionTrigger.DOCUMENT_OPEN);
523 |             } catch (IOException e) {
524 |                 //See PDFBOX-3773
525 |                 //swallow -- no need to report this
526 |             }
527 |         } catch (TikaException|SAXException e) {
528 |             throw new IOExceptionWithCause("Unable to start a document", e);
529 |         }
530 |     }
531 | 
532 |     private void handleDestinationOrAction(PDDestinationOrAction action,
533 |                                            AbstractPDF2XHTML.ActionTrigger actionTrigger) throws IOException, SAXException, TikaException {
534 |         if (action == null || ! config.getExtractActions()) {
535 |             return;
536 |         }
537 |         AttributesImpl attributes = new AttributesImpl();
538 |         String actionOrDestString = (action instanceof PDAction) ? "action" : "destination";
539 | 
540 |         addNonNullAttribute("class",  actionOrDestString, attributes);
541 |         addNonNullAttribute("type", action.getClass().getSimpleName(), attributes);
542 |         addNonNullAttribute("trigger", actionTrigger.name(), attributes);
543 | 
544 |         if (action instanceof PDActionImportData) {
545 |             processDoc("", ((PDActionImportData)action).getFile(), attributes);
546 |         } else if (action instanceof PDActionLaunch) {
547 |             PDActionLaunch pdActionLaunch = (PDActionLaunch)action;
548 |             addNonNullAttribute("id", pdActionLaunch.getF(), attributes);
549 |             addNonNullAttribute("defaultDirectory", pdActionLaunch.getD(), attributes);
550 |             addNonNullAttribute("operation", pdActionLaunch.getO(), attributes);
551 |             addNonNullAttribute("parameters", pdActionLaunch.getP(), attributes);
552 |             processDoc(pdActionLaunch.getF(), pdActionLaunch.getFile(), attributes);
553 |         } else if (action instanceof PDActionRemoteGoTo) {
554 |             PDActionRemoteGoTo remoteGoTo = (PDActionRemoteGoTo)action;
555 |             processDoc("", remoteGoTo.getFile(), attributes);
556 |         } else if (action instanceof PDActionJavaScript) {
557 |             PDActionJavaScript jsAction = (PDActionJavaScript)action;
558 |             Metadata m = new Metadata();
559 |             m.set(Metadata.CONTENT_TYPE, "application/javascript");
560 |             m.set(Metadata.CONTENT_ENCODING, StandardCharsets.UTF_8.toString());
561 |             m.set(PDF.ACTION_TRIGGER, actionTrigger.toString());
562 |             m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.name());
563 |             String js = jsAction.getAction();
564 |             js = (js == null) ? "" : js;
565 |             if (embeddedDocumentExtractor.shouldParseEmbedded(m)) {
566 |                 try (InputStream is = TikaInputStream.get(js.getBytes(StandardCharsets.UTF_8))) {
567 |                     embeddedDocumentExtractor.parseEmbedded(is, xhtml, m, false);
568 |                 }
569 |             }
570 |             addNonNullAttribute("class", "javascript", attributes);
571 |             addNonNullAttribute("type", jsAction.getType(), attributes);
572 |             addNonNullAttribute("subtype", jsAction.getSubType(), attributes);
573 |             xhtml.startElement("div", attributes);
574 |             xhtml.endElement("div");
575 |         } else {
576 |             xhtml.startElement("div", attributes);
577 |             xhtml.endElement("div");
578 |         }
579 |     }
580 | 
581 |     private static void addNonNullAttribute(String name, String value, AttributesImpl attributes) {
582 |         if (name == null || value == null) {
583 |             return;
584 |         }
585 |         attributes.addAttribute("", name, name, "CDATA", value);
586 |     }
587 | 
588 |     @Override
589 |     protected void endDocument(PDDocument pdf) throws IOException {
590 |         try {
591 |             // Extract text for any bookmarks:
592 |             if(config.getExtractBookmarksText()) {
593 |                 extractBookmarkText();
594 |             }
595 | 
596 |             try {
597 |                 extractEmbeddedDocuments(pdf);
598 |             } catch (IOException e) {
599 |                 handleCatchableIOE(e);
600 |             }
601 | 
602 |             //extract acroform data at end of doc
603 |             if (config.getExtractAcroFormContent() == true) {
604 |                 try {
605 |                     extractAcroForm(pdf);
606 |                 } catch (IOException e) {
607 |                     handleCatchableIOE(e);
608 |                 }
609 |             }
610 |             PDDocumentCatalogAdditionalActions additionalActions = pdf.getDocumentCatalog().getActions();
611 |             handleDestinationOrAction(additionalActions.getDP(), AbstractPDF2XHTML.ActionTrigger.AFTER_DOCUMENT_PRINT);
612 |             handleDestinationOrAction(additionalActions.getDS(), AbstractPDF2XHTML.ActionTrigger.AFTER_DOCUMENT_SAVE);
613 |             handleDestinationOrAction(additionalActions.getWC(), AbstractPDF2XHTML.ActionTrigger.BEFORE_DOCUMENT_CLOSE);
614 |             handleDestinationOrAction(additionalActions.getWP(), AbstractPDF2XHTML.ActionTrigger.BEFORE_DOCUMENT_PRINT);
615 |             handleDestinationOrAction(additionalActions.getWS(), AbstractPDF2XHTML.ActionTrigger.BEFORE_DOCUMENT_SAVE);
616 | 
617 |             if (cdataContent.size() > 0)
618 |                 dumpCDATA();
619 | 
620 |             xhtml.endDocument();
621 |         } catch (TikaException e) {
622 |             throw new IOExceptionWithCause("Unable to end a document", e);
623 |         } catch (SAXException e) {
624 |             throw new IOExceptionWithCause("Unable to end a document", e);
625 |         }
626 |         if (fontNames.size() > 0) {
627 |             for (String fontName : fontNames) {
628 |                 metadata.add(Font.FONT_NAME, fontName);
629 |             }
630 |         }
631 |     }
632 | 
633 |     void extractBookmarkText() throws SAXException, IOException, TikaException {
634 |         PDDocumentOutline outline = document.getDocumentCatalog().getDocumentOutline();
635 |         if (outline != null) {
636 |             extractBookmarkText(outline);
637 |         }
638 |     }
639 | 
640 |     void extractBookmarkText(PDOutlineNode bookmark) throws SAXException, IOException, TikaException {
641 |         PDOutlineItem current = bookmark.getFirstChild();
642 | 
643 |         if (current != null) {
644 |             xhtml.startElement("ul");
645 |             while (current != null) {
646 |                 xhtml.startElement("li");
647 |                 xhtml.characters(current.getTitle());
648 |                 xhtml.endElement("li");
649 |                 handleDestinationOrAction(current.getAction(), AbstractPDF2XHTML.ActionTrigger.BOOKMARK);
650 |                 // Recurse:
651 |                 extractBookmarkText(current);
652 |                 current = current.getNextSibling();
653 |             }
654 |             xhtml.endElement("ul");
655 |         }
656 |     }
657 | 
658 |     void extractAcroForm(PDDocument pdf) throws IOException,
659 |             SAXException, TikaException {
660 |         //Thank you, Ben Litchfield, for org.apache.pdfbox.examples.fdf.PrintFields
661 |         //this code derives from Ben's code
662 |         PDDocumentCatalog catalog = pdf.getDocumentCatalog();
663 | 
664 |         if (catalog == null)
665 |             return;
666 | 
667 |         PDAcroForm form = catalog.getAcroForm();
668 |         if (form == null)
669 |             return;
670 | 
671 |         //if it has xfa, try that.
672 |         //if it doesn't exist or there's an exception,
673 |         //go with traditional AcroForm
674 |         PDXFAResource pdxfa = form.getXFA();
675 | 
676 |         if (pdxfa != null) {
677 |             //if successful, return
678 |             XFAExtractor xfaExtractor = new XFAExtractor();
679 |             InputStream is = null;
680 |             try {
681 |                 is = new BufferedInputStream(
682 |                         new ByteArrayInputStream(pdxfa.getBytes()));
683 |             } catch (IOException e) {
684 |                 EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
685 |             }
686 |             if (is != null) {
687 |                 try {
688 |                     xfaExtractor.extract(is, xhtml, metadata, context);
689 |                     return;
690 |                 } catch (XMLStreamException e) {
691 |                     //if there was an xml parse exception in xfa, try the AcroForm
692 |                     EmbeddedDocumentUtil.recordException(e, metadata);
693 |                 } finally {
694 |                     IOUtils.closeQuietly(is);
695 |                 }
696 |             }
697 |         }
698 | 
699 |         @SuppressWarnings("rawtypes")
700 |         List fields = form.getFields();
701 | 
702 |         if (fields == null)
703 |             return;
704 | 
705 |         @SuppressWarnings("rawtypes")
706 |         ListIterator itr = fields.listIterator();
707 | 
708 |         if (itr == null)
709 |             return;
710 | 
711 |         xhtml.startElement("div", "class", "acroform");
712 |         xhtml.startElement("ol");
713 | 
714 |         while (itr.hasNext()) {
715 |             Object obj = itr.next();
716 |             if (obj != null && obj instanceof PDField) {
717 |                 processAcroField((PDField) obj, 0);
718 |             }
719 |         }
720 |         xhtml.endElement("ol");
721 |         xhtml.endElement("div");
722 |     }
723 | 
724 |     private void processAcroField(PDField field, final int currentRecursiveDepth)
725 |             throws SAXException, IOException, TikaException {
726 | 
727 |         if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) {
728 |             return;
729 |         }
730 | 
731 |         PDFormFieldAdditionalActions pdFormFieldAdditionalActions = field.getActions();
732 |         if (pdFormFieldAdditionalActions != null) {
733 |             handleDestinationOrAction(pdFormFieldAdditionalActions.getC(), AbstractPDF2XHTML.ActionTrigger.FORM_FIELD_RECALCULATE);
734 |             handleDestinationOrAction(pdFormFieldAdditionalActions.getF(), AbstractPDF2XHTML.ActionTrigger.FORM_FIELD_FORMATTED);
735 |             handleDestinationOrAction(pdFormFieldAdditionalActions.getK(), AbstractPDF2XHTML.ActionTrigger.FORM_FIELD_KEYSTROKE);
736 |             handleDestinationOrAction(pdFormFieldAdditionalActions.getV(), AbstractPDF2XHTML.ActionTrigger.FORM_FIELD_VALUE_CHANGE);
737 |         }
738 |         if (field.getWidgets() != null) {
739 |             for (PDAnnotationWidget widget : field.getWidgets()) {
740 |                 handleWidget(widget);
741 |             }
742 |         }
743 | 
744 | 
745 |         addFieldString(field);
746 |         if (field instanceof PDNonTerminalField) {
747 |             int r = currentRecursiveDepth + 1;
748 |             xhtml.startElement("ol");
749 |             for (PDField child : ((PDNonTerminalField)field).getChildren()) {
750 |                 processAcroField(child, r);
751 |             }
752 |             xhtml.endElement("ol");
753 |         }
754 |     }
755 | 
756 |     private void addFieldString(PDField field) throws SAXException {
757 |         //Pick partial name to present in content and altName for attribute
758 |         //Ignoring FullyQualifiedName for now
759 |         String partName = field.getPartialName();
760 |         String altName = field.getAlternateFieldName();
761 | 
762 |         StringBuilder sb = new StringBuilder();
763 |         AttributesImpl attrs = new AttributesImpl();
764 | 
765 |         if (partName != null) {
766 |             sb.append(partName).append(": ");
767 |         }
768 |         if (altName != null) {
769 |             attrs.addAttribute("", "altName", "altName", "CDATA", altName);
770 |         }
771 |         //return early if PDSignature field
772 |         if (field instanceof PDSignatureField) {
773 |             handleSignature(attrs, (PDSignatureField) field);
774 |             return;
775 |         }
776 |         String value = field.getValueAsString();
777 |         if (value != null && !value.equals("null")) {
778 |             sb.append(value);
779 |         }
780 | 
781 |         if (attrs.getLength() > 0 || sb.length() > 0) {
782 |             xhtml.startElement("li", attrs);
783 |             xhtml.characters(sb.toString());
784 |             xhtml.endElement("li");
785 |         }
786 |     }
787 | 
788 |     private void handleSignature(AttributesImpl parentAttributes, PDSignatureField sigField)
789 |             throws SAXException {
790 | 
791 |         PDSignature sig = sigField.getSignature();
792 |         if (sig == null) {
793 |             return;
794 |         }
795 |         Map<String, String> vals = new TreeMap<>();
796 |         vals.put("name", sig.getName());
797 |         vals.put("contactInfo", sig.getContactInfo());
798 |         vals.put("location", sig.getLocation());
799 |         vals.put("reason", sig.getReason());
800 | 
801 |         Calendar cal = sig.getSignDate();
802 |         if (cal != null) {
803 |             dateFormat.setTimeZone(cal.getTimeZone());
804 |             vals.put("date", dateFormat.format(cal.getTime()));
805 |         }
806 |         //see if there is any data
807 |         int nonNull = 0;
808 |         for (String val : vals.keySet()) {
809 |             if (val != null && !val.equals("")) {
810 |                 nonNull++;
811 |             }
812 |         }
813 |         //if there is, process it
814 |         if (nonNull > 0) {
815 |             metadata.set(TikaCoreProperties.HAS_SIGNATURE, "true");
816 |             xhtml.startElement("li", parentAttributes);
817 | 
818 |             AttributesImpl attrs = new AttributesImpl();
819 |             attrs.addAttribute("", "type", "type", "CDATA", "signaturedata");
820 | 
821 |             xhtml.startElement("ol", attrs);
822 |             for (Map.Entry<String, String> e : vals.entrySet()) {
823 |                 if (e.getValue() == null || e.getValue().equals("")) {
824 |                     continue;
825 |                 }
826 |                 attrs = new AttributesImpl();
827 |                 attrs.addAttribute("", "signdata", "signdata", "CDATA", e.getKey());
828 |                 xhtml.startElement("li", attrs);
829 |                 xhtml.characters(e.getValue());
830 |                 xhtml.endElement("li");
831 |             }
832 |             xhtml.endElement("ol");
833 |             xhtml.endElement("li");
834 |         }
835 |     }
836 | 
837 | 
838 |     private static PDActionURI getActionURI(PDAnnotation annot) {
839 |         //copied and pasted from PDFBox's PrintURLs
840 | 
841 |         // use reflection to catch all annotation types that have getAction()
842 |         // If you can't use reflection, then check for classes
843 |         // PDAnnotationLink and PDAnnotationWidget, and call getAction() and check for a
844 |         // PDActionURI result type
845 |         try {
846 |             Method actionMethod = annot.getClass().getDeclaredMethod("getAction");
847 |             if (actionMethod.getReturnType().equals(PDAction.class)) {
848 |                 PDAction action = (PDAction) actionMethod.invoke(annot);
849 |                 if (action instanceof PDActionURI) {
850 |                     return (PDActionURI) action;
851 |                 }
852 |             }
853 |         }
854 |         catch (NoSuchMethodException|IllegalAccessException|InvocationTargetException e) {
855 |         }
856 |         return null;
857 |     }
858 | 
859 |     /**
860 |      * we need to override this because we are overriding {@link #processPages(PDPageTree)}
861 |      * @return
862 |      */
863 |     @Override
864 |     public int getCurrentPageNo() {
865 |         return pageIndex+1;
866 |     }
867 | 
868 |     /**
869 |      * See TIKA-2845 for why we need to override this.
870 |      *
871 |      * @param pages
872 |      * @throws IOException
873 |      */
874 |     @Override
875 |     protected void processPages(PDPageTree pages) throws IOException, SAXException {
876 |         //we currently need this hack because we aren't able to increment
877 |         //the private currentPageNo in PDFTextStripper,
878 |         //and PDFTextStripper's processPage relies on that variable
879 |         //being >= startPage when deciding whether or not to process a page
880 |         // See:
881 |         // if (currentPageNo >= startPage && currentPageNo <= endPage
882 |         //                && (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber)
883 |         //                && (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber))
884 |         //        {
885 |         super.setStartPage(-1);
886 |         for (PDPage page : pages) {
887 |             if (getCurrentPageNo() >= getStartPage()
888 |                     && getCurrentPageNo() <= getEndPage()) {
889 |                 processPage(page);
890 |             }
891 |             pageIndex++;
892 |         }
893 |     }
894 | 
895 |     @Override
896 |     public void setStartBookmark(PDOutlineItem pdOutlineItem) {
897 |         throw new UnsupportedOperationException("We don't currently support this -- See PDFTextStripper's processPages() for how to implement this.");
898 |     }
899 | 
900 |     @Override
901 |     public void setEndBookmark(PDOutlineItem pdOutlineItem) {
902 |         throw new UnsupportedOperationException("We don't currently support this -- See PDFTextStripper's processPages() for how to implement this.");
903 |     }
904 | 
905 |     @Override
906 |     public void setStartPage(int startPage) {
907 |         this.startPage = startPage;
908 |     }
909 | 
910 |     @Override
911 |     public int getStartPage() {
912 |         return startPage;
913 |     }
914 | 
915 |     @Override
916 |     protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code, String unicode, Vector displacement) throws IOException
917 |     {
918 |         super.showGlyph(textRenderingMatrix, font, code, unicode, displacement);
919 |         if (unicode == null || unicode.isEmpty()) {
920 |             unmappedUnicodeCharsPerPage++;
921 |         }
922 |         totalCharsPerPage++;
923 |     }
924 | }
925 | 


--------------------------------------------------------------------------------