├── .gitignore ├── lexpredict-tika ├── setenv.sh ├── src │ ├── test │ │ ├── resources │ │ │ └── test-documents │ │ │ │ ├── jbig.pdf │ │ │ │ ├── scanned.pdf │ │ │ │ ├── sample_ol.pdf │ │ │ │ ├── chylde_harold.pdf │ │ │ │ ├── sample_table.pdf │ │ │ │ ├── text_on_white.pdf │ │ │ │ ├── transp_scanned.pdf │ │ │ │ ├── double_space_test.pdf │ │ │ │ ├── mixed_scanned_text.pdf │ │ │ │ └── industrial developing authority.pdf │ │ └── java │ │ │ └── com │ │ │ └── lexpredict │ │ │ └── tika │ │ │ ├── PdfContentTypeCheckerTest.java │ │ │ ├── ShallowCopyTest.java │ │ │ ├── PdfContentImagePreprocessorTest.java │ │ │ ├── AlterPDFParserTest.java │ │ │ └── TikaTest.java │ └── main │ │ └── java │ │ └── com │ │ └── lexpredict │ │ └── tika │ │ ├── TagData.java │ │ ├── AlterPDFParserConfig.java │ │ ├── ShallowCopy.java │ │ ├── FieldLookup.java │ │ ├── AlterXHTMLContentHandler.java │ │ ├── OCR2XHTML.java │ │ ├── PdfContentImagePreprocessor.java │ │ ├── HttpRequestParamsReader.java │ │ ├── PDFEncodedStringDecoder.java │ │ ├── PdfContentTypeChecker.java │ │ ├── PdfStripperProcessor.java │ │ ├── PDMetadataExtractor.java │ │ ├── XFAExtractor.java │ │ ├── LegacyPDFStreamEngine.java │ │ ├── AlterPDFParser.java │ │ ├── PDF2XHTML.java │ │ └── AbstractPDF2XHTML.java ├── .gitignore ├── debug.sh ├── example_tika.config ├── prepare_debug_env.sh ├── pom.xml.debug ├── pom.xml └── readme.txt ├── deployment-example ├── deploy.sh ├── docker-compose.yml └── tika-config.xml ├── tika-config.xml ├── Dockerfile ├── README.md └── LICENSE /.gitignore: -------------------------------------------------------------------------------- 1 | /.idea -------------------------------------------------------------------------------- /lexpredict-tika/setenv.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | TIKA_VERSION=1.20 4 | LEXPREDICT_TIKA_VERSION=1.0 5 | -------------------------------------------------------------------------------- /deployment-example/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | sudo -E docker stack deploy --compose-file docker-compose.yml lexpredict-tika-cluster 5 | -------------------------------------------------------------------------------- /lexpredict-tika/src/test/resources/test-documents/jbig.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LexPredict/tika-server/HEAD/lexpredict-tika/src/test/resources/test-documents/jbig.pdf -------------------------------------------------------------------------------- /lexpredict-tika/src/test/resources/test-documents/scanned.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LexPredict/tika-server/HEAD/lexpredict-tika/src/test/resources/test-documents/scanned.pdf -------------------------------------------------------------------------------- /lexpredict-tika/src/test/resources/test-documents/sample_ol.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LexPredict/tika-server/HEAD/lexpredict-tika/src/test/resources/test-documents/sample_ol.pdf -------------------------------------------------------------------------------- /lexpredict-tika/src/test/resources/test-documents/chylde_harold.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LexPredict/tika-server/HEAD/lexpredict-tika/src/test/resources/test-documents/chylde_harold.pdf -------------------------------------------------------------------------------- /lexpredict-tika/src/test/resources/test-documents/sample_table.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LexPredict/tika-server/HEAD/lexpredict-tika/src/test/resources/test-documents/sample_table.pdf -------------------------------------------------------------------------------- /lexpredict-tika/src/test/resources/test-documents/text_on_white.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LexPredict/tika-server/HEAD/lexpredict-tika/src/test/resources/test-documents/text_on_white.pdf -------------------------------------------------------------------------------- /lexpredict-tika/src/test/resources/test-documents/transp_scanned.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LexPredict/tika-server/HEAD/lexpredict-tika/src/test/resources/test-documents/transp_scanned.pdf -------------------------------------------------------------------------------- /lexpredict-tika/src/test/resources/test-documents/double_space_test.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LexPredict/tika-server/HEAD/lexpredict-tika/src/test/resources/test-documents/double_space_test.pdf -------------------------------------------------------------------------------- /lexpredict-tika/src/test/resources/test-documents/mixed_scanned_text.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LexPredict/tika-server/HEAD/lexpredict-tika/src/test/resources/test-documents/mixed_scanned_text.pdf -------------------------------------------------------------------------------- /lexpredict-tika/src/test/resources/test-documents/industrial developing authority.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LexPredict/tika-server/HEAD/lexpredict-tika/src/test/resources/test-documents/industrial developing authority.pdf -------------------------------------------------------------------------------- /lexpredict-tika/.gitignore: -------------------------------------------------------------------------------- 1 | # Eclipse 2 | .classpath 3 | .project 4 | .settings/ 5 | 6 | # Intellij 7 | .idea/ 8 | *.iml 9 | *.iws 10 | 11 | # Mac 12 | .DS_Store 13 | 14 | # Maven 15 | log/ 16 | target/ 17 | 18 | 19 | debug/ 20 | 21 | tmp/ 22 | -------------------------------------------------------------------------------- /lexpredict-tika/debug.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source setenv.sh 3 | 4 | java -agentlib:jdwp=transport=dt_socket,server=y,address=8001,suspend=n -cp "./debug/tika-server-${TIKA_VERSION}.jar:./target/lexpredict-tika-${LEXPREDICT_TIKA_VERSION}.jar:libs/*" org.apache.tika.server.TikaServerCli --config ../tika-config.xml -------------------------------------------------------------------------------- /deployment-example/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.3" 2 | services: 3 | 4 | tika: 5 | image: lexpredict/tika-server:latest 6 | ports: 7 | - 9999:9998 8 | configs: 9 | - source: tika_config_3 10 | target: /tika-config.xml 11 | networks: 12 | - net 13 | deploy: 14 | replicas: 3 15 | 16 | networks: 17 | net: 18 | 19 | configs: 20 | tika_config_3: 21 | file: ./tika-config.xml 22 | -------------------------------------------------------------------------------- /tika-config.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | application/pdf 8 | 9 | 10 | 11 | 12 | application/pdf 13 | 14 | 15 | -------------------------------------------------------------------------------- /lexpredict-tika/src/test/java/com/lexpredict/tika/PdfContentTypeCheckerTest.java: -------------------------------------------------------------------------------- 1 | package com.lexpredict.tika; 2 | 3 | import org.junit.Test; 4 | import java.io.InputStream; 5 | import static org.junit.Assert.assertEquals; 6 | 7 | public class PdfContentTypeCheckerTest extends TikaTest { 8 | @Test 9 | public void testPdfTypeChecker() throws Exception { 10 | InputStream stream = AlterPDFParserTest.class.getResourceAsStream("/test-documents/scanned.pdf"); 11 | PdfContentTypeChecker checker = new PdfContentTypeChecker(); 12 | PdfContentTypeChecker.PdfContent docType = checker.determineDocContentType(stream); 13 | assertEquals(PdfContentTypeChecker.PdfContent.IMAGES, docType); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /lexpredict-tika/src/test/java/com/lexpredict/tika/ShallowCopyTest.java: -------------------------------------------------------------------------------- 1 | package com.lexpredict.tika; 2 | 3 | import org.apache.tika.parser.pdf.PDFParserConfig; 4 | import org.junit.Test; 5 | import static org.junit.Assert.assertTrue; 6 | 7 | public class ShallowCopyTest { 8 | @Test 9 | public void testPdfAlphaImageReplacing() throws Exception { 10 | PDFParserConfig cfg = new PDFParserConfig(); 11 | cfg.setExtractUniqueInlineImagesOnly(false); 12 | cfg.setOcrStrategy("OCR_ONLY"); 13 | 14 | PDFParserConfig cpy = new PDFParserConfig(); 15 | ShallowCopy.copyFields(cfg, cpy); 16 | assertTrue(cfg.getExtractUniqueInlineImagesOnly() == 17 | cpy.getExtractUniqueInlineImagesOnly()); 18 | assertTrue(cfg.getOcrStrategy() == 19 | cpy.getOcrStrategy()); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /lexpredict-tika/src/main/java/com/lexpredict/tika/TagData.java: -------------------------------------------------------------------------------- 1 | package com.lexpredict.tika; 2 | 3 | public class TagData { 4 | public String tagName; 5 | public Boolean isCdata; 6 | public String attributeString; 7 | public StringBuilder data = new StringBuilder(); 8 | 9 | public TagData(String tagName, 10 | Boolean isCdata, 11 | String attributeString) { 12 | this.tagName = tagName; 13 | this.isCdata = isCdata; 14 | this.attributeString = attributeString; 15 | } 16 | 17 | @Override 18 | public String toString() { 19 | return "TagData{" + 20 | "tagName='" + tagName + '\'' + 21 | ", isCdata=" + isCdata + 22 | ", attributeString='" + attributeString + '\'' + 23 | ", data=" + data + 24 | '}'; 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /lexpredict-tika/src/main/java/com/lexpredict/tika/AlterPDFParserConfig.java: -------------------------------------------------------------------------------- 1 | package com.lexpredict.tika; 2 | 3 | import org.apache.tika.parser.pdf.PDFParserConfig; 4 | 5 | public class AlterPDFParserConfig{ 6 | public static void configureAlterPtf2Xhtml(PDFParserConfig config, PDF2XHTML pdf2XHTML) { 7 | pdf2XHTML.setSortByPosition(config.getSortByPosition()); 8 | if (config.getEnableAutoSpace()) { 9 | pdf2XHTML.setWordSeparator(" "); 10 | } else { 11 | pdf2XHTML.setWordSeparator(""); 12 | } 13 | if (config.getAverageCharTolerance() != null) { 14 | pdf2XHTML.setAverageCharTolerance(config.getAverageCharTolerance()); 15 | } 16 | if (config.getSpacingTolerance() != null) { 17 | pdf2XHTML.setSpacingTolerance(config.getSpacingTolerance()); 18 | } 19 | pdf2XHTML.setSuppressDuplicateOverlappingText(config.getSuppressDuplicateOverlappingText()); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /lexpredict-tika/example_tika.config: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 7 | 8 | image/jpeg 9 | application/pdf 10 | 11 | 12 | 13 | 14 | application/pdf 15 | true 16 | false 17 | ocr_and_text_extraction 18 | 1 19 | 20 | 21 | -------------------------------------------------------------------------------- /deployment-example/tika-config.xml: -------------------------------------------------------------------------------- 1 | 2 | 22 | 23 | 25 | 26 | 31 | -------------------------------------------------------------------------------- /lexpredict-tika/prepare_debug_env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source setenv.sh 4 | TIKA_SERVER_URL=https://www.apache.org/dist/tika/tika-server-$TIKA_VERSION.jar 5 | 6 | 7 | mkdir -p ./debug 8 | pushd debug 9 | 10 | sudo apt-get install -y gpg curl gdal-bin openjdk-8-jre-headless 11 | 12 | sudo apt-get -y install \ 13 | tesseract-ocr \ 14 | tesseract-ocr-eng tesseract-ocr-ita tesseract-ocr-fra tesseract-ocr-spa tesseract-ocr-deu tesseract-ocr-rus \ 15 | && tesseract -v 16 | 17 | curl -sSL https://people.apache.org/keys/group/tika.asc -o /tmp/tika.asc \ 18 | && gpg --import /tmp/tika.asc \ 19 | && curl -sSL "$TIKA_SERVER_URL.asc" -o /tmp/tika-server-${TIKA_VERSION}.jar.asc \ 20 | && NEAREST_TIKA_SERVER_URL=$(curl -sSL http://www.apache.org/dyn/closer.cgi/${TIKA_SERVER_URL#https://www.apache.org/dist/}\?asjson\=1 \ 21 | | awk '/"path_info": / { pi=$2; }; /"preferred":/ { pref=$2; }; END { print pref " " pi; };' \ 22 | | sed -r -e 's/^"//; s/",$//; s/" "//') \ 23 | && echo "Nearest mirror: $NEAREST_TIKA_SERVER_URL" \ 24 | && wget "$NEAREST_TIKA_SERVER_URL" -O tika-server-${TIKA_VERSION}.jar 25 | 26 | 27 | 28 | 29 | popd 30 | -------------------------------------------------------------------------------- /lexpredict-tika/src/main/java/com/lexpredict/tika/ShallowCopy.java: -------------------------------------------------------------------------------- 1 | package com.lexpredict.tika; 2 | 3 | import java.lang.reflect.Field; 4 | 5 | public class ShallowCopy { 6 | public static void copyFields(Object from, Object to) { 7 | Field[] fields = from.getClass().getDeclaredFields(); 8 | for (Field field : fields) { 9 | try { 10 | Field fieldFrom = from.getClass().getDeclaredField(field.getName()); 11 | if (java.lang.reflect.Modifier.isStatic(fieldFrom.getModifiers())) 12 | continue; 13 | 14 | boolean wasAccessed = fieldFrom.isAccessible(); 15 | fieldFrom.setAccessible(true); 16 | Object value = fieldFrom.get(from); 17 | fieldFrom.setAccessible(wasAccessed); 18 | 19 | Field fieldTo = to.getClass().getDeclaredField(field.getName()); 20 | fieldTo.setAccessible(true); 21 | fieldTo.set(to, value); 22 | fieldTo.setAccessible(wasAccessed); 23 | 24 | } catch (IllegalAccessException | NoSuchFieldException e) { 25 | e.printStackTrace(); 26 | } 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /lexpredict-tika/src/test/java/com/lexpredict/tika/PdfContentImagePreprocessorTest.java: -------------------------------------------------------------------------------- 1 | package com.lexpredict.tika; 2 | 3 | import org.apache.pdfbox.pdmodel.PDDocument; 4 | import org.junit.Test; 5 | import java.io.InputStream; 6 | import static org.junit.Assert.assertFalse; 7 | import static org.junit.Assert.assertTrue; 8 | 9 | public class PdfContentImagePreprocessorTest extends TikaTest { 10 | @Test 11 | public void testPdfNonAlphaImageReplacing() throws Exception { 12 | InputStream stream = AlterPDFParserTest.class.getResourceAsStream("/test-documents/scanned.pdf"); 13 | PDDocument doc = PDDocument.load(stream); 14 | 15 | PdfContentImagePreprocessor preproc = new PdfContentImagePreprocessor(); 16 | boolean hasReplaced = preproc.removeImagesAlphaChannel(doc); 17 | assertFalse(hasReplaced); 18 | } 19 | 20 | @Test 21 | public void testPdfAlphaImageReplacing() throws Exception { 22 | InputStream stream = AlterPDFParserTest.class.getResourceAsStream("/test-documents/transp_scanned.pdf"); 23 | PDDocument doc = PDDocument.load(stream); 24 | 25 | PdfContentImagePreprocessor preproc = new PdfContentImagePreprocessor(); 26 | boolean hasReplaced = preproc.removeImagesAlphaChannel(doc); 27 | assertTrue(hasReplaced); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /lexpredict-tika/src/main/java/com/lexpredict/tika/FieldLookup.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Modifications copyright (C) 2020 ContraxSuite, LLC 18 | */ 19 | 20 | package com.lexpredict.tika; 21 | 22 | import java.lang.reflect.Field; 23 | 24 | public class FieldLookup { 25 | 26 | // find field in passed class or one of his ancestors 27 | public static Object getFieldValue(Object obj, String fieldName) { 28 | Field f = findField(obj.getClass(), fieldName); 29 | if (f == null) 30 | return null; 31 | try { 32 | f.setAccessible(true); 33 | return f.get(obj); 34 | } catch (IllegalAccessException e) { 35 | return null; 36 | } 37 | } 38 | 39 | public static Field findField(Class cls, String fieldName) { 40 | while (true) { 41 | try { 42 | return cls.getDeclaredField(fieldName); 43 | } catch (NoSuchFieldException e) { 44 | // pass 45 | } 46 | cls = cls.getSuperclass(); 47 | if (cls == null) break; 48 | } 49 | return null; 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | 3 | ENV TIKA_VERSION 1.24 4 | ENV TIKA_SERVER_URL https://www.apache.org/dist/tika/tika-server-$TIKA_VERSION.jar 5 | 6 | 7 | 8 | RUN apt-get -y --fix-missing update 9 | 10 | RUN apt-get install -y gpg curl gdal-bin openjdk-8-jre-headless 11 | 12 | RUN \ 13 | apt-get -y install \ 14 | tesseract-ocr \ 15 | tesseract-ocr-eng tesseract-ocr-ita tesseract-ocr-fra tesseract-ocr-spa tesseract-ocr-deu tesseract-ocr-rus \ 16 | && tesseract -v 17 | 18 | RUN \ 19 | curl -sSL https://people.apache.org/keys/group/tika.asc -o /tmp/tika.asc \ 20 | && gpg --import /tmp/tika.asc \ 21 | && curl -sSL "$TIKA_SERVER_URL.asc" -o /tmp/tika-server-${TIKA_VERSION}.jar.asc \ 22 | && NEAREST_TIKA_SERVER_URL=$(curl -sSL http://www.apache.org/dyn/closer.cgi/${TIKA_SERVER_URL#https://www.apache.org/dist/}\?asjson\=1 \ 23 | | awk '/"path_info": / { pi=$2; }; /"preferred":/ { pref=$2; }; END { print pref " " pi; };' \ 24 | | sed -r -e 's/^"//; s/",$//; s/" "//') \ 25 | && echo "Nearest mirror: $NEAREST_TIKA_SERVER_URL" \ 26 | && curl -sSL "$NEAREST_TIKA_SERVER_URL" -o /tika-server-${TIKA_VERSION}.jar 27 | 28 | 29 | RUN apt-get -y clean autoclean \ 30 | && apt-get -y autoremove \ 31 | && rm -rf /var/lib/{apt,dpkg,cache,log}/ 32 | 33 | # default Tika config - may be overriden by Docker Swarm config mounting 34 | COPY ./tika-config.xml /tika-config.xml 35 | COPY ./lexpredict-tika/target/lexpredict-tika-1.0.jar / 36 | RUN echo $(date) > /build.date 37 | 38 | EXPOSE 9998 39 | ENTRYPOINT echo "Tika Server Docker Image built $(cat /build.date)" \ 40 | && echo "Java Version:" \ 41 | && java -version \ 42 | && echo "Tesseract:" \ 43 | && tesseract -v \ 44 | && echo "Tika: ${TIKA_VERSION}" \ 45 | && echo "Config:" \ 46 | && cat /tika-config.xml \ 47 | && java -cp "tika-server-${TIKA_VERSION}.jar:lexpredict-tika-1.0.jar:libs/*" org.apache.tika.server.TikaServerCli --h 0.0.0.0 --port 9998 --config /tika-config.xml 48 | -------------------------------------------------------------------------------- /lexpredict-tika/pom.xml.debug: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | lexpredict-tika 7 | com.lexpredict.tika 8 | 1.0 9 | 4.0.0 10 | 11 | 12 | 13 | org.apache.tika 14 | tika-app 15 | 1.23 16 | 17 | 18 | org.apache.tika 19 | tika-core 20 | 1.23 21 | 22 | 23 | commons-io 24 | commons-io 25 | 2.6 26 | 27 | 28 | org.apache.pdfbox 29 | pdfbox 30 | 2.0.13 31 | 32 | 33 | org.apache.tika 34 | tika-parsers 35 | 1.23 36 | 37 | 38 | org.apache.tika 39 | tika-server 40 | 1.23 41 | 42 | 43 | junit 44 | junit 45 | 4.12 46 | test 47 | 48 | 49 | 50 | 51 | 52 | 53 | org.apache.maven.plugins 54 | maven-compiler-plugin 55 | 3.6.0 56 | 57 | 1.8 58 | 1.8 59 | 60 | 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /lexpredict-tika/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | lexpredict-tika 7 | com.lexpredict.tika 8 | 1.0 9 | 4.0.0 10 | 11 | 12 | 13 | org.apache.tika 14 | tika-core 15 | 1.24 16 | 17 | 18 | org.apache.tika 19 | tika-app 20 | 1.24 21 | compile 22 | 23 | 24 | commons-io 25 | commons-io 26 | 2.6 27 | 28 | 29 | org.apache.pdfbox 30 | pdfbox 31 | 2.0.13 32 | 33 | 34 | org.apache.tika 35 | tika-parsers 36 | 1.24 37 | 38 | 39 | org.apache.tika 40 | tika-server 41 | 1.24 42 | 43 | 44 | junit 45 | junit 46 | 4.12 47 | test 48 | 49 | 50 | 51 | 52 | 53 | 54 | org.apache.maven.plugins 55 | maven-compiler-plugin 56 | 3.6.0 57 | 58 | 1.8 59 | 1.8 60 | 61 | 62 | 63 | 64 | 65 | -------------------------------------------------------------------------------- /lexpredict-tika/src/main/java/com/lexpredict/tika/AlterXHTMLContentHandler.java: -------------------------------------------------------------------------------- 1 | package com.lexpredict.tika; 2 | 3 | import org.apache.tika.metadata.Metadata; 4 | import org.apache.tika.sax.ToXMLContentHandler; 5 | import org.apache.tika.sax.XHTMLContentHandler; 6 | import org.xml.sax.ContentHandler; 7 | import org.xml.sax.SAXException; 8 | 9 | import java.lang.reflect.Field; 10 | import java.lang.reflect.InvocationTargetException; 11 | import java.lang.reflect.Method; 12 | 13 | class AlterXHTMLContentHandler extends XHTMLContentHandler { 14 | protected static final char[] emptyChar = new char[0]; 15 | 16 | protected ToXMLContentHandler decoratedHandler; 17 | 18 | protected Method charactersRawMethod = null; 19 | 20 | public AlterXHTMLContentHandler(ContentHandler handler, Metadata metadata) { 21 | super(handler, metadata); 22 | try { 23 | Class c = Class.forName("org.apache.tika.sax.ContentHandlerDecorator"); 24 | Field field = c.getDeclaredField("handler"); 25 | field.setAccessible(true); 26 | Object decoratedHandlerObj = field.get(this); 27 | 28 | if (decoratedHandlerObj instanceof ToXMLContentHandler) { 29 | // handlerClassName can also be TaggedContentHandler 30 | this.decoratedHandler = (ToXMLContentHandler)field.get(this); 31 | c = Class.forName("org.apache.tika.sax.ToXMLContentHandler"); 32 | this.charactersRawMethod = c.getDeclaredMethod("write", 33 | String.class); 34 | this.charactersRawMethod.setAccessible(true); 35 | } 36 | } catch (ClassNotFoundException | NoSuchMethodException | NoSuchFieldException | IllegalAccessException e) { 37 | e.printStackTrace(); 38 | } 39 | } 40 | 41 | public boolean isCharacterInvalid(char c) { 42 | return this.isInvalid(c); 43 | } 44 | 45 | public void charactersRaw(String data) throws SAXException { 46 | if (this.charactersRawMethod == null) { 47 | super.characters(data); 48 | return; 49 | } 50 | 51 | super.characters(emptyChar, 0, 0); 52 | try { 53 | this.charactersRawMethod.invoke(this.decoratedHandler, data); 54 | } catch (IllegalAccessException | InvocationTargetException e) { 55 | e.printStackTrace(); 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /lexpredict-tika/src/main/java/com/lexpredict/tika/OCR2XHTML.java: -------------------------------------------------------------------------------- 1 | package com.lexpredict.tika; 2 | import java.io.IOException; 3 | import java.io.Writer; 4 | import org.apache.commons.io.IOExceptionWithCause; 5 | import org.apache.pdfbox.pdmodel.PDDocument; 6 | import org.apache.pdfbox.pdmodel.PDPage; 7 | import org.apache.pdfbox.text.TextPosition; 8 | import org.apache.tika.exception.TikaException; 9 | import org.apache.tika.metadata.Metadata; 10 | import org.apache.tika.parser.ParseContext; 11 | import org.apache.tika.parser.pdf.PDFParserConfig; 12 | import org.xml.sax.ContentHandler; 13 | import org.xml.sax.SAXException; 14 | 15 | class OCR2XHTML extends AbstractPDF2XHTML { 16 | private OCR2XHTML(PDDocument document, ContentHandler handler, ParseContext context, 17 | Metadata metadata, PDFParserConfig config) throws IOException { 18 | super(document, handler, context, metadata, config); 19 | } 20 | 21 | public static void process(PDDocument document, ContentHandler handler, 22 | ParseContext context, Metadata metadata, PDFParserConfig config) throws SAXException, TikaException { 23 | OCR2XHTML ocr2XHTML = null; 24 | 25 | try { 26 | ocr2XHTML = new OCR2XHTML(document, handler, context, metadata, config); 27 | ocr2XHTML.writeText(document, new Writer() { 28 | public void write(char[] cbuf, int off, int len) { 29 | } 30 | 31 | public void flush() { 32 | } 33 | 34 | public void close() { 35 | } 36 | }); 37 | } catch (IOException var7) { 38 | if (var7.getCause() instanceof SAXException) { 39 | throw (SAXException)var7.getCause(); 40 | } 41 | 42 | throw new TikaException("Unable to extract PDF content", var7); 43 | } 44 | 45 | if (ocr2XHTML.exceptions.size() > 0) { 46 | throw new TikaException("Unable to extract all PDF content", (Throwable)ocr2XHTML.exceptions.get(0)); 47 | } 48 | } 49 | 50 | public void processPage(PDPage pdPage) throws IOException { 51 | try { 52 | this.startPage(pdPage); 53 | this.doOCROnCurrentPage(); 54 | this.endPage(pdPage); 55 | } catch (SAXException | TikaException var3) { 56 | throw new IOExceptionWithCause(var3); 57 | } catch (IOException var4) { 58 | this.handleCatchableIOE(var4); 59 | } 60 | 61 | } 62 | 63 | protected void writeString(String text) throws IOException { 64 | } 65 | 66 | protected void writeCharacters(TextPosition text) throws IOException { 67 | } 68 | 69 | protected void writeWordSeparator() throws IOException { 70 | } 71 | 72 | protected void writeLineSeparator() throws IOException { 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tika-server 2 | 3 | Configurable Apache Tika Server Docker Image with Tesseract 4. 4 | 5 | Contains additional PDF parser improvements to workaround problem with obsolete empty lines in PDF files caused by corrupted embedded fonts. 6 | 7 | ## Contents 8 | - Apache Tika 1.20 9 | - Tesseract OCR 4 10 | - Tesseract Language Packs: English, Italian, French, Spain, German, Russian 11 | 12 | Allows providing external configuration file for Tika Server - for disabling OCR or any other needs. 13 | 14 | ## Building 15 | 16 | ``` 17 | cd build 18 | ./build.sh script. 19 | ``` 20 | 21 | ## Running 22 | 23 | **Pulling lexnlp/tika-server:** 24 | ``` 25 | docker pull lexpredict/tika-server 26 | ``` 27 | 28 | 29 | 30 | **Simply running Tika Server with default config and publishing Tika port on the host machine:** 31 | ``` 32 | docker run -p 9998:9998 -it lexpredict/tika-server 33 | ``` 34 | 35 | **Running Tika Server with external configuration:** 36 | 1. Create tika-config.xml file. 37 | The following example tika-config.xml can be used for disabling OCR: 38 | ``` 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | ``` 48 | 2. Run Tika server with this config file: 49 | ``` 50 | docker run -it -p 9998:9998 -v /home/user/tika-config.xml:/tika-config.xml lexpredict/tika-server 51 | ``` 52 | If running via sudo ensure you provide full path to the file on the host machine – otherwise it will throw an error. 53 | 54 | 55 | **Running Tika Server cluster in Docker Swarm:** 56 | 1. Assuming you already have a Docker Swarm cluster configured (docker swarm init) and some worker machines are connected to it. 57 | 2. To deploy Tika we need docker-compose.yml file (see /deployment-example dir): 58 | ``` 59 | version: "3.3" 60 | services: 61 | tika: 62 | image: lexpredict/tika-server:latest 63 | ports: 64 | - 9998:9998 65 | configs: 66 | - source: tika_config_3 67 | target: /tika-config.xml 68 | networks: 69 | - net 70 | deploy: 71 | replicas: 3 72 | 73 | networks: 74 | net: 75 | 76 | configs: 77 | tika_config_3: 78 | file: ./tika-config.xml 79 | 80 | ``` 81 | Configuration file (tika-config.xml) should be in the same directory with docker-compose.xml. 82 | 3. Deploying Tika to Docker Swarm: 83 | ``` 84 | docker stack deploy --compose-file docker-compose.yml tika-cluster 85 | ``` 86 | 87 | ## Workaround for fixing obsolete empty lines in PDF documents having corrupted embedded fonts 88 | 89 | In some cases the current PDF text extraction routines from TIKA work incorrectly with PDF documents containing corrupted embedded fonts. The extracted text contains an obsolete blank line after almost every line of normal text. 90 | 91 | It can be fixed by using PDFTextStripper class from PDFBox which probably was used in previous versions of TIKA. 92 | This workaround is not suitable for all cases because it provides worse results than TIKA's normal text extraction on good uncorrupted PDF documents. 93 | 94 | Normaly TIKA configured in this Docker image processes PDFs as usual without using the old-style PDFTextStripper. 95 | To trigger processing the document with PDFTextStripper add a header to the request: "pdf-parse:strip". 96 | -------------------------------------------------------------------------------- /lexpredict-tika/src/main/java/com/lexpredict/tika/PdfContentImagePreprocessor.java: -------------------------------------------------------------------------------- 1 | package com.lexpredict.tika; 2 | 3 | import org.apache.pdfbox.cos.COSName; 4 | import org.apache.pdfbox.pdmodel.PDDocument; 5 | import org.apache.pdfbox.pdmodel.PDPage; 6 | import org.apache.pdfbox.pdmodel.PDPageTree; 7 | import org.apache.pdfbox.pdmodel.PDResources; 8 | import org.apache.pdfbox.pdmodel.graphics.PDXObject; 9 | import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject; 10 | import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; 11 | import javax.imageio.ImageIO; 12 | import java.awt.*; 13 | import java.awt.image.BufferedImage; 14 | import java.io.ByteArrayOutputStream; 15 | import java.io.IOException; 16 | 17 | // TODO: somehow we should determine image type from COSName 18 | // or PDImageXObject before saving it back to ByteArrayOutputStream in getImageBytes() 19 | 20 | // TODO: determine contrast background color in flattenImage() 21 | 22 | public class PdfContentImagePreprocessor { 23 | private boolean imagesWereChanged; 24 | 25 | private PDDocument document; 26 | 27 | public boolean removeImagesAlphaChannel(PDDocument document) { 28 | this.document = document; 29 | imagesWereChanged = false; 30 | try { 31 | removeImagesAlphaChannelUnsafe(); 32 | return imagesWereChanged; 33 | } catch (Exception e) { 34 | return false; 35 | } 36 | } 37 | 38 | private void removeImagesAlphaChannelUnsafe() { 39 | try { 40 | PDPageTree allPages = document.getDocumentCatalog().getPages(); 41 | for (int i = 0; i < allPages.getCount(); i++) { 42 | PDPage page = allPages.get(i); 43 | processImagesFromResources(page.getResources()); 44 | } 45 | } catch (Exception e) { 46 | e.printStackTrace(); 47 | } 48 | } 49 | 50 | // search for images in document's resources 51 | private void processImagesFromResources(PDResources resources) throws IOException { 52 | for (COSName xObjectName : resources.getXObjectNames()) { 53 | PDXObject xObject = resources.getXObject(xObjectName); 54 | 55 | if (xObject instanceof PDFormXObject) { 56 | processImagesFromResources(((PDFormXObject) xObject).getResources()); 57 | } else if (xObject instanceof PDImageXObject) { 58 | PDImageXObject img = (PDImageXObject) xObject; 59 | if (!img.getImage().getColorModel().hasAlpha()) 60 | return; 61 | 62 | PDImageXObject cpy = makeImageObjectCopy(img); 63 | resources.put(xObjectName, cpy); 64 | imagesWereChanged = true; 65 | } 66 | } 67 | } 68 | 69 | // load the image, "flatten" it and store it into bytes 70 | // then return new PDImageXObject from image's bytes 71 | private PDImageXObject makeImageObjectCopy(PDImageXObject img) throws IOException { 72 | BufferedImage flatImg = flattenImage(img.getImage()); 73 | byte[] bytes = getImageBytes(flatImg); 74 | return PDImageXObject.createFromByteArray(document, bytes, "image"); 75 | } 76 | 77 | // make a new BufferedImage drawn on a solid background 78 | private BufferedImage flattenImage(BufferedImage img) { 79 | BufferedImage copy = new BufferedImage(img.getWidth(), img.getHeight(), BufferedImage.TYPE_INT_RGB); 80 | Graphics2D g2d = copy.createGraphics(); 81 | g2d.setColor(Color.WHITE); 82 | g2d.fillRect(0, 0, copy.getWidth(), copy.getHeight()); 83 | g2d.drawImage(img, 0, 0, null); 84 | g2d.dispose(); 85 | return copy; 86 | } 87 | 88 | // serialize image as bytes 89 | private byte[] getImageBytes(BufferedImage img) throws IOException { 90 | ByteArrayOutputStream baos = new ByteArrayOutputStream(); 91 | ImageIO.write(img, "png", baos ); 92 | baos.flush(); 93 | byte[] imageInByte = baos.toByteArray(); 94 | baos.close(); 95 | return imageInByte; 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /lexpredict-tika/readme.txt: -------------------------------------------------------------------------------- 1 | Release notes; 2 | 3 | - current version: 1.0 4 | 5 | 6 | 7 | 8 | 9 | 10 | - 1 - how to build 11 | 12 | Just run "mvn install" or "mvn install -DskipTests" command in the project directory (lexpredict-tika). Output file lexpredict-tika-.jar would be in lexpredict-tika/target/ folder. 13 | 14 | 15 | - 2 - how to use 16 | 17 | The resulted artifact (JAR) is a part of docker container. There, in docker, the jar-file is used as a parameter while starting Tika server like this: 18 | 19 | java -cp "tika-server-${TIKA_VERSION}.jar:lexpredict-tika-.jar:libs/*" org.apache.tika.server.TikaServerCli --config tika.config 20 | 21 | 22 | - 3 - how to test locally: 23 | 24 | Suppose we have a following folders and files: 25 | documents/ 26 | source_doc.pdf 27 | parsed/ 28 | scripts/ 29 | tika-server-1.20.jar 30 | lexpredict-tika-1.0.jar 31 | tika.config 32 | first thing to do is to 33 | 34 | 35 | To debug processing a single file in IDE: 36 | Main class: org.apache.tika.cli.TikaCLI 37 | Program arguments: --config=example_tika.config -J -t -eutf-8 tmp/your_file.pdf 38 | 39 | 40 | - 3.1 - run Tika server 41 | 42 | Currently we are in "parsed" directory. Our tika server has version 1.20, our plugin has version 1.0. 43 | Run the following command: 44 | java -cp 'tika-server-1.20.jar:lexpredict-tika-1.0.jar:libs/*' org.apache.tika.server.TikaServerCli --port 9999 --config tika.config 45 | 46 | We should see a number of lines in output, like: 47 | INFO Starting Apache Tika 1.20 server 48 | ... 49 | INFO Using custom config: tika.config 50 | ... 51 | Started Apache Tika server at http://localhost:9999/ 52 | 53 | 54 | - 3.2 - parse a document 55 | 56 | Run command: 57 | curl -T documents/source_doc.pdf http://localhost:9999 -H pdf-parse:pdf_ocr > parsed/parsed_doc.zip 58 | Note that -H pdf-parse:pdf_ocr parameter 59 | This parameter comes from the plugin. It could have one of the three values 60 | 1) "pdf_ocr" means that plugin will decide what internal parser to use, PDF-2-TEXT or OCR, 61 | 2) "strip" means the same, but the "printed" text will be obtained by PDFBox PDFTextStripper class 62 | 3) "default" means that the plugin will work as a standard PDFParser plugin 63 | 64 | Alternatively, you can set LEXNLP_TIKA_PARSER_MODE environment variable instead of 65 | passing "pdf-parse" command line argument. 66 | 67 | 68 | - 4.2 - source files 69 | 70 | 1) Directory lexpredict-tika/src/main/java/com/lexpredict/tika, files: 71 | 1.1) AlterPDFParser.java 72 | here is the plugin itself. A class derived from standard PDFParser. 73 | 74 | 1.2) FieldLookup.java 75 | finds field in passed class or one of his ancestors. 76 | 77 | 1.3) HttpRequestParamsReader.java 78 | a class that captures HTTP context for the command passed to the Tika server. Searches for "pdf-parse" request parameter. 79 | 80 | 1.4) PdfContentImagePreprocessor.java 81 | this class "removes" alpha channel from all embedded in PDDocument images by drawing them on a solid color background. Thus preventing issue with parsing transparent images. 82 | 83 | 1.5) PdfContentTypeChecker.java 84 | this class determines the content of the PDDocument passed. The content is either "EMPTY, TEXT", "IMAGES" or "MIXED" (text + images). When the content is "IMAGES" and "pdf-parse" is set to "pdf_ocr" the parser uses OCR document processing. 85 | 86 | 1.6) PdfStripperProcessor.java 87 | the class encapsulates PDFTextStripper functionality for setting text in ContentHandler parameter. 88 | 89 | 1.7) ShallowCopy.java 90 | this class makes a shallow copy, setting the "to" object's properties equal to the ones of object "from". 91 | 92 | 93 | 2) Directory lexpredict-tika/src/test/java/com/lexpredict/tika 94 | 2.1) AlterPDFParserTest.java 95 | contains unit tests for AlterPDFParser: tests getting text from vector and scanned PDF. 96 | 97 | 2.2, 2.3, 2.4) PdfContentImagePreprocessorTest.java, PdfCOntentTypeCheckerTest.java, ShallowCopy.java 98 | unit tests for PdfContentImagePreprocessor and PdfContentTypeChecker classes 99 | 100 | File pom.xml.debug is a copy of pom.xml plus extra dependency for debugging parser with tika-app. 101 | 102 | -------------------------------------------------------------------------------- /lexpredict-tika/src/main/java/com/lexpredict/tika/HttpRequestParamsReader.java: -------------------------------------------------------------------------------- 1 | package com.lexpredict.tika; 2 | 3 | import org.eclipse.jetty.http.HttpField; 4 | import org.eclipse.jetty.http.HttpFields; 5 | import org.eclipse.jetty.http.MetaData; 6 | import javax.servlet.http.HttpServletRequest; 7 | import java.io.InputStream; 8 | import java.lang.reflect.Field; 9 | import java.util.HashMap; 10 | 11 | enum CommonParseFlag 12 | { 13 | VERBOSE, PDF_PARSE_METHOD; 14 | } 15 | 16 | // class reads HttpRequest params from InputStream 17 | // if InputStream is from HttpRequest 18 | public class HttpRequestParamsReader { 19 | public static final String PDF_PARSE_METHOD_STRIP = "strip"; 20 | public static final String PDF_PARSE_METHOD_PDF_OCR = "pdf_ocr"; 21 | public static final String PDF_PARSE_METHOD_PDF_PREFER_TEXT = "pdf_prefer_text"; 22 | public static final String PDF_PARSE_METHOD_PDF_ONLY = "pdf_only"; 23 | public static final String PDF_PARSE_METHOD_OCR_ONLY = "ocr_only"; 24 | 25 | public HashMap rawParams = new HashMap(); 26 | public HashMap typedParams = new HashMap<>(); 27 | 28 | private static HashMap flagByName = new HashMap() { 29 | { 30 | put("v", CommonParseFlag.VERBOSE); 31 | put("-verbose", CommonParseFlag.VERBOSE); 32 | put("pdf-parse", CommonParseFlag.PDF_PARSE_METHOD); 33 | } 34 | }; 35 | 36 | private static HttpRequestParamsReader single_instance = null; 37 | 38 | private boolean initialized = false; 39 | 40 | private HttpRequestParamsReader() 41 | { 42 | } 43 | 44 | // static method to create instance of Singleton class 45 | public static HttpRequestParamsReader getInstance() 46 | { 47 | if (single_instance == null) 48 | single_instance = new HttpRequestParamsReader(); 49 | return single_instance; 50 | } 51 | 52 | public void initialize(InputStream stream) { 53 | if (initialized) 54 | return; 55 | initialized = true; 56 | MetaData metaDict = getMetaDataField(stream); 57 | if (metaDict == null) 58 | return; 59 | 60 | HttpFields fields = metaDict.getFields(); 61 | for (HttpField field : fields) 62 | rawParams.put(field.getName(), field.getValue()); 63 | GetCommonFlags(); 64 | } 65 | 66 | public boolean IsVerbose() { 67 | return typedParams.containsKey(CommonParseFlag.VERBOSE); 68 | } 69 | 70 | public void outIfVerbose(String s) { 71 | if (!IsVerbose()) return; 72 | System.out.println(s); 73 | } 74 | 75 | // just check the value specified in the dictionary passed 76 | public boolean checkParamValue(CommonParseFlag ptrName, String expectedValue) { 77 | return typedParams.containsKey(ptrName) && 78 | typedParams.get(ptrName).equalsIgnoreCase( 79 | expectedValue); 80 | } 81 | 82 | private void GetCommonFlags() { 83 | rawParams.entrySet().forEach(entry -> { 84 | flagByName.entrySet().forEach(fl -> { 85 | if (fl.getKey().equals(entry.getKey())) 86 | typedParams.put(fl.getValue(), entry.getValue()); 87 | }); 88 | }); 89 | } 90 | 91 | // read metadata from HttpRequest 92 | private static MetaData getMetaDataField(Object stream) { 93 | while (true) { 94 | try { 95 | Field field = FieldLookup.findField(stream.getClass(), "val$req"); 96 | if (field != null) { 97 | field.setAccessible(true); 98 | HttpServletRequest req = (HttpServletRequest) field.get(stream); 99 | field = FieldLookup.findField(req.getClass(), "_metaData"); 100 | if (field == null) 101 | return null; 102 | 103 | field.setAccessible(true); 104 | return (MetaData) field.get(req); 105 | } 106 | } catch (IllegalAccessException ex) { 107 | return null; 108 | } 109 | 110 | Field inField = FieldLookup.findField(stream.getClass(), "in"); 111 | if (inField == null) 112 | return null; 113 | 114 | inField.setAccessible(true); 115 | try { 116 | stream = inField.get(stream); 117 | } catch (IllegalAccessException e) { 118 | return null; 119 | } 120 | } 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /lexpredict-tika/src/main/java/com/lexpredict/tika/PDFEncodedStringDecoder.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Modifications copyright (C) 2020 ContraxSuite, LLC 18 | */ 19 | 20 | package com.lexpredict.tika; 21 | 22 | import static java.nio.charset.StandardCharsets.ISO_8859_1; 23 | 24 | import java.io.ByteArrayInputStream; 25 | import java.io.IOException; 26 | import java.io.InputStream; 27 | 28 | import org.apache.pdfbox.cos.COSString; 29 | import org.apache.pdfbox.io.RandomAccessBuffer; 30 | import org.apache.pdfbox.io.RandomAccessRead; 31 | import org.apache.pdfbox.pdfparser.COSParser; 32 | 33 | /** 34 | * In fairly rare cases, a PDF's XMP will contain a string that 35 | * has incorrectly been encoded with PDFEncoding: an octal for non-ascii and 36 | * ascii for ascii, e.g. "\376\377\000M\000i\000c\000r\000o\000s\000o\000f\000t\000" 37 | *

38 | * This class can be used to decode those strings. 39 | *

40 | * See TIKA-1678. Many thanks to Andrew Jackson for raising this issue 41 | * and Tilman Hausherr for the solution. 42 | *

43 | * As of this writing, we are only handling strings that start with 44 | * an encoded BOM. Andrew Jackson found a handful of other examples (e.g. 45 | * this ISO-8859-7 string: 46 | * "Microsoft Word - \\323\\365\\354\\354\\345\\364\\357\\367\\336 47 | * \\364\\347\\362 PRAKSIS \\363\\364\\357") 48 | * that we aren't currently handling. 49 | */ 50 | class PDFEncodedStringDecoder { 51 | 52 | private static final String[] PDF_ENCODING_BOMS = { 53 | "\\376\\377", //UTF-16BE 54 | "\\377\\376", //UTF-16LE 55 | "\\357\\273\\277"//UTF-8 56 | }; 57 | 58 | /** 59 | * Does this string contain an octal-encoded UTF BOM? 60 | * Call this statically to determine if you should bother creating a new parser to parse it. 61 | * @param s 62 | * @return 63 | */ 64 | static boolean shouldDecode(String s) { 65 | if (s == null || s.length() < 8) { 66 | return false; 67 | } 68 | for (String BOM : PDF_ENCODING_BOMS) { 69 | if (s.startsWith(BOM)) { 70 | return true; 71 | } 72 | } 73 | return false; 74 | } 75 | 76 | /** 77 | * This assumes that {@link #shouldDecode(String)} has been called 78 | * and has returned true. If you run this on a non-octal encoded string, 79 | * disaster will happen! 80 | * 81 | * @param value 82 | * @return 83 | */ 84 | String decode(String value) { 85 | try { 86 | byte[] bytes = new String("(" + value + ")").getBytes(ISO_8859_1); 87 | InputStream is = new ByteArrayInputStream(bytes); 88 | PDFEncodedStringDecoder.COSStringParser p = new PDFEncodedStringDecoder.COSStringParser(new RandomAccessBuffer(is)); 89 | String parsed = p.myParseCOSString(); 90 | if (parsed != null) { 91 | return parsed; 92 | } 93 | } catch (IOException e) { 94 | //oh well, we tried. 95 | } 96 | //just return value if something went wrong 97 | return value; 98 | } 99 | 100 | class COSStringParser extends COSParser { 101 | 102 | COSStringParser(RandomAccessRead buffer) throws IOException { 103 | super(buffer); 104 | } 105 | 106 | /** 107 | * 108 | * @return parsed string or null if something went wrong. 109 | */ 110 | String myParseCOSString() { 111 | try { 112 | COSString cosString = parseCOSString(); 113 | if (cosString != null) { 114 | return cosString.getString(); 115 | } 116 | } catch (IOException e) { 117 | } 118 | return null; 119 | } 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /lexpredict-tika/src/main/java/com/lexpredict/tika/PdfContentTypeChecker.java: -------------------------------------------------------------------------------- 1 | package com.lexpredict.tika; 2 | 3 | import org.apache.pdfbox.contentstream.operator.Operator; 4 | import org.apache.pdfbox.cos.COSName; 5 | import org.apache.pdfbox.pdfparser.PDFStreamParser; 6 | import org.apache.pdfbox.pdmodel.PDDocument; 7 | import org.apache.pdfbox.pdmodel.PDPage; 8 | import org.apache.pdfbox.pdmodel.PDPageTree; 9 | import org.apache.pdfbox.pdmodel.PDResources; 10 | import org.apache.pdfbox.pdmodel.graphics.PDXObject; 11 | import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject; 12 | import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; 13 | import org.xml.sax.SAXException; 14 | 15 | import java.io.IOException; 16 | import java.io.InputStream; 17 | import java.util.List; 18 | 19 | 20 | // import org.apache.tika.parser.pdf.PDFParser; 21 | // class MyPDF2XHTML extends PDF2XHTML { 22 | 23 | 24 | 25 | // determine content of the PDDocument passed: 26 | // whether it contains text, images, text + images or just nothing 27 | public class PdfContentTypeChecker { 28 | public enum PdfContent { 29 | EMPTY, TEXT, IMAGES, MIXED, UNKNOWN 30 | } 31 | 32 | private PdfContent docContent = PdfContent.EMPTY; 33 | 34 | private int pageCount = 0; 35 | 36 | private int imagesCount = 0; 37 | 38 | private int textBlocks = 0; 39 | 40 | private int fullTextLength = 0; 41 | 42 | private PDFTextStripper pdfTextStripper; 43 | 44 | public int getImagesCount() { 45 | return imagesCount; 46 | } 47 | 48 | public int getTextBlocks() { 49 | return textBlocks; 50 | } 51 | 52 | // reads PDDocument from the stream and calls determineDocContentType 53 | public PdfContent determineDocContentType(InputStream stream) { 54 | try { 55 | PDDocument document = PDDocument.load(stream); 56 | return determineDocContentType(document); 57 | } catch (Exception e) { 58 | return PdfContent.UNKNOWN; 59 | } 60 | } 61 | 62 | public PdfContent determineDocContentType(PDDocument document) throws IOException { 63 | try { 64 | calculateObjectsInDocument(document); 65 | } catch (Exception e) { 66 | return PdfContent.UNKNOWN; 67 | } 68 | int totalCount = imagesCount + textBlocks; 69 | docContent = totalCount == 0 ? PdfContent.EMPTY 70 | : imagesCount > 0 && textBlocks > 0 ? PdfContent.MIXED 71 | : imagesCount > 0 ? PdfContent.IMAGES 72 | : PdfContent.TEXT; 73 | return docContent; 74 | } 75 | 76 | // calculate count of text blocks (textBlocks member) and 77 | // images (imagesCount) in the document 78 | private void calculateObjectsInDocument(PDDocument document) throws IOException { 79 | this.pdfTextStripper = new PDFTextStripper(); 80 | 81 | try { 82 | PDPageTree allPages = document.getDocumentCatalog().getPages(); 83 | this.pageCount = allPages.getCount(); 84 | for (int i = 0; i < allPages.getCount(); i++) { 85 | PDPage page = allPages.get(i); 86 | readObjectsOnPage(page); 87 | calculateTextLengthOnPage(document, i + 1); 88 | } 89 | } catch (Exception e) { 90 | e.printStackTrace(); 91 | } 92 | } 93 | 94 | // calculate objects' count for the page passed 95 | private void readObjectsOnPage(PDPage page) throws IOException { 96 | getImagesFromResources(page.getResources()); 97 | calculateTextObjectsOnPage(page); 98 | } 99 | 100 | 101 | private void calculateTextLengthOnPage(PDDocument doc, int pageNum1Based) throws IOException, SAXException { 102 | this.pdfTextStripper.setStartPage(pageNum1Based); 103 | this.pdfTextStripper.setEndPage(pageNum1Based); 104 | String text = this.pdfTextStripper.getText(doc); 105 | if (text != null) { 106 | text = text.trim().replaceAll("\\s+", " "); 107 | this.fullTextLength += text.length(); 108 | } 109 | } 110 | 111 | private void calculateTextObjectsOnPage(PDPage page) throws IOException { 112 | PDFStreamParser parser = new PDFStreamParser(page); 113 | parser.parse(); 114 | List pageTokens = parser.getTokens(); 115 | for (Object token : pageTokens) { 116 | if (token instanceof Operator) { 117 | String opName = ((Operator) token).getName(); 118 | if (opName.equals("BT")) // Begin Text 119 | textBlocks++; 120 | } 121 | } 122 | } 123 | 124 | private void getImagesFromResources(PDResources resources) throws IOException { 125 | for (COSName xObjectName : resources.getXObjectNames()) { 126 | PDXObject xObject = resources.getXObject(xObjectName); 127 | 128 | if (xObject instanceof PDFormXObject) { 129 | getImagesFromResources(((PDFormXObject) xObject).getResources()); 130 | } else if (xObject instanceof PDImageXObject) { 131 | //((PDImageXObject) xObject).getImage(); 132 | imagesCount++; 133 | } 134 | } 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /lexpredict-tika/src/main/java/com/lexpredict/tika/PdfStripperProcessor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Modifications copyright (C) 2020 ContraxSuite, LLC 18 | */ 19 | 20 | package com.lexpredict.tika; 21 | 22 | import org.apache.pdfbox.pdmodel.PDDocument; 23 | import org.apache.pdfbox.text.PDFTextStripper; 24 | import org.apache.tika.sax.SecureContentHandler; 25 | import org.apache.tika.sax.ToTextContentHandler; 26 | import org.apache.tika.sax.xpath.Matcher; 27 | import org.apache.tika.sax.xpath.MatchingContentHandler; 28 | import org.xml.sax.ContentHandler; 29 | import org.xml.sax.SAXException; 30 | import java.io.IOException; 31 | import java.io.Writer; 32 | import java.lang.reflect.Field; 33 | import java.lang.reflect.InvocationTargetException; 34 | import java.lang.reflect.Method; 35 | 36 | // Class uses PDFBox text "stripping" methods 37 | // instead of Tika's ones 38 | // Sometimes PDFBox method format extracted text better than Tika 39 | public class PdfStripperProcessor { 40 | public static void setTextUsingPDFTextStripper(ContentHandler handler, PDDocument pdfDocument) 41 | throws IOException, SAXException, NoSuchMethodException, InvocationTargetException, 42 | IllegalAccessException, NoSuchFieldException { 43 | PDFTextStripper stripper = new PDFTextStripper(); 44 | String text = stripper.getText(pdfDocument); 45 | char[] chars = text.toCharArray(); 46 | setContentHandlerCharacters(handler, chars); 47 | } 48 | 49 | private static void setContentHandlerCharacters(ContentHandler handler, char[] chars) 50 | throws SAXException, NoSuchMethodException, InvocationTargetException, IllegalAccessException, 51 | NoSuchFieldException, IOException { 52 | 53 | advanceSecureContentHandler(handler, chars.length); 54 | 55 | ContentHandler textHandler = getUnderlyingHandler(handler, ToTextContentHandler.class); 56 | if (textHandler instanceof ToTextContentHandler) { 57 | writeCharsToTextHandler((ToTextContentHandler)textHandler, chars); 58 | return; 59 | } 60 | 61 | ContentHandler matchHandler = getUnderlyingHandler(handler, MatchingContentHandler.class); 62 | if (matchHandler instanceof MatchingContentHandler) { 63 | setCharsBypassingMatching(handler, (MatchingContentHandler)matchHandler, chars); 64 | return; 65 | } 66 | 67 | handler.characters(chars, 0, chars.length); 68 | } 69 | 70 | private static void advanceSecureContentHandler(ContentHandler handler, int bytesCount) 71 | throws IllegalAccessException, NoSuchMethodException, InvocationTargetException { 72 | ContentHandler secHandler = getUnderlyingHandler(handler, SecureContentHandler.class); 73 | if (!(secHandler instanceof SecureContentHandler)) 74 | return; 75 | Method adMethod = SecureContentHandler.class.getDeclaredMethod("advance", 76 | int.class); 77 | adMethod.setAccessible(true); 78 | adMethod.invoke(secHandler, bytesCount); 79 | } 80 | 81 | private static void writeCharsToTextHandler(ToTextContentHandler handler, char[] chars) 82 | throws IllegalAccessException, NoSuchFieldException, IOException { 83 | Field writerField = FieldLookup.findField(handler.getClass(), "writer"); 84 | if (writerField == null) 85 | throw new NoSuchFieldException("writer"); 86 | writerField.setAccessible(true); 87 | Writer writer = (Writer)writerField.get(handler); 88 | writer.write(chars); 89 | writer.close(); 90 | } 91 | 92 | private static void directlySetCharacters(ContentHandler handler, char[] chars) 93 | throws NoSuchMethodException, IllegalAccessException, InvocationTargetException { 94 | Method setter; 95 | while (true) { 96 | try { 97 | setter = handler.getClass().getDeclaredMethod("characters", 98 | char[].class, int.class, int.class); 99 | break; 100 | } catch (NoSuchMethodException e) { 101 | // pass 102 | } 103 | Field handField = FieldLookup.findField(handler.getClass(), "handler"); 104 | if (handField == null) 105 | throw new NoSuchMethodException("characters"); 106 | handField.setAccessible(true); 107 | handler = (ContentHandler)handField.get(handler); 108 | } 109 | setter.invoke(handler, chars, 0, chars.length); 110 | } 111 | 112 | private static void setCharsBypassingMatching(ContentHandler handler, MatchingContentHandler matchHandler, char[] chars) 113 | throws IllegalAccessException, NoSuchFieldException, 114 | InvocationTargetException, NoSuchMethodException { 115 | Field matchField = MatchingContentHandler.class.getDeclaredField("matcher"); 116 | matchField.setAccessible(true); 117 | Object oldMatcher = matchField.get(matchHandler); 118 | matchField.set(matchHandler, new TrueMatcher()); 119 | 120 | directlySetCharacters(handler, chars); 121 | 122 | matchField.set(matchHandler, oldMatcher); 123 | } 124 | 125 | private static ContentHandler getUnderlyingHandler(ContentHandler handler, Class desiredClass) 126 | throws IllegalAccessException { 127 | while (true) { 128 | Class handlerClass = handler.getClass(); 129 | if (handlerClass == desiredClass) break; 130 | Field handlerField = FieldLookup.findField(handlerClass,"handler"); 131 | if (handlerField == null) 132 | return handler; 133 | 134 | handlerField.setAccessible(true); 135 | handler = (ContentHandler) handlerField.get(handler); 136 | } 137 | return handler; 138 | } 139 | } 140 | 141 | // the class tells calling code that the passed content should be 142 | // included in the output 143 | class TrueMatcher extends Matcher { 144 | @Override 145 | public boolean matchesText() { 146 | return true; 147 | } 148 | } -------------------------------------------------------------------------------- /lexpredict-tika/src/test/java/com/lexpredict/tika/AlterPDFParserTest.java: -------------------------------------------------------------------------------- 1 | package com.lexpredict.tika; 2 | 3 | import org.apache.tika.parser.ParseContext; 4 | import org.apache.tika.parser.pdf.PDFParser; 5 | import org.apache.tika.parser.pdf.PDFParserConfig; 6 | import org.junit.Test; 7 | import java.io.InputStream; 8 | import java.lang.reflect.Field; 9 | import java.util.Collections; 10 | import java.util.HashMap; 11 | import java.util.Map; 12 | 13 | import static org.junit.Assert.*; 14 | 15 | public class AlterPDFParserTest extends TikaTest { 16 | @Test 17 | public void testDoubleSpacedText() throws Exception { 18 | PDFParser pdfParser = new AlterPDFParser(); 19 | ParseContext context = new ParseContext(); 20 | PDFParserConfig config = new PDFParserConfig(); 21 | context.set(PDFParserConfig.class, config); 22 | 23 | InputStream stream = AlterPDFParserTest.class.getResourceAsStream("/test-documents/double_space_test.pdf"); 24 | String text = getText(stream, pdfParser, context); 25 | stream.close(); 26 | 27 | assertTrue(text.length() > 100); 28 | } 29 | 30 | @Test 31 | public void testParseSimpleScannedText() throws Exception { 32 | String text = getTextFromDoc("/test-documents/text_on_white.pdf", 33 | AlterPDFParser.ParsePdfMode.PDF_OCR); 34 | assertTrue(text.length() > 50); 35 | } 36 | 37 | @Test 38 | public void testParseTransparentScannedText() throws Exception { 39 | String text = getTextFromDoc("/test-documents/transp_scanned.pdf", 40 | AlterPDFParser.ParsePdfMode.PDF_OCR); 41 | assertTrue(text.length() > 50); 42 | } 43 | 44 | private String getTextFromDoc(String docPath, 45 | AlterPDFParser.ParsePdfMode parseMode) throws Exception { 46 | return getTextFromDoc(docPath, parseMode, "text"); 47 | } 48 | 49 | private String getTextFromDoc(String docPath, 50 | AlterPDFParser.ParsePdfMode parseMode, 51 | String outputFormat) throws Exception { 52 | AlterPDFParser pdfParser = new AlterPDFParser(); 53 | pdfParser.defaultParseMode = parseMode; 54 | ParseContext context = new ParseContext(); 55 | PDFParserConfig config = new PDFParserConfig(); 56 | context.set(PDFParserConfig.class, config); 57 | 58 | InputStream stream = AlterPDFParserTest.class.getResourceAsStream(docPath); 59 | if (outputFormat.equals("text")) { 60 | String txt = getText(stream, pdfParser, context); 61 | stream.close(); 62 | return txt; 63 | } 64 | XMLResult rst = getXML(stream, pdfParser, context); 65 | stream.close(); 66 | return rst.xml; 67 | } 68 | 69 | @Test 70 | public void testParseXhtmlNoDetail() throws Exception { 71 | String text = getTextFromDoc("/test-documents/sample_table.pdf", 72 | AlterPDFParser.ParsePdfMode.PDF_ONLY, "xml"); 73 | assertTrue(text.length() > 50); 74 | } 75 | 76 | @Test 77 | public void testParseJBig() throws Exception { 78 | String oldSysEnv = setEnvVar("LEXNLP_TIKA_PARSER_MODE", "ocr_only"); 79 | String text = getTextFromDoc("/test-documents/jbig.pdf", 80 | AlterPDFParser.ParsePdfMode.OCR_ONLY, "xml"); 81 | setEnvVar("LEXNLP_TIKA_PARSER_MODE", oldSysEnv); 82 | assertTrue(text.length() > 50); 83 | } 84 | 85 | @Test 86 | public void testParseXhtmlCoordsEmbedded() throws Exception { 87 | String oldSysEnv = setEnvVar("LEXNLP_TIKA_XML_DETAIL", "coords_embedded"); 88 | String text = getTextFromDoc("/test-documents/industrial developing authority.pdf", 89 | AlterPDFParser.ParsePdfMode.PDF_ONLY, "xml"); 90 | setEnvVar("LEXNLP_TIKA_XML_DETAIL", oldSysEnv); 91 | assertTrue(text.length() > 50); 92 | } 93 | 94 | @Test 95 | public void testParseXhtmlCoordsFlat() throws Exception { 96 | String oldSysEnv = setEnvVar("LEXNLP_TIKA_XML_DETAIL", "coords_flat"); 97 | String text = getTextFromDoc("/test-documents/industrial developing authority.pdf", 98 | AlterPDFParser.ParsePdfMode.PDF_ONLY, "xml"); 99 | setEnvVar("LEXNLP_TIKA_XML_DETAIL", oldSysEnv); 100 | assertTrue(text.length() > 50); 101 | } 102 | 103 | @Test 104 | public void testParseXhtmlCsTextFlat() throws Exception { 105 | String oldSysEnv = setEnvVar("LEXNLP_TIKA_XML_DETAIL", "coords_text_flat"); 106 | String text = getTextFromDoc("/test-documents/double_space_test.pdf", 107 | AlterPDFParser.ParsePdfMode.PDF_ONLY, "xml"); 108 | setEnvVar("LEXNLP_TIKA_XML_DETAIL", oldSysEnv); 109 | assertTrue(text.length() > 50); 110 | } 111 | 112 | @Test 113 | public void testParseToBraces() throws Exception { 114 | String oldSysEnv = setEnvVar("LEXNLP_TIKA_XML_DETAIL", "coords_text_flat"); 115 | String text = getTextFromDoc("/test-documents/chylde_harold.pdf", 116 | AlterPDFParser.ParsePdfMode.PDF_ONLY, "xml"); 117 | setEnvVar("LEXNLP_TIKA_XML_DETAIL", oldSysEnv); 118 | assertTrue(text.length() > 50); 119 | assertTrue(text.indexOf("] ]") > 0); 120 | } 121 | 122 | @Test 123 | public void testParseNoDuplicates() throws Exception { 124 | String oldSysEnv = setEnvVar("LEXNLP_TIKA_PARSER_MODE", "pdf_prefer_text"); 125 | String text = getTextFromDoc("/test-documents/mixed_scanned_text.pdf", 126 | AlterPDFParser.ParsePdfMode.PDF_ONLY, "xml"); 127 | setEnvVar("LEXNLP_TIKA_PARSER_MODE", oldSysEnv); 128 | assertTrue(text.length() > 50); 129 | } 130 | 131 | protected static String setEnvVar(String varName, String varValue) throws Exception { 132 | String oldSysEnv = System.getenv(varName); 133 | oldSysEnv = oldSysEnv == null ? "" : oldSysEnv; 134 | setEnv(new HashMap() {{ 135 | put(varName, varValue); 136 | }}); 137 | return oldSysEnv; 138 | } 139 | 140 | protected static void setEnv(Map newenv) throws Exception { 141 | try { 142 | Class processEnvironmentClass = Class.forName("java.lang.ProcessEnvironment"); 143 | Field theEnvironmentField = processEnvironmentClass.getDeclaredField("theEnvironment"); 144 | theEnvironmentField.setAccessible(true); 145 | Map env = (Map) theEnvironmentField.get(null); 146 | env.putAll(newenv); 147 | Field theCaseInsensitiveEnvironmentField = processEnvironmentClass.getDeclaredField("theCaseInsensitiveEnvironment"); 148 | theCaseInsensitiveEnvironmentField.setAccessible(true); 149 | Map cienv = (Map) theCaseInsensitiveEnvironmentField.get(null); 150 | cienv.putAll(newenv); 151 | } catch (NoSuchFieldException e) { 152 | Class[] classes = Collections.class.getDeclaredClasses(); 153 | Map env = System.getenv(); 154 | for(Class cl : classes) { 155 | if("java.util.Collections$UnmodifiableMap".equals(cl.getName())) { 156 | Field field = cl.getDeclaredField("m"); 157 | field.setAccessible(true); 158 | Object obj = field.get(env); 159 | Map map = (Map) obj; 160 | map.clear(); 161 | map.putAll(newenv); 162 | } 163 | } 164 | } 165 | } 166 | } 167 | -------------------------------------------------------------------------------- /lexpredict-tika/src/main/java/com/lexpredict/tika/PDMetadataExtractor.java: -------------------------------------------------------------------------------- 1 | package com.lexpredict.tika; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.util.Calendar; 6 | import java.util.List; 7 | import java.util.Locale; 8 | 9 | import org.apache.jempbox.xmp.XMPMetadata; 10 | import org.apache.jempbox.xmp.XMPSchema; 11 | import org.apache.jempbox.xmp.XMPSchemaDublinCore; 12 | import org.apache.jempbox.xmp.pdfa.XMPSchemaPDFAId; 13 | import org.apache.pdfbox.cos.COSArray; 14 | import org.apache.pdfbox.cos.COSBase; 15 | import org.apache.pdfbox.cos.COSDictionary; 16 | import org.apache.pdfbox.cos.COSString; 17 | import org.apache.pdfbox.pdmodel.common.PDMetadata; 18 | import org.apache.poi.util.IOUtils; 19 | import org.apache.tika.exception.TikaException; 20 | import org.apache.tika.extractor.EmbeddedDocumentUtil; 21 | import org.apache.tika.metadata.Metadata; 22 | import org.apache.tika.metadata.PDF; 23 | import org.apache.tika.metadata.Property; 24 | import org.apache.tika.metadata.TikaCoreProperties; 25 | import org.apache.tika.mime.MediaType; 26 | import org.apache.tika.parser.ParseContext; 27 | import org.apache.tika.parser.image.xmp.JempboxExtractor; 28 | import org.apache.tika.utils.XMLReaderUtils; 29 | import org.w3c.dom.Document; 30 | import org.xml.sax.SAXException; 31 | 32 | class PDMetadataExtractor { 33 | 34 | private static final MediaType MEDIA_TYPE = MediaType.application("pdf"); 35 | 36 | 37 | static void extract(PDMetadata pdMetadata, Metadata metadata, ParseContext context) { 38 | if (pdMetadata == null) { 39 | metadata.set(PDF.HAS_XMP, "false"); 40 | return; 41 | } 42 | //this file has XMP... 43 | //whether or not it is readable or throws an exception is another story... 44 | metadata.set(PDF.HAS_XMP, "true"); 45 | //now go for the XMP 46 | Document dom = loadDOM(pdMetadata, metadata, context); 47 | 48 | XMPMetadata xmp = null; 49 | if (dom != null) { 50 | xmp = new XMPMetadata(dom); 51 | } 52 | XMPSchemaDublinCore dcSchema = null; 53 | 54 | if (xmp != null) { 55 | try { 56 | dcSchema = xmp.getDublinCoreSchema(); 57 | } catch (IOException e) { 58 | } 59 | 60 | JempboxExtractor.extractXMPMM(xmp, metadata); 61 | } 62 | 63 | extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema); 64 | extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, dcSchema); 65 | extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, dcSchema); 66 | extractMultilingualItems(metadata, TikaCoreProperties.TITLE, null, dcSchema); 67 | 68 | try { 69 | if (xmp != null) { 70 | xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class); 71 | XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class); 72 | if (pdfaxmp != null) { 73 | if (pdfaxmp.getPart() != null) { 74 | metadata.set(PDF.PDFAID_PART, Integer.toString(pdfaxmp.getPart())); 75 | } 76 | if (pdfaxmp.getConformance() != null) { 77 | metadata.set(PDF.PDFAID_CONFORMANCE, pdfaxmp.getConformance()); 78 | String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase(Locale.ROOT); 79 | metadata.set(PDF.PDFA_VERSION, version); 80 | metadata.add(TikaCoreProperties.FORMAT.getName(), 81 | MEDIA_TYPE.toString() + "; version=\"" + version + "\""); 82 | } 83 | } 84 | // TODO WARN if this XMP version is inconsistent with document header version? 85 | } 86 | } catch (IOException e) { 87 | metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e); 88 | } 89 | } 90 | 91 | /** 92 | * As of this writing, XMPSchema can contain bags or sequence lists 93 | * for some attributes...despite standards documentation. 94 | * JempBox expects one or the other for specific attributes. 95 | * Until more flexibility is added to JempBox, Tika will have to handle both. 96 | * 97 | * @param schema 98 | * @param name 99 | * @return list of values or null 100 | */ 101 | static List getXMPBagOrSeqList(XMPSchema schema, String name) { 102 | List ret = schema.getBagList(name); 103 | if (ret == null) { 104 | ret = schema.getSequenceList(name); 105 | } 106 | return ret; 107 | } 108 | 109 | /** 110 | * Try to extract all multilingual items from the XMPSchema 111 | *

112 | * This relies on the property having a valid xmp getName() 113 | *

114 | * For now, this only extracts the first language if the property does not allow multiple values (see TIKA-1295) 115 | * 116 | * @param metadata 117 | * @param property 118 | * @param pdfBoxBaseline 119 | * @param schema 120 | */ 121 | private static void extractMultilingualItems(Metadata metadata, Property property, 122 | String pdfBoxBaseline, XMPSchema schema) { 123 | //if schema is null, just go with pdfBoxBaseline 124 | if (schema == null) { 125 | if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) { 126 | addMetadata(metadata, property, pdfBoxBaseline); 127 | } 128 | return; 129 | } 130 | 131 | for (String lang : schema.getLanguagePropertyLanguages(property.getName())) { 132 | String value = schema.getLanguageProperty(property.getName(), lang); 133 | 134 | if (value != null && value.length() > 0) { 135 | //if you're going to add it below in the baseline addition, don't add it now 136 | if (pdfBoxBaseline != null && value.equals(pdfBoxBaseline)) { 137 | continue; 138 | } 139 | addMetadata(metadata, property, value); 140 | if (!property.isMultiValuePermitted()) { 141 | return; 142 | } 143 | } 144 | } 145 | 146 | if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) { 147 | //if we've already added something above and multivalue is not permitted 148 | //return. 149 | if (!property.isMultiValuePermitted()) { 150 | if (metadata.get(property) != null) { 151 | return; 152 | } 153 | } 154 | addMetadata(metadata, property, pdfBoxBaseline); 155 | } 156 | } 157 | 158 | 159 | /** 160 | * This tries to read a list from a particular property in 161 | * XMPSchemaDublinCore. 162 | *

163 | * Until PDFBOX-1803/TIKA-1233 are fixed, do not call this 164 | * on dates! 165 | *

166 | * This relies on the property having a DublinCore compliant getName() 167 | * 168 | * @param property 169 | * @param dc 170 | * @param metadata 171 | */ 172 | private static void extractDublinCoreListItems(Metadata metadata, Property property, XMPSchemaDublinCore dc) { 173 | //if no dc, add baseline and return 174 | if (dc == null) { 175 | return; 176 | } 177 | List items = getXMPBagOrSeqList(dc, property.getName()); 178 | if (items == null) { 179 | return; 180 | } 181 | for (String item : items) { 182 | addMetadata(metadata, property, item); 183 | } 184 | } 185 | 186 | 187 | static void addMetadata(Metadata metadata, Property property, String value) { 188 | if (value != null) { 189 | String decoded = decode(value); 190 | if (property.isMultiValuePermitted() || metadata.get(property) == null) { 191 | metadata.add(property, decoded); 192 | } 193 | //silently skip adding property that already exists if multiple values are not permitted 194 | } 195 | } 196 | 197 | static void addMetadata(Metadata metadata, String name, String value) { 198 | if (value != null) { 199 | metadata.add(name, decode(value)); 200 | } 201 | } 202 | 203 | static String decode(String value) { 204 | if (PDFEncodedStringDecoder.shouldDecode(value)) { 205 | PDFEncodedStringDecoder d = new PDFEncodedStringDecoder(); 206 | return d.decode(value); 207 | } 208 | return value; 209 | } 210 | 211 | //can return null! 212 | private static Document loadDOM(PDMetadata pdMetadata, Metadata metadata, ParseContext context) { 213 | if (pdMetadata == null) { 214 | return null; 215 | } 216 | 217 | InputStream is = null; 218 | try { 219 | try { 220 | is = pdMetadata.exportXMPMetadata(); 221 | } catch (IOException e) { 222 | EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata); 223 | return null; 224 | } 225 | return XMLReaderUtils.buildDOM(is, context); 226 | } catch (IOException| SAXException | TikaException e) { 227 | EmbeddedDocumentUtil.recordException(e, metadata); 228 | } finally { 229 | IOUtils.closeQuietly(is); 230 | } 231 | return null; 232 | 233 | } 234 | 235 | static void addMetadata(Metadata metadata, Property property, Calendar value) { 236 | if (value != null) { 237 | metadata.set(property, value); 238 | } 239 | } 240 | 241 | /** 242 | * Used when processing custom metadata entries, as PDFBox won't do 243 | * the conversion for us in the way it does for the standard ones 244 | */ 245 | static void addMetadata(Metadata metadata, String name, COSBase value) { 246 | if (value instanceof COSArray) { 247 | for (Object v : ((COSArray) value).toList()) { 248 | addMetadata(metadata, name, ((COSBase) v)); 249 | } 250 | } else if (value instanceof COSString) { 251 | addMetadata(metadata, name, ((COSString) value).getString()); 252 | } 253 | // Avoid calling COSDictionary#toString, since it can lead to infinite 254 | // recursion. See TIKA-1038 and PDFBOX-1835. 255 | else if (value != null && !(value instanceof COSDictionary)) { 256 | addMetadata(metadata, name, value.toString()); 257 | } 258 | } 259 | } 260 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /lexpredict-tika/src/main/java/com/lexpredict/tika/XFAExtractor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Modifications copyright (C) 2020 ContraxSuite, LLC 18 | */ 19 | 20 | package com.lexpredict.tika; 21 | 22 | import javax.xml.namespace.QName; 23 | import javax.xml.stream.XMLStreamConstants; 24 | import javax.xml.stream.XMLStreamException; 25 | import javax.xml.stream.XMLStreamReader; 26 | import java.io.InputStream; 27 | import java.util.HashMap; 28 | import java.util.LinkedHashMap; 29 | import java.util.Map; 30 | import java.util.regex.Matcher; 31 | import java.util.regex.Pattern; 32 | 33 | import org.apache.tika.metadata.Metadata; 34 | import org.apache.tika.parser.ParseContext; 35 | import org.apache.tika.sax.XHTMLContentHandler; 36 | import org.xml.sax.SAXException; 37 | import org.xml.sax.helpers.AttributesImpl; 38 | 39 | 40 | class XFAExtractor { 41 | 42 | private static final Pattern XFA_TEMPLATE_ANY_VERSION = Pattern.compile("^http://www.xfa.org/schema/xfa-template"); 43 | private static final Pattern TEXT_PATTERN = 44 | Pattern.compile("^(speak|text|contents-richtext|toolTip|exData)$"); 45 | 46 | private static final String XFA_DATA_NS = "http://www.xfa.org/schema/xfa-data/1.0/"; 47 | 48 | private static final String FIELD_LN = "field"; 49 | private static final QName XFA_DATA = new QName(XFA_DATA_NS, "data"); 50 | 51 | private final Matcher xfaTemplateMatcher;//namespace any version 52 | private final Matcher textMatcher; 53 | 54 | XFAExtractor() { 55 | xfaTemplateMatcher = XFA_TEMPLATE_ANY_VERSION.matcher(""); 56 | textMatcher = TEXT_PATTERN.matcher(""); 57 | } 58 | 59 | void extract(InputStream xfaIs, XHTMLContentHandler xhtml, Metadata m, ParseContext context) 60 | throws XMLStreamException, SAXException { 61 | xhtml.startElement("div", "class", "xfa_content"); 62 | 63 | Map pdfObjRToValues = new HashMap<>(); 64 | 65 | //for now, store and dump the fields in insertion order 66 | Map namedFields = new LinkedHashMap<>(); 67 | 68 | //The strategy is to cache the fields in fields 69 | //and cache the values in pdfObjRToValues while 70 | //handling the text etc along the way. 71 | // 72 | //As a final step, dump the merged fields and the values. 73 | 74 | XMLStreamReader reader = context.getXMLInputFactory().createXMLStreamReader(xfaIs); 75 | while (reader.hasNext()) { 76 | switch (reader.next()) { 77 | case XMLStreamConstants.START_ELEMENT : 78 | QName name = reader.getName(); 79 | String localName = name.getLocalPart(); 80 | if (xfaTemplateMatcher.reset(name.getNamespaceURI()).find() && 81 | FIELD_LN.equals(name.getLocalPart())) { 82 | handleField(reader, namedFields); 83 | } else if (XFA_DATA.equals(name)) {//full qname match is important! 84 | loadData(reader, pdfObjRToValues); 85 | } else if (textMatcher.reset(localName).find()) { 86 | scrapeTextUntil(reader, xhtml, name); 87 | } 88 | break; 89 | case XMLStreamConstants.END_ELEMENT : 90 | break; 91 | } 92 | } 93 | 94 | if (namedFields.size() == 0) { 95 | xhtml.endElement("xfa_content"); 96 | return; 97 | } 98 | //now dump fields and values 99 | xhtml.startElement("div", "class", "xfa_form"); 100 | xhtml.startElement("ol"); 101 | StringBuilder sb = new StringBuilder(); 102 | for (Map.Entry e : namedFields.entrySet()) { 103 | String fieldName = e.getKey(); 104 | XFAExtractor.XFAField field = e.getValue(); 105 | String fieldValue = pdfObjRToValues.get(fieldName); 106 | AttributesImpl attrs = new AttributesImpl(); 107 | attrs.addAttribute("", "fieldName", "fieldName", "CDATA", fieldName); 108 | 109 | String displayFieldName = (field.toolTip == null || 110 | field.toolTip.trim().length() == 0) ? fieldName : field.toolTip; 111 | 112 | sb.append(displayFieldName).append(": "); 113 | if (fieldValue != null) { 114 | sb.append(fieldValue); 115 | } 116 | 117 | xhtml.startElement("li", attrs); 118 | xhtml.characters(sb.toString()); 119 | xhtml.endElement("li"); 120 | sb.setLength(0); 121 | } 122 | xhtml.endElement("ol"); 123 | xhtml.endElement("div"); 124 | xhtml.endElement("xfa_content"); 125 | } 126 | 127 | //try to scrape the text until the endElement 128 | private void scrapeTextUntil(XMLStreamReader reader, XHTMLContentHandler xhtml, 129 | QName endElement) throws XMLStreamException, SAXException { 130 | StringBuilder buffer = new StringBuilder(); 131 | boolean keepGoing = true; 132 | while (reader.hasNext() && keepGoing) { 133 | switch (reader.next()) { 134 | case XMLStreamConstants.START_ELEMENT: 135 | break; 136 | case XMLStreamConstants.CHARACTERS: 137 | int start = reader.getTextStart(); 138 | int length = reader.getTextLength(); 139 | buffer.append(reader.getTextCharacters(), 140 | start, 141 | length); 142 | break; 143 | 144 | case XMLStreamConstants.CDATA: 145 | start = reader.getTextStart(); 146 | length = reader.getTextLength(); 147 | buffer.append(reader.getTextCharacters(), 148 | start, 149 | length); 150 | break; 151 | 152 | case (XMLStreamConstants.END_ELEMENT): 153 | if (reader.getName().equals(endElement)) { 154 | keepGoing = false; 155 | } else if ("p".equals(reader.getName().getLocalPart())) { 156 | xhtml.element("p", buffer.toString()); 157 | buffer.setLength(0); 158 | } 159 | break; 160 | } 161 | } 162 | String remainder = buffer.toString(); 163 | if (remainder.trim().length() > 0) { 164 | xhtml.element("p", remainder); 165 | } 166 | } 167 | 168 | 169 | private String scrapeTextUntil(XMLStreamReader reader, QName endElement) throws XMLStreamException { 170 | StringBuilder buffer = new StringBuilder(); 171 | boolean keepGoing = true; 172 | while (reader.hasNext() && keepGoing) { 173 | switch (reader.next()) { 174 | case XMLStreamConstants.START_ELEMENT: 175 | break; 176 | case XMLStreamConstants.CHARACTERS: 177 | int start = reader.getTextStart(); 178 | int length = reader.getTextLength(); 179 | buffer.append(reader.getTextCharacters(), 180 | start, 181 | length); 182 | break; 183 | 184 | case XMLStreamConstants.CDATA: 185 | start = reader.getTextStart(); 186 | length = reader.getTextLength(); 187 | buffer.append(reader.getTextCharacters(), 188 | start, 189 | length); 190 | break; 191 | 192 | case (XMLStreamConstants.END_ELEMENT): 193 | if (reader.getName().equals(endElement)) { 194 | keepGoing = false; 195 | } else if ("p".equals(reader.getName().getLocalPart())) { 196 | buffer.append("\n"); 197 | } 198 | break; 199 | } 200 | } 201 | return buffer.toString(); 202 | } 203 | 204 | private void loadData(XMLStreamReader reader, Map pdfObjRToValues) 205 | throws XMLStreamException { 206 | //reader is at the "xfa:data" element 207 | //scrape the contents from the text containing nodes 208 | StringBuilder buffer = new StringBuilder(); 209 | while (reader.hasNext()) { 210 | switch (reader.next()) { 211 | case (XMLStreamConstants.START_ELEMENT) : 212 | break; 213 | case XMLStreamConstants.CHARACTERS: 214 | int start = reader.getTextStart(); 215 | int length = reader.getTextLength(); 216 | buffer.append(reader.getTextCharacters(), 217 | start, 218 | length); 219 | break; 220 | 221 | case XMLStreamConstants.CDATA: 222 | start = reader.getTextStart(); 223 | length = reader.getTextLength(); 224 | buffer.append(reader.getTextCharacters(), 225 | start, 226 | length); 227 | break; 228 | 229 | case (XMLStreamConstants.END_ELEMENT) : 230 | if (buffer.length() > 0) { 231 | String localName = reader.getLocalName(); 232 | pdfObjRToValues.put(localName, buffer.toString()); 233 | buffer.setLength(0); 234 | } 235 | if (XFA_DATA.equals(reader.getName())) { 236 | return; 237 | } 238 | break; 239 | 240 | } 241 | } 242 | } 243 | 244 | private void handleField(XMLStreamReader reader, Map fields) throws XMLStreamException { 245 | //reader is set to the field element 246 | String fieldName = findFirstAttributeValue(reader, "name"); 247 | String pdfObjRef = ""; 248 | String toolTip = ""; 249 | while (reader.hasNext()) { 250 | switch (reader.next()) { 251 | case XMLStreamConstants.START_ELEMENT : 252 | if ("toolTip".equals(reader.getName().getLocalPart())) { 253 | toolTip = scrapeTextUntil(reader, reader.getName()); 254 | } 255 | // add checkbutton, etcif (reader.getName().equals()) 256 | break; 257 | case XMLStreamConstants.END_ELEMENT : 258 | if (xfaTemplateMatcher.reset(reader.getName().getNamespaceURI()).find() && 259 | FIELD_LN.equals(reader.getName().getLocalPart())) { 260 | if (fieldName != null) { 261 | fields.put(fieldName, new XFAExtractor.XFAField(fieldName, toolTip, pdfObjRef)); 262 | } 263 | return; 264 | } 265 | break; 266 | case XMLStreamConstants.PROCESSING_INSTRUCTION: 267 | if ("PDF_OBJR".equals(reader.getPITarget())) { 268 | pdfObjRef = reader.getPIData(); 269 | } 270 | break; 271 | 272 | } 273 | } 274 | } 275 | 276 | private String findFirstAttributeValue(XMLStreamReader reader, String name) { 277 | for (int i = 0; i < reader.getAttributeCount(); i++) { 278 | String n = reader.getAttributeLocalName(i); 279 | if (name.equals(n)) { 280 | return reader.getAttributeValue(i); 281 | } 282 | } 283 | return ""; 284 | } 285 | 286 | class XFAField { 287 | String fieldName; 288 | String toolTip; 289 | String pdfObjRef; 290 | String value; 291 | 292 | public XFAField(String fieldName, String toolTip, String pdfObjRef) { 293 | this.fieldName = fieldName; 294 | this.toolTip = toolTip; 295 | this.pdfObjRef = pdfObjRef; 296 | } 297 | 298 | @Override 299 | public String toString() { 300 | return "XFAField{" + 301 | "fieldName='" + fieldName + '\'' + 302 | ", toolTip='" + toolTip + '\'' + 303 | ", pdfObjRef='" + pdfObjRef + '\'' + 304 | ", value='" + value + '\'' + 305 | '}'; 306 | } 307 | } 308 | } 309 | 310 | -------------------------------------------------------------------------------- /lexpredict-tika/src/main/java/com/lexpredict/tika/LegacyPDFStreamEngine.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Modifications copyright (C) 2020 ContraxSuite, LLC 18 | */ 19 | 20 | package com.lexpredict.tika; 21 | 22 | import org.apache.commons.logging.Log; 23 | import org.apache.commons.logging.LogFactory; 24 | import org.apache.fontbox.ttf.TrueTypeFont; 25 | import org.apache.fontbox.util.BoundingBox; 26 | import org.apache.pdfbox.contentstream.PDFStreamEngine; 27 | import org.apache.pdfbox.contentstream.operator.DrawObject; 28 | import org.apache.pdfbox.pdmodel.PDPage; 29 | import org.apache.pdfbox.pdmodel.common.PDRectangle; 30 | import org.apache.pdfbox.pdmodel.font.encoding.GlyphList; 31 | import org.apache.pdfbox.pdmodel.graphics.state.PDGraphicsState; 32 | import org.apache.pdfbox.text.TextPosition; 33 | import org.apache.pdfbox.util.Matrix; 34 | import org.apache.pdfbox.util.Vector; 35 | import java.io.IOException; 36 | import java.io.InputStream; 37 | import org.apache.pdfbox.pdmodel.font.PDCIDFont; 38 | import org.apache.pdfbox.pdmodel.font.PDCIDFontType2; 39 | import org.apache.pdfbox.pdmodel.font.PDFont; 40 | import org.apache.pdfbox.pdmodel.font.PDSimpleFont; 41 | import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont; 42 | import org.apache.pdfbox.pdmodel.font.PDType0Font; 43 | import org.apache.pdfbox.pdmodel.font.PDType3Font; 44 | import org.apache.pdfbox.contentstream.operator.state.Concatenate; 45 | import org.apache.pdfbox.contentstream.operator.state.Restore; 46 | import org.apache.pdfbox.contentstream.operator.state.Save; 47 | import org.apache.pdfbox.contentstream.operator.state.SetGraphicsStateParameters; 48 | import org.apache.pdfbox.contentstream.operator.state.SetMatrix; 49 | import org.apache.pdfbox.contentstream.operator.text.BeginText; 50 | import org.apache.pdfbox.contentstream.operator.text.EndText; 51 | import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize; 52 | import org.apache.pdfbox.contentstream.operator.text.SetTextHorizontalScaling; 53 | import org.apache.pdfbox.contentstream.operator.text.ShowTextAdjusted; 54 | import org.apache.pdfbox.contentstream.operator.text.ShowTextLine; 55 | import org.apache.pdfbox.contentstream.operator.text.ShowTextLineAndSpace; 56 | import org.apache.pdfbox.contentstream.operator.text.MoveText; 57 | import org.apache.pdfbox.contentstream.operator.text.MoveTextSetLeading; 58 | import org.apache.pdfbox.contentstream.operator.text.NextLine; 59 | import org.apache.pdfbox.contentstream.operator.text.SetCharSpacing; 60 | import org.apache.pdfbox.contentstream.operator.text.SetTextLeading; 61 | import org.apache.pdfbox.contentstream.operator.text.SetTextRenderingMode; 62 | import org.apache.pdfbox.contentstream.operator.text.SetTextRise; 63 | import org.apache.pdfbox.contentstream.operator.text.SetWordSpacing; 64 | import org.apache.pdfbox.contentstream.operator.text.ShowText; 65 | import org.apache.pdfbox.pdmodel.font.PDFontDescriptor; 66 | 67 | class LegacyPDFStreamEngine extends PDFStreamEngine 68 | { 69 | private static final Log LOG = LogFactory.getLog(com.lexpredict.tika.LegacyPDFStreamEngine.class); 70 | 71 | private int pageRotation; 72 | private PDRectangle pageSize; 73 | private Matrix translateMatrix; 74 | private final GlyphList glyphList; 75 | 76 | /** 77 | * Constructor. 78 | */ 79 | LegacyPDFStreamEngine() throws IOException 80 | { 81 | addOperator(new BeginText()); 82 | addOperator(new Concatenate()); 83 | addOperator(new DrawObject()); // special text version 84 | addOperator(new EndText()); 85 | addOperator(new SetGraphicsStateParameters()); 86 | addOperator(new Save()); 87 | addOperator(new Restore()); 88 | addOperator(new NextLine()); 89 | addOperator(new SetCharSpacing()); 90 | addOperator(new MoveText()); 91 | addOperator(new MoveTextSetLeading()); 92 | addOperator(new SetFontAndSize()); 93 | addOperator(new ShowText()); 94 | addOperator(new ShowTextAdjusted()); 95 | addOperator(new SetTextLeading()); 96 | addOperator(new SetMatrix()); 97 | addOperator(new SetTextRenderingMode()); 98 | addOperator(new SetTextRise()); 99 | addOperator(new SetWordSpacing()); 100 | addOperator(new SetTextHorizontalScaling()); 101 | addOperator(new ShowTextLine()); 102 | addOperator(new ShowTextLineAndSpace()); 103 | 104 | // load additional glyph list for Unicode mapping 105 | String path = "org/apache/pdfbox/resources/glyphlist/additional.txt"; 106 | InputStream input = GlyphList.class.getClassLoader().getResourceAsStream(path); 107 | glyphList = new GlyphList(GlyphList.getAdobeGlyphList(), input); 108 | } 109 | 110 | /** 111 | * This will initialize and process the contents of the stream. 112 | * 113 | * @param page the page to process 114 | * @throws java.io.IOException if there is an error accessing the stream. 115 | */ 116 | @Override 117 | public void processPage(PDPage page) throws IOException 118 | { 119 | this.pageRotation = page.getRotation(); 120 | this.pageSize = page.getCropBox(); 121 | 122 | if (pageSize.getLowerLeftX() == 0 && pageSize.getLowerLeftY() == 0) 123 | { 124 | translateMatrix = null; 125 | } 126 | else 127 | { 128 | // translation matrix for cropbox 129 | translateMatrix = Matrix.getTranslateInstance(-pageSize.getLowerLeftX(), -pageSize.getLowerLeftY()); 130 | } 131 | super.processPage(page); 132 | } 133 | 134 | /** 135 | * This method was originally written by Ben Litchfield for PDFStreamEngine. 136 | */ 137 | @Override 138 | protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code, String unicode, 139 | Vector displacement) throws IOException 140 | { 141 | // 142 | // legacy calculations which were previously in PDFStreamEngine 143 | // 144 | // DO NOT USE THIS CODE UNLESS YOU ARE WORKING WITH PDFTextStripper. 145 | // THIS CODE IS DELIBERATELY INCORRECT 146 | // 147 | 148 | PDGraphicsState state = getGraphicsState(); 149 | Matrix ctm = state.getCurrentTransformationMatrix(); 150 | float fontSize = state.getTextState().getFontSize(); 151 | float horizontalScaling = state.getTextState().getHorizontalScaling() / 100f; 152 | Matrix textMatrix = getTextMatrix(); 153 | 154 | BoundingBox bbox = font.getBoundingBox(); 155 | if (bbox.getLowerLeftY() < Short.MIN_VALUE) 156 | { 157 | // PDFBOX-2158 and PDFBOX-3130 158 | // files by Salmat eSolutions / ClibPDF Library 159 | bbox.setLowerLeftY(- (bbox.getLowerLeftY() + 65536)); 160 | } 161 | // 1/2 the bbox is used as the height todo: why? 162 | float glyphHeight = bbox.getHeight() / 2; 163 | 164 | // sometimes the bbox has very high values, but CapHeight is OK 165 | PDFontDescriptor fontDescriptor = font.getFontDescriptor(); 166 | if (fontDescriptor != null) 167 | { 168 | float capHeight = fontDescriptor.getCapHeight(); 169 | if (capHeight != 0 && (capHeight < glyphHeight || glyphHeight == 0)) 170 | { 171 | glyphHeight = capHeight; 172 | } 173 | } 174 | 175 | // transformPoint from glyph space -> text space 176 | float height; 177 | if (font instanceof PDType3Font) 178 | { 179 | height = font.getFontMatrix().transformPoint(0, glyphHeight).y; 180 | } 181 | else 182 | { 183 | height = glyphHeight / 1000; 184 | } 185 | 186 | float displacementX = displacement.getX(); 187 | // the sorting algorithm is based on the width of the character. As the displacement 188 | // for vertical characters doesn't provide any suitable value for it, we have to 189 | // calculate our own 190 | if (font.isVertical()) 191 | { 192 | displacementX = font.getWidth(code) / 1000; 193 | // there may be an additional scaling factor for true type fonts 194 | TrueTypeFont ttf = null; 195 | if (font instanceof PDTrueTypeFont) 196 | { 197 | ttf = ((PDTrueTypeFont)font).getTrueTypeFont(); 198 | } 199 | else if (font instanceof PDType0Font) 200 | { 201 | PDCIDFont cidFont = ((PDType0Font)font).getDescendantFont(); 202 | if (cidFont instanceof PDCIDFontType2) 203 | { 204 | ttf = ((PDCIDFontType2)cidFont).getTrueTypeFont(); 205 | } 206 | } 207 | if (ttf != null && ttf.getUnitsPerEm() != 1000) 208 | { 209 | displacementX *= 1000f / ttf.getUnitsPerEm(); 210 | } 211 | } 212 | 213 | // 214 | // legacy calculations which were previously in PDFStreamEngine 215 | // 216 | // DO NOT USE THIS CODE UNLESS YOU ARE WORKING WITH PDFTextStripper. 217 | // THIS CODE IS DELIBERATELY INCORRECT 218 | // 219 | 220 | // (modified) combined displacement, this is calculated *without* taking the character 221 | // spacing and word spacing into account, due to legacy code in TextStripper 222 | float tx = displacementX * fontSize * horizontalScaling; 223 | float ty = displacement.getY() * fontSize; 224 | 225 | // (modified) combined displacement matrix 226 | Matrix td = Matrix.getTranslateInstance(tx, ty); 227 | 228 | // (modified) text rendering matrix 229 | Matrix nextTextRenderingMatrix = td.multiply(textMatrix).multiply(ctm); // text space -> device space 230 | float nextX = nextTextRenderingMatrix.getTranslateX(); 231 | float nextY = nextTextRenderingMatrix.getTranslateY(); 232 | 233 | // (modified) width and height calculations 234 | float dxDisplay = nextX - textRenderingMatrix.getTranslateX(); 235 | float dyDisplay = height * textRenderingMatrix.getScalingFactorY(); 236 | 237 | // 238 | // start of the original method 239 | // 240 | 241 | // Note on variable names. There are three different units being used in this code. 242 | // Character sizes are given in glyph units, text locations are initially given in text 243 | // units, and we want to save the data in display units. The variable names should end with 244 | // Text or Disp to represent if the values are in text or disp units (no glyph units are 245 | // saved). 246 | 247 | float glyphSpaceToTextSpaceFactor = 1 / 1000f; 248 | if (font instanceof PDType3Font) 249 | { 250 | glyphSpaceToTextSpaceFactor = font.getFontMatrix().getScaleX(); 251 | } 252 | 253 | float spaceWidthText = 0; 254 | try 255 | { 256 | // to avoid crash as described in PDFBOX-614, see what the space displacement should be 257 | spaceWidthText = font.getSpaceWidth() * glyphSpaceToTextSpaceFactor; 258 | } 259 | catch (Throwable exception) 260 | { 261 | LOG.warn(exception, exception); 262 | } 263 | 264 | if (spaceWidthText == 0) 265 | { 266 | spaceWidthText = font.getAverageFontWidth() * glyphSpaceToTextSpaceFactor; 267 | // the average space width appears to be higher than necessary so make it smaller 268 | spaceWidthText *= .80f; 269 | } 270 | if (spaceWidthText == 0) 271 | { 272 | spaceWidthText = 1.0f; // if could not find font, use a generic value 273 | } 274 | 275 | // the space width has to be transformed into display units 276 | float spaceWidthDisplay = spaceWidthText * textRenderingMatrix.getScalingFactorX(); 277 | 278 | // use our additional glyph list for Unicode mapping 279 | unicode = font.toUnicode(code, glyphList); 280 | 281 | // when there is no Unicode mapping available, Acrobat simply coerces the character code 282 | // into Unicode, so we do the same. Subclasses of PDFStreamEngine don't necessarily want 283 | // this, which is why we leave it until this point in PDFTextStreamEngine. 284 | if (unicode == null) 285 | { 286 | if (font instanceof PDSimpleFont) 287 | { 288 | char c = (char) code; 289 | unicode = new String(new char[] { c }); 290 | } 291 | else 292 | { 293 | // Acrobat doesn't seem to coerce composite font's character codes, instead it 294 | // skips them. See the "allah2.pdf" TestTextStripper file. 295 | return; 296 | } 297 | } 298 | 299 | // adjust for cropbox if needed 300 | Matrix translatedTextRenderingMatrix; 301 | if (translateMatrix == null) 302 | { 303 | translatedTextRenderingMatrix = textRenderingMatrix; 304 | } 305 | else 306 | { 307 | translatedTextRenderingMatrix = Matrix.concatenate(translateMatrix, textRenderingMatrix); 308 | nextX -= pageSize.getLowerLeftX(); 309 | nextY -= pageSize.getLowerLeftY(); 310 | } 311 | 312 | processTextPosition(new TextPosition(pageRotation, pageSize.getWidth(), 313 | pageSize.getHeight(), translatedTextRenderingMatrix, nextX, nextY, 314 | Math.abs(dyDisplay), dxDisplay, 315 | Math.abs(spaceWidthDisplay), unicode, new int[] { code } , font, fontSize, 316 | (int)(fontSize * textMatrix.getScalingFactorX()))); 317 | } 318 | 319 | /** 320 | * A method provided as an event interface to allow a subclass to perform some specific 321 | * functionality when text needs to be processed. 322 | * 323 | * @param text The text to be processed. 324 | */ 325 | protected void processTextPosition(TextPosition text) 326 | { 327 | // subclasses can override to provide specific functionality 328 | } 329 | } 330 | 331 | -------------------------------------------------------------------------------- /lexpredict-tika/src/test/java/com/lexpredict/tika/TikaTest.java: -------------------------------------------------------------------------------- 1 | package com.lexpredict.tika;/* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | import static org.junit.Assert.assertEquals; 19 | import static org.junit.Assert.assertFalse; 20 | import static org.junit.Assert.assertTrue; 21 | import static org.junit.Assert.fail; 22 | 23 | import java.io.ByteArrayOutputStream; 24 | import java.io.File; 25 | import java.io.IOException; 26 | import java.io.InputStream; 27 | import java.net.URISyntaxException; 28 | import java.net.URL; 29 | import java.util.ArrayList; 30 | import java.util.Collection; 31 | import java.util.HashSet; 32 | import java.util.List; 33 | import java.util.Set; 34 | 35 | import org.apache.tika.extractor.EmbeddedResourceHandler; 36 | import org.apache.tika.io.IOUtils; 37 | import org.apache.tika.io.TikaInputStream; 38 | import org.apache.tika.metadata.Metadata; 39 | import org.apache.tika.mime.MediaType; 40 | import org.apache.tika.parser.AutoDetectParser; 41 | import org.apache.tika.parser.ParseContext; 42 | import org.apache.tika.parser.Parser; 43 | import org.apache.tika.parser.RecursiveParserWrapper; 44 | import org.apache.tika.sax.BasicContentHandlerFactory; 45 | import org.apache.tika.sax.BodyContentHandler; 46 | import org.apache.tika.sax.RecursiveParserWrapperHandler; 47 | import org.apache.tika.sax.ToXMLContentHandler; 48 | import org.xml.sax.ContentHandler; 49 | import org.xml.sax.SAXException; 50 | 51 | /** 52 | * Parent class of Tika tests 53 | */ 54 | public abstract class TikaTest { 55 | /** 56 | * This method will give you back the filename incl. the absolute path name 57 | * to the resource. If the resource does not exist it will give you back the 58 | * resource name incl. the path. 59 | * 60 | * @param name 61 | * The named resource to search for. 62 | * @return an absolute path incl. the name which is in the same directory as 63 | * the the class you've called it from. 64 | */ 65 | public File getResourceAsFile(String name) throws URISyntaxException { 66 | URL url = this.getClass().getResource(name); 67 | if (url != null) { 68 | return new File(url.toURI()); 69 | } else { 70 | // We have a file which does not exists 71 | // We got the path 72 | url = this.getClass().getResource("."); 73 | File file = new File(new File(url.toURI()), name); 74 | if (file == null) { 75 | fail("Unable to find requested file " + name); 76 | } 77 | return file; 78 | } 79 | } 80 | 81 | public InputStream getResourceAsStream(String name) { 82 | InputStream stream = this.getClass().getResourceAsStream(name); 83 | if (stream == null) { 84 | fail("Unable to find requested resource " + name); 85 | } 86 | return stream; 87 | } 88 | 89 | public static void assertContainsCount(String needle, String haystack, int targetCount) { 90 | int i = haystack.indexOf(needle); 91 | int count = 0; 92 | while (i > -1) { 93 | count++; 94 | i = haystack.indexOf(needle, i+1); 95 | } 96 | assertEquals("found "+count +" but should have found: "+targetCount, 97 | targetCount, count); 98 | } 99 | 100 | 101 | public static void assertContains(String needle, String haystack) { 102 | assertTrue(needle + " not found in:\n" + haystack, haystack.contains(needle)); 103 | } 104 | 105 | public static void assertContains(T needle, Collection haystack) { 106 | assertTrue(needle + " not found in:\n" + haystack, haystack.contains(needle)); 107 | } 108 | 109 | public static void assertNotContained(String needle, String haystack) { 110 | assertFalse(needle + " unexpectedly found in:\n" + haystack, haystack.contains(needle)); 111 | } 112 | public static void assertNotContained(T needle, Collection haystack) { 113 | assertFalse(needle + " unexpectedly found in:\n" + haystack, haystack.contains(needle)); 114 | } 115 | 116 | /** 117 | * Test that in at least one item in metadataList, all keys and values 118 | * in minExpected are contained. 119 | *

120 | * The values in minExpected are tested for whether they are contained 121 | * within a value in the target. If minExpected=&dquot;text/vbasic&dquot; and 122 | * what was actually found in the target within metadatalist is 123 | * &dquot;text/vbasic; charset=windows-1252&dquot;, 124 | * that is counted as a hit. 125 | * 126 | * @param minExpected 127 | * @param metadataList 128 | */ 129 | public static void assertContainsAtLeast(Metadata minExpected, List metadataList) { 130 | 131 | for (Metadata m : metadataList) { 132 | int foundPropertyCount = 0; 133 | for (String n : minExpected.names()) { 134 | int foundValCount = 0; 135 | for (String foundVal : m.getValues(n)) { 136 | for (String expectedVal : minExpected.getValues(n)) { 137 | if (foundVal.contains(expectedVal)) { 138 | foundValCount++; 139 | } 140 | } 141 | } 142 | if (foundValCount == minExpected.getValues(n).length) { 143 | foundPropertyCount++; 144 | } 145 | } 146 | if (foundPropertyCount == minExpected.names().length) { 147 | //found everything! 148 | return; 149 | } 150 | } 151 | //TODO: figure out how to have more informative error message 152 | fail("Couldn't find everything within a single metadata item"); 153 | } 154 | protected static class XMLResult { 155 | public final String xml; 156 | public final Metadata metadata; 157 | 158 | public XMLResult(String xml, Metadata metadata) { 159 | this.xml = xml; 160 | this.metadata = metadata; 161 | } 162 | } 163 | 164 | protected XMLResult getXML(String filePath, Parser parser, ParseContext context) throws Exception { 165 | return getXML(getResourceAsStream("/test-documents/" + filePath), parser, new Metadata(), context); 166 | } 167 | 168 | protected XMLResult getXML(String filePath, Parser parser, Metadata metadata) throws Exception { 169 | return getXML(getResourceAsStream("/test-documents/" + filePath), parser, metadata, null); 170 | } 171 | 172 | protected XMLResult getXML(String filePath, ParseContext parseContext) throws Exception { 173 | return getXML(filePath, new AutoDetectParser(), parseContext); 174 | } 175 | 176 | protected XMLResult getXML(String filePath, Metadata metadata, ParseContext parseContext) throws Exception { 177 | return getXML(getResourceAsStream("/test-documents/"+filePath), new AutoDetectParser(), metadata, parseContext); 178 | } 179 | 180 | protected XMLResult getXML(String filePath, Metadata metadata) throws Exception { 181 | return getXML(getResourceAsStream("/test-documents/" + filePath), new AutoDetectParser(), metadata, null); 182 | } 183 | 184 | protected XMLResult getXML(String filePath, Parser parser) throws Exception { 185 | Metadata metadata = new Metadata(); 186 | metadata.set(Metadata.RESOURCE_NAME_KEY, filePath); 187 | return getXML(filePath, parser, metadata); 188 | } 189 | 190 | protected XMLResult getXML(String filePath) throws Exception { 191 | return getXML(getResourceAsStream("/test-documents/" + filePath), new AutoDetectParser(), new Metadata(), null); 192 | } 193 | 194 | protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata) throws Exception { 195 | return getXML(input, parser, metadata, null); 196 | } 197 | 198 | protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata, ParseContext context) throws Exception { 199 | if (context == null) { 200 | context = new ParseContext(); 201 | } 202 | 203 | try { 204 | ContentHandler handler = new ToXMLContentHandler(); 205 | parser.parse(input, handler, metadata, context); 206 | return new XMLResult(handler.toString(), metadata); 207 | } finally { 208 | input.close(); 209 | } 210 | } 211 | 212 | public XMLResult getXML(InputStream is, Parser parser, ParseContext context) throws Exception{ 213 | return getXML(is, parser, new Metadata(), context); 214 | } 215 | 216 | protected List getRecursiveMetadata(String filePath) throws Exception { 217 | return getRecursiveMetadata(filePath, new ParseContext()); 218 | } 219 | 220 | protected List getRecursiveMetadata(String filePath, Metadata metadata) throws Exception { 221 | return getRecursiveMetadata(filePath, new ParseContext(), metadata); 222 | } 223 | 224 | protected List getRecursiveMetadata(String filePath, ParseContext context, Metadata metadata) throws Exception { 225 | Parser p = new AutoDetectParser(); 226 | RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p); 227 | RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( 228 | new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); 229 | 230 | try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) { 231 | wrapper.parse(is, handler, metadata, context); 232 | } 233 | return handler.getMetadataList(); 234 | } 235 | 236 | protected List getRecursiveMetadata(String filePath, ParseContext context) throws Exception { 237 | Parser p = new AutoDetectParser(); 238 | RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p); 239 | 240 | RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( 241 | new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); 242 | try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) { 243 | wrapper.parse(is, handler, new Metadata(), context); 244 | } 245 | return handler.getMetadataList(); 246 | } 247 | 248 | protected List getRecursiveMetadata(String filePath, Parser parserToWrap) throws Exception { 249 | return getRecursiveMetadata(filePath, parserToWrap, BasicContentHandlerFactory.HANDLER_TYPE.XML); 250 | } 251 | 252 | protected List getRecursiveMetadata(String filePath, Parser parserToWrap, BasicContentHandlerFactory.HANDLER_TYPE handlerType) throws Exception { 253 | RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parserToWrap); 254 | RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( 255 | new BasicContentHandlerFactory(handlerType, -1)); 256 | try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) { 257 | wrapper.parse(is, handler, new Metadata(), new ParseContext()); 258 | } 259 | return handler.getMetadataList(); 260 | } 261 | 262 | protected List getRecursiveMetadata(String filePath, Parser parserToWrap, ParseContext parseContext) throws Exception { 263 | RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parserToWrap); 264 | RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( 265 | new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); 266 | 267 | try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) { 268 | wrapper.parse(is, handler, new Metadata(), parseContext); 269 | } 270 | return handler.getMetadataList(); 271 | } 272 | 273 | 274 | /** 275 | * Basic text extraction. 276 | *

277 | * Tries to close input stream after processing. 278 | */ 279 | public String getText(InputStream is, Parser parser, ParseContext context, Metadata metadata) throws Exception{ 280 | ContentHandler handler = new BodyContentHandler(1000000); 281 | try { 282 | parser.parse(is, handler, metadata, context); 283 | } finally { 284 | is.close(); 285 | } 286 | return handler.toString(); 287 | } 288 | 289 | public String getTextWoDoublebreaks(InputStream is, Parser parser, 290 | ParseContext context, Metadata metadata) throws Exception{ 291 | ContentHandler handler = new OriginalBodyContentHandler(); 292 | try { 293 | parser.parse(is, handler, metadata, context); 294 | } finally { 295 | is.close(); 296 | } 297 | return handler.toString(); 298 | } 299 | 300 | public String getTextWoDoublebreaks(InputStream is, Parser parser, Metadata metadata) throws Exception{ 301 | return getTextWoDoublebreaks(is, parser, new ParseContext(), metadata); 302 | } 303 | 304 | public String getText(InputStream is, Parser parser, Metadata metadata) throws Exception{ 305 | return getText(is, parser, new ParseContext(), metadata); 306 | } 307 | 308 | public String getText(InputStream is, Parser parser, ParseContext context) throws Exception{ 309 | return getText(is, parser, context, new Metadata()); 310 | } 311 | 312 | public String getText(InputStream is, Parser parser) throws Exception{ 313 | return getText(is, parser, new ParseContext(), new Metadata()); 314 | } 315 | 316 | /** 317 | * Keeps track of media types and file names recursively. 318 | * 319 | */ 320 | public static class TrackingHandler implements EmbeddedResourceHandler { 321 | public List filenames = new ArrayList(); 322 | public List mediaTypes = new ArrayList(); 323 | 324 | private final Set skipTypes; 325 | 326 | public TrackingHandler() { 327 | skipTypes = new HashSet(); 328 | } 329 | 330 | public TrackingHandler(Set skipTypes) { 331 | this.skipTypes = skipTypes; 332 | } 333 | 334 | @Override 335 | public void handle(String filename, MediaType mediaType, 336 | InputStream stream) { 337 | if (skipTypes.contains(mediaType)) { 338 | return; 339 | } 340 | mediaTypes.add(mediaType); 341 | filenames.add(filename); 342 | } 343 | } 344 | 345 | /** 346 | * Copies byte[] of embedded documents into a List. 347 | */ 348 | public static class ByteCopyingHandler implements EmbeddedResourceHandler { 349 | 350 | public List bytes = new ArrayList(); 351 | 352 | @Override 353 | public void handle(String filename, MediaType mediaType, 354 | InputStream stream) { 355 | ByteArrayOutputStream os = new ByteArrayOutputStream(); 356 | if (! stream.markSupported()) { 357 | stream = TikaInputStream.get(stream); 358 | } 359 | stream.mark(0); 360 | try { 361 | IOUtils.copy(stream, os); 362 | bytes.add(os.toByteArray()); 363 | stream.reset(); 364 | } catch (IOException e) { 365 | //swallow 366 | } 367 | } 368 | } 369 | 370 | public class OriginalBodyContentHandler extends BodyContentHandler { 371 | @Override 372 | public void ignorableWhitespace(char[] ch, int start, int length) 373 | throws SAXException { 374 | // Not writing extra new lines generated by XHTMLContentHandler. 375 | } 376 | } 377 | 378 | public static void debug(List list) { 379 | int i = 0; 380 | for (Metadata m : list) { 381 | for (String n : m.names()) { 382 | for (String v : m.getValues(n)) { 383 | System.out.println(i + ": "+n + " : "+v); 384 | } 385 | } 386 | i++; 387 | } 388 | } 389 | 390 | public static void debug(Metadata metadata) { 391 | for (String n : metadata.names()) { 392 | for (String v : metadata.getValues(n)) { 393 | System.out.println(n + " : "+v); 394 | } 395 | } 396 | } 397 | } 398 | -------------------------------------------------------------------------------- /lexpredict-tika/src/main/java/com/lexpredict/tika/AlterPDFParser.java: -------------------------------------------------------------------------------- 1 | package com.lexpredict.tika; 2 | 3 | import org.apache.commons.io.input.CloseShieldInputStream; 4 | import org.apache.pdfbox.io.MemoryUsageSetting; 5 | import org.apache.pdfbox.pdmodel.PDDocument; 6 | import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException; 7 | import org.apache.tika.exception.AccessPermissionException; 8 | import org.apache.tika.exception.EncryptedDocumentException; 9 | import org.apache.tika.exception.TikaException; 10 | import org.apache.tika.io.TikaInputStream; 11 | import org.apache.tika.metadata.Metadata; 12 | import org.apache.tika.metadata.PDF; 13 | import org.apache.tika.mime.MediaType; 14 | import org.apache.tika.parser.ParseContext; 15 | import org.apache.tika.parser.ocr.TesseractOCRConfig; 16 | import org.apache.tika.parser.ocr.TesseractOCRParser; 17 | import org.apache.tika.parser.pdf.*; 18 | import org.xml.sax.ContentHandler; 19 | import org.xml.sax.SAXException; 20 | 21 | import java.io.*; 22 | import java.lang.reflect.InvocationTargetException; 23 | import java.lang.reflect.Method; 24 | 25 | 26 | public class AlterPDFParser extends PDFParser { 27 | public enum ParsePdfMode { 28 | DEFAULT, PDF_OCR, PDF_ONLY, TEXT_STRIP, PREFER_TEXT, OCR_ONLY 29 | } 30 | 31 | // uses this value if it is not set in HttpRequest 32 | ParsePdfMode defaultParseMode = ParsePdfMode.PDF_OCR; 33 | 34 | // Metadata key for giving the document password to the parser 35 | private static final MediaType MEDIA_TYPE = MediaType.application("pdf"); 36 | 37 | // Serial version UID 38 | private static final long serialVersionUID = -752276948656079347L; 39 | 40 | private PDFParserConfig defaultConfig = new PDFParserConfig(); 41 | 42 | @Override 43 | public void parse( 44 | InputStream stream, ContentHandler handler, 45 | Metadata metadata, ParseContext context) 46 | throws IOException, SAXException, TikaException { 47 | 48 | HttpRequestParamsReader.getInstance().initialize(stream); 49 | HttpRequestParamsReader.getInstance().outIfVerbose("AlterPDFParser.parse()"); 50 | ParsePdfMode pdfParseMode = getParseMode(); 51 | 52 | PDFParserConfig sourceConfig = context.get(PDFParserConfig.class, defaultConfig); 53 | PDFParserConfig localConfig = makeConfigLocalCopy(sourceConfig); 54 | 55 | if (localConfig.getSetKCMS()) 56 | System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider"); 57 | 58 | PDDocument pdfDocument = null; 59 | try { 60 | TikaInputStream tstream = TikaInputStream.cast(stream); 61 | String password = callGetPassword(metadata, context); 62 | MemoryUsageSetting memoryUsageSetting = MemoryUsageSetting.setupMainMemoryOnly(); 63 | if (localConfig.getMaxMainMemoryBytes() >= 0) { 64 | memoryUsageSetting = MemoryUsageSetting.setupMixed(localConfig.getMaxMainMemoryBytes()); 65 | } 66 | 67 | if (tstream != null && tstream.hasFile()) { 68 | // File based -- send file directly to PDFBox 69 | pdfDocument = PDDocument.load(tstream.getPath().toFile(), password, memoryUsageSetting); 70 | } else 71 | pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), password, memoryUsageSetting); 72 | 73 | extractAndCheckMetadata(metadata, context, localConfig, pdfDocument); 74 | 75 | if (handler == null) 76 | return; 77 | 78 | // preprocess document 79 | //PdfContentImagePreprocessor preproc = new PdfContentImagePreprocessor(); 80 | //preproc.removeImagesAlphaChannel(pdfDocument); 81 | 82 | if (callShouldHandleXFAOnly(pdfDocument, localConfig)) { 83 | HttpRequestParamsReader.getInstance().outIfVerbose("AlterPDFParser.parse(callShouldHandleXFAOnly)"); 84 | callHandleXFAOnly(pdfDocument, handler, metadata, context); 85 | } else if (localConfig.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY)) { 86 | HttpRequestParamsReader.getInstance().outIfVerbose("AlterPDFParser.parse(OCR_ONLY)"); 87 | metadata.add("X-Parsed-By", TesseractOCRParser.class.toString()); 88 | callOCR2XHTMLProcess(pdfDocument, handler, context, metadata, localConfig); 89 | } else { 90 | // parse document by using PDFStripper 91 | if (pdfParseMode == ParsePdfMode.TEXT_STRIP) { 92 | HttpRequestParamsReader.getInstance().outIfVerbose("AlterPDFParser.parse(TEXT_STRIP)"); 93 | PdfStripperProcessor.setTextUsingPDFTextStripper(handler, pdfDocument); 94 | } 95 | // just PDF parsing 96 | else if (pdfParseMode == ParsePdfMode.PDF_ONLY) { 97 | HttpRequestParamsReader.getInstance().outIfVerbose("AlterPDFParser.parse(PDF_ONLY)"); 98 | callPDF2XHTMLProcess(pdfDocument, handler, context, metadata, localConfig, true); 99 | } 100 | // smart parsing: PDF or OCR 101 | else if (pdfParseMode == ParsePdfMode.PDF_OCR || 102 | pdfParseMode == ParsePdfMode.PREFER_TEXT) { 103 | HttpRequestParamsReader.getInstance().outIfVerbose("AlterPDFParser.parse(PDF_OCR)"); 104 | PdfContentTypeChecker checker = new PdfContentTypeChecker(); 105 | PdfContentTypeChecker.PdfContent docType = checker.determineDocContentType(pdfDocument); 106 | HttpRequestParamsReader.getInstance().outIfVerbose("detected doc type: " + docType.toString()); 107 | 108 | if (docType == PdfContentTypeChecker.PdfContent.TEXT || 109 | (docType != PdfContentTypeChecker.PdfContent.IMAGES && pdfParseMode == ParsePdfMode.PREFER_TEXT)) 110 | callPDF2XHTMLProcess(pdfDocument, handler, context, metadata, localConfig, false); 111 | else { 112 | metadata.add("X-Parsed-By", TesseractOCRParser.class.toString()); 113 | callOCR2XHTMLProcess(pdfDocument, handler, context, metadata, localConfig); 114 | } 115 | } 116 | else if (pdfParseMode == ParsePdfMode.OCR_ONLY) { 117 | metadata.add("X-Parsed-By", TesseractOCRParser.class.toString()); 118 | callOCR2XHTMLProcess(pdfDocument, handler, context, metadata, localConfig); 119 | } else { // ... or parse it default Tika-way 120 | HttpRequestParamsReader.getInstance().outIfVerbose("AlterPDFParser.parse(callPDF2XHTMLProcess)"); 121 | callPDF2XHTMLProcess(pdfDocument, handler, context, metadata, localConfig, false); 122 | } 123 | } 124 | 125 | } catch (InvalidPasswordException e) { 126 | metadata.set(PDF.IS_ENCRYPTED, "true"); 127 | throw new EncryptedDocumentException(e); 128 | } catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException | 129 | NoSuchFieldException | ClassNotFoundException | IOException e) { 130 | e.printStackTrace(); 131 | } // see e.getCause() 132 | finally { 133 | if (pdfDocument != null) { 134 | pdfDocument.close(); 135 | } 136 | } 137 | } 138 | 139 | // method determines what parsing strategy to use 140 | // from HTTPRequest or the default variable value 141 | private ParsePdfMode getParseMode() { 142 | String parseMode = HttpRequestParamsReader.getInstance().typedParams.get(CommonParseFlag.PDF_PARSE_METHOD); 143 | if (parseMode == null || parseMode.length() == 0) 144 | parseMode = System.getenv("LEXNLP_TIKA_PARSER_MODE"); 145 | if (parseMode == null || parseMode.length() == 0) 146 | return defaultParseMode; 147 | 148 | if (parseMode.equals(HttpRequestParamsReader.PDF_PARSE_METHOD_STRIP)) 149 | return ParsePdfMode.TEXT_STRIP; 150 | if (parseMode.equals(HttpRequestParamsReader.PDF_PARSE_METHOD_PDF_OCR)) 151 | return ParsePdfMode.PDF_OCR; 152 | if (parseMode.equals(HttpRequestParamsReader.PDF_PARSE_METHOD_PDF_ONLY)) 153 | return ParsePdfMode.PDF_ONLY; 154 | if (parseMode.equals(HttpRequestParamsReader.PDF_PARSE_METHOD_OCR_ONLY)) 155 | return ParsePdfMode.OCR_ONLY; 156 | if (parseMode.equals(HttpRequestParamsReader.PDF_PARSE_METHOD_PDF_PREFER_TEXT)) 157 | return ParsePdfMode.PREFER_TEXT; 158 | 159 | return defaultParseMode; 160 | } 161 | 162 | // extract doc's metadata and check whether it is accessible 163 | private void extractAndCheckMetadata(Metadata metadata, ParseContext context, PDFParserConfig localConfig, PDDocument pdfDocument) 164 | throws NoSuchMethodException, IllegalAccessException, InvocationTargetException, AccessPermissionException { 165 | metadata.set(PDF.IS_ENCRYPTED, Boolean.toString(pdfDocument.isEncrypted())); 166 | metadata.set(Metadata.CONTENT_TYPE, MEDIA_TYPE.toString()); 167 | callExtractMetadata(pdfDocument, metadata, context); 168 | 169 | AccessChecker checker = localConfig.getAccessChecker(); 170 | checker.check(metadata); 171 | } 172 | 173 | // process PDF as a printed (vector) document 174 | // uses standard Tika's PDF2XHTML class by reflection 175 | // because this class is private (package restricted) and I don't 176 | // want to copy the class's code and a bunch of dependent modules into plugin 177 | private void callPDF2XHTMLProcess(PDDocument document, ContentHandler handler, 178 | ParseContext context, Metadata metadata, 179 | PDFParserConfig config, 180 | boolean noOcr) throws 181 | TikaException, SAXException { 182 | // noOcr ptr is ignored in current implementation 183 | PDFParserConfig.OCR_STRATEGY oldOcrStrategy = config.getOcrStrategy(); 184 | config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR); 185 | PDF2XHTML.process(document, handler, context, metadata, config); 186 | config.setOcrStrategy(oldOcrStrategy); 187 | } 188 | 189 | // process PDF as a scanned image set 190 | // again uses reflection 191 | private void callOCR2XHTMLProcess(PDDocument document, ContentHandler handler, 192 | ParseContext context, Metadata metadata, 193 | PDFParserConfig config) throws 194 | ClassNotFoundException, NoSuchMethodException, InvocationTargetException, IllegalAccessException, 195 | TikaException, SAXException { 196 | TesseractOCRConfig cfg = buildTesseractOCRConfig(config); 197 | context.set(TesseractOCRConfig.class, cfg); 198 | 199 | PDFParserConfig.OCR_STRATEGY oldOcrStrategy = config.getOcrStrategy(); 200 | boolean oldExtractInlineImages = config.getExtractInlineImages(); 201 | boolean oldExtractUniqueInlineImagesOnly = config.getExtractUniqueInlineImagesOnly(); 202 | 203 | // explicitly tells Tika to use OCR 204 | config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.OCR_ONLY); 205 | config.setExtractInlineImages(true); 206 | config.setExtractUniqueInlineImagesOnly(false); 207 | 208 | Class c = Class.forName("org.apache.tika.parser.pdf.OCR2XHTML"); 209 | Method m = c.getDeclaredMethod("process", 210 | PDDocument.class, ContentHandler.class, ParseContext.class, Metadata.class, 211 | PDFParserConfig.class); 212 | m.setAccessible(true); 213 | m.invoke(null, document, handler, context, metadata, config); 214 | 215 | config.setOcrStrategy(oldOcrStrategy); 216 | config.setExtractInlineImages(oldExtractInlineImages); 217 | config.setExtractUniqueInlineImagesOnly(oldExtractUniqueInlineImagesOnly); 218 | } 219 | 220 | private TesseractOCRConfig buildTesseractOCRConfig(PDFParserConfig config) 221 | { 222 | TesseractOCRConfig cfg = new TesseractOCRConfig(); 223 | // here I set default timeout of 2 hours 224 | // The calling process should check parsing process and terminate it by timeout 225 | cfg.setTimeout(60 * 60 * 2); 226 | return cfg; 227 | } 228 | 229 | // check whether the method should read XFA (forms) only 230 | private boolean callShouldHandleXFAOnly(PDDocument pdDocument, PDFParserConfig config) 231 | throws InvocationTargetException, IllegalAccessException { 232 | boolean xfa = this.checkDocHasXFA(pdDocument); 233 | Method m; 234 | try { 235 | m = getClass().getSuperclass().getDeclaredMethod("shouldHandleXFAOnly", 236 | boolean.class, PDFParserConfig.class); 237 | } catch (NoSuchMethodException e) { 238 | return false; 239 | } 240 | m.setAccessible(true); 241 | return (boolean) m.invoke(this, xfa, config); 242 | } 243 | 244 | private boolean checkDocHasXFA(PDDocument pdDocument) 245 | throws InvocationTargetException, IllegalAccessException { 246 | Method m; 247 | try { 248 | m = getClass().getSuperclass().getDeclaredMethod("hasXFA", 249 | PDDocument.class); 250 | } 251 | catch (NoSuchMethodException e) { 252 | return false; 253 | } 254 | m.setAccessible(true); 255 | return (boolean) m.invoke(this, pdDocument); 256 | } 257 | 258 | // read XFA forms' content 259 | private void callHandleXFAOnly(PDDocument pdDocument, ContentHandler handler, 260 | Metadata metadata, ParseContext context) 261 | throws NoSuchMethodException, InvocationTargetException, IllegalAccessException { 262 | Method m = getClass().getSuperclass().getDeclaredMethod("handleXFAOnly", 263 | PDDocument.class, ContentHandler.class, Metadata.class, ParseContext.class); 264 | m.setAccessible(true); 265 | m.invoke(this, pdDocument, handler, metadata, context); 266 | } 267 | 268 | // uses reflection, again, for obtaining PDF's metadata 269 | private void callExtractMetadata(PDDocument document, Metadata metadata, ParseContext context) 270 | throws NoSuchMethodException, IllegalAccessException, InvocationTargetException { 271 | Method m = getClass().getSuperclass().getDeclaredMethod("extractMetadata", 272 | PDDocument.class, Metadata.class, ParseContext.class); 273 | m.setAccessible(true); 274 | m.invoke(this, document, metadata, context); 275 | } 276 | 277 | // read password from metadata 278 | private String callGetPassword(Metadata metadata, ParseContext context) 279 | throws NoSuchMethodException, IllegalAccessException, InvocationTargetException { 280 | Method m = getClass().getSuperclass().getDeclaredMethod("getPassword", 281 | Metadata.class, ParseContext.class); 282 | m.setAccessible(true); 283 | Object retVal = m.invoke(this, metadata, context); 284 | return (String) retVal; 285 | } 286 | 287 | // make a copy because I don't want to modify original config params 288 | private PDFParserConfig makeConfigLocalCopy(PDFParserConfig srcConfig) { 289 | PDFParserConfig cpy = new PDFParserConfig(); 290 | ShallowCopy.copyFields(srcConfig, cpy); 291 | return cpy; 292 | } 293 | 294 | public static void main(String args[]) throws IOException { 295 | if (args.length < 2) return; 296 | 297 | if (args[0].equals("--flatten")) { 298 | // flatten document's images by redrawing them on a white background 299 | String srcPath = args[1], dstPath = srcPath + ".processed"; 300 | if (args.length > 2) 301 | dstPath = args[2]; 302 | 303 | File inputFile = new File(srcPath); 304 | FileInputStream fis = new FileInputStream(inputFile); 305 | 306 | try { 307 | PDDocument doc = PDDocument.load(fis); 308 | 309 | PdfContentImagePreprocessor preproc = new PdfContentImagePreprocessor(); 310 | boolean hasReplaced = preproc.removeImagesAlphaChannel(doc); 311 | if (hasReplaced) { 312 | System.out.println("PDF file images were updated"); 313 | doc.save(dstPath); 314 | } else { 315 | System.out.println("PDF file was not changed"); 316 | } 317 | } catch (Exception e) { 318 | System.out.println("Error occurred:"); 319 | System.out.println(e.toString()); 320 | fis.close(); 321 | } 322 | } 323 | 324 | if (args[0].equals("--explore")) { 325 | PdfContentTypeChecker checker = new PdfContentTypeChecker(); 326 | String srcPath = args[1]; 327 | File inputFile = new File(srcPath); 328 | FileInputStream fis = new FileInputStream(inputFile); 329 | 330 | try { 331 | PDDocument doc = PDDocument.load(fis); 332 | checker.determineDocContentType(doc); 333 | 334 | System.out.printf("images:%d,text_blocks:%d%n", 335 | checker.getImagesCount(), 336 | checker.getTextBlocks()); 337 | 338 | } catch (Exception e) { 339 | System.out.println("Error occurred:"); 340 | System.out.println(e.toString()); 341 | fis.close(); 342 | } 343 | } 344 | } 345 | } 346 | -------------------------------------------------------------------------------- /lexpredict-tika/src/main/java/com/lexpredict/tika/PDF2XHTML.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Modifications copyright (C) 2020 ContraxSuite, LLC 18 | */ 19 | /* 20 | * Licensed to the Apache Software Foundation (ASF) under one or more 21 | * contributor license agreements. See the NOTICE file distributed with 22 | * this work for additional information regarding copyright ownership. 23 | * The ASF licenses this file to You under the Apache License, Version 2.0 24 | * (the "License"); you may not use this file except in compliance with 25 | * the License. You may obtain a copy of the License at 26 | * 27 | * http://www.apache.org/licenses/LICENSE-2.0 28 | * 29 | * Unless required by applicable law or agreed to in writing, software 30 | * distributed under the License is distributed on an "AS IS" BASIS, 31 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 32 | * See the License for the specific language governing permissions and 33 | * limitations under the License. 34 | * 35 | * Modifications copyright (C) 2020 ContraxSuite, LLC 36 | */ 37 | package com.lexpredict.tika; 38 | 39 | import org.apache.pdfbox.cos.COSArray; 40 | import org.apache.pdfbox.cos.COSBase; 41 | import org.apache.pdfbox.cos.COSName; 42 | import org.apache.pdfbox.cos.COSStream; 43 | import org.apache.pdfbox.filter.MissingImageReaderException; 44 | import org.apache.pdfbox.pdmodel.PDDocument; 45 | import org.apache.pdfbox.pdmodel.PDPage; 46 | import org.apache.pdfbox.pdmodel.PDPageContentStream; 47 | import org.apache.pdfbox.pdmodel.PDResources; 48 | import org.apache.pdfbox.pdmodel.font.PDFont; 49 | import org.apache.pdfbox.pdmodel.graphics.PDXObject; 50 | import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray; 51 | import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB; 52 | import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject; 53 | import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; 54 | import org.apache.pdfbox.text.PDFTextStripper; 55 | import org.apache.pdfbox.text.TextPosition; 56 | import org.apache.pdfbox.tools.imageio.ImageIOUtil; 57 | import org.apache.pdfbox.util.Matrix; 58 | import org.apache.tika.exception.TikaException; 59 | import org.apache.tika.extractor.EmbeddedDocumentUtil; 60 | import org.apache.tika.io.TikaInputStream; 61 | import org.apache.tika.metadata.Metadata; 62 | import org.apache.tika.metadata.TikaCoreProperties; 63 | import org.apache.tika.parser.ParseContext; 64 | import org.apache.tika.parser.pdf.PDFParserConfig; 65 | import org.apache.tika.sax.EmbeddedContentHandler; 66 | import org.xml.sax.ContentHandler; 67 | import org.xml.sax.SAXException; 68 | import org.xml.sax.helpers.AttributesImpl; 69 | 70 | import java.awt.image.BufferedImage; 71 | import java.io.ByteArrayOutputStream; 72 | import java.io.IOException; 73 | import java.io.InputStream; 74 | import java.io.OutputStream; 75 | import java.io.Writer; 76 | import java.util.*; 77 | 78 | class PDF2XHTML extends AbstractPDF2XHTML { 79 | private static final List JPEG = Arrays.asList( 80 | COSName.DCT_DECODE.getName(), 81 | COSName.DCT_DECODE_ABBREVIATION.getName()); 82 | 83 | private static final List JP2 = 84 | Arrays.asList(COSName.JPX_DECODE.getName()); 85 | 86 | private static final List JB2 = Arrays.asList( 87 | COSName.JBIG2_DECODE.getName()); 88 | 89 | /** 90 | * This keeps track of the pdf object ids for inline 91 | * images that have been processed. 92 | * If {@link PDFParserConfig#getExtractUniqueInlineImagesOnly() 93 | * is true, this will be checked before extracting an embedded image. 94 | * The integer keeps track of the inlineImageCounter for that image. 95 | * This integer is used to identify images in the markup. 96 | * 97 | * This is used across the document. To avoid infinite recursion 98 | * TIKA-1742, we're limiting the export to one image per page. 99 | */ 100 | private Map processedInlineImages = new HashMap<>(); 101 | private int inlineImageCounter = 0; 102 | private PDF2XHTML(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata, 103 | PDFParserConfig config) 104 | throws IOException { 105 | super(document, handler, context, metadata, config); 106 | } 107 | 108 | /** 109 | * Converts the given PDF document (and related metadata) to a stream 110 | * of XHTML SAX events sent to the given content handler. 111 | * 112 | * @param document PDF document 113 | * @param handler SAX content handler 114 | * @param metadata PDF metadata 115 | * @throws SAXException if the content handler fails to process SAX events 116 | * @throws TikaException if there was an exception outside of per page processing 117 | */ 118 | public static void process( 119 | PDDocument document, 120 | ContentHandler handler, 121 | ParseContext context, 122 | Metadata metadata, 123 | PDFParserConfig config) 124 | throws SAXException, TikaException { 125 | PDF2XHTML pdf2XHTML = null; 126 | try { 127 | // Extract text using a dummy Writer as we override the 128 | // key methods to output to the given content 129 | // handler. 130 | if (config.getDetectAngles()) { 131 | pdf2XHTML = new PDF2XHTML.AngleDetectingPDF2XHTML(document, handler, context, metadata, config); 132 | } else { 133 | pdf2XHTML = new PDF2XHTML(document, handler, context, metadata, config); 134 | } 135 | AlterPDFParserConfig.configureAlterPtf2Xhtml(config, pdf2XHTML); 136 | 137 | pdf2XHTML.writeText(document, new Writer() { 138 | @Override 139 | public void write(char[] cbuf, int off, int len) { 140 | } 141 | 142 | @Override 143 | public void flush() { 144 | } 145 | 146 | @Override 147 | public void close() { 148 | } 149 | }); 150 | } catch (IOException e) { 151 | if (e.getCause() instanceof SAXException) { 152 | throw (SAXException) e.getCause(); 153 | } else { 154 | throw new TikaException("Unable to extract PDF content", e); 155 | } 156 | } 157 | if (pdf2XHTML.exceptions.size() > 0) { 158 | //throw the first 159 | throw new TikaException("Unable to extract PDF content", pdf2XHTML.exceptions.get(0)); 160 | } 161 | } 162 | 163 | @Override 164 | public void processPage(PDPage page) throws IOException { 165 | try { 166 | super.processPage(page); 167 | } catch (IOException e) { 168 | handleCatchableIOE(e); 169 | endPage(page); 170 | } 171 | } 172 | 173 | @Override 174 | protected void endPage(PDPage page) throws IOException { 175 | try { 176 | writeParagraphEnd(); 177 | try { 178 | extractImages(page.getResources(), new HashSet()); 179 | } catch (IOException e) { 180 | handleCatchableIOE(e); 181 | } 182 | super.endPage(page); 183 | } catch (SAXException e) { 184 | throw new IOException("Unable to end a page", e); 185 | } catch (IOException e) { 186 | handleCatchableIOE(e); 187 | } 188 | } 189 | 190 | private void extractImages(PDResources resources, Set seenThisPage) throws SAXException, IOException { 191 | if (resources == null || config.getExtractInlineImages() == false) { 192 | return; 193 | } 194 | 195 | for (COSName name : resources.getXObjectNames()) { 196 | 197 | PDXObject object = null; 198 | try { 199 | object = resources.getXObject(name); 200 | } catch (MissingImageReaderException e) { 201 | EmbeddedDocumentUtil.recordException(e, metadata); 202 | continue; 203 | } catch (IOException e) { 204 | EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata); 205 | continue; 206 | } 207 | processImageObject(object, seenThisPage); 208 | } 209 | } 210 | 211 | private void processImageObject(PDXObject object, Set seenThisPage) throws SAXException, IOException { 212 | if (object == null) { 213 | return; 214 | } 215 | COSStream cosStream = object.getCOSObject(); 216 | if (seenThisPage.contains(cosStream)) { 217 | //avoid infinite recursion TIKA-1742 218 | return; 219 | } 220 | seenThisPage.add(cosStream); 221 | 222 | if (object instanceof PDFormXObject) { 223 | extractImages(((PDFormXObject) object).getResources(), seenThisPage); 224 | } else if (object instanceof PDImageXObject) { 225 | 226 | PDImageXObject image = (PDImageXObject) object; 227 | 228 | Metadata embeddedMetadata = new Metadata(); 229 | String extension = image.getSuffix(); 230 | 231 | if (extension == null || extension.equals("png")) { 232 | embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/png"); 233 | extension = "png"; 234 | } else if (extension.equals("jpg")) { 235 | embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/jpeg"); 236 | } else if (extension.equals("tiff")) { 237 | embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/tiff"); 238 | extension = "tif"; 239 | } else if (extension.equals("jpx")) { 240 | embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/jp2"); 241 | } else if (extension.equals("jb2")) { 242 | embeddedMetadata.set( 243 | Metadata.CONTENT_TYPE, "image/x-jbig2"); 244 | } else { 245 | //TODO: determine if we need to add more image types 246 | // throw new RuntimeException("EXTEN:" + extension); 247 | } 248 | Integer imageNumber = processedInlineImages.get(cosStream); 249 | if (imageNumber == null) { 250 | imageNumber = inlineImageCounter++; 251 | } 252 | String fileName = "image" + imageNumber + "."+extension; 253 | embeddedMetadata.set(Metadata.RESOURCE_NAME_KEY, fileName); 254 | 255 | // Output the img tag 256 | AttributesImpl attr = new AttributesImpl(); 257 | attr.addAttribute("", "src", "src", "CDATA", "embedded:" + fileName); 258 | attr.addAttribute("", "alt", "alt", "CDATA", fileName); 259 | xhtml.startElement("img", attr); 260 | xhtml.endElement("img"); 261 | 262 | //Do we only want to process unique COSObject ids? 263 | //If so, have we already processed this one? 264 | if (config.getExtractUniqueInlineImagesOnly() == true) { 265 | if (processedInlineImages.containsKey(cosStream)) { 266 | return; 267 | } 268 | processedInlineImages.put(cosStream, imageNumber); 269 | } 270 | 271 | embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, 272 | TikaCoreProperties.EmbeddedResourceType.INLINE.toString()); 273 | 274 | if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) { 275 | ByteArrayOutputStream buffer = new ByteArrayOutputStream(); 276 | try { 277 | //extract the metadata contained outside of the image 278 | // TODO: fix IT 279 | //PDMetadataExtractor.extract(image.getMetadata(), 280 | // embeddedMetadata, context); 281 | try { 282 | writeToBuffer(image, extension, buffer); 283 | } catch (IOException e) { 284 | EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata); 285 | return; 286 | } 287 | try (InputStream embeddedIs = TikaInputStream.get(buffer.toByteArray())) { 288 | embeddedDocumentExtractor.parseEmbedded( 289 | embeddedIs, 290 | new EmbeddedContentHandler(xhtml), 291 | embeddedMetadata, false); 292 | } 293 | } catch (IOException e) { 294 | handleCatchableIOE(e); 295 | } 296 | } 297 | } 298 | } 299 | 300 | //nearly directly copied from PDFBox ExtractImages 301 | private void writeToBuffer(PDImageXObject pdImage, String suffix, OutputStream out) 302 | throws IOException { 303 | 304 | BufferedImage image = pdImage.getImage(); 305 | if (image != null) { 306 | if ("jpg".equals(suffix)) { 307 | String colorSpaceName = pdImage.getColorSpace().getName(); 308 | //TODO: figure out if we want directJPEG as a configuration 309 | //previously: if (directJPeg || PDDeviceGray.... 310 | if (PDDeviceGray.INSTANCE.getName().equals(colorSpaceName) || 311 | PDDeviceRGB.INSTANCE.getName().equals(colorSpaceName)) { 312 | // RGB or Gray colorspace: get and write the unmodifiedJPEG stream 313 | InputStream data = pdImage.getStream().createInputStream(JPEG); 314 | org.apache.pdfbox.io.IOUtils.copy(data, out); 315 | org.apache.pdfbox.io.IOUtils.closeQuietly(data); 316 | } else { 317 | // for CMYK and other "unusual" colorspaces, the JPEG will be converted 318 | ImageIOUtil.writeImage(image, suffix, out); 319 | } 320 | } else if ("jp2".equals(suffix) || "jpx".equals(suffix)) { 321 | InputStream data = pdImage.createInputStream(JP2); 322 | org.apache.pdfbox.io.IOUtils.copy(data, out); 323 | org.apache.pdfbox.io.IOUtils.closeQuietly(data); 324 | } else if ("jb2".equals(suffix)) { 325 | InputStream data = pdImage.createInputStream(JB2); 326 | org.apache.pdfbox.io.IOUtils.copy(data, out); 327 | org.apache.pdfbox.io.IOUtils.closeQuietly(data); 328 | } else{ 329 | ImageIOUtil.writeImage(image, suffix, out); 330 | } 331 | } 332 | out.flush(); 333 | } 334 | 335 | @Override 336 | protected void writeParagraphStart() throws IOException { 337 | super.writeParagraphStart(); 338 | try { 339 | xhtml.startElement("p"); 340 | } catch (SAXException e) { 341 | throw new IOException("Unable to start a paragraph", e); 342 | } 343 | } 344 | 345 | @Override 346 | protected void writeParagraphEnd() throws IOException { 347 | super.writeParagraphEnd(); 348 | try { 349 | xhtml.endElement("p"); 350 | } catch (SAXException e) { 351 | throw new IOException("Unable to end a paragraph", e); 352 | } 353 | } 354 | 355 | @Override 356 | protected void writeString(String text) throws IOException { 357 | try { 358 | xhtml.characters(text); 359 | } catch (SAXException e) { 360 | throw new IOException( 361 | "Unable to write a string: " + text, e); 362 | } 363 | } 364 | 365 | @Override 366 | protected void writeString(String text, List textPositions) throws IOException 367 | { 368 | try { 369 | AttributesImpl atr = new AttributesImpl(); 370 | if (textPositions.size() > 0) { 371 | StringBuilder posStr = new StringBuilder(); 372 | for (TextPosition pos : textPositions) 373 | posStr.append( 374 | formatFloatNumbers(";", 375 | pos.getX(), pos.getY(), 376 | pos.getWidth(), pos.getHeight())); 377 | atr.addAttribute("", "pos", "pos", "string", posStr.toString()); 378 | } 379 | xhtml.startElement("span", atr); 380 | xhtml.characters(text); 381 | xhtml.endElement("span"); 382 | } catch (SAXException e) { 383 | throw new IOException( 384 | "Unable to write a string: " + text, e); 385 | } 386 | } 387 | 388 | @Override 389 | protected String normalizeString( 390 | String text, 391 | List textPositions, 392 | boolean tryFontMapping) throws IOException { 393 | if (text == null || text.length() == 0) 394 | return ""; 395 | 396 | if (tryFontMapping) { 397 | // replace \r\n with \n 398 | while (true) { 399 | int rnIndex = text.indexOf("\r\n"); 400 | if (rnIndex < 0) break; 401 | text = text.substring(0, rnIndex) + text.substring(rnIndex + 1); 402 | if (rnIndex < textPositions.size()) 403 | textPositions.remove(rnIndex); 404 | } 405 | } 406 | // replace \r (\f) with \n 407 | text = text.replace('\r', '\n'); 408 | text = text.replace('\f', '\n'); 409 | 410 | StringBuilder mappedString = new StringBuilder(); 411 | char[] charArray = text.toCharArray(); 412 | for (int i = 0; i < charArray.length; i++) { 413 | if (!xhtml.isCharacterInvalid(charArray[i])) { 414 | mappedString.append(charArray[i]); 415 | continue; 416 | } 417 | if (tryFontMapping) { 418 | PDFont txtFont = textPositions.get(i).getFont(); 419 | if (txtFont == null) { 420 | mappedString.append('?'); 421 | continue; 422 | } 423 | String uniStr = txtFont.toUnicode(charArray[i]); 424 | uniStr = normalizeString(uniStr, textPositions, false); 425 | mappedString.append(uniStr == null || uniStr.length() == 0 ? "?" : uniStr); 426 | } else 427 | mappedString.append("?"); 428 | } 429 | return mappedString.toString(); 430 | } 431 | 432 | @Override 433 | protected void dumpCDATA() throws SAXException { 434 | for (Map.Entry entry : cdataContent.entrySet()) { 435 | TagData tag = entry.getValue(); 436 | String containerName = tag.tagName; 437 | if (tag.attributeString.length() > 0) 438 | containerName += " " + tag.attributeString; 439 | xhtml.startElement("", containerName, containerName, new AttributesImpl()); 440 | if (tag.isCdata) 441 | //xhtml.charactersRaw(""); 446 | xhtml.endElement(tag.tagName); 447 | } 448 | } 449 | 450 | @Override 451 | protected void writeString(String text, TextPosition boundingPosition) throws IOException 452 | { 453 | writeString(text, new ArrayList() {{add(boundingPosition);}}); 454 | } 455 | 456 | @Override 457 | protected void writeCharacters(TextPosition text) throws IOException { 458 | try { 459 | xhtml.characters(text.getUnicode()); 460 | } catch (SAXException e) { 461 | throw new IOException( 462 | "Unable to write a character: " + text.getUnicode(), e); 463 | } 464 | } 465 | 466 | @Override 467 | protected void writeWordSeparator() throws IOException { 468 | try { 469 | xhtml.characters(getWordSeparator()); 470 | } catch (SAXException e) { 471 | throw new IOException( 472 | "Unable to write a space character", e); 473 | } 474 | } 475 | 476 | @Override 477 | protected void writeLineSeparator() throws IOException { 478 | try { 479 | xhtml.newline(); 480 | } catch (SAXException e) { 481 | throw new IOException( 482 | "Unable to write a newline character", e); 483 | } 484 | } 485 | 486 | class AngleCollector extends PDFTextStripper { 487 | Set angles = new HashSet<>(); 488 | 489 | public Set getAngles() { 490 | return angles; 491 | } 492 | 493 | /** 494 | * Instantiate a new PDFTextStripper object. 495 | * 496 | * @throws IOException If there is an error loading the properties. 497 | */ 498 | AngleCollector() throws IOException { 499 | } 500 | 501 | @Override 502 | protected void processTextPosition(TextPosition text) { 503 | Matrix m = text.getTextMatrix(); 504 | m.concatenate(text.getFont().getFontMatrix()); 505 | int angle = (int) Math.round(Math.toDegrees(Math.atan2(m.getShearY(), m.getScaleY()))); 506 | angle = (angle + 360) % 360; 507 | angles.add(angle); 508 | } 509 | } 510 | 511 | private static class AngleDetectingPDF2XHTML extends PDF2XHTML { 512 | 513 | private AngleDetectingPDF2XHTML(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata, PDFParserConfig config) throws IOException { 514 | super(document, handler, context, metadata, config); 515 | } 516 | 517 | @Override 518 | protected void startPage(PDPage page) throws IOException { 519 | //no-op 520 | } 521 | 522 | @Override 523 | protected void endPage(PDPage page) throws IOException { 524 | //no-op 525 | } 526 | 527 | @Override 528 | public void processPage(PDPage page) throws IOException { 529 | try { 530 | super.startPage(page); 531 | detectAnglesAndProcessPage(page); 532 | } catch (IOException e) { 533 | handleCatchableIOE(e); 534 | } finally { 535 | super.endPage(page); 536 | } 537 | } 538 | 539 | private void detectAnglesAndProcessPage(PDPage page) throws IOException { 540 | //copied and pasted from https://issues.apache.org/jira/secure/attachment/12947452/ExtractAngledText.java 541 | //PDFBOX-4371 542 | PDF2XHTML.AngleCollector angleCollector = new PDF2XHTML.AngleCollector(); // alternatively, reset angles 543 | angleCollector.setStartPage(getCurrentPageNo()); 544 | angleCollector.setEndPage(getCurrentPageNo()); 545 | angleCollector.getText(document); 546 | 547 | int rotation = page.getRotation(); 548 | page.setRotation(0); 549 | 550 | for (Integer angle : angleCollector.getAngles()) { 551 | if (angle == 0) { 552 | try { 553 | super.processPage(page); 554 | } catch (IOException e) { 555 | handleCatchableIOE(e); 556 | } 557 | } else { 558 | // prepend a transformation 559 | try (PDPageContentStream cs = new PDPageContentStream(document, page, PDPageContentStream.AppendMode.PREPEND, false)) { 560 | cs.transform(Matrix.getRotateInstance(-Math.toRadians(angle), 0, 0)); 561 | } 562 | 563 | try { 564 | super.processPage(page); 565 | } catch (IOException e) { 566 | handleCatchableIOE(e); 567 | } 568 | 569 | // remove transformation 570 | COSArray contents = (COSArray) page.getCOSObject().getItem(COSName.CONTENTS); 571 | contents.remove(0); 572 | } 573 | } 574 | page.setRotation(rotation); 575 | } 576 | 577 | @Override 578 | protected void processTextPosition(TextPosition text) { 579 | Matrix m = text.getTextMatrix(); 580 | m.concatenate(text.getFont().getFontMatrix()); 581 | int angle = (int) Math.round(Math.toDegrees(Math.atan2(m.getShearY(), m.getScaleY()))); 582 | if (angle == 0) { 583 | super.processTextPosition(text); 584 | } 585 | } 586 | } 587 | } 588 | 589 | -------------------------------------------------------------------------------- /lexpredict-tika/src/main/java/com/lexpredict/tika/AbstractPDF2XHTML.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Modifications copyright (C) 2020 ContraxSuite, LLC 18 | */ 19 | 20 | package com.lexpredict.tika; 21 | 22 | import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.NO_OCR; 23 | 24 | import javax.xml.stream.XMLStreamException; 25 | import java.awt.image.BufferedImage; 26 | import java.io.BufferedInputStream; 27 | import java.io.ByteArrayInputStream; 28 | import java.io.IOException; 29 | import java.io.InputStream; 30 | import java.io.OutputStream; 31 | import java.lang.reflect.InvocationTargetException; 32 | import java.lang.reflect.Method; 33 | import java.nio.charset.StandardCharsets; 34 | import java.nio.file.Files; 35 | import java.nio.file.Path; 36 | import java.text.SimpleDateFormat; 37 | import java.util.ArrayList; 38 | import java.util.Calendar; 39 | import java.util.HashSet; 40 | import java.util.List; 41 | import java.util.ListIterator; 42 | import java.util.Locale; 43 | import java.util.Map; 44 | import java.util.Set; 45 | import java.util.TreeMap; 46 | 47 | import org.apache.commons.io.IOExceptionWithCause; 48 | import org.apache.commons.io.IOUtils; 49 | import org.apache.pdfbox.cos.COSName; 50 | import org.apache.pdfbox.pdmodel.PDDocument; 51 | import org.apache.pdfbox.pdmodel.PDDocumentCatalog; 52 | import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary; 53 | import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode; 54 | import org.apache.pdfbox.pdmodel.PDPage; 55 | import org.apache.pdfbox.pdmodel.PDPageTree; 56 | import org.apache.pdfbox.pdmodel.common.PDDestinationOrAction; 57 | import org.apache.pdfbox.pdmodel.common.PDNameTreeNode; 58 | import org.apache.pdfbox.pdmodel.common.PDRectangle; 59 | import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification; 60 | import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile; 61 | import org.apache.pdfbox.pdmodel.common.filespecification.PDFileSpecification; 62 | import org.apache.pdfbox.pdmodel.common.filespecification.PDSimpleFileSpecification; 63 | import org.apache.pdfbox.pdmodel.font.PDFont; 64 | import org.apache.pdfbox.pdmodel.interactive.action.PDAction; 65 | import org.apache.pdfbox.pdmodel.interactive.action.PDActionImportData; 66 | import org.apache.pdfbox.pdmodel.interactive.action.PDActionJavaScript; 67 | import org.apache.pdfbox.pdmodel.interactive.action.PDActionLaunch; 68 | import org.apache.pdfbox.pdmodel.interactive.action.PDActionRemoteGoTo; 69 | import org.apache.pdfbox.pdmodel.interactive.action.PDActionURI; 70 | import org.apache.pdfbox.pdmodel.interactive.action.PDAnnotationAdditionalActions; 71 | import org.apache.pdfbox.pdmodel.interactive.action.PDDocumentCatalogAdditionalActions; 72 | import org.apache.pdfbox.pdmodel.interactive.action.PDFormFieldAdditionalActions; 73 | import org.apache.pdfbox.pdmodel.interactive.action.PDPageAdditionalActions; 74 | import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; 75 | import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment; 76 | import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup; 77 | import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationWidget; 78 | import org.apache.pdfbox.pdmodel.interactive.digitalsignature.PDSignature; 79 | import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline; 80 | import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem; 81 | import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineNode; 82 | import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm; 83 | import org.apache.pdfbox.pdmodel.interactive.form.PDField; 84 | import org.apache.pdfbox.pdmodel.interactive.form.PDNonTerminalField; 85 | import org.apache.pdfbox.pdmodel.interactive.form.PDSignatureField; 86 | import org.apache.pdfbox.pdmodel.interactive.form.PDXFAResource; 87 | import org.apache.pdfbox.rendering.PDFRenderer; 88 | import org.apache.pdfbox.tools.imageio.ImageIOUtil; 89 | import org.apache.pdfbox.util.Matrix; 90 | import org.apache.pdfbox.util.Vector; 91 | import org.apache.tika.exception.TikaException; 92 | import org.apache.tika.extractor.EmbeddedDocumentExtractor; 93 | import org.apache.tika.extractor.EmbeddedDocumentUtil; 94 | import org.apache.tika.io.TemporaryResources; 95 | import org.apache.tika.io.TikaInputStream; 96 | import org.apache.tika.metadata.Font; 97 | import org.apache.tika.metadata.Metadata; 98 | import org.apache.tika.metadata.PDF; 99 | import org.apache.tika.metadata.TikaCoreProperties; 100 | import org.apache.tika.parser.ParseContext; 101 | import org.apache.tika.parser.ocr.TesseractOCRConfig; 102 | import org.apache.tika.parser.ocr.TesseractOCRParser; 103 | import org.apache.tika.parser.pdf.PDFParserConfig; 104 | import org.apache.tika.sax.EmbeddedContentHandler; 105 | import org.xml.sax.ContentHandler; 106 | import org.xml.sax.SAXException; 107 | import org.xml.sax.helpers.AttributesImpl; 108 | 109 | class AbstractPDF2XHTML extends PDFTextStripper { 110 | 111 | enum ActionTrigger { 112 | AFTER_DOCUMENT_PRINT, 113 | AFTER_DOCUMENT_SAVE, 114 | ANNOTATION_CURSOR_ENTERS, 115 | ANNOTATION_CURSOR_EXIT, 116 | ANNOTATION_LOSE_INPUT_FOCUS, 117 | ANNOTATION_MOUSE_CLICK, 118 | ANNOTATION_MOUSE_RELEASED, 119 | ANNOTATION_PAGE_CLOSED, 120 | ANNOTATION_PAGE_NO_LONGER_VISIBLE, 121 | ANNOTATION_PAGE_OPENED, 122 | ANNOTATION_PAGE_VISIBLE, 123 | ANNOTATION_RECEIVES_FOCUS, 124 | ANNOTATION_WIDGET, 125 | BEFORE_DOCUMENT_CLOSE, 126 | BEFORE_DOCUMENT_PRINT, 127 | BEFORE_DOCUMENT_SAVE, 128 | DOCUMENT_OPEN, 129 | FORM_FIELD, 130 | FORM_FIELD_FORMATTED, 131 | FORM_FIELD_KEYSTROKE, 132 | FORM_FIELD_RECALCULATE, 133 | FORM_FIELD_VALUE_CHANGE, 134 | PAGE_CLOSE, 135 | PAGE_OPEN, BOOKMARK, 136 | }; 137 | 138 | /** 139 | * Maximum recursive depth during AcroForm processing. 140 | * Prevents theoretical AcroForm recursion bomb. 141 | */ 142 | private final static int MAX_ACROFORM_RECURSIONS = 10; 143 | 144 | private final static TesseractOCRConfig DEFAULT_TESSERACT_CONFIG = new TesseractOCRConfig(); 145 | 146 | /** 147 | * Format used for signature dates 148 | * TODO Make this thread-safe 149 | */ 150 | private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.ROOT); 151 | 152 | 153 | final List exceptions = new ArrayList<>(); 154 | final PDDocument pdDocument; 155 | final AlterXHTMLContentHandler xhtml; 156 | final ParseContext context; 157 | final Metadata metadata; 158 | final EmbeddedDocumentExtractor embeddedDocumentExtractor; 159 | final PDFParserConfig config; 160 | final TesseractOCRParser tesseractOCRParser;//can be null! 161 | 162 | //zero-based pageIndex 163 | int pageIndex = 0; 164 | int startPage = -1;//private in PDFTextStripper...must have own copy because we override processpages 165 | int unmappedUnicodeCharsPerPage = 0; 166 | int totalCharsPerPage = 0; 167 | 168 | private final Set fontNames = new HashSet<>(); 169 | 170 | AbstractPDF2XHTML(PDDocument pdDocument, ContentHandler handler, ParseContext context, Metadata metadata, 171 | PDFParserConfig config) throws IOException { 172 | this.pdDocument = pdDocument; 173 | this.xhtml = new AlterXHTMLContentHandler(handler, metadata); 174 | this.context = context; 175 | this.metadata = metadata; 176 | this.config = config; 177 | embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); 178 | if (config.getOcrStrategy() == NO_OCR) { 179 | tesseractOCRParser = null; 180 | } else { 181 | tesseractOCRParser = (TesseractOCRParser)EmbeddedDocumentUtil.tryToFindExistingLeafParser(TesseractOCRParser.class, context); 182 | } 183 | } 184 | 185 | @Override 186 | protected void startPage(PDPage page) throws IOException { 187 | try { 188 | AttributesImpl attrs = new AttributesImpl(); 189 | if (this.detalization != OutputDetalization.NO_EXTRA_DETAIL) { 190 | StringBuilder sb = new StringBuilder(); 191 | PDRectangle area = page.getMediaBox(); 192 | sb.append(area.getLowerLeftX()); 193 | sb.append(","); 194 | sb.append(area.getLowerLeftY()); 195 | sb.append(","); 196 | sb.append(area.getWidth()); 197 | sb.append(","); 198 | sb.append(area.getHeight()); 199 | attrs.addAttribute("", "area", "area", "string", sb.toString()); 200 | } 201 | attrs.addAttribute("", "class", "class", "string", "page"); 202 | xhtml.startElement("div", attrs); 203 | } catch (SAXException e) { 204 | throw new IOExceptionWithCause("Unable to start a page", e); 205 | } 206 | writeParagraphStart(); 207 | } 208 | 209 | private void extractEmbeddedDocuments(PDDocument document) 210 | throws IOException, SAXException, TikaException { 211 | PDDocumentNameDictionary namesDictionary = 212 | new PDDocumentNameDictionary(document.getDocumentCatalog()); 213 | PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles(); 214 | if (efTree == null) { 215 | return; 216 | } 217 | 218 | Map embeddedFileNames = efTree.getNames(); 219 | //For now, try to get the embeddedFileNames out of embeddedFiles or its kids. 220 | //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java 221 | //If there is a need we could add a fully recursive search to find a non-null 222 | //Map that contains the doc info. 223 | if (embeddedFileNames != null) { 224 | processEmbeddedDocNames(embeddedFileNames); 225 | } else { 226 | List> kids = efTree.getKids(); 227 | if (kids == null) { 228 | return; 229 | } 230 | for (PDNameTreeNode node : kids) { 231 | embeddedFileNames = node.getNames(); 232 | if (embeddedFileNames != null) { 233 | processEmbeddedDocNames(embeddedFileNames); 234 | } 235 | } 236 | } 237 | } 238 | 239 | private void processDoc(String name, PDFileSpecification spec, AttributesImpl attributes) throws TikaException, SAXException, IOException { 240 | if (spec instanceof PDSimpleFileSpecification) { 241 | attributes.addAttribute("", "class", "class", "CDATA", "linked"); 242 | attributes.addAttribute("", "id", "id", "CDATA", spec.getFile()); 243 | xhtml.startElement("div", attributes); 244 | xhtml.endElement("div"); 245 | } else if (spec instanceof PDComplexFileSpecification){ 246 | if (attributes.getIndex("source") < 0) { 247 | attributes.addAttribute("", "source", "source", "CDATA", "attachment"); 248 | } 249 | extractMultiOSPDEmbeddedFiles(name, (PDComplexFileSpecification)spec, attributes); 250 | } 251 | } 252 | 253 | private void processEmbeddedDocNames(Map embeddedFileNames) 254 | throws IOException, SAXException, TikaException { 255 | if (embeddedFileNames == null || embeddedFileNames.isEmpty()) { 256 | return; 257 | } 258 | 259 | for (Map.Entry ent : embeddedFileNames.entrySet()) { 260 | processDoc(ent.getKey(), ent.getValue(), new AttributesImpl()); 261 | } 262 | } 263 | 264 | private void extractMultiOSPDEmbeddedFiles(String displayName, 265 | PDComplexFileSpecification spec, AttributesImpl attributes) throws IOException, 266 | SAXException, TikaException { 267 | 268 | if (spec == null) { 269 | return; 270 | } 271 | //current strategy is to pull all, not just first non-null 272 | extractPDEmbeddedFile(displayName, spec.getFileUnicode(), 273 | spec.getFile(), spec.getEmbeddedFile(), attributes); 274 | extractPDEmbeddedFile(displayName, spec.getFileUnicode(), 275 | spec.getFileMac(), spec.getEmbeddedFileMac(), attributes); 276 | extractPDEmbeddedFile(displayName, spec.getFileUnicode(), 277 | spec.getFileDos(), spec.getEmbeddedFileDos(), attributes); 278 | extractPDEmbeddedFile(displayName, spec.getFileUnicode(), 279 | spec.getFileUnix(), spec.getEmbeddedFileUnix(), attributes); 280 | } 281 | 282 | private void extractPDEmbeddedFile(String displayName, String unicodeFileName, 283 | String fileName, PDEmbeddedFile file, AttributesImpl attributes) 284 | throws SAXException, IOException, TikaException { 285 | 286 | if (file == null) { 287 | //skip silently 288 | return; 289 | } 290 | 291 | fileName = (fileName == null || "".equals(fileName.trim())) ? unicodeFileName : fileName; 292 | fileName = (fileName == null || "".equals(fileName.trim())) ? displayName : fileName; 293 | 294 | // TODO: other metadata? 295 | Metadata embeddedMetadata = new Metadata(); 296 | embeddedMetadata.set(Metadata.RESOURCE_NAME_KEY, fileName); 297 | embeddedMetadata.set(Metadata.CONTENT_TYPE, file.getSubtype()); 298 | embeddedMetadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize())); 299 | embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, 300 | TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString()); 301 | embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName); 302 | if (!embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) { 303 | return; 304 | } 305 | TikaInputStream stream = null; 306 | try { 307 | stream = TikaInputStream.get(file.createInputStream()); 308 | } catch (IOException e) { 309 | //store this exception in the parent's metadata 310 | EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata); 311 | return; 312 | } 313 | try { 314 | embeddedDocumentExtractor.parseEmbedded( 315 | stream, 316 | new EmbeddedContentHandler(xhtml), 317 | embeddedMetadata, false); 318 | 319 | attributes.addAttribute("", "class", "class", "CDATA", "embedded"); 320 | attributes.addAttribute("", "id", "id", "CDATA", fileName); 321 | xhtml.startElement("div", attributes); 322 | xhtml.endElement("div"); 323 | } finally { 324 | IOUtils.closeQuietly(stream); 325 | } 326 | 327 | } 328 | 329 | void handleCatchableIOE(IOException e) throws IOException { 330 | if (config.isCatchIntermediateIOExceptions()) { 331 | if (e.getCause() instanceof SAXException && e.getCause().getMessage() != null && 332 | e.getCause().getMessage().contains("Your document contained more than")) { 333 | //TODO -- is there a cleaner way of checking for: 334 | // WriteOutContentHandler.WriteLimitReachedException? 335 | throw e; 336 | } 337 | 338 | String msg = e.getMessage(); 339 | if (msg == null) { 340 | msg = "IOException, no message"; 341 | } 342 | metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, msg); 343 | exceptions.add(e); 344 | } else { 345 | throw e; 346 | } 347 | } 348 | 349 | void doOCROnCurrentPage() throws IOException, TikaException, SAXException { 350 | if (config.getOcrStrategy().equals(NO_OCR)) { 351 | return; 352 | } 353 | TesseractOCRConfig tesseractConfig = 354 | context.get(TesseractOCRConfig.class, tesseractOCRParser.getDefaultConfig()); 355 | 356 | if (! tesseractOCRParser.hasTesseract(tesseractConfig)) { 357 | throw new TikaException("Tesseract is not available. "+ 358 | "Please set the OCR_STRATEGY to NO_OCR or configure Tesseract correctly"); 359 | } 360 | 361 | PDFRenderer renderer = new PDFRenderer(pdDocument); 362 | TemporaryResources tmp = new TemporaryResources(); 363 | try { 364 | 365 | int dpi = config.getOcrDPI(); 366 | BufferedImage image = renderer.renderImageWithDPI(pageIndex, dpi, config.getOcrImageType()); 367 | Path tmpFile = tmp.createTempFile(); 368 | try (OutputStream os = Files.newOutputStream(tmpFile)) { 369 | //TODO: get output format from TesseractConfig 370 | ImageIOUtil.writeImage(image, config.getOcrImageFormatName(), 371 | os, dpi, config.getOcrImageQuality()); 372 | } 373 | try (InputStream is = TikaInputStream.get(tmpFile)) { 374 | tesseractOCRParser.parseInline(is, xhtml, tesseractConfig); 375 | } 376 | } catch (IOException e) { 377 | handleCatchableIOE(e); 378 | } catch (SAXException e) { 379 | throw new IOExceptionWithCause("error writing OCR content from PDF", e); 380 | } finally { 381 | tmp.dispose(); 382 | } 383 | } 384 | 385 | @Override 386 | protected void endPage(PDPage page) throws IOException { 387 | metadata.add(PDF.CHARACTERS_PER_PAGE, totalCharsPerPage); 388 | metadata.add(PDF.UNMAPPED_UNICODE_CHARS_PER_PAGE, 389 | unmappedUnicodeCharsPerPage); 390 | 391 | try { 392 | for (PDAnnotation annotation : page.getAnnotations()) { 393 | 394 | if (annotation instanceof PDAnnotationFileAttachment) { 395 | PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation; 396 | PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile(); 397 | try { 398 | AttributesImpl attributes = new AttributesImpl(); 399 | attributes.addAttribute("", "source", "source", "CDATA", "annotation"); 400 | extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, attributes); 401 | } catch (SAXException e) { 402 | throw new IOExceptionWithCause("file embedded in annotation sax exception", e); 403 | } catch (TikaException e) { 404 | throw new IOExceptionWithCause("file embedded in annotation tika exception", e); 405 | } catch (IOException e) { 406 | handleCatchableIOE(e); 407 | } 408 | } else if (annotation instanceof PDAnnotationWidget) { 409 | handleWidget((PDAnnotationWidget)annotation); 410 | } 411 | // TODO: remove once PDFBOX-1143 is fixed: 412 | if (config.getExtractAnnotationText()) { 413 | PDActionURI uri = getActionURI(annotation); 414 | if (uri != null) { 415 | String link = uri.getURI(); 416 | if (link != null && link.trim().length() > 0) { 417 | xhtml.startElement("div", "class", "annotation"); 418 | xhtml.startElement("a", "href", link); 419 | xhtml.characters(link); 420 | xhtml.endElement("a"); 421 | xhtml.endElement("div"); 422 | } 423 | } 424 | 425 | if (annotation instanceof PDAnnotationMarkup) { 426 | PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation; 427 | String title = annotationMarkup.getTitlePopup(); 428 | String subject = annotationMarkup.getSubject(); 429 | String contents = annotationMarkup.getContents(); 430 | // TODO: maybe also annotationMarkup.getRichContents()? 431 | if (title != null || subject != null || contents != null) { 432 | xhtml.startElement("div", "class", "annotation"); 433 | 434 | if (title != null) { 435 | xhtml.startElement("div", "class", "annotationTitle"); 436 | xhtml.characters(title); 437 | xhtml.endElement("div"); 438 | } 439 | 440 | if (subject != null) { 441 | xhtml.startElement("div", "class", "annotationSubject"); 442 | xhtml.characters(subject); 443 | xhtml.endElement("div"); 444 | } 445 | 446 | if (contents != null) { 447 | xhtml.startElement("div", "class", "annotationContents"); 448 | xhtml.characters(contents); 449 | xhtml.endElement("div"); 450 | } 451 | 452 | xhtml.endElement("div"); 453 | } 454 | } 455 | } 456 | } 457 | if (config.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) { 458 | doOCROnCurrentPage(); 459 | } else if (config.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.AUTO)) { 460 | //TODO add more sophistication 461 | if (totalCharsPerPage < 10 || unmappedUnicodeCharsPerPage > 10) { 462 | doOCROnCurrentPage(); 463 | } 464 | } 465 | 466 | PDPageAdditionalActions pageActions = page.getActions(); 467 | if (pageActions != null) { 468 | handleDestinationOrAction(pageActions.getC(), AbstractPDF2XHTML.ActionTrigger.PAGE_CLOSE); 469 | handleDestinationOrAction(pageActions.getO(), AbstractPDF2XHTML.ActionTrigger.PAGE_OPEN); 470 | } 471 | xhtml.endElement("div"); 472 | super.endPage(page); 473 | } catch (SAXException|TikaException e) { 474 | throw new IOExceptionWithCause("Unable to end a page", e); 475 | } catch (IOException e) { 476 | handleCatchableIOE(e); 477 | } finally { 478 | totalCharsPerPage = 0; 479 | unmappedUnicodeCharsPerPage = 0; 480 | } 481 | 482 | if (config.getExtractFontNames()) { 483 | 484 | for (COSName n : page.getResources().getFontNames()) { 485 | PDFont font = page.getResources().getFont(n); 486 | if (font != null && font.getFontDescriptor() != null) { 487 | String fontName = font.getFontDescriptor().getFontName(); 488 | if (fontName != null) { 489 | fontNames.add(fontName); 490 | } 491 | } 492 | } 493 | } 494 | } 495 | 496 | private void handleWidget(PDAnnotationWidget widget) throws TikaException, SAXException, IOException { 497 | if (widget == null) { 498 | return; 499 | } 500 | handleDestinationOrAction(widget.getAction(), AbstractPDF2XHTML.ActionTrigger.ANNOTATION_WIDGET); 501 | PDAnnotationAdditionalActions annotationActions = widget.getActions(); 502 | if (annotationActions != null) { 503 | handleDestinationOrAction(annotationActions.getBl(), AbstractPDF2XHTML.ActionTrigger.ANNOTATION_LOSE_INPUT_FOCUS); 504 | handleDestinationOrAction(annotationActions.getD(), AbstractPDF2XHTML.ActionTrigger.ANNOTATION_MOUSE_CLICK); 505 | handleDestinationOrAction(annotationActions.getE(), AbstractPDF2XHTML.ActionTrigger.ANNOTATION_CURSOR_ENTERS); 506 | handleDestinationOrAction(annotationActions.getFo(), AbstractPDF2XHTML.ActionTrigger.ANNOTATION_RECEIVES_FOCUS); 507 | handleDestinationOrAction(annotationActions.getPC(), AbstractPDF2XHTML.ActionTrigger.ANNOTATION_PAGE_CLOSED); 508 | handleDestinationOrAction(annotationActions.getPI(), AbstractPDF2XHTML.ActionTrigger.ANNOTATION_PAGE_NO_LONGER_VISIBLE); 509 | handleDestinationOrAction(annotationActions.getPO(), AbstractPDF2XHTML.ActionTrigger.ANNOTATION_PAGE_OPENED); 510 | handleDestinationOrAction(annotationActions.getPV(), AbstractPDF2XHTML.ActionTrigger.ANNOTATION_PAGE_VISIBLE); 511 | handleDestinationOrAction(annotationActions.getU(), AbstractPDF2XHTML.ActionTrigger.ANNOTATION_MOUSE_RELEASED); 512 | handleDestinationOrAction(annotationActions.getX(), AbstractPDF2XHTML.ActionTrigger.ANNOTATION_CURSOR_EXIT); 513 | } 514 | 515 | } 516 | 517 | @Override 518 | protected void startDocument(PDDocument pdf) throws IOException { 519 | try { 520 | xhtml.startDocument(); 521 | try { 522 | handleDestinationOrAction(pdf.getDocumentCatalog().getOpenAction(), AbstractPDF2XHTML.ActionTrigger.DOCUMENT_OPEN); 523 | } catch (IOException e) { 524 | //See PDFBOX-3773 525 | //swallow -- no need to report this 526 | } 527 | } catch (TikaException|SAXException e) { 528 | throw new IOExceptionWithCause("Unable to start a document", e); 529 | } 530 | } 531 | 532 | private void handleDestinationOrAction(PDDestinationOrAction action, 533 | AbstractPDF2XHTML.ActionTrigger actionTrigger) throws IOException, SAXException, TikaException { 534 | if (action == null || ! config.getExtractActions()) { 535 | return; 536 | } 537 | AttributesImpl attributes = new AttributesImpl(); 538 | String actionOrDestString = (action instanceof PDAction) ? "action" : "destination"; 539 | 540 | addNonNullAttribute("class", actionOrDestString, attributes); 541 | addNonNullAttribute("type", action.getClass().getSimpleName(), attributes); 542 | addNonNullAttribute("trigger", actionTrigger.name(), attributes); 543 | 544 | if (action instanceof PDActionImportData) { 545 | processDoc("", ((PDActionImportData)action).getFile(), attributes); 546 | } else if (action instanceof PDActionLaunch) { 547 | PDActionLaunch pdActionLaunch = (PDActionLaunch)action; 548 | addNonNullAttribute("id", pdActionLaunch.getF(), attributes); 549 | addNonNullAttribute("defaultDirectory", pdActionLaunch.getD(), attributes); 550 | addNonNullAttribute("operation", pdActionLaunch.getO(), attributes); 551 | addNonNullAttribute("parameters", pdActionLaunch.getP(), attributes); 552 | processDoc(pdActionLaunch.getF(), pdActionLaunch.getFile(), attributes); 553 | } else if (action instanceof PDActionRemoteGoTo) { 554 | PDActionRemoteGoTo remoteGoTo = (PDActionRemoteGoTo)action; 555 | processDoc("", remoteGoTo.getFile(), attributes); 556 | } else if (action instanceof PDActionJavaScript) { 557 | PDActionJavaScript jsAction = (PDActionJavaScript)action; 558 | Metadata m = new Metadata(); 559 | m.set(Metadata.CONTENT_TYPE, "application/javascript"); 560 | m.set(Metadata.CONTENT_ENCODING, StandardCharsets.UTF_8.toString()); 561 | m.set(PDF.ACTION_TRIGGER, actionTrigger.toString()); 562 | m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.name()); 563 | String js = jsAction.getAction(); 564 | js = (js == null) ? "" : js; 565 | if (embeddedDocumentExtractor.shouldParseEmbedded(m)) { 566 | try (InputStream is = TikaInputStream.get(js.getBytes(StandardCharsets.UTF_8))) { 567 | embeddedDocumentExtractor.parseEmbedded(is, xhtml, m, false); 568 | } 569 | } 570 | addNonNullAttribute("class", "javascript", attributes); 571 | addNonNullAttribute("type", jsAction.getType(), attributes); 572 | addNonNullAttribute("subtype", jsAction.getSubType(), attributes); 573 | xhtml.startElement("div", attributes); 574 | xhtml.endElement("div"); 575 | } else { 576 | xhtml.startElement("div", attributes); 577 | xhtml.endElement("div"); 578 | } 579 | } 580 | 581 | private static void addNonNullAttribute(String name, String value, AttributesImpl attributes) { 582 | if (name == null || value == null) { 583 | return; 584 | } 585 | attributes.addAttribute("", name, name, "CDATA", value); 586 | } 587 | 588 | @Override 589 | protected void endDocument(PDDocument pdf) throws IOException { 590 | try { 591 | // Extract text for any bookmarks: 592 | if(config.getExtractBookmarksText()) { 593 | extractBookmarkText(); 594 | } 595 | 596 | try { 597 | extractEmbeddedDocuments(pdf); 598 | } catch (IOException e) { 599 | handleCatchableIOE(e); 600 | } 601 | 602 | //extract acroform data at end of doc 603 | if (config.getExtractAcroFormContent() == true) { 604 | try { 605 | extractAcroForm(pdf); 606 | } catch (IOException e) { 607 | handleCatchableIOE(e); 608 | } 609 | } 610 | PDDocumentCatalogAdditionalActions additionalActions = pdf.getDocumentCatalog().getActions(); 611 | handleDestinationOrAction(additionalActions.getDP(), AbstractPDF2XHTML.ActionTrigger.AFTER_DOCUMENT_PRINT); 612 | handleDestinationOrAction(additionalActions.getDS(), AbstractPDF2XHTML.ActionTrigger.AFTER_DOCUMENT_SAVE); 613 | handleDestinationOrAction(additionalActions.getWC(), AbstractPDF2XHTML.ActionTrigger.BEFORE_DOCUMENT_CLOSE); 614 | handleDestinationOrAction(additionalActions.getWP(), AbstractPDF2XHTML.ActionTrigger.BEFORE_DOCUMENT_PRINT); 615 | handleDestinationOrAction(additionalActions.getWS(), AbstractPDF2XHTML.ActionTrigger.BEFORE_DOCUMENT_SAVE); 616 | 617 | if (cdataContent.size() > 0) 618 | dumpCDATA(); 619 | 620 | xhtml.endDocument(); 621 | } catch (TikaException e) { 622 | throw new IOExceptionWithCause("Unable to end a document", e); 623 | } catch (SAXException e) { 624 | throw new IOExceptionWithCause("Unable to end a document", e); 625 | } 626 | if (fontNames.size() > 0) { 627 | for (String fontName : fontNames) { 628 | metadata.add(Font.FONT_NAME, fontName); 629 | } 630 | } 631 | } 632 | 633 | void extractBookmarkText() throws SAXException, IOException, TikaException { 634 | PDDocumentOutline outline = document.getDocumentCatalog().getDocumentOutline(); 635 | if (outline != null) { 636 | extractBookmarkText(outline); 637 | } 638 | } 639 | 640 | void extractBookmarkText(PDOutlineNode bookmark) throws SAXException, IOException, TikaException { 641 | PDOutlineItem current = bookmark.getFirstChild(); 642 | 643 | if (current != null) { 644 | xhtml.startElement("ul"); 645 | while (current != null) { 646 | xhtml.startElement("li"); 647 | xhtml.characters(current.getTitle()); 648 | xhtml.endElement("li"); 649 | handleDestinationOrAction(current.getAction(), AbstractPDF2XHTML.ActionTrigger.BOOKMARK); 650 | // Recurse: 651 | extractBookmarkText(current); 652 | current = current.getNextSibling(); 653 | } 654 | xhtml.endElement("ul"); 655 | } 656 | } 657 | 658 | void extractAcroForm(PDDocument pdf) throws IOException, 659 | SAXException, TikaException { 660 | //Thank you, Ben Litchfield, for org.apache.pdfbox.examples.fdf.PrintFields 661 | //this code derives from Ben's code 662 | PDDocumentCatalog catalog = pdf.getDocumentCatalog(); 663 | 664 | if (catalog == null) 665 | return; 666 | 667 | PDAcroForm form = catalog.getAcroForm(); 668 | if (form == null) 669 | return; 670 | 671 | //if it has xfa, try that. 672 | //if it doesn't exist or there's an exception, 673 | //go with traditional AcroForm 674 | PDXFAResource pdxfa = form.getXFA(); 675 | 676 | if (pdxfa != null) { 677 | //if successful, return 678 | XFAExtractor xfaExtractor = new XFAExtractor(); 679 | InputStream is = null; 680 | try { 681 | is = new BufferedInputStream( 682 | new ByteArrayInputStream(pdxfa.getBytes())); 683 | } catch (IOException e) { 684 | EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata); 685 | } 686 | if (is != null) { 687 | try { 688 | xfaExtractor.extract(is, xhtml, metadata, context); 689 | return; 690 | } catch (XMLStreamException e) { 691 | //if there was an xml parse exception in xfa, try the AcroForm 692 | EmbeddedDocumentUtil.recordException(e, metadata); 693 | } finally { 694 | IOUtils.closeQuietly(is); 695 | } 696 | } 697 | } 698 | 699 | @SuppressWarnings("rawtypes") 700 | List fields = form.getFields(); 701 | 702 | if (fields == null) 703 | return; 704 | 705 | @SuppressWarnings("rawtypes") 706 | ListIterator itr = fields.listIterator(); 707 | 708 | if (itr == null) 709 | return; 710 | 711 | xhtml.startElement("div", "class", "acroform"); 712 | xhtml.startElement("ol"); 713 | 714 | while (itr.hasNext()) { 715 | Object obj = itr.next(); 716 | if (obj != null && obj instanceof PDField) { 717 | processAcroField((PDField) obj, 0); 718 | } 719 | } 720 | xhtml.endElement("ol"); 721 | xhtml.endElement("div"); 722 | } 723 | 724 | private void processAcroField(PDField field, final int currentRecursiveDepth) 725 | throws SAXException, IOException, TikaException { 726 | 727 | if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) { 728 | return; 729 | } 730 | 731 | PDFormFieldAdditionalActions pdFormFieldAdditionalActions = field.getActions(); 732 | if (pdFormFieldAdditionalActions != null) { 733 | handleDestinationOrAction(pdFormFieldAdditionalActions.getC(), AbstractPDF2XHTML.ActionTrigger.FORM_FIELD_RECALCULATE); 734 | handleDestinationOrAction(pdFormFieldAdditionalActions.getF(), AbstractPDF2XHTML.ActionTrigger.FORM_FIELD_FORMATTED); 735 | handleDestinationOrAction(pdFormFieldAdditionalActions.getK(), AbstractPDF2XHTML.ActionTrigger.FORM_FIELD_KEYSTROKE); 736 | handleDestinationOrAction(pdFormFieldAdditionalActions.getV(), AbstractPDF2XHTML.ActionTrigger.FORM_FIELD_VALUE_CHANGE); 737 | } 738 | if (field.getWidgets() != null) { 739 | for (PDAnnotationWidget widget : field.getWidgets()) { 740 | handleWidget(widget); 741 | } 742 | } 743 | 744 | 745 | addFieldString(field); 746 | if (field instanceof PDNonTerminalField) { 747 | int r = currentRecursiveDepth + 1; 748 | xhtml.startElement("ol"); 749 | for (PDField child : ((PDNonTerminalField)field).getChildren()) { 750 | processAcroField(child, r); 751 | } 752 | xhtml.endElement("ol"); 753 | } 754 | } 755 | 756 | private void addFieldString(PDField field) throws SAXException { 757 | //Pick partial name to present in content and altName for attribute 758 | //Ignoring FullyQualifiedName for now 759 | String partName = field.getPartialName(); 760 | String altName = field.getAlternateFieldName(); 761 | 762 | StringBuilder sb = new StringBuilder(); 763 | AttributesImpl attrs = new AttributesImpl(); 764 | 765 | if (partName != null) { 766 | sb.append(partName).append(": "); 767 | } 768 | if (altName != null) { 769 | attrs.addAttribute("", "altName", "altName", "CDATA", altName); 770 | } 771 | //return early if PDSignature field 772 | if (field instanceof PDSignatureField) { 773 | handleSignature(attrs, (PDSignatureField) field); 774 | return; 775 | } 776 | String value = field.getValueAsString(); 777 | if (value != null && !value.equals("null")) { 778 | sb.append(value); 779 | } 780 | 781 | if (attrs.getLength() > 0 || sb.length() > 0) { 782 | xhtml.startElement("li", attrs); 783 | xhtml.characters(sb.toString()); 784 | xhtml.endElement("li"); 785 | } 786 | } 787 | 788 | private void handleSignature(AttributesImpl parentAttributes, PDSignatureField sigField) 789 | throws SAXException { 790 | 791 | PDSignature sig = sigField.getSignature(); 792 | if (sig == null) { 793 | return; 794 | } 795 | Map vals = new TreeMap<>(); 796 | vals.put("name", sig.getName()); 797 | vals.put("contactInfo", sig.getContactInfo()); 798 | vals.put("location", sig.getLocation()); 799 | vals.put("reason", sig.getReason()); 800 | 801 | Calendar cal = sig.getSignDate(); 802 | if (cal != null) { 803 | dateFormat.setTimeZone(cal.getTimeZone()); 804 | vals.put("date", dateFormat.format(cal.getTime())); 805 | } 806 | //see if there is any data 807 | int nonNull = 0; 808 | for (String val : vals.keySet()) { 809 | if (val != null && !val.equals("")) { 810 | nonNull++; 811 | } 812 | } 813 | //if there is, process it 814 | if (nonNull > 0) { 815 | metadata.set(TikaCoreProperties.HAS_SIGNATURE, "true"); 816 | xhtml.startElement("li", parentAttributes); 817 | 818 | AttributesImpl attrs = new AttributesImpl(); 819 | attrs.addAttribute("", "type", "type", "CDATA", "signaturedata"); 820 | 821 | xhtml.startElement("ol", attrs); 822 | for (Map.Entry e : vals.entrySet()) { 823 | if (e.getValue() == null || e.getValue().equals("")) { 824 | continue; 825 | } 826 | attrs = new AttributesImpl(); 827 | attrs.addAttribute("", "signdata", "signdata", "CDATA", e.getKey()); 828 | xhtml.startElement("li", attrs); 829 | xhtml.characters(e.getValue()); 830 | xhtml.endElement("li"); 831 | } 832 | xhtml.endElement("ol"); 833 | xhtml.endElement("li"); 834 | } 835 | } 836 | 837 | 838 | private static PDActionURI getActionURI(PDAnnotation annot) { 839 | //copied and pasted from PDFBox's PrintURLs 840 | 841 | // use reflection to catch all annotation types that have getAction() 842 | // If you can't use reflection, then check for classes 843 | // PDAnnotationLink and PDAnnotationWidget, and call getAction() and check for a 844 | // PDActionURI result type 845 | try { 846 | Method actionMethod = annot.getClass().getDeclaredMethod("getAction"); 847 | if (actionMethod.getReturnType().equals(PDAction.class)) { 848 | PDAction action = (PDAction) actionMethod.invoke(annot); 849 | if (action instanceof PDActionURI) { 850 | return (PDActionURI) action; 851 | } 852 | } 853 | } 854 | catch (NoSuchMethodException|IllegalAccessException|InvocationTargetException e) { 855 | } 856 | return null; 857 | } 858 | 859 | /** 860 | * we need to override this because we are overriding {@link #processPages(PDPageTree)} 861 | * @return 862 | */ 863 | @Override 864 | public int getCurrentPageNo() { 865 | return pageIndex+1; 866 | } 867 | 868 | /** 869 | * See TIKA-2845 for why we need to override this. 870 | * 871 | * @param pages 872 | * @throws IOException 873 | */ 874 | @Override 875 | protected void processPages(PDPageTree pages) throws IOException, SAXException { 876 | //we currently need this hack because we aren't able to increment 877 | //the private currentPageNo in PDFTextStripper, 878 | //and PDFTextStripper's processPage relies on that variable 879 | //being >= startPage when deciding whether or not to process a page 880 | // See: 881 | // if (currentPageNo >= startPage && currentPageNo <= endPage 882 | // && (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber) 883 | // && (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber)) 884 | // { 885 | super.setStartPage(-1); 886 | for (PDPage page : pages) { 887 | if (getCurrentPageNo() >= getStartPage() 888 | && getCurrentPageNo() <= getEndPage()) { 889 | processPage(page); 890 | } 891 | pageIndex++; 892 | } 893 | } 894 | 895 | @Override 896 | public void setStartBookmark(PDOutlineItem pdOutlineItem) { 897 | throw new UnsupportedOperationException("We don't currently support this -- See PDFTextStripper's processPages() for how to implement this."); 898 | } 899 | 900 | @Override 901 | public void setEndBookmark(PDOutlineItem pdOutlineItem) { 902 | throw new UnsupportedOperationException("We don't currently support this -- See PDFTextStripper's processPages() for how to implement this."); 903 | } 904 | 905 | @Override 906 | public void setStartPage(int startPage) { 907 | this.startPage = startPage; 908 | } 909 | 910 | @Override 911 | public int getStartPage() { 912 | return startPage; 913 | } 914 | 915 | @Override 916 | protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code, String unicode, Vector displacement) throws IOException 917 | { 918 | super.showGlyph(textRenderingMatrix, font, code, unicode, displacement); 919 | if (unicode == null || unicode.isEmpty()) { 920 | unmappedUnicodeCharsPerPage++; 921 | } 922 | totalCharsPerPage++; 923 | } 924 | } 925 | --------------------------------------------------------------------------------