() {
29 | {
30 | put("v", CommonParseFlag.VERBOSE);
31 | put("-verbose", CommonParseFlag.VERBOSE);
32 | put("pdf-parse", CommonParseFlag.PDF_PARSE_METHOD);
33 | }
34 | };
35 |
36 | private static HttpRequestParamsReader single_instance = null;
37 |
38 | private boolean initialized = false;
39 |
40 | private HttpRequestParamsReader()
41 | {
42 | }
43 |
44 | // static method to create instance of Singleton class
45 | public static HttpRequestParamsReader getInstance()
46 | {
47 | if (single_instance == null)
48 | single_instance = new HttpRequestParamsReader();
49 | return single_instance;
50 | }
51 |
52 | public void initialize(InputStream stream) {
53 | if (initialized)
54 | return;
55 | initialized = true;
56 | MetaData metaDict = getMetaDataField(stream);
57 | if (metaDict == null)
58 | return;
59 |
60 | HttpFields fields = metaDict.getFields();
61 | for (HttpField field : fields)
62 | rawParams.put(field.getName(), field.getValue());
63 | GetCommonFlags();
64 | }
65 |
66 | public boolean IsVerbose() {
67 | return typedParams.containsKey(CommonParseFlag.VERBOSE);
68 | }
69 |
70 | public void outIfVerbose(String s) {
71 | if (!IsVerbose()) return;
72 | System.out.println(s);
73 | }
74 |
75 | // just check the value specified in the dictionary passed
76 | public boolean checkParamValue(CommonParseFlag ptrName, String expectedValue) {
77 | return typedParams.containsKey(ptrName) &&
78 | typedParams.get(ptrName).equalsIgnoreCase(
79 | expectedValue);
80 | }
81 |
82 | private void GetCommonFlags() {
83 | rawParams.entrySet().forEach(entry -> {
84 | flagByName.entrySet().forEach(fl -> {
85 | if (fl.getKey().equals(entry.getKey()))
86 | typedParams.put(fl.getValue(), entry.getValue());
87 | });
88 | });
89 | }
90 |
91 | // read metadata from HttpRequest
92 | private static MetaData getMetaDataField(Object stream) {
93 | while (true) {
94 | try {
95 | Field field = FieldLookup.findField(stream.getClass(), "val$req");
96 | if (field != null) {
97 | field.setAccessible(true);
98 | HttpServletRequest req = (HttpServletRequest) field.get(stream);
99 | field = FieldLookup.findField(req.getClass(), "_metaData");
100 | if (field == null)
101 | return null;
102 |
103 | field.setAccessible(true);
104 | return (MetaData) field.get(req);
105 | }
106 | } catch (IllegalAccessException ex) {
107 | return null;
108 | }
109 |
110 | Field inField = FieldLookup.findField(stream.getClass(), "in");
111 | if (inField == null)
112 | return null;
113 |
114 | inField.setAccessible(true);
115 | try {
116 | stream = inField.get(stream);
117 | } catch (IllegalAccessException e) {
118 | return null;
119 | }
120 | }
121 | }
122 | }
123 |
--------------------------------------------------------------------------------
/lexpredict-tika/src/main/java/com/lexpredict/tika/PDFEncodedStringDecoder.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | *
17 | * Modifications copyright (C) 2020 ContraxSuite, LLC
18 | */
19 |
20 | package com.lexpredict.tika;
21 |
22 | import static java.nio.charset.StandardCharsets.ISO_8859_1;
23 |
24 | import java.io.ByteArrayInputStream;
25 | import java.io.IOException;
26 | import java.io.InputStream;
27 |
28 | import org.apache.pdfbox.cos.COSString;
29 | import org.apache.pdfbox.io.RandomAccessBuffer;
30 | import org.apache.pdfbox.io.RandomAccessRead;
31 | import org.apache.pdfbox.pdfparser.COSParser;
32 |
33 | /**
34 | * In fairly rare cases, a PDF's XMP will contain a string that
35 | * has incorrectly been encoded with PDFEncoding: an octal for non-ascii and
36 | * ascii for ascii, e.g. "\376\377\000M\000i\000c\000r\000o\000s\000o\000f\000t\000"
37 | *
38 | * This class can be used to decode those strings.
39 | *
40 | * See TIKA-1678. Many thanks to Andrew Jackson for raising this issue
41 | * and Tilman Hausherr for the solution.
42 | *
43 | * As of this writing, we are only handling strings that start with
44 | * an encoded BOM. Andrew Jackson found a handful of other examples (e.g.
45 | * this ISO-8859-7 string:
46 | * "Microsoft Word - \\323\\365\\354\\354\\345\\364\\357\\367\\336
47 | * \\364\\347\\362 PRAKSIS \\363\\364\\357")
48 | * that we aren't currently handling.
49 | */
50 | class PDFEncodedStringDecoder {
51 |
52 | private static final String[] PDF_ENCODING_BOMS = {
53 | "\\376\\377", //UTF-16BE
54 | "\\377\\376", //UTF-16LE
55 | "\\357\\273\\277"//UTF-8
56 | };
57 |
58 | /**
59 | * Does this string contain an octal-encoded UTF BOM?
60 | * Call this statically to determine if you should bother creating a new parser to parse it.
61 | * @param s
62 | * @return
63 | */
64 | static boolean shouldDecode(String s) {
65 | if (s == null || s.length() < 8) {
66 | return false;
67 | }
68 | for (String BOM : PDF_ENCODING_BOMS) {
69 | if (s.startsWith(BOM)) {
70 | return true;
71 | }
72 | }
73 | return false;
74 | }
75 |
76 | /**
77 | * This assumes that {@link #shouldDecode(String)} has been called
78 | * and has returned true. If you run this on a non-octal encoded string,
79 | * disaster will happen!
80 | *
81 | * @param value
82 | * @return
83 | */
84 | String decode(String value) {
85 | try {
86 | byte[] bytes = new String("(" + value + ")").getBytes(ISO_8859_1);
87 | InputStream is = new ByteArrayInputStream(bytes);
88 | PDFEncodedStringDecoder.COSStringParser p = new PDFEncodedStringDecoder.COSStringParser(new RandomAccessBuffer(is));
89 | String parsed = p.myParseCOSString();
90 | if (parsed != null) {
91 | return parsed;
92 | }
93 | } catch (IOException e) {
94 | //oh well, we tried.
95 | }
96 | //just return value if something went wrong
97 | return value;
98 | }
99 |
100 | class COSStringParser extends COSParser {
101 |
102 | COSStringParser(RandomAccessRead buffer) throws IOException {
103 | super(buffer);
104 | }
105 |
106 | /**
107 | *
108 | * @return parsed string or null if something went wrong.
109 | */
110 | String myParseCOSString() {
111 | try {
112 | COSString cosString = parseCOSString();
113 | if (cosString != null) {
114 | return cosString.getString();
115 | }
116 | } catch (IOException e) {
117 | }
118 | return null;
119 | }
120 | }
121 | }
122 |
--------------------------------------------------------------------------------
/lexpredict-tika/src/main/java/com/lexpredict/tika/PdfContentTypeChecker.java:
--------------------------------------------------------------------------------
1 | package com.lexpredict.tika;
2 |
3 | import org.apache.pdfbox.contentstream.operator.Operator;
4 | import org.apache.pdfbox.cos.COSName;
5 | import org.apache.pdfbox.pdfparser.PDFStreamParser;
6 | import org.apache.pdfbox.pdmodel.PDDocument;
7 | import org.apache.pdfbox.pdmodel.PDPage;
8 | import org.apache.pdfbox.pdmodel.PDPageTree;
9 | import org.apache.pdfbox.pdmodel.PDResources;
10 | import org.apache.pdfbox.pdmodel.graphics.PDXObject;
11 | import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
12 | import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
13 | import org.xml.sax.SAXException;
14 |
15 | import java.io.IOException;
16 | import java.io.InputStream;
17 | import java.util.List;
18 |
19 |
20 | // import org.apache.tika.parser.pdf.PDFParser;
21 | // class MyPDF2XHTML extends PDF2XHTML {
22 |
23 |
24 |
25 | // determine content of the PDDocument passed:
26 | // whether it contains text, images, text + images or just nothing
27 | public class PdfContentTypeChecker {
28 | public enum PdfContent {
29 | EMPTY, TEXT, IMAGES, MIXED, UNKNOWN
30 | }
31 |
32 | private PdfContent docContent = PdfContent.EMPTY;
33 |
34 | private int pageCount = 0;
35 |
36 | private int imagesCount = 0;
37 |
38 | private int textBlocks = 0;
39 |
40 | private int fullTextLength = 0;
41 |
42 | private PDFTextStripper pdfTextStripper;
43 |
44 | public int getImagesCount() {
45 | return imagesCount;
46 | }
47 |
48 | public int getTextBlocks() {
49 | return textBlocks;
50 | }
51 |
52 | // reads PDDocument from the stream and calls determineDocContentType
53 | public PdfContent determineDocContentType(InputStream stream) {
54 | try {
55 | PDDocument document = PDDocument.load(stream);
56 | return determineDocContentType(document);
57 | } catch (Exception e) {
58 | return PdfContent.UNKNOWN;
59 | }
60 | }
61 |
62 | public PdfContent determineDocContentType(PDDocument document) throws IOException {
63 | try {
64 | calculateObjectsInDocument(document);
65 | } catch (Exception e) {
66 | return PdfContent.UNKNOWN;
67 | }
68 | int totalCount = imagesCount + textBlocks;
69 | docContent = totalCount == 0 ? PdfContent.EMPTY
70 | : imagesCount > 0 && textBlocks > 0 ? PdfContent.MIXED
71 | : imagesCount > 0 ? PdfContent.IMAGES
72 | : PdfContent.TEXT;
73 | return docContent;
74 | }
75 |
76 | // calculate count of text blocks (textBlocks member) and
77 | // images (imagesCount) in the document
78 | private void calculateObjectsInDocument(PDDocument document) throws IOException {
79 | this.pdfTextStripper = new PDFTextStripper();
80 |
81 | try {
82 | PDPageTree allPages = document.getDocumentCatalog().getPages();
83 | this.pageCount = allPages.getCount();
84 | for (int i = 0; i < allPages.getCount(); i++) {
85 | PDPage page = allPages.get(i);
86 | readObjectsOnPage(page);
87 | calculateTextLengthOnPage(document, i + 1);
88 | }
89 | } catch (Exception e) {
90 | e.printStackTrace();
91 | }
92 | }
93 |
94 | // calculate objects' count for the page passed
95 | private void readObjectsOnPage(PDPage page) throws IOException {
96 | getImagesFromResources(page.getResources());
97 | calculateTextObjectsOnPage(page);
98 | }
99 |
100 |
101 | private void calculateTextLengthOnPage(PDDocument doc, int pageNum1Based) throws IOException, SAXException {
102 | this.pdfTextStripper.setStartPage(pageNum1Based);
103 | this.pdfTextStripper.setEndPage(pageNum1Based);
104 | String text = this.pdfTextStripper.getText(doc);
105 | if (text != null) {
106 | text = text.trim().replaceAll("\\s+", " ");
107 | this.fullTextLength += text.length();
108 | }
109 | }
110 |
111 | private void calculateTextObjectsOnPage(PDPage page) throws IOException {
112 | PDFStreamParser parser = new PDFStreamParser(page);
113 | parser.parse();
114 | List