├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
└── src
    ├── SearchablePDF.zip
    └── SearchablePDF
        ├── .classpath
        ├── .idea
            ├── .name
            ├── compiler.xml
            ├── misc.xml
            ├── uiDesigner.xml
            └── workspace.xml
        ├── .project
        ├── .settings
            ├── org.eclipse.jdt.apt.core.prefs
            ├── org.eclipse.jdt.core.prefs
            └── org.eclipse.m2e.core.prefs
        ├── SearchablePDF.iml
        ├── documents
            ├── SampleInput.pdf
            ├── SampleInput.png
            └── SampleOutput.pdf
        ├── pom.xml
        └── src
            └── main
                └── java
                    ├── Demo.java
                    ├── DemoLambda.java
                    ├── DemoPdfFromLocalImage.java
                    ├── DemoPdfFromLocalPdf.java
                    ├── DemoPdfFromS3Image.java
                    ├── DemoPdfFromS3Pdf.java
                    ├── DemoPdfFromS3PdfAppend.java
                    └── com
                        └── amazon
                            └── textract
                                └── pdf
                                    ├── FontInfo.java
                                    ├── ImageType.java
                                    ├── PDFDocument.java
                                    └── TextLine.java


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *master* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 
61 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes.
62 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit persons to whom the Software is furnished to do so.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
15 | 
16 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Generate Searchable PDF documents with Amazon Textract
 2 | 
 3 | This repository contains sample library and code examples showing how Amazon Textract can be used to extract text from documents and generate searchable pdf documents.
 4 | 
 5 | ## How is searchable PDF generated
 6 | 
 7 | To generate a searchable PDF, we use [Amazon Textract](https://aws.amazon.com/textract/) to extract text from documents and then add extracted text as a layer to the image in the PDF document. Amazon Textract detect and analyze text input documents and returns information about detected items such as pages, words, lines, form data (key-value pairs), tables, selection elements etc. It also provides bounding box information which is an axis-aligned coarse representation of the location of the recognized item on the document page. We use detected text and its bounding box information to appropriately place text in the pdf page.
 8 | 
 9 | [SampleInput.pdf](https://github.com/aws-samples/amazon-textract-searchable-pdf/raw/master/src/SearchablePDF/documents/SampleInput.pdf) is an example input document where text is locked inside the image. [SampleOutput.pdf](https://github.com/aws-samples/amazon-textract-searchable-pdf/raw/master/src/SearchablePDF/documents/SampleOutput.pdf) is an example of a searchable pdf document where you can select and copy text and search within the document.
10 | 
11 | [PDFDocument](./src/SearchablePDF/src/main/java/com/amazon/textract/pdf/PDFDocument.java) library wraps all the necessary logic to generate searchable PDF document using output from Amazon Textract. It also uses open source Java library [Apache PDFBox](https://pdfbox.apache.org/) to create the PDF document but there similar pdf processing libraries available in other programming languages.
12 | 
13 | ```
14 |     ...
15 |     
16 |     //Extract text using Amazon Textract
17 |     List<TextLine> lines = extractText(imageBytes);
18 |         
19 |     //Create new pdf document
20 |     PDFDocument pdfDocument = new PDFDocument();
21 | 
22 |     //Add page with text layer and image in the pdf document
23 |     pdfDocument.addPage(image, imageType, lines);
24 |     
25 |     //Save PDF to local disk
26 |     try(OutputStream outputStream = new FileOutputStream(outputDocumentName)) {
27 |         pdfDocument.save(outputStream);
28 |         pdfDocument.close();
29 |     }
30 | ```
31 | 
32 | ## Code examples
33 | [Sample project](./src/SearchablePDF.zip) has five different examples:
34 | 
35 | - [Create searchable PDF from image on local drive](./src/SearchablePDF/src/main/java/DemoPdfFromLocalImage.java)
36 | - [Create searchable PDF from pdf on local drive](./src/SearchablePDF/src/main/java/DemoPdfFromLocalPdf.java)
37 | - [Create searchable PDF from image in Amazon S3 bucket](./src/SearchablePDF/src/main/java/DemoPdfFromS3Image.java)
38 | - [Create searchable PDF from pdf in Amazon S3 bucket](./src/SearchablePDF/src/main/java/DemoPdfFromS3Pdf.java)
39 | - [Create searchable PDF from pdf in Amazon S3 bucket - by appending input document](./src/SearchablePDF/src/main/java/DemoPdfFromS3PdfAppend.java)
40 | 
41 | ## Run code examples on local machine
42 | 
43 | 1. Setup AWS Account and AWS CLI using [getting started with Amazon Textract](https://docs.aws.amazon.com/textract/latest/dg/getting-started.html).
44 | 2. Download and unzip the [sample project](./src/SearchablePDF.zip).
45 | 3. Install [Apache Maven](https://maven.apache.org/index.html) if it is not already installed.
46 | 4. In the project directory run "mvn package".
47 | 5. Run: "java -cp target/searchable-pdf-1.0.jar Demo" to run Java project with [Demo](./src/SearchablePDF/src/main/java/Demo.java) as main class.
48 | 
49 | By default only first example to create searchable PDF from image on local drive is enabled. Uncomment relevant lines in [Demo](./src/SearchablePDF/src/main/java/Demo.java) to run other examples.
50 | 
51 | ## Run code examples in AWS Lambda
52 | 
53 | 1. Download and unzip the [sample project](./src/SearchablePDF.zip).
54 | 2. Install [Apache Maven](https://maven.apache.org/index.html) if it is not already installed.
55 | 3. In the project directory run "mvn package".
56 | 
57 | The build creates .jar in project-dir/target/searchable-pdf1.0.jar, using information in the pom.xml to do the necessary transforms. This is a standalone .jar (.zip file) that includes all the dependencies. This is your [deployment package](https://docs.aws.amazon.com/lambda/latest/dg/lambda-java-how-to-create-deployment-package.html) that you can upload to AWS Lambda to create a Lambda function.  [DemoLambda](./src/SearchablePDF/src/main/java/DemoLambda.java) has all the necessary code to read S3 events and take action based on the type of input document.
58 | 
59 | 4. Create an Amazon S3 bucket.
60 | 5. Create a folder “documents” in Amazon S3 bucket.
61 | 6. Create an AWS Lambda with Java 17 and IAM role that has read and write permissions to S3 bucket you created earlier.
62 | 7. Configure the IAM role to have permissions to call Amazon Textract.
63 | 8. Set handler to "DemoLambda::handleRequest".
64 | 9. Increase timeout to 5 minutes.
65 | 10. Upload jar file you build earlier.
66 | 
67 | 11. Add a trigger in the Lambda function such that when an object is uploaded to the folder “documents” in your Amazon S3 bucket, Lambda function gets executed.
68 | 
69 | Make sure that you set trigger for “documents” folder. If you add trigger for the whole bucket then Lambda will trigger every time an output pdf document is generated resulting in cycle.
70 | 
71 | 12. Upload an image (jpeg, png) or pdf document to documents folder in your Amazon S3 bucket.
72 | 
73 | In few seconds you should see searchable pdf document generated in the S3 bucket.
74 | 
75 | These steps show simple Amazon S3 and Lambda integration. In production you should consider [scalable architecture similar to this reference architecture](https://github.com/aws-samples/amazon-textract-serverless-large-scale-document-processing).
76 | 
77 | ## Cost
78 | - As you run these samples they call different Amazon Textract APIs in your AWS account. You will get charged for all the API calls made as part of the analysis.
79 | 
80 | ## Other Resources
81 | 
82 | - [Large scale document processing with Amazon Textract - Reference Architecture](https://github.com/aws-samples/amazon-textract-serverless-large-scale-document-processing)
83 | - [Amazon Textract code samples](https://github.com/aws-samples/amazon-textract-code-samples)
84 | - [Batch processing tool](https://github.com/aws-samples/amazon-textract-textractor)
85 | - [JSON response parser](https://github.com/aws-samples/amazon-textract-response-parser)
86 | 
87 | ## License
88 | 
89 | This library is licensed under the MIT-0 License. See the LICENSE file.
90 | 
91 | 


--------------------------------------------------------------------------------
/src/SearchablePDF.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-searchable-pdf/4b01e49831b67a2c72b461f1b183d61d9200f50d/src/SearchablePDF.zip


--------------------------------------------------------------------------------
/src/SearchablePDF/.classpath:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <classpath>
 3 | 	<classpathentry kind="src" output="target/classes" path="src/main/java">
 4 | 		<attributes>
 5 | 			<attribute name="optional" value="true"/>
 6 | 			<attribute name="maven.pomderived" value="true"/>
 7 | 		</attributes>
 8 | 	</classpathentry>
 9 | 	<classpathentry excluding="**" kind="src" output="target/classes" path="src/main/resources">
10 | 		<attributes>
11 | 			<attribute name="maven.pomderived" value="true"/>
12 | 		</attributes>
13 | 	</classpathentry>
14 | 	<classpathentry kind="src" output="target/test-classes" path="src/test/java">
15 | 		<attributes>
16 | 			<attribute name="optional" value="true"/>
17 | 			<attribute name="maven.pomderived" value="true"/>
18 | 			<attribute name="test" value="true"/>
19 | 		</attributes>
20 | 	</classpathentry>
21 | 	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.8">
22 | 		<attributes>
23 | 			<attribute name="maven.pomderived" value="true"/>
24 | 		</attributes>
25 | 	</classpathentry>
26 | 	<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
27 | 		<attributes>
28 | 			<attribute name="maven.pomderived" value="true"/>
29 | 		</attributes>
30 | 	</classpathentry>
31 | 	<classpathentry kind="src" path="target/generated-sources/annotations">
32 | 		<attributes>
33 | 			<attribute name="optional" value="true"/>
34 | 			<attribute name="maven.pomderived" value="true"/>
35 | 			<attribute name="ignore_optional_problems" value="true"/>
36 | 			<attribute name="m2e-apt" value="true"/>
37 | 		</attributes>
38 | 	</classpathentry>
39 | 	<classpathentry kind="src" output="target/test-classes" path="target/generated-test-sources/test-annotations">
40 | 		<attributes>
41 | 			<attribute name="optional" value="true"/>
42 | 			<attribute name="maven.pomderived" value="true"/>
43 | 			<attribute name="ignore_optional_problems" value="true"/>
44 | 			<attribute name="m2e-apt" value="true"/>
45 | 			<attribute name="test" value="true"/>
46 | 		</attributes>
47 | 	</classpathentry>
48 | 	<classpathentry kind="output" path="target/classes"/>
49 | </classpath>
50 | 


--------------------------------------------------------------------------------
/src/SearchablePDF/.idea/.name:
--------------------------------------------------------------------------------
1 | searchablepdf


--------------------------------------------------------------------------------
/src/SearchablePDF/.idea/compiler.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="CompilerConfiguration">
 4 |     <annotationProcessing>
 5 |       <profile name="Maven default annotation processors profile" enabled="true">
 6 |         <sourceOutputDir name="target/generated-sources/annotations" />
 7 |         <sourceTestOutputDir name="target/generated-test-sources/test-annotations" />
 8 |         <outputRelativeToContentRoot value="true" />
 9 |         <module name="SearchablePDF" />
10 |       </profile>
11 |     </annotationProcessing>
12 |   </component>
13 | </project>


--------------------------------------------------------------------------------
/src/SearchablePDF/.idea/misc.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="ExternalStorageConfigurationManager" enabled="true" />
 4 |   <component name="MavenProjectsManager">
 5 |     <option name="originalFiles">
 6 |       <list>
 7 |         <option value="$PROJECT_DIR$/pom.xml" />
 8 |       </list>
 9 |     </option>
10 |   </component>
11 |   <component name="ProjectRootManager" version="2" languageLevel="JDK_12" project-jdk-name="12" project-jdk-type="JavaSDK">
12 |     <output url="file://$PROJECT_DIR$/out" />
13 |   </component>
14 | </project>


--------------------------------------------------------------------------------
/src/SearchablePDF/.idea/uiDesigner.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project version="4">
  3 |   <component name="Palette2">
  4 |     <group name="Swing">
  5 |       <item class="com.intellij.uiDesigner.HSpacer" tooltip-text="Horizontal Spacer" icon="/com/intellij/uiDesigner/icons/hspacer.png" removable="false" auto-create-binding="false" can-attach-label="false">
  6 |         <default-constraints vsize-policy="1" hsize-policy="6" anchor="0" fill="1" />
  7 |       </item>
  8 |       <item class="com.intellij.uiDesigner.VSpacer" tooltip-text="Vertical Spacer" icon="/com/intellij/uiDesigner/icons/vspacer.png" removable="false" auto-create-binding="false" can-attach-label="false">
  9 |         <default-constraints vsize-policy="6" hsize-policy="1" anchor="0" fill="2" />
 10 |       </item>
 11 |       <item class="javax.swing.JPanel" icon="/com/intellij/uiDesigner/icons/panel.png" removable="false" auto-create-binding="false" can-attach-label="false">
 12 |         <default-constraints vsize-policy="3" hsize-policy="3" anchor="0" fill="3" />
 13 |       </item>
 14 |       <item class="javax.swing.JScrollPane" icon="/com/intellij/uiDesigner/icons/scrollPane.png" removable="false" auto-create-binding="false" can-attach-label="true">
 15 |         <default-constraints vsize-policy="7" hsize-policy="7" anchor="0" fill="3" />
 16 |       </item>
 17 |       <item class="javax.swing.JButton" icon="/com/intellij/uiDesigner/icons/button.png" removable="false" auto-create-binding="true" can-attach-label="false">
 18 |         <default-constraints vsize-policy="0" hsize-policy="3" anchor="0" fill="1" />
 19 |         <initial-values>
 20 |           <property name="text" value="Button" />
 21 |         </initial-values>
 22 |       </item>
 23 |       <item class="javax.swing.JRadioButton" icon="/com/intellij/uiDesigner/icons/radioButton.png" removable="false" auto-create-binding="true" can-attach-label="false">
 24 |         <default-constraints vsize-policy="0" hsize-policy="3" anchor="8" fill="0" />
 25 |         <initial-values>
 26 |           <property name="text" value="RadioButton" />
 27 |         </initial-values>
 28 |       </item>
 29 |       <item class="javax.swing.JCheckBox" icon="/com/intellij/uiDesigner/icons/checkBox.png" removable="false" auto-create-binding="true" can-attach-label="false">
 30 |         <default-constraints vsize-policy="0" hsize-policy="3" anchor="8" fill="0" />
 31 |         <initial-values>
 32 |           <property name="text" value="CheckBox" />
 33 |         </initial-values>
 34 |       </item>
 35 |       <item class="javax.swing.JLabel" icon="/com/intellij/uiDesigner/icons/label.png" removable="false" auto-create-binding="false" can-attach-label="false">
 36 |         <default-constraints vsize-policy="0" hsize-policy="0" anchor="8" fill="0" />
 37 |         <initial-values>
 38 |           <property name="text" value="Label" />
 39 |         </initial-values>
 40 |       </item>
 41 |       <item class="javax.swing.JTextField" icon="/com/intellij/uiDesigner/icons/textField.png" removable="false" auto-create-binding="true" can-attach-label="true">
 42 |         <default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1">
 43 |           <preferred-size width="150" height="-1" />
 44 |         </default-constraints>
 45 |       </item>
 46 |       <item class="javax.swing.JPasswordField" icon="/com/intellij/uiDesigner/icons/passwordField.png" removable="false" auto-create-binding="true" can-attach-label="true">
 47 |         <default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1">
 48 |           <preferred-size width="150" height="-1" />
 49 |         </default-constraints>
 50 |       </item>
 51 |       <item class="javax.swing.JFormattedTextField" icon="/com/intellij/uiDesigner/icons/formattedTextField.png" removable="false" auto-create-binding="true" can-attach-label="true">
 52 |         <default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1">
 53 |           <preferred-size width="150" height="-1" />
 54 |         </default-constraints>
 55 |       </item>
 56 |       <item class="javax.swing.JTextArea" icon="/com/intellij/uiDesigner/icons/textArea.png" removable="false" auto-create-binding="true" can-attach-label="true">
 57 |         <default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
 58 |           <preferred-size width="150" height="50" />
 59 |         </default-constraints>
 60 |       </item>
 61 |       <item class="javax.swing.JTextPane" icon="/com/intellij/uiDesigner/icons/textPane.png" removable="false" auto-create-binding="true" can-attach-label="true">
 62 |         <default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
 63 |           <preferred-size width="150" height="50" />
 64 |         </default-constraints>
 65 |       </item>
 66 |       <item class="javax.swing.JEditorPane" icon="/com/intellij/uiDesigner/icons/editorPane.png" removable="false" auto-create-binding="true" can-attach-label="true">
 67 |         <default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
 68 |           <preferred-size width="150" height="50" />
 69 |         </default-constraints>
 70 |       </item>
 71 |       <item class="javax.swing.JComboBox" icon="/com/intellij/uiDesigner/icons/comboBox.png" removable="false" auto-create-binding="true" can-attach-label="true">
 72 |         <default-constraints vsize-policy="0" hsize-policy="2" anchor="8" fill="1" />
 73 |       </item>
 74 |       <item class="javax.swing.JTable" icon="/com/intellij/uiDesigner/icons/table.png" removable="false" auto-create-binding="true" can-attach-label="false">
 75 |         <default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
 76 |           <preferred-size width="150" height="50" />
 77 |         </default-constraints>
 78 |       </item>
 79 |       <item class="javax.swing.JList" icon="/com/intellij/uiDesigner/icons/list.png" removable="false" auto-create-binding="true" can-attach-label="false">
 80 |         <default-constraints vsize-policy="6" hsize-policy="2" anchor="0" fill="3">
 81 |           <preferred-size width="150" height="50" />
 82 |         </default-constraints>
 83 |       </item>
 84 |       <item class="javax.swing.JTree" icon="/com/intellij/uiDesigner/icons/tree.png" removable="false" auto-create-binding="true" can-attach-label="false">
 85 |         <default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
 86 |           <preferred-size width="150" height="50" />
 87 |         </default-constraints>
 88 |       </item>
 89 |       <item class="javax.swing.JTabbedPane" icon="/com/intellij/uiDesigner/icons/tabbedPane.png" removable="false" auto-create-binding="true" can-attach-label="false">
 90 |         <default-constraints vsize-policy="3" hsize-policy="3" anchor="0" fill="3">
 91 |           <preferred-size width="200" height="200" />
 92 |         </default-constraints>
 93 |       </item>
 94 |       <item class="javax.swing.JSplitPane" icon="/com/intellij/uiDesigner/icons/splitPane.png" removable="false" auto-create-binding="false" can-attach-label="false">
 95 |         <default-constraints vsize-policy="3" hsize-policy="3" anchor="0" fill="3">
 96 |           <preferred-size width="200" height="200" />
 97 |         </default-constraints>
 98 |       </item>
 99 |       <item class="javax.swing.JSpinner" icon="/com/intellij/uiDesigner/icons/spinner.png" removable="false" auto-create-binding="true" can-attach-label="true">
100 |         <default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1" />
101 |       </item>
102 |       <item class="javax.swing.JSlider" icon="/com/intellij/uiDesigner/icons/slider.png" removable="false" auto-create-binding="true" can-attach-label="false">
103 |         <default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1" />
104 |       </item>
105 |       <item class="javax.swing.JSeparator" icon="/com/intellij/uiDesigner/icons/separator.png" removable="false" auto-create-binding="false" can-attach-label="false">
106 |         <default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3" />
107 |       </item>
108 |       <item class="javax.swing.JProgressBar" icon="/com/intellij/uiDesigner/icons/progressbar.png" removable="false" auto-create-binding="true" can-attach-label="false">
109 |         <default-constraints vsize-policy="0" hsize-policy="6" anchor="0" fill="1" />
110 |       </item>
111 |       <item class="javax.swing.JToolBar" icon="/com/intellij/uiDesigner/icons/toolbar.png" removable="false" auto-create-binding="false" can-attach-label="false">
112 |         <default-constraints vsize-policy="0" hsize-policy="6" anchor="0" fill="1">
113 |           <preferred-size width="-1" height="20" />
114 |         </default-constraints>
115 |       </item>
116 |       <item class="javax.swing.JToolBar$Separator" icon="/com/intellij/uiDesigner/icons/toolbarSeparator.png" removable="false" auto-create-binding="false" can-attach-label="false">
117 |         <default-constraints vsize-policy="0" hsize-policy="0" anchor="0" fill="1" />
118 |       </item>
119 |       <item class="javax.swing.JScrollBar" icon="/com/intellij/uiDesigner/icons/scrollbar.png" removable="false" auto-create-binding="true" can-attach-label="false">
120 |         <default-constraints vsize-policy="6" hsize-policy="0" anchor="0" fill="2" />
121 |       </item>
122 |     </group>
123 |   </component>
124 | </project>


--------------------------------------------------------------------------------
/src/SearchablePDF/.idea/workspace.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="ChangeListManager">
 4 |     <list default="true" id="6080fa7e-eed8-44ce-a422-791e11b1c2c4" name="Default Changelist" comment="" />
 5 |     <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
 6 |     <option name="SHOW_DIALOG" value="false" />
 7 |     <option name="HIGHLIGHT_CONFLICTS" value="true" />
 8 |     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
 9 |     <option name="LAST_RESOLUTION" value="IGNORE" />
10 |   </component>
11 |   <component name="FileTemplateManagerImpl">
12 |     <option name="RECENT_TEMPLATES">
13 |       <list>
14 |         <option value="Enum" />
15 |         <option value="Class" />
16 |       </list>
17 |     </option>
18 |   </component>
19 |   <component name="MavenImportPreferences">
20 |     <option name="importingSettings">
21 |       <MavenImportingSettings>
22 |         <option name="importAutomatically" value="true" />
23 |       </MavenImportingSettings>
24 |     </option>
25 |   </component>
26 |   <component name="ProjectId" id="1Qi6LV9idCQTcC1RQMNV41emoLn" />
27 |   <component name="PropertiesComponent">
28 |     <property name="project.structure.last.edited" value="Modules" />
29 |     <property name="project.structure.proportion" value="0.15" />
30 |     <property name="project.structure.side.proportion" value="0.2" />
31 |     <property name="settings.editor.selected.configurable" value="configurable.group.build" />
32 |   </component>
33 |   <component name="RecentsManager">
34 |     <key name="MoveClassesOrPackagesDialog.RECENTS_KEY">
35 |       <recent name="" />
36 |     </key>
37 |     <key name="MoveFile.RECENT_KEYS">
38 |       <recent name="$PROJECT_DIR$/documents" />
39 |       <recent name="$PROJECT_DIR$/src/main/java" />
40 |     </key>
41 |   </component>
42 |   <component name="RunDashboard">
43 |     <option name="ruleStates">
44 |       <list>
45 |         <RuleState>
46 |           <option name="name" value="ConfigurationTypeDashboardGroupingRule" />
47 |         </RuleState>
48 |         <RuleState>
49 |           <option name="name" value="StatusDashboardGroupingRule" />
50 |         </RuleState>
51 |       </list>
52 |     </option>
53 |   </component>
54 |   <component name="RunManager">
55 |     <configuration name="Demo" type="Application" factoryName="Application" temporary="true" nameIsGenerated="true">
56 |       <option name="MAIN_CLASS_NAME" value="Demo" />
57 |       <module name="SearchablePDF" />
58 |       <method v="2">
59 |         <option name="Make" enabled="true" />
60 |       </method>
61 |     </configuration>
62 |     <recent_temporary>
63 |       <list>
64 |         <item itemvalue="Application.Demo" />
65 |       </list>
66 |     </recent_temporary>
67 |   </component>
68 |   <component name="SvnConfiguration">
69 |     <configuration />
70 |   </component>
71 |   <component name="TaskManager">
72 |     <task active="true" id="Default" summary="Default task">
73 |       <changelist id="6080fa7e-eed8-44ce-a422-791e11b1c2c4" name="Default Changelist" comment="" />
74 |       <created>1568241013627</created>
75 |       <option name="number" value="Default" />
76 |       <option name="presentableId" value="Default" />
77 |       <updated>1568241013627</updated>
78 |     </task>
79 |     <servers />
80 |   </component>
81 | </project>


--------------------------------------------------------------------------------
/src/SearchablePDF/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>searchable-pdf</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>org.eclipse.jdt.core.javabuilder</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 		<buildCommand>
14 | 			<name>org.eclipse.m2e.core.maven2Builder</name>
15 | 			<arguments>
16 | 			</arguments>
17 | 		</buildCommand>
18 | 	</buildSpec>
19 | 	<natures>
20 | 		<nature>org.eclipse.jdt.core.javanature</nature>
21 | 		<nature>org.eclipse.m2e.core.maven2Nature</nature>
22 | 	</natures>
23 | </projectDescription>
24 | 


--------------------------------------------------------------------------------
/src/SearchablePDF/.settings/org.eclipse.jdt.apt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.apt.aptEnabled=false
3 | 


--------------------------------------------------------------------------------
/src/SearchablePDF/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
 1 | eclipse.preferences.version=1
 2 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
 3 | org.eclipse.jdt.core.compiler.compliance=1.8
 4 | org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled
 5 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
 6 | org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=ignore
 7 | org.eclipse.jdt.core.compiler.processAnnotations=disabled
 8 | org.eclipse.jdt.core.compiler.release=enabled
 9 | org.eclipse.jdt.core.compiler.source=1.8
10 | 


--------------------------------------------------------------------------------
/src/SearchablePDF/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 | 


--------------------------------------------------------------------------------
/src/SearchablePDF/SearchablePDF.iml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <module type="JAVA_MODULE" version="4" />


--------------------------------------------------------------------------------
/src/SearchablePDF/documents/SampleInput.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-searchable-pdf/4b01e49831b67a2c72b461f1b183d61d9200f50d/src/SearchablePDF/documents/SampleInput.pdf


--------------------------------------------------------------------------------
/src/SearchablePDF/documents/SampleInput.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-searchable-pdf/4b01e49831b67a2c72b461f1b183d61d9200f50d/src/SearchablePDF/documents/SampleInput.png


--------------------------------------------------------------------------------
/src/SearchablePDF/documents/SampleOutput.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-searchable-pdf/4b01e49831b67a2c72b461f1b183d61d9200f50d/src/SearchablePDF/documents/SampleOutput.pdf


--------------------------------------------------------------------------------
/src/SearchablePDF/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <modelVersion>4.0.0</modelVersion>
 6 | 
 7 |     <groupId>com.amazon.textract.solutions</groupId>
 8 |     <artifactId>searchable-pdf</artifactId>
 9 |     <version>1.0</version>
10 |     <dependencies>
11 |         <!-- https://mvnrepository.com/artifact/com.amazonaws/aws-java-sdk-textract -->
12 |         <dependency>
13 |             <groupId>com.amazonaws</groupId>
14 |             <artifactId>aws-java-sdk-textract</artifactId>
15 |             <version>1.12.529</version>
16 |         </dependency>
17 |         <!-- https://mvnrepository.com/artifact/com.amazonaws/aws-java-sdk-s3 -->
18 |         <dependency>
19 |             <groupId>com.amazonaws</groupId>
20 |             <artifactId>aws-java-sdk-s3</artifactId>
21 |             <version>1.12.529</version>
22 |         </dependency>
23 |         <!-- https://mvnrepository.com/artifact/com.amazonaws/aws-lambda-java-core -->
24 |         <dependency>
25 |             <groupId>com.amazonaws</groupId>
26 |             <artifactId>aws-lambda-java-core</artifactId>
27 |             <version>1.2.2</version>
28 |         </dependency>
29 |         <!-- https://mvnrepository.com/artifact/com.amazonaws/aws-lambda-java-events -->
30 |         <dependency>
31 |             <groupId>com.amazonaws</groupId>
32 |             <artifactId>aws-lambda-java-events</artifactId>
33 |             <version>3.11.2</version>
34 |         </dependency>
35 |         <!-- https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox -->
36 |         <dependency>
37 |             <groupId>org.apache.pdfbox</groupId>
38 |             <artifactId>pdfbox</artifactId>
39 |             <version>2.0.29</version>
40 |         </dependency>
41 |         <dependency>
42 |             <groupId>org.apache.pdfbox</groupId>
43 |             <artifactId>fontbox</artifactId>
44 |             <version>2.0.29</version>
45 |         </dependency>
46 |         <dependency>
47 |             <groupId>org.apache.pdfbox</groupId>
48 |             <artifactId>pdfbox-tools</artifactId>
49 |             <version>2.0.29</version>
50 |         </dependency>
51 |         <dependency>
52 |             <groupId>org.apache.pdfbox</groupId>
53 |             <artifactId>jbig2-imageio</artifactId>
54 |             <version>3.0.4</version>
55 |         </dependency>
56 |         <!-- https://mvnrepository.com/artifact/javax.xml.bind/jaxb-api -->
57 |         <dependency>
58 |             <groupId>javax.xml.bind</groupId>
59 |             <artifactId>jaxb-api</artifactId>
60 |             <version>2.3.1</version>
61 |         </dependency>
62 | 
63 |         <!-- https://mvnrepository.com/artifact/com.github.jai-imageio/jai-imageio-jpeg2000 -->
64 |         <dependency>
65 |             <groupId>com.github.jai-imageio</groupId>
66 |             <artifactId>jai-imageio-jpeg2000</artifactId>
67 |             <version>1.4.0</version>
68 |         </dependency>
69 | 
70 |     </dependencies>
71 |     <build>
72 |         <plugins>
73 |             <plugin>
74 |                 <groupId>org.apache.maven.plugins</groupId>
75 |                 <artifactId>maven-compiler-plugin</artifactId>
76 |                 <version>3.8.0</version>
77 |                 <configuration>
78 |                     <release>8</release>
79 |                 </configuration>
80 |             </plugin>
81 |             <plugin>
82 |                 <groupId>org.apache.maven.plugins</groupId>
83 |                 <artifactId>maven-shade-plugin</artifactId>
84 |                 <version>3.2.1</version>
85 |                 <configuration>
86 |                     <createDependencyReducedPom>false</createDependencyReducedPom>
87 |                 </configuration>
88 |                 <executions>
89 |                     <execution>
90 |                         <phase>package</phase>
91 |                         <goals>
92 |                             <goal>shade</goal>
93 |                         </goals>
94 |                     </execution>
95 |                 </executions>
96 |             </plugin>
97 |         </plugins>
98 |     </build>
99 | </project>


--------------------------------------------------------------------------------
/src/SearchablePDF/src/main/java/Demo.java:
--------------------------------------------------------------------------------
 1 | public class Demo {
 2 |     public static void main(String args[]) {
 3 |         try {
 4 |             //Generate searchable PDF from local image
 5 |             DemoPdfFromLocalImage localImage = new DemoPdfFromLocalImage();
 6 |             localImage.run("./documents/SampleInput.png", "./documents/SampleOutput.pdf");
 7 | 
 8 | //            //Generate searchable PDF from local pdf
 9 | //            DemoPdfFromLocalPdf localPdf = new DemoPdfFromLocalPdf();
10 | //            localPdf.run("./documents/SampleInput.pdf", "./documents/SampleOutput.pdf");
11 | //
12 | //            //Generate searchable PDF from image in Amazon S3 bucket
13 | //            DemoPdfFromS3Image s3Image = new DemoPdfFromS3Image();
14 | //            s3Image.run("ki-textract-demo-docs", "SampleInput.png", "SampleOutput.pdf");
15 | //
16 | //            //Generate searchable PDF from pdf in Amazon S3 bucket
17 | //            DemoPdfFromS3Pdf s3Pdf = new DemoPdfFromS3Pdf();
18 | //            s3Pdf.run("ki-textract-demo-docs", "SampleInput.pdf", "SampleOutput.pdf");
19 | //
20 | //            //Generate searchable PDF from pdf in Amazon S3 bucket
21 | //            //(by adding text to the input pdf document)
22 | //            DemoPdfFromS3PdfAppend s3PdfAppend = new DemoPdfFromS3PdfAppend();
23 | //            s3PdfAppend.run("ki-textract-demo-docs", "SampleInput.pdf", "SampleOutput.pdf");
24 | 
25 |         } catch (Exception e) {
26 |             e.printStackTrace();
27 |         }
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/SearchablePDF/src/main/java/DemoLambda.java:
--------------------------------------------------------------------------------
 1 | import com.amazonaws.services.lambda.runtime.Context;
 2 | import com.amazonaws.services.lambda.runtime.RequestHandler;
 3 | import com.amazonaws.services.lambda.runtime.events.S3Event;
 4 | import com.amazonaws.services.lambda.runtime.events.models.s3.S3EventNotification;
 5 | 
 6 | public class DemoLambda implements RequestHandler<S3Event, String> {
 7 | 
 8 |     @Override
 9 |     public String handleRequest(S3Event event, Context ctx) {
10 | 
11 |         S3EventNotification.S3EventNotificationRecord record = event.getRecords().get(0);
12 | 
13 |         String bucketName = record.getS3().getBucket().getName();
14 |         String keyName = record.getS3().getObject().getKey();
15 |         String keyNameLower = record.getS3().getObject().getKey().toLowerCase();
16 |         String filename = keyName.substring(keyName.lastIndexOf("/") + 1);
17 | 
18 |         System.out.println("Bucket Name is " + bucketName);
19 |         System.out.println("File Path is " + keyName);
20 | 
21 |         try {
22 |             if (keyNameLower.endsWith("pdf")) {
23 |                 DemoPdfFromS3Pdf s3Pdf = new DemoPdfFromS3Pdf();
24 |                 s3Pdf.run(bucketName, keyName, filename);
25 | 
26 |             } else if (keyNameLower.endsWith("jpg") || keyNameLower.endsWith("jpeg") || keyNameLower.endsWith("png")) {
27 |                 DemoPdfFromS3Image s3Image = new DemoPdfFromS3Image();
28 |                 s3Image.run(bucketName, keyName, filename);
29 |             }
30 |         } catch (Exception e) {
31 |             e.printStackTrace();
32 |             System.out.println(e.getMessage());
33 |         }
34 |         return null;
35 |     }
36 | }


--------------------------------------------------------------------------------
/src/SearchablePDF/src/main/java/DemoPdfFromLocalImage.java:
--------------------------------------------------------------------------------
 1 | import com.amazon.textract.pdf.ImageType;
 2 | import com.amazon.textract.pdf.PDFDocument;
 3 | import com.amazon.textract.pdf.TextLine;
 4 | import com.amazonaws.services.textract.AmazonTextract;
 5 | import com.amazonaws.services.textract.AmazonTextractClientBuilder;
 6 | import com.amazonaws.services.textract.model.*;
 7 | import com.amazonaws.util.IOUtils;
 8 | import javax.imageio.ImageIO;
 9 | import java.awt.image.BufferedImage;
10 | import java.io.*;
11 | import java.nio.ByteBuffer;
12 | import java.util.ArrayList;
13 | import java.util.List;
14 | 
15 | public class DemoPdfFromLocalImage {
16 | 
17 |     public void run(String documentName, String outputDocumentName) throws IOException {
18 | 
19 |         System.out.println("Generating searchable pdf from: " + documentName);
20 | 
21 |         ImageType imageType = ImageType.JPEG;
22 |         if(documentName.toLowerCase().endsWith(".png"))
23 |             imageType = ImageType.PNG;
24 | 
25 |         //Get image bytes
26 |         ByteBuffer imageBytes = null;
27 |         try(InputStream in = new FileInputStream(documentName)) {
28 |             imageBytes = ByteBuffer.wrap(IOUtils.toByteArray(in));
29 |         }
30 | 
31 |         //Extract text
32 |         List<TextLine> lines = extractText(imageBytes);
33 | 
34 |         //Get Image
35 |         BufferedImage image = getImage(documentName);
36 | 
37 |         //Create new pdf document
38 |         PDFDocument pdfDocument = new PDFDocument();
39 | 
40 |         //Add page with text layer and image in the pdf document
41 |         pdfDocument.addPage(image, imageType, lines);
42 | 
43 |         //Save PDF to local disk
44 |         try(OutputStream outputStream = new FileOutputStream(outputDocumentName)) {
45 |             pdfDocument.save(outputStream);
46 |             pdfDocument.close();
47 |         }
48 | 
49 |         System.out.println("Generated searchable pdf: " + outputDocumentName);
50 |     }
51 | 
52 |     private BufferedImage getImage(String documentName) throws IOException {
53 | 
54 |         BufferedImage image = null;
55 | 
56 |         try(InputStream in = new FileInputStream(documentName)) {
57 |             image = ImageIO.read(in);
58 |         }
59 | 
60 |         return image;
61 |     }
62 | 
63 |     private List<TextLine> extractText(ByteBuffer imageBytes) {
64 | 
65 |         AmazonTextract client = AmazonTextractClientBuilder.defaultClient();
66 | 
67 |         DetectDocumentTextRequest request = new DetectDocumentTextRequest()
68 |                 .withDocument(new Document()
69 |                         .withBytes(imageBytes));
70 | 
71 |         DetectDocumentTextResult result = client.detectDocumentText(request);
72 | 
73 |         List<TextLine> lines = new ArrayList<TextLine>();
74 |         List<Block> blocks = result.getBlocks();
75 |         BoundingBox boundingBox = null;
76 |         for (Block block : blocks) {
77 |             if ((block.getBlockType()).equals("LINE")) {
78 |                 boundingBox = block.getGeometry().getBoundingBox();
79 |                 lines.add(new TextLine(boundingBox.getLeft(),
80 |                         boundingBox.getTop(),
81 |                         boundingBox.getWidth(),
82 |                         boundingBox.getHeight(),
83 |                         block.getText()));
84 |             }
85 |         }
86 | 
87 |         return lines;
88 |     }
89 | }
90 | 


--------------------------------------------------------------------------------
/src/SearchablePDF/src/main/java/DemoPdfFromLocalPdf.java:
--------------------------------------------------------------------------------
 1 | import com.amazon.textract.pdf.ImageType;
 2 | import com.amazon.textract.pdf.PDFDocument;
 3 | import com.amazon.textract.pdf.TextLine;
 4 | import com.amazonaws.services.textract.AmazonTextract;
 5 | import com.amazonaws.services.textract.AmazonTextractClientBuilder;
 6 | import com.amazonaws.services.textract.model.*;
 7 | import org.apache.pdfbox.pdmodel.PDDocument;
 8 | import org.apache.pdfbox.rendering.PDFRenderer;
 9 | import org.apache.pdfbox.tools.imageio.ImageIOUtil;
10 | import java.awt.image.BufferedImage;
11 | import java.io.*;
12 | import java.nio.ByteBuffer;
13 | import java.util.ArrayList;
14 | import java.util.List;
15 | 
16 | public class DemoPdfFromLocalPdf {
17 | 
18 |     private List<TextLine> extractText(ByteBuffer imageBytes){
19 | 
20 |         AmazonTextract client = AmazonTextractClientBuilder.defaultClient();
21 | 
22 |         DetectDocumentTextRequest request = new DetectDocumentTextRequest()
23 |                 .withDocument(new Document()
24 |                         .withBytes(imageBytes));
25 | 
26 |         DetectDocumentTextResult result = client.detectDocumentText(request);
27 | 
28 |         List<TextLine> lines = new ArrayList<TextLine>();
29 |         List<Block> blocks = result.getBlocks();
30 |         BoundingBox boundingBox = null;
31 |         for (Block block : blocks) {
32 |             if ((block.getBlockType()).equals("LINE")) {
33 |                 boundingBox = block.getGeometry().getBoundingBox();
34 |                 lines.add(new TextLine(boundingBox.getLeft(),
35 |                         boundingBox.getTop(),
36 |                         boundingBox.getWidth(),
37 |                         boundingBox.getHeight(),
38 |                         block.getText()));
39 |             }
40 |         }
41 | 
42 |         return lines;
43 |     }
44 | 
45 |     public void run(String documentName, String outputDocumentName) throws IOException {
46 | 
47 |         System.out.println("Generating searchable pdf from: " + documentName);
48 | 
49 |         PDFDocument pdfDocument = new PDFDocument();
50 | 
51 |         List<TextLine> lines = null;
52 |         BufferedImage image = null;
53 |         ByteArrayOutputStream byteArrayOutputStream = null;
54 |         ByteBuffer imageBytes = null;
55 | 
56 |         //Load pdf document and process each page as image
57 |         PDDocument inputDocument = PDDocument.load(new File(documentName));
58 |         PDFRenderer pdfRenderer = new PDFRenderer(inputDocument);
59 |         for (int page = 0; page < inputDocument.getNumberOfPages(); ++page) {
60 | 
61 |             //Render image
62 |             image = pdfRenderer.renderImageWithDPI(page, 300, org.apache.pdfbox.rendering.ImageType.RGB);
63 | 
64 |             //Get image bytes
65 |             byteArrayOutputStream = new ByteArrayOutputStream();
66 |             ImageIOUtil.writeImage(image, "jpeg", byteArrayOutputStream);
67 |             byteArrayOutputStream.flush();
68 |             imageBytes = ByteBuffer.wrap(byteArrayOutputStream.toByteArray());
69 | 
70 |             //Extract text
71 |             lines = extractText(imageBytes);
72 | 
73 |             //Add extracted text to pdf page
74 |             pdfDocument.addPage(image, ImageType.JPEG, lines);
75 | 
76 |             System.out.println("Processed page index: " + page);
77 |         }
78 | 
79 |         inputDocument.close();
80 | 
81 |         //Save PDF to local disk
82 |         try (OutputStream outputStream = new FileOutputStream(outputDocumentName)) {
83 |             pdfDocument.save(outputStream);
84 |             pdfDocument.close();
85 |         }
86 | 
87 |         System.out.println("Generated searchable pdf: " + outputDocumentName);
88 |     }
89 | }
90 | 


--------------------------------------------------------------------------------
/src/SearchablePDF/src/main/java/DemoPdfFromS3Image.java:
--------------------------------------------------------------------------------
 1 | import com.amazon.textract.pdf.ImageType;
 2 | import com.amazon.textract.pdf.PDFDocument;
 3 | import com.amazon.textract.pdf.TextLine;
 4 | import com.amazonaws.services.s3.AmazonS3;
 5 | import com.amazonaws.services.s3.AmazonS3ClientBuilder;
 6 | import com.amazonaws.services.s3.model.GetObjectRequest;
 7 | import com.amazonaws.services.s3.model.ObjectMetadata;
 8 | import com.amazonaws.services.s3.model.PutObjectRequest;
 9 | import com.amazonaws.services.textract.AmazonTextract;
10 | import com.amazonaws.services.textract.AmazonTextractClientBuilder;
11 | import com.amazonaws.services.textract.model.*;
12 | import javax.imageio.ImageIO;
13 | import java.awt.image.BufferedImage;
14 | import java.io.*;
15 | import java.util.ArrayList;
16 | import java.util.List;
17 | 
18 | public class DemoPdfFromS3Image {
19 | 
20 |     private List<TextLine> extractText(String bucketName, String documentName){
21 |         AmazonTextract client = AmazonTextractClientBuilder.defaultClient();
22 | 
23 |         DetectDocumentTextRequest request = new DetectDocumentTextRequest()
24 |                 .withDocument(new Document()
25 |                         .withS3Object(new S3Object()
26 |                                 .withName(documentName)
27 |                                 .withBucket(bucketName)));
28 | 
29 |         DetectDocumentTextResult result = client.detectDocumentText(request);
30 | 
31 |         List<TextLine> lines = new ArrayList<TextLine>();
32 |         List<Block> blocks = result.getBlocks();
33 |         BoundingBox boundingBox = null;
34 |         for (Block block : blocks) {
35 |             if ((block.getBlockType()).equals("LINE")) {
36 |                 boundingBox = block.getGeometry().getBoundingBox();
37 |                 lines.add(new TextLine(boundingBox.getLeft(),
38 |                         boundingBox.getTop(),
39 |                         boundingBox.getWidth(),
40 |                         boundingBox.getHeight(),
41 |                         block.getText()));
42 |             }
43 |         }
44 | 
45 |         return lines;
46 |     }
47 | 
48 |     private BufferedImage getImageFromS3(String bucketName, String documentName) throws IOException {
49 | 
50 |         AmazonS3 s3client = AmazonS3ClientBuilder.defaultClient();
51 |         com.amazonaws.services.s3.model.S3Object fullObject = s3client.getObject(new GetObjectRequest(bucketName, documentName));
52 |         BufferedImage image = ImageIO.read(fullObject.getObjectContent());
53 |         return image;
54 |     }
55 | 
56 |     private void UploadToS3(String bucketName, String objectName, String contentType, byte[] bytes) throws IOException {
57 |         AmazonS3 s3client = AmazonS3ClientBuilder.defaultClient();
58 |         ByteArrayInputStream baInputStream = new ByteArrayInputStream(bytes);
59 |         ObjectMetadata metadata = new ObjectMetadata();
60 |         metadata.setContentLength(bytes.length);
61 |         metadata.setContentType(contentType);
62 |         PutObjectRequest putRequest = new PutObjectRequest(bucketName, objectName, baInputStream, metadata);
63 |         s3client.putObject(putRequest);
64 |     }
65 | 
66 |     public void run(String bucketName, String documentName, String outputDocumentName) throws IOException {
67 | 
68 |         System.out.println("Generating searchable pdf from: " + bucketName + "/" + documentName);
69 | 
70 |         ImageType imageType = ImageType.JPEG;
71 |         if(documentName.toLowerCase().endsWith(".png"))
72 |             imageType = ImageType.PNG;
73 | 
74 |         //Extract text
75 |         List<TextLine> lines = extractText(bucketName, documentName);
76 | 
77 |         //Get image from S3
78 |         BufferedImage image = getImageFromS3(bucketName, documentName);
79 | 
80 |         //Create PDF document
81 |         PDFDocument pdfDocument = new PDFDocument();
82 | 
83 |         //Add page with text layer and image in the pdf document
84 |         pdfDocument.addPage(image, imageType, lines);
85 | 
86 |         //Save PDF to stream
87 |         ByteArrayOutputStream os = new ByteArrayOutputStream();
88 |         pdfDocument.save(os);
89 |         pdfDocument.close();
90 | 
91 |         //Upload PDF to S3
92 |         UploadToS3(bucketName, outputDocumentName, "application/pdf", os.toByteArray());
93 | 
94 |         System.out.println("Generated searchable pdf: " + bucketName + "/" + outputDocumentName);
95 |     }
96 | }
97 | 


--------------------------------------------------------------------------------
/src/SearchablePDF/src/main/java/DemoPdfFromS3Pdf.java:
--------------------------------------------------------------------------------
  1 | import com.amazon.textract.pdf.ImageType;
  2 | import com.amazon.textract.pdf.PDFDocument;
  3 | import com.amazon.textract.pdf.TextLine;
  4 | import com.amazonaws.services.s3.AmazonS3;
  5 | import com.amazonaws.services.s3.AmazonS3ClientBuilder;
  6 | import com.amazonaws.services.s3.model.GetObjectRequest;
  7 | import com.amazonaws.services.s3.model.ObjectMetadata;
  8 | import com.amazonaws.services.s3.model.PutObjectRequest;
  9 | import com.amazonaws.services.textract.AmazonTextract;
 10 | import com.amazonaws.services.textract.AmazonTextractClientBuilder;
 11 | import com.amazonaws.services.textract.model.*;
 12 | import org.apache.pdfbox.pdmodel.PDDocument;
 13 | import org.apache.pdfbox.rendering.PDFRenderer;
 14 | import java.awt.image.BufferedImage;
 15 | import java.io.*;
 16 | import java.util.ArrayList;
 17 | import java.util.List;
 18 | import java.util.concurrent.TimeUnit;
 19 | 
 20 | public class DemoPdfFromS3Pdf {
 21 |     public void run(String bucketName, String documentName, String outputDocumentName) throws IOException, InterruptedException {
 22 | 
 23 |         System.out.println("Generating searchable pdf from: " + bucketName + "/" + documentName);
 24 | 
 25 |         //Extract text using Amazon Textract
 26 |         List<ArrayList<TextLine>> linesInPages = extractText(bucketName, documentName);
 27 | 
 28 |         //Get input pdf document from Amazon S3
 29 |         InputStream inputPdf = getPdfFromS3(bucketName, documentName);
 30 | 
 31 |         //Create new PDF document
 32 |         PDFDocument pdfDocument = new PDFDocument();
 33 | 
 34 |         //For each page add text layer and image in the pdf document
 35 |         PDDocument inputDocument = PDDocument.load(inputPdf);
 36 |         PDFRenderer pdfRenderer = new PDFRenderer(inputDocument);
 37 |         BufferedImage image = null;
 38 |         for (int page = 0; page < inputDocument.getNumberOfPages(); ++page) {
 39 |             image = pdfRenderer.renderImageWithDPI(page, 300, org.apache.pdfbox.rendering.ImageType.RGB);
 40 | 
 41 |             pdfDocument.addPage(image, ImageType.JPEG, linesInPages.get(page));
 42 | 
 43 |             System.out.println("Processed page index: " + page);
 44 |         }
 45 | 
 46 |         //Save PDF to stream
 47 |         ByteArrayOutputStream os = new ByteArrayOutputStream();
 48 |         pdfDocument.save(os);
 49 |         pdfDocument.close();
 50 |         inputDocument.close();
 51 | 
 52 |         //Upload PDF to S3
 53 |         UploadToS3(bucketName, outputDocumentName, "application/pdf", os.toByteArray());
 54 | 
 55 |         System.out.println("Generated searchable pdf: " + bucketName + "/" + outputDocumentName);
 56 |     }
 57 | 
 58 |     private List<ArrayList<TextLine>> extractText(String bucketName, String documentName) throws InterruptedException {
 59 | 
 60 |         AmazonTextract client = AmazonTextractClientBuilder.defaultClient();
 61 | 
 62 |         StartDocumentTextDetectionRequest req = new StartDocumentTextDetectionRequest()
 63 |                 .withDocumentLocation(new DocumentLocation()
 64 |                         .withS3Object(new S3Object()
 65 |                                 .withBucket(bucketName)
 66 |                                 .withName(documentName)))
 67 |                 .withJobTag("DetectingText");
 68 | 
 69 |         StartDocumentTextDetectionResult startDocumentTextDetectionResult = client.startDocumentTextDetection(req);
 70 |         String startJobId = startDocumentTextDetectionResult.getJobId();
 71 | 
 72 |         System.out.println("Text detection job started with Id: " + startJobId);
 73 | 
 74 |         GetDocumentTextDetectionRequest documentTextDetectionRequest = null;
 75 |         GetDocumentTextDetectionResult response = null;
 76 | 
 77 |         String jobStatus = "IN_PROGRESS";
 78 | 
 79 |         while (jobStatus.equals("IN_PROGRESS")) {
 80 |             System.out.println("Waiting for job to complete...");
 81 |             TimeUnit.SECONDS.sleep(10);
 82 |             documentTextDetectionRequest = new GetDocumentTextDetectionRequest()
 83 |                     .withJobId(startJobId)
 84 |                     .withMaxResults(1);
 85 | 
 86 |             response = client.getDocumentTextDetection(documentTextDetectionRequest);
 87 |             jobStatus = response.getJobStatus();
 88 |         }
 89 | 
 90 |         int maxResults = 1000;
 91 |         String paginationToken = null;
 92 |         Boolean finished = false;
 93 | 
 94 |         List<ArrayList<TextLine>> pages = new ArrayList<ArrayList<TextLine>>();
 95 |         ArrayList<TextLine> page = null;
 96 |         BoundingBox boundingBox = null;
 97 | 
 98 |         while (finished == false) {
 99 |             documentTextDetectionRequest = new GetDocumentTextDetectionRequest()
100 |                     .withJobId(startJobId)
101 |                     .withMaxResults(maxResults)
102 |                     .withNextToken(paginationToken);
103 |             response = client.getDocumentTextDetection(documentTextDetectionRequest);
104 | 
105 |             //Show blocks information
106 |             List<Block> blocks = response.getBlocks();
107 |             for (Block block : blocks) {
108 |                 if (block.getBlockType().equals("PAGE")) {
109 |                     page = new ArrayList<TextLine>();
110 |                     pages.add(page);
111 |                 } else if (block.getBlockType().equals("LINE")) {
112 |                     boundingBox = block.getGeometry().getBoundingBox();
113 |                     page.add(new TextLine(boundingBox.getLeft(),
114 |                             boundingBox.getTop(),
115 |                             boundingBox.getWidth(),
116 |                             boundingBox.getHeight(),
117 |                             block.getText()));
118 |                 }
119 |             }
120 |             paginationToken = response.getNextToken();
121 |             if (paginationToken == null)
122 |                 finished = true;
123 |         }
124 | 
125 |         return pages;
126 |     }
127 | 
128 |     private InputStream getPdfFromS3(String bucketName, String documentName) throws IOException {
129 | 
130 |         AmazonS3 s3client = AmazonS3ClientBuilder.defaultClient();
131 |         com.amazonaws.services.s3.model.S3Object fullObject = s3client.getObject(new GetObjectRequest(bucketName, documentName));
132 |         InputStream in = fullObject.getObjectContent();
133 |         return in;
134 |     }
135 | 
136 |     private void UploadToS3(String bucketName, String objectName, String contentType, byte[] bytes) {
137 |         AmazonS3 s3client = AmazonS3ClientBuilder.defaultClient();
138 |         ByteArrayInputStream baInputStream = new ByteArrayInputStream(bytes);
139 |         ObjectMetadata metadata = new ObjectMetadata();
140 |         metadata.setContentLength(bytes.length);
141 |         metadata.setContentType(contentType);
142 |         PutObjectRequest putRequest = new PutObjectRequest(bucketName, objectName, baInputStream, metadata);
143 |         s3client.putObject(putRequest);
144 |     }
145 | }
146 | 


--------------------------------------------------------------------------------
/src/SearchablePDF/src/main/java/DemoPdfFromS3PdfAppend.java:
--------------------------------------------------------------------------------
  1 | import com.amazon.textract.pdf.PDFDocument;
  2 | import com.amazon.textract.pdf.TextLine;
  3 | import com.amazonaws.services.s3.AmazonS3;
  4 | import com.amazonaws.services.s3.AmazonS3ClientBuilder;
  5 | import com.amazonaws.services.s3.model.GetObjectRequest;
  6 | import com.amazonaws.services.s3.model.ObjectMetadata;
  7 | import com.amazonaws.services.s3.model.PutObjectRequest;
  8 | import com.amazonaws.services.textract.AmazonTextract;
  9 | import com.amazonaws.services.textract.AmazonTextractClientBuilder;
 10 | import com.amazonaws.services.textract.model.*;
 11 | import java.io.ByteArrayInputStream;
 12 | import java.io.ByteArrayOutputStream;
 13 | import java.io.IOException;
 14 | import java.io.InputStream;
 15 | import java.util.ArrayList;
 16 | import java.util.List;
 17 | import java.util.concurrent.TimeUnit;
 18 | 
 19 | public class DemoPdfFromS3PdfAppend {
 20 |     private List<ArrayList<TextLine>> extractText(String bucketName, String documentName) throws InterruptedException {
 21 | 
 22 |         AmazonTextract client = AmazonTextractClientBuilder.defaultClient();
 23 | 
 24 |         StartDocumentTextDetectionRequest req = new StartDocumentTextDetectionRequest()
 25 |                 .withDocumentLocation(new DocumentLocation()
 26 |                         .withS3Object(new S3Object()
 27 |                                 .withBucket(bucketName)
 28 |                                 .withName(documentName)))
 29 |                 .withJobTag("DetectingText");
 30 | 
 31 |         StartDocumentTextDetectionResult startDocumentTextDetectionResult = client.startDocumentTextDetection(req);
 32 |         String startJobId = startDocumentTextDetectionResult.getJobId();
 33 | 
 34 |         System.out.println("Text detection job started with Id: " + startJobId);
 35 | 
 36 |         GetDocumentTextDetectionRequest documentTextDetectionRequest = null;
 37 |         GetDocumentTextDetectionResult response = null;
 38 | 
 39 |         String jobStatus = "IN_PROGRESS";
 40 | 
 41 |         while(jobStatus.equals("IN_PROGRESS")){
 42 |             System.out.println("Waiting for job to complete...");
 43 |             TimeUnit.SECONDS.sleep(10);
 44 |             documentTextDetectionRequest= new GetDocumentTextDetectionRequest()
 45 |                     .withJobId(startJobId)
 46 |                     .withMaxResults(1);
 47 | 
 48 |             response = client.getDocumentTextDetection(documentTextDetectionRequest);
 49 |             jobStatus = response.getJobStatus();
 50 |         }
 51 | 
 52 |         int maxResults=1000;
 53 |         String paginationToken=null;
 54 |         Boolean finished=false;
 55 | 
 56 |         List<ArrayList<TextLine>> pages = new ArrayList<ArrayList<TextLine>>();
 57 |         ArrayList<TextLine> page = null;
 58 |         BoundingBox boundingBox = null;
 59 | 
 60 |         while (finished==false)
 61 |         {
 62 |             documentTextDetectionRequest= new GetDocumentTextDetectionRequest()
 63 |                     .withJobId(startJobId)
 64 |                     .withMaxResults(maxResults)
 65 |                     .withNextToken(paginationToken);
 66 |             response = client.getDocumentTextDetection(documentTextDetectionRequest);
 67 | 
 68 |             //Show blocks information
 69 |             List<Block> blocks= response.getBlocks();
 70 |             for (Block block : blocks) {
 71 |                 if(block.getBlockType().equals("PAGE")) {
 72 |                     page = new ArrayList<TextLine>();
 73 |                     pages.add(page);
 74 |                 }
 75 |                 else if(block.getBlockType().equals("LINE")){
 76 |                     boundingBox = block.getGeometry().getBoundingBox();
 77 |                     page.add(new TextLine(boundingBox.getLeft(),
 78 |                             boundingBox.getTop(),
 79 |                             boundingBox.getWidth(),
 80 |                             boundingBox.getHeight(),
 81 |                             block.getText()));
 82 |                 }
 83 |             }
 84 |             paginationToken=response.getNextToken();
 85 |             if (paginationToken==null)
 86 |                 finished=true;
 87 |         }
 88 | 
 89 |         return pages;
 90 |     }
 91 | 
 92 |     private InputStream getPdfFromS3(String bucketName, String documentName) throws IOException {
 93 | 
 94 |         AmazonS3 s3client = AmazonS3ClientBuilder.defaultClient();
 95 |         com.amazonaws.services.s3.model.S3Object fullObject = s3client.getObject(new GetObjectRequest(bucketName, documentName));
 96 |         InputStream in = fullObject.getObjectContent();
 97 |         return in;
 98 |     }
 99 | 
100 |     private void UploadToS3(String bucketName, String objectName, String contentType, byte[] bytes){
101 |         AmazonS3 s3client = AmazonS3ClientBuilder.defaultClient();
102 |         ByteArrayInputStream baInputStream = new ByteArrayInputStream(bytes);
103 |         ObjectMetadata metadata = new ObjectMetadata();
104 |         metadata.setContentLength(bytes.length);
105 |         metadata.setContentType(contentType);
106 |         PutObjectRequest putRequest = new PutObjectRequest(bucketName, objectName, baInputStream, metadata);
107 |         s3client.putObject(putRequest);
108 |     }
109 | 
110 |     public void run(String bucketName, String documentName, String outputDocumentName) throws IOException, InterruptedException {
111 | 
112 |         System.out.println("Generating searchable pdf from: " + bucketName + "/" + documentName);
113 | 
114 |         //Extract text using Amazon Textract
115 |         List<ArrayList<TextLine>> linesInPages = extractText(bucketName, documentName);
116 | 
117 |         //Get input pdf document from Amazon S3
118 |         InputStream inputPdf = getPdfFromS3(bucketName, documentName);
119 | 
120 |         //Generate searchable PDF
121 |         PDFDocument pdfDocument = new PDFDocument(inputPdf);
122 |         int pageIndex = 0;
123 |         for (List<TextLine> linesInPage : linesInPages) {
124 |             //Add extracted text to input pdf document
125 |             pdfDocument.addText(pageIndex, linesInPage);
126 |             pageIndex++;
127 |         }
128 | 
129 |         //Save PDF to stream
130 |         ByteArrayOutputStream os = new ByteArrayOutputStream();
131 |         pdfDocument.save(os);
132 |         pdfDocument.close();
133 | 
134 |         //Upload PDF to S3
135 |         UploadToS3(bucketName, outputDocumentName, "application/pdf", os.toByteArray());
136 | 
137 |         System.out.println("Generated searchable pdf: " + bucketName + "/" + outputDocumentName);
138 |     }
139 | }


--------------------------------------------------------------------------------
/src/SearchablePDF/src/main/java/com/amazon/textract/pdf/FontInfo.java:
--------------------------------------------------------------------------------
1 | package com.amazon.textract.pdf;
2 | 
3 | public class FontInfo {
4 |     int fontSize;
5 |     float textHeight;
6 |     float textWidth;
7 | }
8 | 
9 | 


--------------------------------------------------------------------------------
/src/SearchablePDF/src/main/java/com/amazon/textract/pdf/ImageType.java:
--------------------------------------------------------------------------------
1 | package com.amazon.textract.pdf;
2 | 
3 | public enum ImageType {
4 |     JPEG, PNG;
5 | }
6 | 


--------------------------------------------------------------------------------
/src/SearchablePDF/src/main/java/com/amazon/textract/pdf/PDFDocument.java:
--------------------------------------------------------------------------------
  1 | package com.amazon.textract.pdf;
  2 | 
  3 | import org.apache.pdfbox.pdmodel.PDDocument;
  4 | import org.apache.pdfbox.pdmodel.PDPage;
  5 | import org.apache.pdfbox.pdmodel.PDPageContentStream;
  6 | import org.apache.pdfbox.pdmodel.common.PDRectangle;
  7 | import org.apache.pdfbox.pdmodel.font.PDFont;
  8 | import org.apache.pdfbox.pdmodel.font.PDType1Font;
  9 | import org.apache.pdfbox.pdmodel.graphics.image.JPEGFactory;
 10 | import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory;
 11 | import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
 12 | import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode;
 13 | import java.awt.image.BufferedImage;
 14 | import java.io.*;
 15 | import java.util.List;
 16 | 
 17 | public class PDFDocument {
 18 | 
 19 |     final PDFont font = PDType1Font.COURIER;
 20 | 
 21 |     private PDDocument document;
 22 | 
 23 |     public PDFDocument(){
 24 |         this.document = new PDDocument();
 25 |     }
 26 | 
 27 |     public PDFDocument(InputStream inputDocument) throws IOException {
 28 |         this.document = PDDocument.load(inputDocument);
 29 |     }
 30 | 
 31 |     public void addText(int pageIndex, List<TextLine> lines) throws IOException {
 32 |         PDPage page = document.getPage(pageIndex);
 33 | 
 34 |         float height = page.getMediaBox().getHeight();
 35 | 
 36 |         float width = page.getMediaBox().getWidth();
 37 | 
 38 |         PDPageContentStream contentStream = new PDPageContentStream(document, page, PDPageContentStream.AppendMode.APPEND, false );
 39 |         contentStream.setRenderingMode(RenderingMode.NEITHER);
 40 | 
 41 |         for (TextLine cline : lines){
 42 |             FontInfo fontInfo = calculateFontSize(cline.text, (float)cline.width*width, (float)cline.height*height);
 43 | 
 44 |             //System.out.println("FontSize: " + fontInfo.fontSize + " => for text: " + cline.text);
 45 |             contentStream.beginText();
 46 |             contentStream.setFont(this.font, fontInfo.fontSize);
 47 |             contentStream.newLineAtOffset((float)cline.left*width, (float)(height-height*cline.top-fontInfo.textHeight));
 48 |             contentStream.showText(cline.text);
 49 |             contentStream.endText();
 50 |         }
 51 | 
 52 |         contentStream.close();
 53 |     }
 54 | 
 55 |     private FontInfo calculateFontSize(String text, float bbWidth, float bbHeight) throws IOException {
 56 | 
 57 |         //PDFont font = PDType1Font.TIMES_ROMAN;
 58 | 
 59 |         int fontSize = 17;
 60 |         float textWidth = font.getStringWidth(text) / 1000 * fontSize;
 61 |         float textHeight = font.getFontDescriptor().getFontBoundingBox().getHeight() / 1000 * fontSize;
 62 | 
 63 |         if(textWidth > bbWidth){
 64 |             while(textWidth > bbWidth){
 65 |                 fontSize -= 1;
 66 |                 textWidth = font.getStringWidth(text) / 1000 * fontSize;
 67 |                 textHeight = font.getFontDescriptor().getFontBoundingBox().getHeight() / 1000 * fontSize;
 68 |             }
 69 |         }
 70 |         else if(textWidth < bbWidth){
 71 |             while(textWidth < bbWidth){
 72 |                 fontSize += 1;
 73 |                 textWidth = font.getStringWidth(text) / 1000 * fontSize;
 74 |                 textHeight = font.getFontDescriptor().getFontBoundingBox().getHeight() / 1000 * fontSize;
 75 |             }
 76 |         }
 77 | 
 78 |         //System.out.println("Text height before returning font size: " + textHeight);
 79 | 
 80 |         FontInfo fi = new FontInfo();
 81 |         fi.fontSize = fontSize;
 82 |         fi.textHeight = textHeight;
 83 |         fi.textWidth = textWidth;
 84 | 
 85 |         return fi;
 86 |     }
 87 | 
 88 |     public void addPage(BufferedImage image, ImageType imageType, List<TextLine> lines) throws IOException {
 89 | 
 90 |         float width = image.getWidth();
 91 |         float height = image.getHeight();
 92 | 
 93 |         PDRectangle box = new PDRectangle(width, height);
 94 |         PDPage page = new PDPage(box);
 95 |         page.setMediaBox(box);
 96 |         this.document.addPage(page);
 97 | 
 98 |         PDImageXObject pdImage = null;
 99 | 
100 |         if(imageType == ImageType.JPEG){
101 |             pdImage = JPEGFactory.createFromImage(this.document, image);
102 |         }
103 |         else {
104 |             pdImage = LosslessFactory.createFromImage(this.document, image);
105 |         }
106 | 
107 |         PDPageContentStream contentStream = new PDPageContentStream(document, page);
108 | 
109 |         contentStream.drawImage(pdImage, 0, 0);
110 | 
111 |         contentStream.setRenderingMode(RenderingMode.NEITHER);
112 | 
113 |         for (TextLine cline : lines){
114 |             FontInfo fontInfo = calculateFontSize(cline.text, (float)cline.width*width, (float)cline.height*height);
115 |             contentStream.beginText();
116 |             contentStream.setFont(this.font, fontInfo.fontSize);
117 |             contentStream.newLineAtOffset((float)cline.left*width, (float)(height-height*cline.top-fontInfo.textHeight));
118 |             contentStream.showText(cline.text);
119 |             contentStream.endText();
120 |         }
121 | 
122 |         contentStream.close();
123 |     }
124 | 
125 |     public void save(String path) throws IOException {
126 |         this.document.save(new File(path));
127 |     }
128 | 
129 |     public void save(OutputStream os) throws IOException {
130 |         this.document.save(os);
131 |     }
132 | 
133 |     public void close() throws IOException {
134 |         this.document.close();
135 |     }
136 | }
137 | 


--------------------------------------------------------------------------------
/src/SearchablePDF/src/main/java/com/amazon/textract/pdf/TextLine.java:
--------------------------------------------------------------------------------
 1 | package com.amazon.textract.pdf;
 2 | 
 3 | public class TextLine {
 4 |     public double left;
 5 |     public double top;
 6 |     public double width;
 7 |     public double height;
 8 |     public String text;
 9 | 
10 |     public TextLine(double left, double top, double width, double height, String text) {
11 |         this.left = left;
12 |         this.top = top;
13 |         this.width = width;
14 |         this.height = height;
15 |         this.text = text;
16 |     }
17 | }


--------------------------------------------------------------------------------