├── .gitignore
├── LICENSE
├── README.md
├── cli
    ├── pom.xml
    └── src
    │   └── main
    │       └── java
    │           └── ocr
    │               └── cli
    │                   └── CLI.java
├── common
    ├── pom.xml
    └── src
    │   └── main
    │       └── java
    │           └── ocr
    │               └── common
    │                   └── Util.java
├── conversion
    ├── README.md
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── java
    │       │   └── ocr
    │       │   │   └── conversion
    │       │   │       └── AlmostSimpleRenderer.java
    │       ├── resources
    │       │   ├── log4j.properties
    │       │   └── logback.xml
    │       └── scala
    │       │   └── ocr
    │       │       └── conversion
    │       │           ├── ConfigOptions.scala
    │       │           ├── Converter.scala
    │       │           └── Driver.scala
    │   └── test
    │       ├── resources
    │           └── text-detection.pdf
    │       └── scala
    │           └── ocr
    │               └── conversion
    │                   └── ConverterSpec.scala
├── extraction
    ├── README.md
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── java
    │       │   └── ocr
    │       │   │   └── extraction
    │       │   │       └── tesseract
    │       │   │           └── TesseractUtil.java
    │       └── resources
    │       │   ├── log4j.properties
    │       │   └── logback.xml
    │   └── test
    │       ├── java
    │           └── ocr
    │           │   └── extraction
    │           │       └── tesseract
    │           │           └── TesseractUtilTest.java
    │       └── resources
    │           └── pdf-test.tiff
├── nifi
    ├── README.md
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── java
    │       │   └── ocr
    │       │   │   └── nifi
    │       │   │       ├── conversion
    │       │   │           └── ConversionProcessor.java
    │       │   │       ├── extraction
    │       │   │           └── ExtractionProcessor.java
    │       │   │       ├── preprocessing
    │       │   │           └── PreprocessingProcessor.java
    │       │   │       ├── util
    │       │   │           └── JSONUtils.java
    │       │   │       └── validation
    │       │   │           ├── JsonValidator.java
    │       │   │           └── Validation.java
    │       ├── nifi
    │       │   └── templates
    │       │   │   └── scalable-ocr.xml
    │       └── resources
    │       │   ├── META-INF
    │       │       └── services
    │       │       │   └── org.apache.nifi.processor.Processor
    │       │   ├── log4j.properties
    │       │   └── logback.xml
    │   └── test
    │       └── java
    │           └── ocr
    │               └── nifi
    │                   ├── conversion
    │                       └── ConversionProcessorTest.java
    │                   ├── extraction
    │                       └── ExtractionProcessorTest.java
    │                   └── preprocessing
    │                       └── PreprocessingTest.java
├── pom.xml
├── preprocessing
    ├── README.md
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── java
    │       │   └── ocr
    │       │   │   └── preprocessing
    │       │   │       └── conversion
    │       │   │           ├── CLIUtils.java
    │       │   │           ├── CleaningOptions.java
    │       │   │           ├── CommandFailedException.java
    │       │   │           ├── Handler.java
    │       │   │           ├── ImageUtils.java
    │       │   │           ├── TextCleaner.java
    │       │   │           └── handler
    │       │   │               ├── AdaptiveBlurringHandler.java
    │       │   │               ├── EnhancingHandler.java
    │       │   │               ├── FilterHandler.java
    │       │   │               ├── GrayscaleHandler.java
    │       │   │               ├── LayoutHandler.java
    │       │   │               ├── OffsetHandler.java
    │       │   │               ├── PadHandler.java
    │       │   │               ├── RotationHandler.java
    │       │   │               ├── SaturationHandler.java
    │       │   │               ├── SharpenHandler.java
    │       │   │               ├── SmoothingThresholdHandler.java
    │       │   │               ├── TrimHandler.java
    │       │   │               └── UnrotateHandler.java
    │       └── resources
    │       │   ├── log4j.properties
    │       │   └── logback.xml
    │   └── test
    │       ├── java
    │           └── ocr
    │           │   └── preprocessing
    │           │       └── conversion
    │           │           └── TextCleanerTest.java
    │       └── resources
    │           └── images
    │               ├── abbott2.jpg
    │               ├── brscan_original_r90-out.jpg
    │               └── brscan_original_r90.jpg
├── presentation
    ├── README.md
    ├── conference-rules.txt
    ├── scalable-ocr-hadoop-summit-2016.pptx
    └── text-detection.pdf
└── scripts
    ├── clinton_email_grabber.py
    └── metadata.csv


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Covers Eclipse
 2 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
 3 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
 4 | 
 5 | # User-specific stuff:
 6 | .idea/workspace.xml
 7 | .idea/tasks.xml
 8 | .idea/dictionaries
 9 | 
10 | # Sensitive or high-churn files:
11 | .idea/dataSources.ids
12 | .idea/dataSources.xml
13 | .idea/sqlDataSources.xml
14 | .idea/dynamic.xml
15 | .idea/uiDesigner.xml
16 | 
17 | # Gradle:
18 | .idea/gradle.xml
19 | .idea/libraries
20 | 
21 | # Mongo Explorer plugin:
22 | .idea/mongoSettings.xml
23 | 
24 | ## File-based project format:
25 | *.iws
26 | 
27 | ## Plugin-specific files:
28 | 
29 | # IntelliJ
30 | /out/
31 | 
32 | # mpeltonen/sbt-idea plugin
33 | .idea_modules/
34 | 
35 | # JIRA plugin
36 | atlassian-ide-plugin.xml
37 | 
38 | # Crashlytics plugin (for Android Studio and IntelliJ)
39 | com_crashlytics_export_strings.xml
40 | crashlytics.properties
41 | crashlytics-build.properties
42 | fabric.properties
43 | 
44 | # Intellij
45 | .idea/
46 | *.iml
47 | *.iws
48 | 
49 | # Mobile Tools for Java (J2ME)
50 | .mtj.tmp/
51 | 
52 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
53 | hs_err_pid*
54 | 
55 | *~
56 | *.swp
57 | target/
58 | pig*.log
59 | logs/*
60 | temp/
61 | mockito-capture/
62 | 
63 | *.pydevproject
64 | .project
65 | .metadata
66 | bin/**
67 | tmp/**
68 | tmp/**/*
69 | *.tmp
70 | *.bak
71 | *.swp
72 | *~.nib
73 | local.properties
74 | .classpath
75 | .settings/
76 | .loadpath
77 | target/
78 | *.class
79 | *.factorypath
80 | 
81 | # External tool builders
82 | .externalToolBuilders/
83 | 
84 | # Locally stored "Eclipse launch configurations"
85 | *.launch
86 | 
87 | # CDT-specific
88 | .cproject
89 | 
90 | # PDT-specific
91 | .buildpath
92 | 
93 | .DS_Store
94 | 
95 | dependency-reduced-pom.xml
96 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Scalable OCR
 2 | 
 3 | Welcome to the project
 4 | 
 5 | So much of our data is represented as human readable scans of documents.
 6 | However, this kind of document-by-document analysis does not scale, so
 7 | it is becoming evermore common to need to ingest large numbers of PDFs
 8 | or scanned documents shows up in almost all sectors. Inevitably these
 9 | scanned documents must be converted to text for analysis. And since
10 | dealing with unstructured data is one of the main selling points for a
11 | platform like Hadoop, it means that we must convert large volumes of
12 | potentially large documents into a textual representation. We will show
13 | you how to use scalable open source tooling (Apache NiFi and Tesseract) to scalably convert volumes of PDFs and ingest into a platform that will allow you to analyze this data at scale.
14 | 
15 | # Modules
16 | 
17 | ### Core Modules
18 | - conversion - convert multi-page PDFs to single-page TIFF files
19 | - preprocessing - image correction for better text extraction during OCR
20 | - extraction - OCR images and output text
21 | 
22 | ### Utility
23 | - CLI - command line tool for manual pipeline process execution
24 | - NiFi - custom processors for exposing the core modules via NiFi. Workflow template.
25 | 
26 | # Developers
27 | 
28 | #### Cutting a release for ocr
29 | 
30 | ```bash
31 | mvn release:prepare -Dscm-connection.url=<scm readonly url> -Dscm-developer-connection.url=<scm read-write url>
32 | ```
33 | 
34 | **Note**: The main pom assumes "scm:git:<url>" - simply pass in the URL portion as a build parameter as shown above.
35 | 
36 | Examples: [maven scm] (http://maven.apache.org/scm/git.html)
37 | 
38 | 1. local git - file://localhost/foo/bar/mygitrepodir
39 | 1. github connection url (readonly) - git://github.com/mmiklavc/myproject.git
40 | 1. github developer connection url (read/write) - git@github.com:mmiklavc/myproject.git
41 | 
42 | Performing the release prepare will do the following high-level steps:
43 | 
44 | 1. Change pom versions from X.X-SNAPSHOT to X.X
45 | 1. Commit the new poms for the release to Git
46 | 1. Tag the release commit in Git
47 | 1. Increment poms to a new SNAPSHOT version, e.g. Update from X.0-SNAPSHOT to X.1-SNAPSHOT
48 | 1. Commit the updated SNAPSHOT poms
49 | 
50 | *See [Maven release prepare] (http://maven.apache.org/maven-release/maven-release-plugin/examples/prepare-release.html) documentation for more detail*
51 | 
52 | 


--------------------------------------------------------------------------------
/cli/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <modelVersion>4.0.0</modelVersion>
 6 | 
 7 |     <parent>
 8 |         <groupId>ocr</groupId>
 9 |         <artifactId>ocr</artifactId>
10 |         <version>1.0-SNAPSHOT</version>
11 |     </parent>
12 | 
13 |     <artifactId>cli</artifactId>
14 |     <name>cli</name>
15 | 
16 | 
17 |     <dependencies>
18 |         <dependency>
19 |             <groupId>ocr</groupId>
20 |             <artifactId>conversion</artifactId>
21 |             <version>${project.parent.version}</version>
22 |         </dependency>
23 |         <dependency>
24 |             <groupId>ocr</groupId>
25 |             <artifactId>extraction</artifactId>
26 |             <version>${project.parent.version}</version>
27 |         </dependency>
28 |         <dependency>
29 |             <groupId>ocr</groupId>
30 |             <artifactId>preprocessing</artifactId>
31 |             <version>${project.parent.version}</version>
32 |         </dependency>
33 | 
34 |     </dependencies>
35 |     <build>
36 |         <plugins>
37 |             <plugin>
38 |                 <artifactId>maven-compiler-plugin</artifactId>
39 |                 <version>3.1</version>
40 |                 <configuration>
41 |                     <source>1.8</source>
42 |                     <target>1.8</target>
43 |                 </configuration>
44 |             </plugin>
45 |             <plugin>
46 |                 <groupId>org.apache.maven.plugins</groupId>
47 |                 <artifactId>maven-shade-plugin</artifactId>
48 |                 <version>2.4.3</version>
49 |                 <executions>
50 |                     <execution>
51 |                         <phase>package</phase>
52 |                         <goals>
53 |                             <goal>shade</goal>
54 |                         </goals>
55 |                         <configuration>
56 |                             <filters>
57 |                                 <filter>
58 |                                     <artifact>*:*</artifact>
59 |                                     <excludes>
60 |                                         <exclude>META-INF/*.SF</exclude>
61 |                                         <exclude>META-INF/*.DSA</exclude>
62 |                                         <exclude>META-INF/*.RSA</exclude>
63 |                                     </excludes>
64 |                                 </filter>
65 |                             </filters>
66 |                             <!-- Additional configuration. -->
67 |                             <transformers>
68 |                                 <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
69 |                                     <manifestEntries>
70 |                                         <Main-Class>org.opentripplanner.graph_builder.GraphBuilderMain</Main-Class>
71 |                                         <Specification-Title>Java Advanced Imaging Image I/O Tools</Specification-Title>
72 |                                         <Specification-Version>1.1</Specification-Version>
73 |                                         <Specification-Vendor>Sun Microsystems, Inc.</Specification-Vendor>
74 |                                         <Implementation-Title>com.sun.media.imageio</Implementation-Title>
75 |                                         <Implementation-Version>1.1</Implementation-Version>
76 |                                         <Implementation-Vendor>Sun Microsystems, Inc.</Implementation-Vendor>
77 |                                         <Extension-Name>com.sun.media.imageio</Extension-Name>
78 |                                     </manifestEntries>
79 |                                 </transformer>
80 |                                 <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
81 |                             </transformers>
82 |                         </configuration>
83 |                     </execution>
84 |                 </executions>
85 |             </plugin>
86 |         </plugins>
87 |     </build>
88 | </project>
89 | 


--------------------------------------------------------------------------------
/cli/src/main/java/ocr/cli/CLI.java:
--------------------------------------------------------------------------------
  1 | package ocr.cli;
  2 | 
  3 | import com.google.common.base.Joiner;
  4 | import com.google.common.base.Splitter;
  5 | import com.google.common.collect.Iterables;
  6 | import net.sourceforge.tess4j.TesseractException;
  7 | import ocr.conversion.Converter;
  8 | import ocr.extraction.tesseract.TesseractUtil;
  9 | import ocr.preprocessing.conversion.CLIUtils;
 10 | import ocr.preprocessing.conversion.CleaningOptions;
 11 | import ocr.preprocessing.conversion.CommandFailedException;
 12 | import ocr.preprocessing.conversion.TextCleaner;
 13 | import org.apache.commons.cli.*;
 14 | import org.apache.commons.io.IOUtils;
 15 | 
 16 | import java.io.*;
 17 | import java.util.*;
 18 | import java.util.function.Function;
 19 | 
 20 | public class CLI {
 21 |   public static enum OcrOptions {
 22 |     HELP("h", code -> {
 23 |       Option o = new Option(code, "help", false, "This screen");
 24 |       o.setRequired(false);
 25 |       return o;
 26 |     }),
 27 |     INPUT("i", code -> {
 28 |       Option o = new Option(code, "input", true, "Single Input File");
 29 |       o.setRequired(false);
 30 |       o.setArgName("INPUT");
 31 |       return o;
 32 |     }),
 33 |     INPUT_DIR("id", code -> {
 34 |       Option o = new Option(code, "input", true, "Input Directory");
 35 |       o.setRequired(false);
 36 |       o.setArgName("DIR");
 37 |       return o;
 38 |     }),
 39 |     INPUT_FILE("if", code -> {
 40 |       Option o = new Option(code, "input_file", true, "Input File");
 41 |       o.setRequired(false);
 42 |       o.setArgName("FILE");
 43 |       return o;
 44 |     }),
 45 |     OUTPUT("o", code -> {
 46 |       Option o = new Option(code, "output", true, "Output Directory");
 47 |       o.setRequired(false);
 48 |       o.setArgName("DIR");
 49 |       return o;
 50 |     }),
 51 |     PREPROCESSING("p", code -> {
 52 |       Option o = new Option(code, "preprocessing", true, "Preprocessing Config");
 53 |       o.setRequired(false);
 54 |       o.setArgName("Preprocessing Configs");
 55 |       return o;
 56 |     }),
 57 |     TEMP_DIR("t", code -> {
 58 |       Option o = new Option(code, "temp_dir", true, "Temp Dir");
 59 |       o.setRequired(false);
 60 |       o.setArgName("DIR");
 61 |       return o;
 62 |     }),
 63 |     LIB_PATH("l", code -> {
 64 |       Option o = new Option(code, "lib_path", true, "jna library path");
 65 |       o.setRequired(false);
 66 |       o.setArgName("DIR");
 67 |       return o;
 68 |     }),
 69 |     CONVERT_PATH("c", code -> {
 70 |       Option o = new Option(code, "convert_path", true, "Path to the Convert utility");
 71 |       o.setRequired(false);
 72 |       o.setArgName("PATH");
 73 |       return o;
 74 |     }),
 75 |     TESSDATA_PATH("d", code -> {
 76 |       Option o = new Option(code, "tess_data_path", true, "Path to TESS_DATA");
 77 |       o.setRequired(false);
 78 |       o.setArgName("PATH");
 79 |       return o;
 80 |     }),
 81 |     TESSPROPERTIES("D", code ->
 82 |      OptionBuilder.withArgName( "property=value" )
 83 |               .hasArgs(2)
 84 |               .withValueSeparator()
 85 |               .withDescription( "Tesseract variables" )
 86 |               .create( code )
 87 | 
 88 |     ),
 89 |     PHASE("ph", code -> {
 90 |       Option o = new Option(code, "phases", true, "Which phases to run: [convert|preprocess|ocr]");
 91 |       o.setRequired(false);
 92 |       o.setArgName("PHASE");
 93 |       return o;
 94 |     });
 95 |     Option option;
 96 |     String shortCode;
 97 |     OcrOptions(String shortCode
 98 |               , Function<String, Option> optionHandler
 99 |                  ) {
100 |       this.shortCode = shortCode;
101 |       this.option = optionHandler.apply(shortCode);
102 | 
103 |     }
104 | 
105 |     public boolean has(CommandLine cli) {
106 |       return cli.hasOption(shortCode);
107 |     }
108 | 
109 |     public String get(CommandLine cli) {
110 |       return cli.getOptionValue(shortCode);
111 |     }
112 |     public String get(CommandLine cli, String def) {
113 |       return has(cli)?cli.getOptionValue(shortCode):def;
114 |     }
115 | 
116 |     public Map<String, String> getProperties(CommandLine cli) {
117 |       Properties p = cli.getOptionProperties(shortCode);
118 |       Map<String, String> ret = new HashMap<>();
119 |       for(Map.Entry<Object, Object> kv : p.entrySet()) {
120 |         ret.put(kv.getKey().toString(), kv.getValue().toString());
121 |       }
122 |       return ret;
123 |     }
124 | 
125 | 
126 |     public static CommandLine parse(CommandLineParser parser, String[] args) throws ParseException {
127 |       try {
128 |         CommandLine cli = parser.parse(getOptions(), args);
129 |         if(HELP.has(cli)) {
130 |           printHelp();
131 |           System.exit(0);
132 |         }
133 |         return cli;
134 |       } catch (ParseException e) {
135 |         System.err.println("Unable to parse args: " + Joiner.on(' ').join(args));
136 |         e.printStackTrace(System.err);
137 |         printHelp();
138 |         throw e;
139 |       }
140 |     }
141 | 
142 |     public static void printHelp() {
143 |       HelpFormatter formatter = new HelpFormatter();
144 |       formatter.printHelp( "OCRCLI", getOptions());
145 |     }
146 | 
147 |     public static Options getOptions() {
148 |       Options ret = new Options();
149 |       for(OcrOptions o : OcrOptions.values()) {
150 |         ret.addOption(o.option);
151 |       }
152 |       return ret;
153 |     }
154 |   }
155 | 
156 |   public static Set<String> getAlreadyProcessed(File outputDir) {
157 |     Set<String> ret = new HashSet<>();
158 |     for(File f : outputDir.listFiles()) {
159 |       ret.add(stripSuffix(f.getName()));
160 |     }
161 |     return ret;
162 |   }
163 | 
164 |   public static String stripSuffix(String filename) {
165 |     if(filename.contains(".")) {
166 |       return Iterables.getFirst(Splitter.on(".").split(filename), null);
167 |     }
168 |     else {
169 |       return filename;
170 |     }
171 |   }
172 | 
173 |   public static List<File> filterFilesToProcess(Iterable<File> files, Set<String> alreadyProcessed) {
174 |     List<File> ret = new ArrayList<>();
175 |     for(File f : files) {
176 |       if(!alreadyProcessed.contains(stripSuffix(f.getName()))) {
177 |         ret.add(f);
178 |       }
179 |       else {
180 |         System.out.println("Skipping " + f.getName());
181 |       }
182 |     }
183 |     return ret;
184 |   }
185 | 
186 |   public static List<File> extractFilesFromFile(File inputFile) throws IOException {
187 |     BufferedReader br = new BufferedReader(new FileReader(inputFile));
188 |     List<File> ret = new ArrayList<>();
189 |     for(String line = null; (line = br.readLine()) != null;) {
190 |       ret.add(new File(line));
191 |     }
192 |     return ret;
193 |   }
194 | 
195 |   public static List<File> extractFilesFromDirectory(File inputDir) throws IOException {
196 |     List<File> ret = new ArrayList<>();
197 |     for(File f : inputDir.listFiles()) {
198 |       ret.add(f);
199 |     }
200 |     return ret;
201 |   }
202 | 
203 |   public static void main(String... argv) throws ParseException, IOException, CommandFailedException, TesseractException {
204 |     PosixParser parser = new PosixParser();
205 |     CommandLine cli = OcrOptions.parse(parser, argv);
206 |     String phase = "all";
207 |     if(OcrOptions.PHASE.has(cli)) {
208 |       phase = OcrOptions.PHASE.get(cli);
209 |     }
210 |     System.getProperties().setProperty("jna.library.path", OcrOptions.LIB_PATH.get(cli, "/opt/local/lib"));
211 |     String preprocessingDef = OcrOptions.PREPROCESSING.get(cli);
212 |     String tempDirStr = OcrOptions.TEMP_DIR.get(cli, "/tmp");
213 |     List<File> files = null;
214 |     File outDir = new File(OcrOptions.OUTPUT.get(cli, "."));
215 |     Set<String> alreadyProcessed = getAlreadyProcessed(outDir);
216 |     Map<String, String> tessProperties = OcrOptions.TESSPROPERTIES.getProperties(cli);
217 |     if(OcrOptions.INPUT.has(cli)) {
218 |       files = Arrays.asList(new File(OcrOptions.INPUT.get(cli)));
219 |     } else if(OcrOptions.INPUT_FILE.has(cli)) {
220 |       files = filterFilesToProcess(extractFilesFromFile(new File(OcrOptions.INPUT_FILE.get(cli))), alreadyProcessed);
221 |     } else if(OcrOptions.INPUT_DIR.has(cli)){
222 |       files = filterFilesToProcess(extractFilesFromDirectory(new File(OcrOptions.INPUT_DIR.get(cli))), alreadyProcessed);
223 |     } else {
224 |       throw new IllegalStateException("Must specify one of input, input directory or input file");
225 |     }
226 |     File tempDir = new File(tempDirStr);
227 |     String convertPath = OcrOptions.CONVERT_PATH.get(cli, "/usr/local/bin/convert");
228 |     File tessDataPath = new File(OcrOptions.TESSDATA_PATH.get(cli,"/usr/local/Cellar/tesseract/3.04.01_1/share/tessdata/"));
229 |     CommandLine cleaningCli = CleaningOptions.parse(new DefaultParser(), CLIUtils.translateCommandline(preprocessingDef) );
230 |     final TextCleaner cleaner = CleaningOptions.createTextCleaner(cleaningCli, convertPath, tempDirStr);
231 |     int i = 0;
232 |     for(File f : files) {
233 |       System.out.println("Processing " + f.getName() + " (" + i++ + " / " + files.size()+ ")");
234 |       int pageNumber = 0;
235 |       if("all".equals(phase)) {
236 |         for (Map.Entry<File, Boolean> page : toPages(new BufferedInputStream(new FileInputStream(f)), tempDir)) {
237 |           pageNumber++;
238 |           System.out.println("Page " + pageNumber);
239 |           try {
240 |             if (page.getValue()) {
241 |               byte[] converted = cleaner.convert(new BufferedInputStream(new FileInputStream(page.getKey())));
242 |               writePreprocessed(converted, new File(outDir, f.getName() + "-" + pageNumber + ".tiff"));
243 |               String pageText = TesseractUtil.INSTANCE.ocr(converted, tessDataPath, tessProperties);
244 |               String fileName = f.getName() + "-" + pageNumber + ".txt";
245 |               File outFile = new File(outDir, fileName);
246 |               try (PrintWriter pw = new PrintWriter(outFile)) {
247 |                 IOUtils.write(pageText, pw);
248 |                 pw.flush();
249 |               }
250 |             }
251 |           } finally {
252 |             page.getKey().delete();
253 |           }
254 |         }
255 |       } else {
256 |         switch(phase) {
257 |           case "convert" :
258 |             toPages(new BufferedInputStream(new FileInputStream(f)), outDir);
259 |             return;
260 |           case "preprocess" :
261 |             byte[] converted = cleaner.convert(new BufferedInputStream(new FileInputStream(f)));
262 |             writePreprocessed(converted, new File(outDir, f.getName() + "-" + pageNumber + ".tiff"));
263 |             return;
264 |           case "ocr" :
265 |             try (FileInputStream fis = new FileInputStream(f)) {
266 |               byte[] inFile = IOUtils.toByteArray(fis);
267 |               String pageText = TesseractUtil.INSTANCE.ocr(inFile, tessDataPath, tessProperties);
268 |               String fileName = f.getName() + "-" + pageNumber + ".txt";
269 |               File outFile = new File(outDir, fileName);
270 |               try (PrintWriter pw = new PrintWriter(outFile)) {
271 |                 IOUtils.write(pageText, pw);
272 |                 pw.flush();
273 |               }
274 |             }
275 |             return;
276 |           default :
277 |             throw new IllegalArgumentException("Unknown phase: " + phase);
278 |         }
279 |       }
280 |     }
281 |   }
282 | 
283 |   private static void writePreprocessed(byte[] converted, File file) {
284 |     try(FileOutputStream fos = new FileOutputStream(file)) {
285 |       IOUtils.write(converted, fos);
286 |       fos.flush();
287 |     } catch (IOException e) {
288 |       e.printStackTrace();
289 |     }
290 |   }
291 | 
292 |   private static List<Map.Entry<File, Boolean>> toPages(InputStream in, File tempDir) {
293 |     Converter converter = new Converter();
294 |     if(!tempDir.exists()) {
295 |       tempDir.mkdirs();
296 |     }
297 |     List<Map.Entry<File, Boolean>> ret = new ArrayList<>();
298 |     for(Map.Entry<File, Boolean> kv : converter.toJava(converter.convert(in, tempDir))) {
299 |       ret.add(new AbstractMap.SimpleEntry<>(kv.getKey(), kv.getValue()));
300 |     }
301 |     return ret;
302 |   }
303 | }
304 | 


--------------------------------------------------------------------------------
/common/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <parent>
 6 |         <artifactId>ocr</artifactId>
 7 |         <groupId>ocr</groupId>
 8 |         <version>1.0-SNAPSHOT</version>
 9 |     </parent>
10 |     <modelVersion>4.0.0</modelVersion>
11 | 
12 |     <artifactId>common</artifactId>
13 | 
14 | </project>
15 | 


--------------------------------------------------------------------------------
/common/src/main/java/ocr/common/Util.java:
--------------------------------------------------------------------------------
 1 | package ocr.common;
 2 | 
 3 | import java.io.File;
 4 | import java.util.Optional;
 5 | import java.util.function.Function;
 6 | 
 7 | public class Util {
 8 |     public enum Locations {
 9 |         CONVERT(new String[]{
10 |                 "/usr/local/bin/convert",
11 |                 "/opt/local/bin/convert"
12 |         }, t -> findFile(t, "convert tool")),
13 |         TESSDATA(new String[]{
14 |                 "/opt/local/share/tessdata/",
15 |                 "/usr/local/Cellar/tesseract/3.04.01_1/share/tessdata/"
16 |         }, t -> findFile(t, "tessdata")),
17 |         JNA(new String[]{
18 |                 "/opt/local/lib"
19 |         }, t -> findDir(t, "jna library"));
20 | 
21 |         private final String[] locs;
22 |         private final Function<String[], Optional<File>> searchHandler;
23 | 
24 |         Locations(String[] locs, Function<String[], Optional<File>> searchHandler) {
25 |             this.locs = locs;
26 |             this.searchHandler = searchHandler;
27 |         }
28 | 
29 |         public Optional<File> find() {
30 |             return searchHandler.apply(locs);
31 |         }
32 | 
33 |         public Optional<File> find(Optional<?> path) {
34 |             if (path.isPresent()) {
35 |                 File f = new File(path.get().toString());
36 |                 if (f.exists()) {
37 |                     return Optional.of(f);
38 |                 }
39 |             }
40 |             return find();
41 |         }
42 |     }
43 | 
44 |     public static Optional<File> findFile(String[] locs, String item) {
45 |         return findFile(locs, item, false);
46 |     }
47 | 
48 |     public static Optional<File> findDir(String[] locs, String item) {
49 |         return findFile(locs, item, true);
50 |     }
51 | 
52 |     public static Optional<File> findFile(String[] locs, String item, boolean checkIsDir) {
53 |         for (String loc : locs) {
54 |             File binPath = new File(loc);
55 |             if (binPath.exists()) {
56 |                 if (checkIsDir) {
57 |                     if (binPath.isDirectory()) {
58 |                         return Optional.of(binPath);
59 |                     }
60 |                     continue;
61 |                 }
62 |                 return Optional.of(binPath);
63 |             }
64 |         }
65 |         return Optional.empty();
66 |     }
67 | 
68 | }
69 | 


--------------------------------------------------------------------------------
/conversion/README.md:
--------------------------------------------------------------------------------
 1 | # Conversion
 2 | 
 3 | Convert images from PDF to an image format
 4 | 
 5 | #### Requirements
 6 | 
 7 | Must have Ghostscript installed
 8 | 
 9 | Example on Mac
10 | 
11 | ```bash
12 | sudo port install ghostscript
13 | ```
14 | 
15 | #### Running project from command line
16 | 
17 | Note: On Mac you will need to specify the jna native library path as the Maven jar's for Ghostscript do not contain the required binary dependencies.
18 | More details can be found in this [Stack Overflow response](http://stackoverflow.com/a/36533605/2163229)
19 | 
20 | ```bash
21 | $ mvn exec:java -Dexec.mainClass="ocr.conversion.Convert" -Dexec.args="file-location"
22 | 
23 | or, including custom jna path
24 | 
25 | $ mvn -Djna.library.path=/opt/local/lib/ exec:java -Dexec.mainClass="ocr.conversion.Convert" -Dexec.args="file-location"
26 | ```
27 | 
28 | **Reference**
29 | 
30 | * [Ghost4j](http://www.ghost4j.org/)
31 | 


--------------------------------------------------------------------------------
/conversion/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
  3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |     <modelVersion>4.0.0</modelVersion>
  6 | 
  7 |     <parent>
  8 |         <groupId>ocr</groupId>
  9 |         <artifactId>ocr</artifactId>
 10 |         <version>1.0-SNAPSHOT</version>
 11 |     </parent>
 12 | 
 13 |     <artifactId>conversion</artifactId>
 14 |     <name>conversion</name>
 15 |     <repositories>
 16 |         <repository>
 17 |             <!-- Not needed for "*-geoapi-3.0" versions -->
 18 |             <id>geotoolkit</id>
 19 |             <name>Geotk repository</name>
 20 |             <url>http://maven.geotoolkit.org</url>
 21 |         </repository>
 22 |     </repositories>
 23 |     <properties>
 24 |         <!-- plugin versions -->
 25 | 
 26 |         <!-- main dependency versions -->
 27 |         <ghost4j.version>1.0.1</ghost4j.version>
 28 |         <bouncycastle.version>1.46</bouncycastle.version>
 29 |     </properties>
 30 | 
 31 |     <dependencies>
 32 |         <dependency>
 33 |             <groupId>org.bouncycastle</groupId>
 34 |             <artifactId>bcprov-jdk15on</artifactId>
 35 |             <version>${bouncycastle.version}</version>
 36 |         </dependency>
 37 |         <dependency>
 38 |             <groupId>org.bouncycastle</groupId>
 39 |             <artifactId>bcmail-jdk15on</artifactId>
 40 |             <version>${bouncycastle.version}</version>
 41 |         </dependency>
 42 |         <dependency>
 43 |             <groupId>org.geotoolkit</groupId>
 44 |             <artifactId>geotk-coverageio</artifactId>
 45 |             <version>3.17</version>
 46 |         </dependency>
 47 |         <dependency>
 48 |             <groupId>org.scala-lang</groupId>
 49 |             <artifactId>scala-library</artifactId>
 50 |         </dependency>
 51 |         <dependency>
 52 |             <groupId>org.scalactic</groupId>
 53 |             <artifactId>scalactic_${scala.binary.version}</artifactId>
 54 |         </dependency>
 55 |         <dependency>
 56 |             <groupId>org.scalatest</groupId>
 57 |             <artifactId>scalatest_${scala.binary.version}</artifactId>
 58 |         </dependency>
 59 |         <dependency>
 60 |             <groupId>org.ghost4j</groupId>
 61 |             <artifactId>ghost4j</artifactId>
 62 |             <version>${ghost4j.version}</version>
 63 |         </dependency>
 64 | 
 65 |     </dependencies>
 66 | 
 67 |     <build>
 68 |         <plugins>
 69 |             <plugin>
 70 |                 <groupId>net.alchim31.maven</groupId>
 71 |                 <artifactId>scala-maven-plugin</artifactId>
 72 |             </plugin>
 73 |             <!-- disable surefire -->
 74 |             <plugin>
 75 |                 <groupId>org.apache.maven.plugins</groupId>
 76 |                 <artifactId>maven-surefire-plugin</artifactId>
 77 |                 <configuration>
 78 |                     <skipTests>true</skipTests>
 79 |                 </configuration>
 80 |             </plugin>
 81 |             <!-- enable scalatest -->
 82 |             <plugin>
 83 |                 <groupId>org.scalatest</groupId>
 84 |                 <artifactId>scalatest-maven-plugin</artifactId>
 85 |             </plugin>
 86 |         </plugins>
 87 |     </build>
 88 | 
 89 |     <reporting>
 90 |         <plugins>
 91 |             <plugin>
 92 |                 <groupId>net.alchim31.maven</groupId>
 93 |                 <artifactId>scala-maven-plugin</artifactId>
 94 |                 <reportSets>
 95 |                     <reportSet>
 96 |                         <reports>
 97 |                             <report>doc-jar</report>
 98 |                             <report>doc</report>
 99 |                         </reports>
100 |                     </reportSet>
101 |                 </reportSets>
102 |             </plugin>
103 |         </plugins>
104 |     </reporting>
105 | 
106 | </project>
107 | 


--------------------------------------------------------------------------------
/conversion/src/main/java/ocr/conversion/AlmostSimpleRenderer.java:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Ghost4J: a Java wrapper for Ghostscript API.
  3 | *
  4 | * Distributable under LGPL license.
  5 | * See terms of license at http://www.gnu.org/licenses/lgpl.html.
  6 | */
  7 | package ocr.conversion;
  8 | 
  9 | import org.ghost4j.Ghostscript;
 10 | import org.ghost4j.GhostscriptException;
 11 | import org.ghost4j.display.PageRaster;
 12 | import org.ghost4j.display.PageRasterDisplayCallback;
 13 | import org.ghost4j.document.Document;
 14 | import org.ghost4j.document.DocumentException;
 15 | import org.ghost4j.document.PDFDocument;
 16 | import org.ghost4j.document.PSDocument;
 17 | import org.ghost4j.renderer.AbstractRemoteRenderer;
 18 | import org.ghost4j.renderer.RendererException;
 19 | import org.ghost4j.util.DiskStore;
 20 | 
 21 | import java.io.IOException;
 22 | import java.util.Arrays;
 23 | import java.util.List;
 24 | 
 25 | public class AlmostSimpleRenderer extends AbstractRemoteRenderer {
 26 | 
 27 |     public static final int OPTION_ANTIALIASING_NONE = 0;
 28 |     public static final int OPTION_ANTIALIASING_LOW = 1;
 29 |     public static final int OPTION_ANTIALIASING_MEDIUM = 2;
 30 |     public static final int OPTION_ANTIALIASING_HIGH = 4;
 31 | 
 32 |     /**
 33 |      * Define subsample antialiasing level (default is high).
 34 |      */
 35 |     private int antialiasing = OPTION_ANTIALIASING_HIGH;
 36 | 
 37 |     /**
 38 |      * Define renderer output resolution in DPI (default is 75dpi).
 39 |      */
 40 |     private int resolution = 75;
 41 | 
 42 |     public AlmostSimpleRenderer() {
 43 | 
 44 |         // set supported classes
 45 |         supportedDocumentClasses = new Class[2];
 46 |         supportedDocumentClasses[0] = PDFDocument.class;
 47 |         supportedDocumentClasses[1] = PSDocument.class;
 48 |     }
 49 | 
 50 |     /**
 51 |      * Main method used to start the renderer in standalone 'slave mode'.
 52 |      *
 53 |      * @param args
 54 |      * @throws RendererException
 55 |      */
 56 |     public static void main(String[] args) throws RendererException {
 57 | 
 58 |         startRemoteRenderer(new org.ghost4j.renderer.SimpleRenderer());
 59 |     }
 60 | 
 61 |     @Override
 62 |     public List<PageRaster> run(Document document, int begin, int end)
 63 |             throws IOException, RendererException, DocumentException {
 64 | 
 65 |         // assert document is supported
 66 |         this.assertDocumentSupported(document);
 67 | 
 68 |         // get Ghostscript instance
 69 |         Ghostscript gs = Ghostscript.getInstance();
 70 | 
 71 |         // generate a unique diskstore key for input file
 72 |         DiskStore diskStore = DiskStore.getInstance();
 73 |         String inputDiskStoreKey = diskStore.generateUniqueKey();
 74 |         // write document to input file
 75 |         document.write(diskStore.addFile(inputDiskStoreKey));
 76 | 
 77 |         // create display callback
 78 |         PageRasterDisplayCallback displayCallback = new PageRasterDisplayCallback();
 79 | 
 80 |         // prepare args
 81 |         // ** ADDED REDIRECTION OF OUTPUT TO SILENCE LOG NOISE **
 82 |         String[] gsArgs = {"-sstdout=%stderr", "-dQUIET", "-dNOPAUSE", "-dBATCH", "-dSAFER",
 83 |                 "-dFirstPage=" + (begin + 1), "-dLastPage=" + (end + 1),
 84 |                 "-sDEVICE=display", "-sDisplayHandle=0",
 85 |                 "-dDisplayFormat=16#804", "-r" + this.getResolution()};
 86 | 
 87 |         // antialiasing
 88 |         if (this.antialiasing != OPTION_ANTIALIASING_NONE) {
 89 |             gsArgs = Arrays.copyOf(gsArgs, gsArgs.length + 2);
 90 |             gsArgs[gsArgs.length - 2] = "-dTextAlphaBits=" + this.antialiasing;
 91 |             gsArgs[gsArgs.length - 1] = "-dGraphicsAlphaBits="
 92 |                     + this.antialiasing;
 93 |         }
 94 | 
 95 |         // add file path args
 96 |         gsArgs = Arrays.copyOf(gsArgs, gsArgs.length + 2);
 97 |         gsArgs[gsArgs.length - 2] = "-f";
 98 |         gsArgs[gsArgs.length - 1] = diskStore.getFile(inputDiskStoreKey).getAbsolutePath();
 99 | 
100 |         // execute and exit interpreter
101 |         try {
102 |             synchronized (gs) {
103 | 
104 |                 // set display callback
105 |                 gs.setDisplayCallback(displayCallback);
106 | 
107 |                 gs.initialize(gsArgs);
108 |                 gs.exit();
109 | 
110 |             }
111 |         } catch (GhostscriptException e) {
112 | 
113 |             throw new RendererException(e);
114 | 
115 |         } finally {
116 | 
117 |             // delete Ghostscript instance
118 |             try {
119 |                 Ghostscript.deleteInstance();
120 |             } catch (GhostscriptException e) {
121 |                 throw new RendererException(e);
122 |             }
123 | 
124 |             // remove temporary file
125 |             diskStore.removeFile(inputDiskStoreKey);
126 |         }
127 | 
128 |         return displayCallback.getRasters();
129 | 
130 |     }
131 | 
132 |     public int getAntialiasing() {
133 |         return antialiasing;
134 |     }
135 | 
136 |     public void setAntialiasing(int antialiasing) {
137 |         this.antialiasing = antialiasing;
138 |     }
139 | 
140 |     public int getResolution() {
141 |         return resolution;
142 |     }
143 | 
144 |     public void setResolution(int resolution) {
145 |         this.resolution = resolution;
146 |     }
147 | }
148 | 
149 | 


--------------------------------------------------------------------------------
/conversion/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Set root logger level to DEBUG and its only appender to STDOUT.
 2 | log4j.rootLogger=INFO, STDOUT
 3 | 
 4 | # STDOUT is set to be a ConsoleAppender.
 5 | log4j.appender.STDOUT=org.apache.log4j.ConsoleAppender
 6 | 
 7 | # STDOUT uses PatternLayout.
 8 | log4j.appender.STDOUT.layout=org.apache.log4j.PatternLayout
 9 | log4j.appender.STDOUT.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m %C%n
10 | 


--------------------------------------------------------------------------------
/conversion/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | 
 3 | <!-- For assistance related to logback-translator or configuration  -->
 4 | <!-- files in general, please contact the logback user mailing list -->
 5 | <!-- at http://www.qos.ch/mailman/listinfo/logback-user             -->
 6 | <!--                                                                -->
 7 | <!-- For professional support please see                            -->
 8 | <!--    http://www.qos.ch/shop/products/professionalSupport         -->
 9 | <!--                                                                -->
10 | <configuration>
11 |     <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
12 |         <encoder>
13 |             <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
14 |         </encoder>
15 |     </appender>
16 |     <root level="INFO">
17 |         <appender-ref ref="STDOUT"/>
18 |     </root>
19 | </configuration>
20 | 


--------------------------------------------------------------------------------
/conversion/src/main/scala/ocr/conversion/ConfigOptions.scala:
--------------------------------------------------------------------------------
 1 | package ocr.conversion
 2 | 
 3 | import java.io.File
 4 | 
 5 | class ConfigOptions(pdf: File, outDir: File, jnaLibPath: Option[String]) {
 6 |     def getPdfFile() = pdf
 7 | 
 8 |     def getOutDir() = outDir
 9 | 
10 |     def getJnaLibPath = jnaLibPath
11 | }
12 | 


--------------------------------------------------------------------------------
/conversion/src/main/scala/ocr/conversion/Converter.scala:
--------------------------------------------------------------------------------
 1 | package ocr.conversion
 2 | 
 3 | import java.awt.image.RenderedImage
 4 | import java.io.{File, FileInputStream, InputStream}
 5 | import java.util.UUID
 6 | import javax.imageio.ImageIO
 7 | import javax.imageio.spi.IIORegistry
 8 | 
 9 | import com.google.common.base.Splitter
10 | import com.google.common.collect.Iterables
11 | import org.apache.commons.io.IOUtils
12 | import org.geotoolkit.image.io.plugin.RawTiffImageReader
13 | import org.ghost4j.document.PDFDocument
14 | 
15 | import scala.collection.JavaConversions._
16 | 
17 | class Converter {
18 | 
19 |     object StaticConfig {
20 |         IIORegistry.getDefaultInstance()
21 |             .registerServiceProvider(new RawTiffImageReader.Spi());
22 |     }
23 | 
24 |     def convert(in: InputStream, outDir: File): List[Tuple2[File, Boolean]] = {
25 |         val document = new PDFDocument()
26 |         document.load(IOUtils.toBufferedInputStream(in))
27 | 
28 |         val renderer: AlmostSimpleRenderer = new AlmostSimpleRenderer()
29 |         renderer.setResolution(300)
30 |         val images = renderer.render(document)
31 |         val uuid = UUID.randomUUID().toString
32 |         images.toList
33 |             .zipWithIndex.map {
34 |             case (img, i) => {
35 |                 val outFile = new File(outDir, uuid + "-" + i + ".tiff");
36 |                 Tuple2(outFile, ImageIO.write(img.asInstanceOf[RenderedImage], "tif", outFile))
37 |             }
38 |         }
39 |     }
40 | 
41 |     def convert(config: ConfigOptions): List[File] = {
42 |         config.getOutDir().mkdirs()
43 |         convert(new FileInputStream(config.getPdfFile()), config.getOutDir())
44 |             .map {
45 |                 case (f, b) => f
46 |             }
47 |     }
48 | 
49 |     def toJava(in: List[Tuple2[File, Boolean]]): java.util.List[java.util.Map.Entry[java.io.File, java.lang.Boolean]] = {
50 |         val ret = new java.util.ArrayList[java.util.Map.Entry[java.io.File, java.lang.Boolean]]
51 |         in.foreach {
52 |             case (f, b) => ret.add(new java.util.AbstractMap.SimpleEntry[java.io.File, java.lang.Boolean](f, b))
53 |         }
54 |         ret
55 |     }
56 | 
57 |     def getPageNumber(fileName: String): Integer = {
58 |         val it = Splitter.on(".tiff").split(fileName);
59 |         val first = Iterables.getFirst(it, null);
60 |         Integer.parseInt(Iterables.getLast(Splitter.on("-").split(first)));
61 |     }
62 | 
63 | }
64 | 


--------------------------------------------------------------------------------
/conversion/src/main/scala/ocr/conversion/Driver.scala:
--------------------------------------------------------------------------------
 1 | package ocr.conversion
 2 | 
 3 | import java.io.File
 4 | 
 5 | import scala.collection.JavaConversions._
 6 | 
 7 | object Driver {
 8 | 
 9 |     val usage =
10 |         """
11 |           |Usage: convert pdfFile outDir [jnaLibPath]
12 |         """.stripMargin
13 | 
14 |     /**
15 |       * Convert each page of PDF to TIFF file
16 |       *
17 |       * @param args
18 |       */
19 |     def main(args: Array[String]): Unit = {
20 |         run(args)
21 |     }
22 | 
23 |     def run(args: Array[String]): List[File] = {
24 |         val config: ConfigOptions = buildConfig(args)
25 |         setupJnaLibPath(config)
26 |         setupOutputLocation(config)
27 |         val converter = new Converter
28 |         converter.convert(config)
29 |     }
30 | 
31 |     def setupOutputLocation(config: ConfigOptions): Unit = {
32 |         config.getOutDir().mkdirs()
33 |     }
34 | 
35 |     def buildConfig(args: Array[String]): ConfigOptions = {
36 |         if (args.length < 2) {
37 |             println("Incorrect arguments: \n" + usage)
38 |             System.exit(1)
39 |         }
40 |         val argsList = args.toList
41 | 
42 |         val config = new ConfigOptions(new File(argsList.get(0)), new File(argsList.get(1)), getJnaLibPath(argsList))
43 |         config
44 |     }
45 | 
46 |     def setupJnaLibPath(config: ConfigOptions): Any = {
47 |         config.getJnaLibPath match {
48 |             case Some(s) => System.getProperties.setProperty("jna.library.path", s)
49 |             case None => println("No jna lib path set")
50 |         }
51 |     }
52 | 
53 |     def getJnaLibPath(argsList: List[String]): Option[String] = {
54 |         if (argsList.isDefinedAt(3)) {
55 |             return Some(argsList.get(3))
56 |         } else if (System.getProperty("os.name").toLowerCase.contains("mac os x")) {
57 |             return Some("/opt/local/lib/")
58 |         } else {
59 |             return None
60 |         }
61 |     }
62 | 
63 | }
64 | 


--------------------------------------------------------------------------------
/conversion/src/test/resources/text-detection.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mmiklavc/scalable-ocr/9c9e42c4844799c860a3cf344a2d0eb218a6d438/conversion/src/test/resources/text-detection.pdf


--------------------------------------------------------------------------------
/conversion/src/test/scala/ocr/conversion/ConverterSpec.scala:
--------------------------------------------------------------------------------
 1 | package ocr.conversion
 2 | 
 3 | import java.io.File
 4 | import javax.imageio.ImageIO
 5 | 
 6 | import org.apache.commons.io.FileUtils
 7 | import org.scalatest.{BeforeAndAfter, FlatSpec, Matchers}
 8 | 
 9 | class ConverterSpec extends FlatSpec with Matchers with BeforeAndAfter {
10 | 
11 |     val outDir = new File("target/converter-output")
12 | 
13 |     before {
14 |         FileUtils.deleteDirectory(outDir)
15 |     }
16 | 
17 |     "converter" should "put 2 files in destination directory" in {
18 |         val samplePDF = "target/test-classes/text-detection.pdf"
19 |         val outFiles = Driver.run(Array(samplePDF, outDir.getAbsolutePath))
20 |         outDir.exists() shouldBe true
21 |         outDir.listFiles().length shouldBe outFiles.length
22 |         outDir.listFiles().length shouldBe 2
23 |         outDir.listFiles.map(f => f.getName).toSet shouldBe outFiles.map(f => f.getName).toSet
24 |         outFiles.map( f => ImageIO.read(f)).foreach( bi => bi.getHeight > 0 && bi.getWidth > 0 shouldBe true)
25 |     }
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/extraction/README.md:
--------------------------------------------------------------------------------
1 | # Extraction
2 | 
3 | Dependencies:
4 | 
5 | - Install Tesseract using Homebrew or MacPorts
6 | 
7 | 


--------------------------------------------------------------------------------
/extraction/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <modelVersion>4.0.0</modelVersion>
 6 | 
 7 |     <parent>
 8 |         <groupId>ocr</groupId>
 9 |         <artifactId>ocr</artifactId>
10 |         <version>1.0-SNAPSHOT</version>
11 |     </parent>
12 | 
13 |     <artifactId>extraction</artifactId>
14 |     <name>extraction</name>
15 | 
16 |     <properties>
17 |         <!-- plugin versions -->
18 | 
19 |         <!-- main dependency versions -->
20 |     </properties>
21 | 
22 |     <dependencies>
23 |         <dependency>
24 |             <groupId>ocr</groupId>
25 |             <artifactId>common</artifactId>
26 |             <version>1.0-SNAPSHOT</version>
27 |         </dependency>
28 |         <dependency>
29 |             <groupId>net.sourceforge.tess4j</groupId>
30 |             <artifactId>tess4j</artifactId>
31 |             <version>3.2.1</version>
32 |         </dependency>
33 |     </dependencies>
34 | 
35 |     <build>
36 |         <plugins>
37 |         </plugins>
38 |     </build>
39 | 
40 | </project>
41 | 


--------------------------------------------------------------------------------
/extraction/src/main/java/ocr/extraction/tesseract/TesseractUtil.java:
--------------------------------------------------------------------------------
 1 | package ocr.extraction.tesseract;
 2 | 
 3 | import net.sourceforge.tess4j.Tesseract;
 4 | import net.sourceforge.tess4j.TesseractException;
 5 | import org.apache.commons.io.IOUtils;
 6 | 
 7 | import javax.imageio.ImageIO;
 8 | import java.awt.image.BufferedImage;
 9 | import java.io.ByteArrayInputStream;
10 | import java.io.File;
11 | import java.io.IOException;
12 | import java.io.InputStream;
13 | import java.util.Map;
14 | 
15 | public enum TesseractUtil {
16 |     INSTANCE;
17 | 
18 |     public String ocr(byte[] img, File dataPath) throws IOException, TesseractException {
19 |         return ocr(new ByteArrayInputStream(img), dataPath);
20 |     }
21 | 
22 |     public String ocr(InputStream is, File dataPath) throws IOException, TesseractException {
23 |         Tesseract instance = new Tesseract();
24 |         instance.setDatapath(dataPath.getPath());
25 |         BufferedImage bi = ImageIO.read(IOUtils.toBufferedInputStream(is));
26 |         return instance.doOCR(bi);
27 |     }
28 | 
29 |     public String ocr(byte[] img, File dataPath, Map<String, String> variables) throws IOException, TesseractException {
30 |         return ocr(new ByteArrayInputStream(img), dataPath, variables);
31 |     }
32 | 
33 |     public String ocr(InputStream is, File dataPath, Map<String, String> variables) throws IOException, TesseractException {
34 |         Tesseract instance = new Tesseract();
35 |         for (Map.Entry<String, String> kv : variables.entrySet()) {
36 |             instance.setTessVariable(kv.getKey(), kv.getValue());
37 |         }
38 |         instance.setDatapath(dataPath.getPath());
39 |         BufferedImage bi = ImageIO.read(IOUtils.toBufferedInputStream(is));
40 |         return instance.doOCR(bi);
41 |     }
42 | 
43 | }
44 | 


--------------------------------------------------------------------------------
/extraction/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Set root logger level to DEBUG and its only appender to STDOUT.
 2 | log4j.rootLogger=INFO, STDOUT
 3 | 
 4 | # STDOUT is set to be a ConsoleAppender.
 5 | log4j.appender.STDOUT=org.apache.log4j.ConsoleAppender
 6 | 
 7 | # STDOUT uses PatternLayout.
 8 | log4j.appender.STDOUT.layout=org.apache.log4j.PatternLayout
 9 | log4j.appender.STDOUT.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m %C%n
10 | 


--------------------------------------------------------------------------------
/extraction/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | 
 3 | <!-- For assistance related to logback-translator or configuration  -->
 4 | <!-- files in general, please contact the logback user mailing list -->
 5 | <!-- at http://www.qos.ch/mailman/listinfo/logback-user             -->
 6 | <!--                                                                -->
 7 | <!-- For professional support please see                            -->
 8 | <!--    http://www.qos.ch/shop/products/professionalSupport         -->
 9 | <!--                                                                -->
10 | <configuration>
11 |     <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
12 |         <encoder>
13 |             <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
14 |         </encoder>
15 |     </appender>
16 |     <root level="INFO">
17 |         <appender-ref ref="STDOUT"/>
18 |     </root>
19 | </configuration>
20 | 


--------------------------------------------------------------------------------
/extraction/src/test/java/ocr/extraction/tesseract/TesseractUtilTest.java:
--------------------------------------------------------------------------------
 1 | package ocr.extraction.tesseract;
 2 | 
 3 | import ocr.common.Util;
 4 | import org.junit.Assert;
 5 | import org.junit.Test;
 6 | 
 7 | import java.io.File;
 8 | import java.nio.file.Files;
 9 | import java.util.HashMap;
10 | 
11 | public class TesseractUtilTest {
12 | 
13 |     @Test
14 |     public void testTesseractHappyPath() throws Exception {
15 |         System.getProperties().setProperty("jna.library.path", Util.Locations.JNA.find().get().getAbsolutePath());
16 |         File inFile = new File("src/test/resources/pdf-test.tiff");
17 |         File txtFile = new File("src/test/resources/pdf-test.txt");
18 |         String text = TesseractUtil.INSTANCE.ocr(Files.readAllBytes(inFile.toPath()), Util.Locations.TESSDATA.find().get(), new HashMap<>());
19 |         Assert.assertTrue(text.contains("Congratulations, your computer is equipped with a PDF (Portable Document Format)\nreader!"));
20 |     }
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/extraction/src/test/resources/pdf-test.tiff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mmiklavc/scalable-ocr/9c9e42c4844799c860a3cf344a2d0eb218a6d438/extraction/src/test/resources/pdf-test.tiff


--------------------------------------------------------------------------------
/nifi/README.md:
--------------------------------------------------------------------------------
1 | # NiFi
2 | 
3 | 


--------------------------------------------------------------------------------
/nifi/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <modelVersion>4.0.0</modelVersion>
 6 | 
 7 |     <parent>
 8 |         <groupId>ocr</groupId>
 9 |         <artifactId>ocr</artifactId>
10 |         <version>1.0-SNAPSHOT</version>
11 |     </parent>
12 | 
13 |     <artifactId>nifi</artifactId>
14 |     <name>nifi</name>
15 |     <packaging>nar</packaging>
16 | 
17 |     <properties>
18 |         <!-- plugin versions -->
19 |         <nifi.nar.version>1.1.0</nifi.nar.version>
20 | 
21 |         <!-- main dependency versions -->
22 |         <nifi.version>0.6.1</nifi.version>
23 |     </properties>
24 | 
25 |     <dependencies>
26 |         <dependency>
27 |             <groupId>ocr</groupId>
28 |             <artifactId>common</artifactId>
29 |             <version>1.0-SNAPSHOT</version>
30 |         </dependency>
31 |         <dependency>
32 |             <groupId>ocr</groupId>
33 |             <artifactId>conversion</artifactId>
34 |             <version>${project.parent.version}</version>
35 |         </dependency>
36 |         <dependency>
37 |             <groupId>ocr</groupId>
38 |             <artifactId>extraction</artifactId>
39 |             <version>${project.parent.version}</version>
40 |         </dependency>
41 |         <dependency>
42 |             <groupId>ocr</groupId>
43 |             <artifactId>preprocessing</artifactId>
44 |             <version>${project.parent.version}</version>
45 |         </dependency>
46 |         <dependency>
47 |             <groupId>org.apache.nifi</groupId>
48 |             <artifactId>nifi-api</artifactId>
49 |             <version>${nifi.version}</version>
50 |         </dependency>
51 |         <dependency>
52 |             <groupId>org.apache.nifi</groupId>
53 |             <artifactId>nifi-utils</artifactId>
54 |             <version>${nifi.version}</version>
55 |         </dependency>
56 |         <dependency>
57 |             <groupId>org.apache.nifi</groupId>
58 |             <artifactId>nifi-processor-utils</artifactId>
59 |             <version>${nifi.version}</version>
60 |         </dependency>
61 |         <dependency>
62 |             <groupId>org.apache.nifi</groupId>
63 |             <artifactId>nifi-mock</artifactId>
64 |             <version>${nifi.version}</version>
65 |             <scope>test</scope>
66 |         </dependency>
67 |         <dependency>
68 |             <groupId>com.fasterxml.jackson.core</groupId>
69 |             <artifactId>jackson-databind</artifactId>
70 |             <version>2.7.4</version>
71 |         </dependency>
72 |     </dependencies>
73 | 
74 |     <build>
75 |         <plugins>
76 |             <plugin>
77 |                 <groupId>org.apache.nifi</groupId>
78 |                 <artifactId>nifi-nar-maven-plugin</artifactId>
79 |                 <version>${nifi.nar.version}</version>
80 |                 <extensions>true</extensions>
81 |             </plugin>
82 | 
83 |         </plugins>
84 |     </build>
85 | 
86 | </project>
87 | 


--------------------------------------------------------------------------------
/nifi/src/main/java/ocr/nifi/conversion/ConversionProcessor.java:
--------------------------------------------------------------------------------
  1 | package ocr.nifi.conversion;
  2 | 
  3 | import com.google.common.base.Splitter;
  4 | import com.google.common.collect.ImmutableList;
  5 | import com.google.common.collect.ImmutableSet;
  6 | import com.google.common.collect.Iterables;
  7 | import ocr.conversion.Converter;
  8 | import org.apache.commons.io.IOUtils;
  9 | import org.apache.nifi.annotation.behavior.SideEffectFree;
 10 | import org.apache.nifi.annotation.documentation.CapabilityDescription;
 11 | import org.apache.nifi.annotation.documentation.Tags;
 12 | import org.apache.nifi.components.PropertyDescriptor;
 13 | import org.apache.nifi.flowfile.FlowFile;
 14 | import org.apache.nifi.logging.ProcessorLog;
 15 | import org.apache.nifi.processor.AbstractProcessor;
 16 | import org.apache.nifi.processor.ProcessContext;
 17 | import org.apache.nifi.processor.ProcessSession;
 18 | import org.apache.nifi.processor.Relationship;
 19 | import org.apache.nifi.processor.exception.ProcessException;
 20 | import org.apache.nifi.processor.util.StandardValidators;
 21 | 
 22 | import java.io.BufferedInputStream;
 23 | import java.io.File;
 24 | import java.io.FileInputStream;
 25 | import java.io.InputStream;
 26 | import java.util.*;
 27 | import java.util.concurrent.atomic.AtomicReference;
 28 | 
 29 | @SideEffectFree
 30 | @Tags({"ocr preprocessing", "image manipulation"})
 31 | @CapabilityDescription("Preprocess images of text documents extract pages and data")
 32 | public class ConversionProcessor extends AbstractProcessor {
 33 |   static PropertyDescriptor JNI_PATH = new PropertyDescriptor.Builder()
 34 |                                                              .name("jni_path")
 35 |                                                              .description("JNI Path")
 36 |                                                              .required(true)
 37 |                                                              .addValidator(StandardValidators.FILE_EXISTS_VALIDATOR)
 38 |                                                              .build();
 39 |   static PropertyDescriptor TEMP_DIR = new PropertyDescriptor.Builder()
 40 |                                                              .name("temp_space")
 41 |                                                              .description("Temporary directory to be used.")
 42 |                                                              .required(false)
 43 |                                                              .addValidator(StandardValidators.FILE_EXISTS_VALIDATOR)
 44 |                                                              .build();
 45 |   static Relationship SUCCESS  = new Relationship.Builder()
 46 |                                                  .name("SUCCESS")
 47 |                                                  .description("Success relationship")
 48 |                                                  .build();
 49 |   static Relationship RAW = new Relationship.Builder()
 50 |                                             .name("RAW")
 51 |                                             .description("Raw data")
 52 |                                             .build();
 53 |   private List<PropertyDescriptor> properties = ImmutableList.of(TEMP_DIR, JNI_PATH);
 54 | 
 55 |   private Set<Relationship> relationships = ImmutableSet.of( SUCCESS, RAW );
 56 |   @Override
 57 |   public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException {
 58 |     final ProcessorLog log = this.getLogger();
 59 |     final AtomicReference<List<Map.Entry<File, Boolean>>> value = new AtomicReference<>();
 60 |     final File tempDir = new File(context.getProperty(TEMP_DIR).getValue());
 61 |     System.getProperties().setProperty("jna.library.path", context.getProperty(JNI_PATH).getValue());
 62 |     FlowFile flowfile = session.get();
 63 |     session.read(flowfile, in -> {
 64 |       try {
 65 |         value.set(convert(in, tempDir));
 66 |       }
 67 |       catch(Exception e) {
 68 |         log.error("Unable to convert: " + e.getMessage(), e);
 69 |       }
 70 |     });
 71 |     if(value.get() != null) {
 72 |       for(Map.Entry<File, Boolean> kv : value.get()) {
 73 |         final File convertedFile = kv.getKey();
 74 |         try {
 75 |           final int pageNumber = getPageNumber(convertedFile.getName());
 76 |           if(kv.getValue()) {
 77 |             FlowFile ff = session.clone(flowfile);
 78 |             ff = session.putAttribute(ff, "pageNumber", "" + pageNumber);
 79 |             ff = session.write(ff, out -> IOUtils.copy(new BufferedInputStream(new FileInputStream(convertedFile)), out));
 80 |             session.transfer(ff, SUCCESS);
 81 |           }
 82 |         }
 83 |         finally {
 84 |           if(convertedFile != null && convertedFile.exists()) {
 85 |             convertedFile.delete();
 86 |           }
 87 |         }
 88 |       }
 89 |     }
 90 |     session.transfer(flowfile, RAW);
 91 |   }
 92 | 
 93 |   private int getPageNumber(String fileName) {
 94 |     Iterable<String> it = Splitter.on(".tiff").split(fileName);
 95 |     String first = Iterables.getFirst(it, null);
 96 |     return Integer.parseInt(Iterables.getLast(Splitter.on("-").split(first)));
 97 |   }
 98 |   private List<Map.Entry<File, Boolean>> convert(InputStream in, File tempDir) {
 99 |     Converter converter = new Converter();
100 |     if(!tempDir.exists()) {
101 |       tempDir.mkdirs();
102 |     }
103 |     List<Map.Entry<File, Boolean>> ret = new ArrayList<>();
104 |     for(Map.Entry<File, Boolean> kv : converter.toJava(converter.convert(in, tempDir))) {
105 |       ret.add(new AbstractMap.SimpleEntry<>(kv.getKey(), kv.getValue()));
106 |     }
107 |     return ret;
108 |   }
109 | 
110 |   @Override
111 |   public Set<Relationship> getRelationships() {
112 |     return relationships;
113 |   }
114 | 
115 |   @Override
116 |   protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
117 |     return properties;
118 |   }
119 | }
120 | 


--------------------------------------------------------------------------------
/nifi/src/main/java/ocr/nifi/extraction/ExtractionProcessor.java:
--------------------------------------------------------------------------------
  1 | package ocr.nifi.extraction;
  2 | 
  3 | import com.fasterxml.jackson.core.type.TypeReference;
  4 | import com.google.common.collect.ImmutableList;
  5 | import com.google.common.collect.ImmutableSet;
  6 | import ocr.extraction.tesseract.TesseractUtil;
  7 | import ocr.nifi.util.JSONUtils;
  8 | import ocr.nifi.validation.Validation;
  9 | import org.apache.nifi.annotation.behavior.SideEffectFree;
 10 | import org.apache.nifi.annotation.documentation.CapabilityDescription;
 11 | import org.apache.nifi.annotation.documentation.Tags;
 12 | import org.apache.nifi.components.PropertyDescriptor;
 13 | import org.apache.nifi.flowfile.FlowFile;
 14 | import org.apache.nifi.logging.ProcessorLog;
 15 | import org.apache.nifi.processor.AbstractProcessor;
 16 | import org.apache.nifi.processor.ProcessContext;
 17 | import org.apache.nifi.processor.ProcessSession;
 18 | import org.apache.nifi.processor.Relationship;
 19 | import org.apache.nifi.processor.exception.ProcessException;
 20 | import org.apache.nifi.processor.util.StandardValidators;
 21 | 
 22 | import java.io.File;
 23 | import java.util.HashMap;
 24 | import java.util.List;
 25 | import java.util.Map;
 26 | import java.util.Set;
 27 | import java.util.concurrent.atomic.AtomicReference;
 28 | @SideEffectFree
 29 | @Tags({"ocr"})
 30 | @CapabilityDescription("Extracts text from images")
 31 | public class ExtractionProcessor extends AbstractProcessor {
 32 |   static PropertyDescriptor JNI_PATH = new PropertyDescriptor.Builder()
 33 |                                                              .name("jni_path")
 34 |                                                              .description("JNI Path")
 35 |                                                              .required(true)
 36 |                                                              .addValidator(StandardValidators.FILE_EXISTS_VALIDATOR)
 37 |                                                              .build();
 38 |   static PropertyDescriptor TESS_DATA = new PropertyDescriptor.Builder()
 39 |                                                               .name("tess_data_dir")
 40 |                                                               .description("Tesseract data directory")
 41 |                                                               .required(true)
 42 |                                                               .addValidator(StandardValidators.FILE_EXISTS_VALIDATOR)
 43 |                                                               .build();
 44 |   static PropertyDescriptor TESS_PROPERTIES = new PropertyDescriptor.Builder()
 45 |                                                               .name("tess_properties")
 46 |                                                               .description("Tesseract properties")
 47 |                                                               .required(false)
 48 |                                                               .addValidator(Validation.Validators.JSON_MAP)
 49 |                                                               .build();
 50 |   static Relationship SUCCESS  = new Relationship.Builder()
 51 |                                                  .name("SUCCESS")
 52 |                                                  .description("Success relationship")
 53 |                                                  .build();
 54 |   private List<PropertyDescriptor> properties = ImmutableList.of(TESS_DATA, JNI_PATH, TESS_PROPERTIES);
 55 | 
 56 |   private Set<Relationship> relationships = ImmutableSet.of( SUCCESS );
 57 | 
 58 |   private Map<String, String> toProperties(String properties) throws ProcessException {
 59 |     Map<String, String> ret = new HashMap<>();
 60 |     if(properties == null) {
 61 |       return ret;
 62 |     }
 63 |     else {
 64 |       try {
 65 |         return JSONUtils.INSTANCE.load(properties, new TypeReference<Map<String, String>>() {
 66 |         });
 67 |       }
 68 |       catch(Throwable t) {
 69 |         throw new ProcessException("Unable to load properties: " + t.getMessage(), t);
 70 |       }
 71 |     }
 72 |   }
 73 | 
 74 |   @Override
 75 |   public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException {
 76 |     final ProcessorLog log = this.getLogger();
 77 |     final AtomicReference<String> value = new AtomicReference<>();
 78 |     final Map<String, String> tessProperties = toProperties(context.getProperty(TESS_PROPERTIES).getValue());
 79 |     final File tessDataDir = new File(context.getProperty(TESS_DATA).getValue());
 80 |     System.getProperties().setProperty("jna.library.path", context.getProperty(JNI_PATH).getValue());
 81 |     FlowFile flowfile = session.get();
 82 |     if (null != flowfile) {
 83 |       session.read(flowfile, in -> {
 84 |         try {
 85 |           value.set(TesseractUtil.INSTANCE.ocr(in, tessDataDir, tessProperties));
 86 |         } catch (Exception e) {
 87 |           log.error("Unable to ocr: " + e.getMessage(), e);
 88 |         }
 89 |       });
 90 | 
 91 |       flowfile = session.write(flowfile, out -> {
 92 |         out.write(value.get().getBytes());
 93 |         out.flush();
 94 |       });
 95 |       session.transfer(flowfile, SUCCESS);
 96 |     } else {
 97 |       log.warn("NULL flow file");
 98 |     }
 99 |   }
100 | 
101 |   @Override
102 |   public Set<Relationship> getRelationships() {
103 |     return relationships;
104 |   }
105 | 
106 |   @Override
107 |   protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
108 |     return properties;
109 |   }
110 | }
111 | 


--------------------------------------------------------------------------------
/nifi/src/main/java/ocr/nifi/preprocessing/PreprocessingProcessor.java:
--------------------------------------------------------------------------------
  1 | package ocr.nifi.preprocessing;
  2 | 
  3 | import com.google.common.collect.ImmutableList;
  4 | import com.google.common.collect.ImmutableSet;
  5 | import ocr.preprocessing.conversion.CLIUtils;
  6 | import ocr.preprocessing.conversion.CleaningOptions;
  7 | import ocr.preprocessing.conversion.TextCleaner;
  8 | import org.apache.commons.cli.CommandLine;
  9 | import org.apache.commons.cli.DefaultParser;
 10 | import org.apache.commons.io.IOUtils;
 11 | import org.apache.nifi.annotation.behavior.SideEffectFree;
 12 | import org.apache.nifi.annotation.documentation.CapabilityDescription;
 13 | import org.apache.nifi.annotation.documentation.Tags;
 14 | import org.apache.nifi.components.PropertyDescriptor;
 15 | import org.apache.nifi.components.ValidationResult;
 16 | import org.apache.nifi.flowfile.FlowFile;
 17 | import org.apache.nifi.logging.ProcessorLog;
 18 | import org.apache.nifi.processor.*;
 19 | import org.apache.nifi.processor.exception.ProcessException;
 20 | import org.apache.nifi.processor.util.StandardValidators;
 21 | 
 22 | import java.util.List;
 23 | import java.util.Set;
 24 | import java.util.concurrent.atomic.AtomicReference;
 25 | 
 26 | @SideEffectFree
 27 | @Tags({"ocr preprocessing", "image manipulation"})
 28 | @CapabilityDescription("Preprocess images of text documents to clean them")
 29 | public class PreprocessingProcessor extends AbstractProcessor {
 30 |   static PropertyDescriptor DEFINITIONS = new PropertyDescriptor.Builder()
 31 |                                                                         .name("definition")
 32 |                                                                         .description(CleaningOptions.getUsage())
 33 |                                                                         .required(true)
 34 |                                                                         .addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
 35 |                                                                         .addValidator(
 36 |                                                                   (subject, value, context) ->  {
 37 |                                                                     boolean valid = true;
 38 |                                                                     String explanation = "";
 39 |                                                                     try {
 40 |                                                                       CleaningOptions.parse(new DefaultParser()
 41 |                                                                                            , CLIUtils.translateCommandline(value)
 42 |                                                                                            );
 43 |                                                                     }
 44 |                                                                     catch(Throwable t) {
 45 |                                                                       valid = false;
 46 |                                                                       explanation = t.getMessage();
 47 |                                                                     }
 48 |                                                                     return
 49 |                                                                     new ValidationResult.Builder()
 50 |                                                                                         .subject(subject)
 51 |                                                                                         .input(value)
 52 |                                                                                         .valid(valid)
 53 |                                                                                         .explanation(explanation)
 54 |                                                                                         .build();
 55 |                                                                   }
 56 |                                                                                      )
 57 |                                                                         .build();
 58 |   static PropertyDescriptor TEMP_DIR = new PropertyDescriptor.Builder()
 59 |                                                                      .name("temp_space")
 60 |                                                                      .description("Temporary directory to be used.")
 61 |                                                                      .required(false)
 62 |                                                                      .addValidator(StandardValidators.FILE_EXISTS_VALIDATOR)
 63 |                                                                      .build();
 64 |   static PropertyDescriptor CONVERT_PATH = new PropertyDescriptor.Builder()
 65 |                                                                      .name("convert_bin_path")
 66 |                                                                      .description("The path to the convert (imagemagick) utility")
 67 |                                                                      .required(true)
 68 |                                                                      .addValidator(StandardValidators.FILE_EXISTS_VALIDATOR)
 69 |                                                                      .build();
 70 |   static Relationship SUCCESS  = new Relationship.Builder()
 71 |                                                          .name("SUCCESS")
 72 |                                                          .description("Success relationship")
 73 |                                                          .build();
 74 |   private List<PropertyDescriptor> properties = ImmutableList.of( DEFINITIONS ,TEMP_DIR, CONVERT_PATH );
 75 | 
 76 |   private Set<Relationship> relationships = ImmutableSet.of( SUCCESS );
 77 | 
 78 |   @Override
 79 |   protected void init(ProcessorInitializationContext context) {
 80 |   }
 81 | 
 82 |   @Override
 83 |   public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException {
 84 |     final ProcessorLog log = this.getLogger();
 85 |     final AtomicReference<byte[]> value = new AtomicReference<>();
 86 |     String preprocessingDef = context.getProperty(DEFINITIONS).getValue();
 87 |     String tempDir = context.getProperty(TEMP_DIR).getValue();
 88 |     String convertPath = context.getProperty(CONVERT_PATH).getValue();
 89 |     CommandLine cli = CleaningOptions.parse(new DefaultParser(), CLIUtils.translateCommandline(preprocessingDef) );
 90 |     final TextCleaner cleaner = CleaningOptions.createTextCleaner(cli, convertPath, tempDir);
 91 |     FlowFile flowfile = session.get();
 92 |     session.read(flowfile, in -> {
 93 |       try {
 94 |         value.set(cleaner.convert(in));
 95 |       } catch (Exception e) {
 96 |         value.set(IOUtils.toByteArray(in));
 97 |         log.error("Unable to execute command: " + e.getMessage(), e);
 98 |       }
 99 |     });
100 |     flowfile = session.write(flowfile, out -> {
101 |       out.write(value.get());
102 |       out.flush();
103 |     });
104 |     session.transfer(flowfile, SUCCESS);
105 |   }
106 | 
107 |   @Override
108 |   public Set<Relationship> getRelationships() {
109 |     return relationships;
110 |   }
111 | 
112 |   @Override
113 |   protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
114 |     return properties;
115 |   }
116 | }
117 | 


--------------------------------------------------------------------------------
/nifi/src/main/java/ocr/nifi/util/JSONUtils.java:
--------------------------------------------------------------------------------
 1 | package ocr.nifi.util;
 2 | import com.fasterxml.jackson.core.JsonProcessingException;
 3 | import com.fasterxml.jackson.core.type.TypeReference;
 4 | import com.fasterxml.jackson.databind.ObjectMapper;
 5 | 
 6 | import java.io.*;
 7 | 
 8 | public enum JSONUtils {
 9 |   INSTANCE;
10 |   private static ThreadLocal<ObjectMapper> _mapper = new ThreadLocal<ObjectMapper>() {
11 |     /**
12 |      * Returns the current thread's "initial value" for this
13 |      * thread-local variable.  This method will be invoked the first
14 |      * time a thread accesses the variable with the {@link #get}
15 |      * method, unless the thread previously invoked the {@link #set}
16 |      * method, in which case the {@code initialValue} method will not
17 |      * be invoked for the thread.  Normally, this method is invoked at
18 |      * most once per thread, but it may be invoked again in case of
19 |      * subsequent invocations of {@link #remove} followed by {@link #get}.
20 |      * <p>
21 |      * <p>This implementation simply returns {@code null}; if the
22 |      * programmer desires thread-local variables to have an initial
23 |      * value other than {@code null}, {@code ThreadLocal} must be
24 |      * subclassed, and this method overridden.  Typically, an
25 |      * anonymous inner class will be used.
26 |      *
27 |      * @return the initial value for this thread-local
28 |      */
29 |     @Override
30 |     protected ObjectMapper initialValue() {
31 |       return new ObjectMapper();
32 |     }
33 |   };
34 | 
35 |   public <T> T load(InputStream is, TypeReference<T> ref) throws IOException {
36 |     return _mapper.get().readValue(is, ref);
37 |   }
38 | 
39 |   public <T> T load(String is, TypeReference<T> ref) throws IOException {
40 |     return _mapper.get().readValue(is, ref);
41 |   }
42 | 
43 |   public <T> T load(File f, TypeReference<T> ref) throws IOException {
44 |     try (InputStream is = new BufferedInputStream(new FileInputStream(f))) {
45 |       return _mapper.get().readValue(is, ref);
46 |     }
47 |   }
48 | 
49 |   public <T> T load(InputStream is, Class<T> clazz) throws IOException {
50 |     return _mapper.get().readValue(is, clazz);
51 |   }
52 | 
53 |   public <T> T load(File f, Class<T> clazz) throws IOException {
54 |     try (InputStream is = new BufferedInputStream(new FileInputStream(f))) {
55 |       return _mapper.get().readValue(is, clazz);
56 |     }
57 |   }
58 | 
59 |   public <T> T load(String is, Class<T> clazz) throws IOException {
60 |     return _mapper.get().readValue(is, clazz);
61 |   }
62 | 
63 |   public String toJSON(Object o, boolean pretty) throws JsonProcessingException {
64 |     if (pretty) {
65 |       return _mapper.get().writerWithDefaultPrettyPrinter().writeValueAsString(o);
66 |     } else {
67 |       return _mapper.get().writeValueAsString(o);
68 |     }
69 |   }
70 | 
71 |   public byte[] toJSON(Object config) throws JsonProcessingException {
72 |     return _mapper.get().writeValueAsBytes(config);
73 |   }
74 | }
75 | 


--------------------------------------------------------------------------------
/nifi/src/main/java/ocr/nifi/validation/JsonValidator.java:
--------------------------------------------------------------------------------
 1 | package ocr.nifi.validation;
 2 | 
 3 | import com.fasterxml.jackson.core.type.TypeReference;
 4 | import ocr.nifi.util.JSONUtils;
 5 | import org.apache.nifi.components.ValidationContext;
 6 | import org.apache.nifi.components.ValidationResult;
 7 | import org.apache.nifi.components.Validator;
 8 | 
 9 | import java.io.IOException;
10 | import java.util.Map;
11 | 
12 | import static com.sun.corba.se.spi.activation.IIOP_CLEAR_TEXT.value;
13 | 
14 | public class JsonValidator implements Validator {
15 |     @Override
16 |     public ValidationResult validate(String subject, String input, ValidationContext context) {
17 |         try {
18 |             JSONUtils.INSTANCE.load(input, new TypeReference<Map<String, String>>() {
19 |             });
20 |         } catch (IOException e) {
21 |             return new ValidationResult.Builder()
22 |                     .subject(subject)
23 |                     .input(value)
24 |                     .valid(false)
25 |                     .explanation("Not a valid JSON map value: " + e.getMessage())
26 |                     .build();
27 |         }
28 |         return new ValidationResult.Builder()
29 |                 .valid(true)
30 |                 .input(value)
31 |                 .subject(subject)
32 |                 .build();
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/nifi/src/main/java/ocr/nifi/validation/Validation.java:
--------------------------------------------------------------------------------
 1 | package ocr.nifi.validation;
 2 | 
 3 | import org.apache.nifi.components.ValidationContext;
 4 | import org.apache.nifi.components.ValidationResult;
 5 | import org.apache.nifi.components.Validator;
 6 | 
 7 | public class Validation {
 8 |     public enum Validators implements Validator {
 9 |         JSON_MAP(new JsonValidator());
10 | 
11 |         private Validator validator;
12 | 
13 |         Validators(Validator validator) {
14 |             this.validator = validator;
15 |         }
16 | 
17 |         @Override
18 |         public ValidationResult validate(String subject, String input, ValidationContext context) {
19 |             return validator.validate(subject, input, context);
20 |         }
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/nifi/src/main/nifi/templates/scalable-ocr.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="yes"?><template><description>OCR flow</description><name>scalable-ocr</name><snippet><connections><id>72556dd6-f10a-4949-b34d-2058a81ffd49</id><parentGroupId>68279867-d055-46a8-9934-0e601af986eb</parentGroupId><backPressureDataSizeThreshold>0 MB</backPressureDataSizeThreshold><backPressureObjectThreshold>0</backPressureObjectThreshold><destination><groupId>68279867-d055-46a8-9934-0e601af986eb</groupId><id>5ded0d2e-2a49-4a28-b3a0-2f4328711971</id><type>PROCESSOR</type></destination><flowFileExpiration>0 sec</flowFileExpiration><labelIndex>1</labelIndex><name></name><selectedRelationships>success</selectedRelationships><source><groupId>68279867-d055-46a8-9934-0e601af986eb</groupId><id>b6321e52-b9f8-4d2e-9bf7-d8c974463922</id><type>PROCESSOR</type></source><zIndex>0</zIndex></connections><connections><id>7b3a6f52-0dbb-4269-8cc3-46b99ca5e940</id><parentGroupId>68279867-d055-46a8-9934-0e601af986eb</parentGroupId><backPressureDataSizeThreshold>0 MB</backPressureDataSizeThreshold><backPressureObjectThreshold>0</backPressureObjectThreshold><destination><groupId>68279867-d055-46a8-9934-0e601af986eb</groupId><id>b80561a8-9dca-4994-bb68-32ac1c4b0de9</id><type>PROCESSOR</type></destination><flowFileExpiration>0 sec</flowFileExpiration><labelIndex>1</labelIndex><name></name><selectedRelationships>SUCCESS</selectedRelationships><source><groupId>68279867-d055-46a8-9934-0e601af986eb</groupId><id>d8fb1883-cc34-4416-9568-95684b29f0d3</id><type>PROCESSOR</type></source><zIndex>0</zIndex></connections><connections><id>e14a5184-6334-4b4e-9bfe-cff55aa92bc4</id><parentGroupId>68279867-d055-46a8-9934-0e601af986eb</parentGroupId><backPressureDataSizeThreshold>0 MB</backPressureDataSizeThreshold><backPressureObjectThreshold>0</backPressureObjectThreshold><destination><groupId>68279867-d055-46a8-9934-0e601af986eb</groupId><id>c77d797f-6d16-4947-92e7-1879586dd7e5</id><type>PROCESSOR</type></destination><flowFileExpiration>0 sec</flowFileExpiration><labelIndex>1</labelIndex><name></name><selectedRelationships>success</selectedRelationships><source><groupId>68279867-d055-46a8-9934-0e601af986eb</groupId><id>d4ca6eb5-7ef8-4976-a91b-5fe7763e203e</id><type>PROCESSOR</type></source><zIndex>0</zIndex></connections><connections><id>afea66f5-e56b-43f7-b7f3-c4f3dc3de0c0</id><parentGroupId>68279867-d055-46a8-9934-0e601af986eb</parentGroupId><backPressureDataSizeThreshold>0 MB</backPressureDataSizeThreshold><backPressureObjectThreshold>0</backPressureObjectThreshold><destination><groupId>68279867-d055-46a8-9934-0e601af986eb</groupId><id>b6321e52-b9f8-4d2e-9bf7-d8c974463922</id><type>PROCESSOR</type></destination><flowFileExpiration>0 sec</flowFileExpiration><labelIndex>1</labelIndex><name></name><selectedRelationships>SUCCESS</selectedRelationships><source><groupId>68279867-d055-46a8-9934-0e601af986eb</groupId><id>c77d797f-6d16-4947-92e7-1879586dd7e5</id><type>PROCESSOR</type></source><zIndex>0</zIndex></connections><connections><id>81227c53-f04e-4d63-b542-d8e8fa53f7de</id><parentGroupId>68279867-d055-46a8-9934-0e601af986eb</parentGroupId><backPressureDataSizeThreshold>0 MB</backPressureDataSizeThreshold><backPressureObjectThreshold>0</backPressureObjectThreshold><destination><groupId>68279867-d055-46a8-9934-0e601af986eb</groupId><id>d8fb1883-cc34-4416-9568-95684b29f0d3</id><type>PROCESSOR</type></destination><flowFileExpiration>0 sec</flowFileExpiration><labelIndex>1</labelIndex><name></name><selectedRelationships>SUCCESS</selectedRelationships><source><groupId>68279867-d055-46a8-9934-0e601af986eb</groupId><id>5ded0d2e-2a49-4a28-b3a0-2f4328711971</id><type>PROCESSOR</type></source><zIndex>0</zIndex></connections><connections><id>c487ddea-6081-492b-b36a-c28b96419df2</id><parentGroupId>68279867-d055-46a8-9934-0e601af986eb</parentGroupId><backPressureDataSizeThreshold>0 MB</backPressureDataSizeThreshold><backPressureObjectThreshold>0</backPressureObjectThreshold><destination><groupId>68279867-d055-46a8-9934-0e601af986eb</groupId><id>e6a96a18-8f0a-4a6b-b4a7-aefe1f37259a</id><type>PROCESSOR</type></destination><flowFileExpiration>0 sec</flowFileExpiration><labelIndex>1</labelIndex><name></name><selectedRelationships>RAW</selectedRelationships><source><groupId>68279867-d055-46a8-9934-0e601af986eb</groupId><id>c77d797f-6d16-4947-92e7-1879586dd7e5</id><type>PROCESSOR</type></source><zIndex>0</zIndex></connections><processors><id>d8fb1883-cc34-4416-9568-95684b29f0d3</id><parentGroupId>68279867-d055-46a8-9934-0e601af986eb</parentGroupId><position><x>277.01132507324223</x><y>665.235888671875</y></position><config><bulletinLevel>WARN</bulletinLevel><comments></comments><concurrentlySchedulableTaskCount>1</concurrentlySchedulableTaskCount><defaultConcurrentTasks><entry><key>TIMER_DRIVEN</key><value>1</value></entry><entry><key>EVENT_DRIVEN</key><value>0</value></entry><entry><key>CRON_DRIVEN</key><value>1</value></entry></defaultConcurrentTasks><defaultSchedulingPeriod><entry><key>TIMER_DRIVEN</key><value>0 sec</value></entry><entry><key>CRON_DRIVEN</key><value>* * * * * ?</value></entry></defaultSchedulingPeriod><descriptors><entry><key>tess_data_dir</key><value><description>Tesseract data directory</description><displayName>tess_data_dir</displayName><dynamic>false</dynamic><name>tess_data_dir</name><required>true</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>jni_path</key><value><description>JNI Path</description><displayName>jni_path</displayName><dynamic>false</dynamic><name>jni_path</name><required>true</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>tess_properties</key><value><description>Tesseract properties</description><displayName>tess_properties</displayName><dynamic>false</dynamic><name>tess_properties</name><required>false</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry></descriptors><lossTolerant>false</lossTolerant><penaltyDuration>30 sec</penaltyDuration><properties><entry><key>tess_data_dir</key><value>/usr/share/tesseract/tessdata</value></entry><entry><key>jni_path</key><value>/usr/lib</value></entry><entry><key>tess_properties</key><value>{
 2 | &quot;tessedit_char_whitelist&quot;: &quot;abcdefghijklmnopqrstuvwxyz0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ-.,'\&quot;():@?&gt;&lt;;&quot;
 3 | }</value></entry></properties><runDurationMillis>0</runDurationMillis><schedulingPeriod>0 sec</schedulingPeriod><schedulingStrategy>TIMER_DRIVEN</schedulingStrategy><yieldDuration>1 sec</yieldDuration></config><name>ExtractionProcessor</name><relationships><autoTerminate>false</autoTerminate><description>Success relationship</description><name>SUCCESS</name></relationships><state>RUNNING</state><style/><supportsEventDriven>false</supportsEventDriven><supportsParallelProcessing>true</supportsParallelProcessing><type>ocr.nifi.extraction.ExtractionProcessor</type></processors><processors><id>e6a96a18-8f0a-4a6b-b4a7-aefe1f37259a</id><parentGroupId>68279867-d055-46a8-9934-0e601af986eb</parentGroupId><position><x>818.601038641043</x><y>87.89188647302024</y></position><config><bulletinLevel>WARN</bulletinLevel><comments></comments><concurrentlySchedulableTaskCount>1</concurrentlySchedulableTaskCount><defaultConcurrentTasks><entry><key>TIMER_DRIVEN</key><value>1</value></entry><entry><key>EVENT_DRIVEN</key><value>0</value></entry><entry><key>CRON_DRIVEN</key><value>1</value></entry></defaultConcurrentTasks><defaultSchedulingPeriod><entry><key>TIMER_DRIVEN</key><value>0 sec</value></entry><entry><key>CRON_DRIVEN</key><value>* * * * * ?</value></entry></defaultSchedulingPeriod><descriptors><entry><key>Hadoop Configuration Resources</key><value><description>A file or comma separated list of files which contains the Hadoop file system configuration. Without this, Hadoop will search the classpath for a 'core-site.xml' and 'hdfs-site.xml' file or will revert to a default configuration.</description><displayName>Hadoop Configuration Resources</displayName><dynamic>false</dynamic><name>Hadoop Configuration Resources</name><required>false</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Kerberos Principal</key><value><description>Kerberos principal to authenticate as. Requires nifi.kerberos.krb5.file to be set in your nifi.properties</description><displayName>Kerberos Principal</displayName><dynamic>false</dynamic><name>Kerberos Principal</name><required>false</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Kerberos Keytab</key><value><description>Kerberos keytab associated with the principal. Requires nifi.kerberos.krb5.file to be set in your nifi.properties</description><displayName>Kerberos Keytab</displayName><dynamic>false</dynamic><name>Kerberos Keytab</name><required>false</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Kerberos Relogin Period</key><value><defaultValue>4 hours</defaultValue><description>Period of time which should pass before attempting a kerberos relogin</description><displayName>Kerberos Relogin Period</displayName><dynamic>false</dynamic><name>Kerberos Relogin Period</name><required>false</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Directory</key><value><description>The parent HDFS directory to which files should be written</description><displayName>Directory</displayName><dynamic>false</dynamic><name>Directory</name><required>true</required><sensitive>false</sensitive><supportsEl>true</supportsEl></value></entry><entry><key>Conflict Resolution Strategy</key><value><allowableValues><displayName>replace</displayName><value>replace</value></allowableValues><allowableValues><displayName>ignore</displayName><value>ignore</value></allowableValues><allowableValues><displayName>fail</displayName><value>fail</value></allowableValues><defaultValue>fail</defaultValue><description>Indicates what should happen when a file with the same name already exists in the output directory</description><displayName>Conflict Resolution Strategy</displayName><dynamic>false</dynamic><name>Conflict Resolution Strategy</name><required>true</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Block Size</key><value><description>Size of each block as written to HDFS. This overrides the Hadoop Configuration</description><displayName>Block Size</displayName><dynamic>false</dynamic><name>Block Size</name><required>false</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>IO Buffer Size</key><value><description>Amount of memory to use to buffer file contents during IO. This overrides the Hadoop Configuration</description><displayName>IO Buffer Size</displayName><dynamic>false</dynamic><name>IO Buffer Size</name><required>false</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Replication</key><value><description>Number of times that HDFS will replicate each file. This overrides the Hadoop Configuration</description><displayName>Replication</displayName><dynamic>false</dynamic><name>Replication</name><required>false</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Permissions umask</key><value><description>A umask represented as an octal number which determines the permissions of files written to HDFS. This overrides the Hadoop Configuration dfs.umaskmode</description><displayName>Permissions umask</displayName><dynamic>false</dynamic><name>Permissions umask</name><required>false</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Remote Owner</key><value><description>Changes the owner of the HDFS file to this value after it is written. This only works if NiFi is running as a user that has HDFS super user privilege to change owner</description><displayName>Remote Owner</displayName><dynamic>false</dynamic><name>Remote Owner</name><required>false</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Remote Group</key><value><description>Changes the group of the HDFS file to this value after it is written. This only works if NiFi is running as a user that has HDFS super user privilege to change group</description><displayName>Remote Group</displayName><dynamic>false</dynamic><name>Remote Group</name><required>false</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Compression codec</key><value><allowableValues><displayName>NONE</displayName><value>NONE</value></allowableValues><allowableValues><displayName>DEFAULT</displayName><value>DEFAULT</value></allowableValues><allowableValues><displayName>BZIP</displayName><value>BZIP</value></allowableValues><allowableValues><displayName>GZIP</displayName><value>GZIP</value></allowableValues><allowableValues><displayName>LZ4</displayName><value>LZ4</value></allowableValues><allowableValues><displayName>SNAPPY</displayName><value>SNAPPY</value></allowableValues><allowableValues><displayName>AUTOMATIC</displayName><value>AUTOMATIC</value></allowableValues><defaultValue>NONE</defaultValue><description></description><displayName>Compression codec</displayName><dynamic>false</dynamic><name>Compression codec</name><required>true</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry></descriptors><lossTolerant>false</lossTolerant><penaltyDuration>30 sec</penaltyDuration><properties><entry><key>Hadoop Configuration Resources</key><value>/etc/hadoop/conf/hdfs-site.xml,/etc/hadoop/conf/core-site.xml</value></entry><entry><key>Kerberos Principal</key></entry><entry><key>Kerberos Keytab</key></entry><entry><key>Kerberos Relogin Period</key></entry><entry><key>Directory</key><value>/data/ocr/raw</value></entry><entry><key>Conflict Resolution Strategy</key></entry><entry><key>Block Size</key></entry><entry><key>IO Buffer Size</key></entry><entry><key>Replication</key></entry><entry><key>Permissions umask</key></entry><entry><key>Remote Owner</key></entry><entry><key>Remote Group</key></entry><entry><key>Compression codec</key></entry></properties><runDurationMillis>0</runDurationMillis><schedulingPeriod>0 sec</schedulingPeriod><schedulingStrategy>TIMER_DRIVEN</schedulingStrategy><yieldDuration>1 sec</yieldDuration></config><name>PutHDFS</name><relationships><autoTerminate>true</autoTerminate><description>Files that could not be written to HDFS for some reason are transferred to this relationship</description><name>failure</name></relationships><relationships><autoTerminate>true</autoTerminate><description>Files that have been successfully written to HDFS are transferred to this relationship</description><name>success</name></relationships><state>RUNNING</state><style/><supportsEventDriven>false</supportsEventDriven><supportsParallelProcessing>true</supportsParallelProcessing><type>org.apache.nifi.processors.hadoop.PutHDFS</type></processors><processors><id>5ded0d2e-2a49-4a28-b3a0-2f4328711971</id><parentGroupId>68279867-d055-46a8-9934-0e601af986eb</parentGroupId><position><x>275.31129455566406</x><y>467.29913330078125</y></position><config><bulletinLevel>WARN</bulletinLevel><comments></comments><concurrentlySchedulableTaskCount>1</concurrentlySchedulableTaskCount><defaultConcurrentTasks><entry><key>TIMER_DRIVEN</key><value>1</value></entry><entry><key>EVENT_DRIVEN</key><value>0</value></entry><entry><key>CRON_DRIVEN</key><value>1</value></entry></defaultConcurrentTasks><defaultSchedulingPeriod><entry><key>TIMER_DRIVEN</key><value>0 sec</value></entry><entry><key>CRON_DRIVEN</key><value>* * * * * ?</value></entry></defaultSchedulingPeriod><descriptors><entry><key>definition</key><value><description>-a,--adaptiveblur &lt;BLUR_AMOUNT&gt;alternate text smoothing using adaptive blur;
 4 |                                floats&gt;=0; default=0 (no smoothing)
 5 | -e,--enhance &lt;TYPE&gt;            enhance image brightness before cleaning; choices
 6 |                                are: none, stretch or normalize; default=none
 7 | -f,--filtersize &lt;SIZE&gt;         size of filter used to clean background;
 8 |                                integer&gt;0; default=15
 9 | -g,--greyscale                 convert document to grayscale before enhancing
10 | -l,--layout &lt;LAYOUT&gt;           desired layout; options are p (or portrait) or l
11 |                                (or landscape); default=portrait
12 | -o,--offset &lt;SIZE&gt;             offset of filter in percent used to reduce noise;
13 |                                integer&gt;=0; default=5
14 | -p,--padamt &lt;BLUR_AMOUNT&gt;      border pad amount around outer part of image;
15 |                                integer&gt;=0; default=0
16 | -r,--rotate &lt;DIRECTION&gt;        rotate image 90 degrees in direction specified if
17 |                                aspect ratio does not match layout; options are
18 |                                cw (or clockwise), ccw (or counterclockwise) and
19 |                                n (or none); default=none or no rotation
20 | -s,--sharpamt &lt;NUM_PIXELS&gt;     sharpening amount in pixels; float&gt;=0; nominal
21 |                                about 1; default=0
22 | -S,--saturation &lt;SATURATION&gt;   color saturation expressed as percent;
23 |                                integer&gt;=0; only applicable if -g not set; a
24 |                                value of 100 is no change; default=200 (double
25 |                                saturation)
26 | -t,--threshold &lt;THRESHOLD&gt;     text smoothing threshold; 0&lt;=threshold&lt;=100;
27 |                                nominal value is about 50; default is no
28 |                                smoothing
29 | -T,--trim                      trim background around outer part of image
30 | -u,--unrotate                  unrotate image; cannot unrotate more than about 5
31 |                                degrees
32 | </description><displayName>definition</displayName><dynamic>false</dynamic><name>definition</name><required>true</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>temp_space</key><value><description>Temporary directory to be used.</description><displayName>temp_space</displayName><dynamic>false</dynamic><name>temp_space</name><required>false</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>convert_bin_path</key><value><description>The path to the convert (imagemagick) utility</description><displayName>convert_bin_path</displayName><dynamic>false</dynamic><name>convert_bin_path</name><required>true</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry></descriptors><lossTolerant>false</lossTolerant><penaltyDuration>30 sec</penaltyDuration><properties><entry><key>definition</key><value>-g -e stretch -f 25 -o 10 -s 1 -T</value></entry><entry><key>temp_space</key><value>/tmp</value></entry><entry><key>convert_bin_path</key><value>/usr/bin/convert</value></entry></properties><runDurationMillis>0</runDurationMillis><schedulingPeriod>0 sec</schedulingPeriod><schedulingStrategy>TIMER_DRIVEN</schedulingStrategy><yieldDuration>1 sec</yieldDuration></config><name>PreprocessingProcessor</name><relationships><autoTerminate>false</autoTerminate><description>Success relationship</description><name>SUCCESS</name></relationships><state>RUNNING</state><style/><supportsEventDriven>false</supportsEventDriven><supportsParallelProcessing>true</supportsParallelProcessing><type>ocr.nifi.preprocessing.PreprocessingProcessor</type></processors><processors><id>b6321e52-b9f8-4d2e-9bf7-d8c974463922</id><parentGroupId>68279867-d055-46a8-9934-0e601af986eb</parentGroupId><position><x>265.16381562944866</x><y>284.07849564980825</y></position><config><bulletinLevel>WARN</bulletinLevel><comments></comments><concurrentlySchedulableTaskCount>1</concurrentlySchedulableTaskCount><defaultConcurrentTasks><entry><key>TIMER_DRIVEN</key><value>1</value></entry><entry><key>EVENT_DRIVEN</key><value>0</value></entry><entry><key>CRON_DRIVEN</key><value>1</value></entry></defaultConcurrentTasks><defaultSchedulingPeriod><entry><key>TIMER_DRIVEN</key><value>0 sec</value></entry><entry><key>CRON_DRIVEN</key><value>* * * * * ?</value></entry></defaultSchedulingPeriod><descriptors><entry><key>Delete Attributes Expression</key><value><description>Regular expression for attributes to be deleted from flowfiles.</description><displayName>Delete Attributes Expression</displayName><dynamic>false</dynamic><name>Delete Attributes Expression</name><required>false</required><sensitive>false</sensitive><supportsEl>true</supportsEl></value></entry><entry><key>filename</key><value><description></description><displayName>filename</displayName><dynamic>true</dynamic><name>filename</name><required>false</required><sensitive>false</sensitive><supportsEl>true</supportsEl></value></entry><entry><key>originalFilename</key><value><description></description><displayName>originalFilename</displayName><dynamic>true</dynamic><name>originalFilename</name><required>false</required><sensitive>false</sensitive><supportsEl>true</supportsEl></value></entry></descriptors><lossTolerant>false</lossTolerant><penaltyDuration>30 sec</penaltyDuration><properties><entry><key>Delete Attributes Expression</key></entry><entry><key>filename</key><value>${filename}-${pageNumber}.txt</value></entry><entry><key>originalFilename</key><value>${filename}</value></entry></properties><runDurationMillis>0</runDurationMillis><schedulingPeriod>0 sec</schedulingPeriod><schedulingStrategy>TIMER_DRIVEN</schedulingStrategy><yieldDuration>1 sec</yieldDuration></config><name>UpdateAttribute</name><relationships><autoTerminate>false</autoTerminate><description>All FlowFiles are routed to this relationship</description><name>success</name></relationships><state>RUNNING</state><style/><supportsEventDriven>true</supportsEventDriven><supportsParallelProcessing>true</supportsParallelProcessing><type>org.apache.nifi.processors.attributes.UpdateAttribute</type></processors><processors><id>d4ca6eb5-7ef8-4976-a91b-5fe7763e203e</id><parentGroupId>68279867-d055-46a8-9934-0e601af986eb</parentGroupId><position><x>-328.7200042724609</x><y>84.86003997802726</y></position><config><bulletinLevel>WARN</bulletinLevel><comments></comments><concurrentlySchedulableTaskCount>1</concurrentlySchedulableTaskCount><defaultConcurrentTasks><entry><key>TIMER_DRIVEN</key><value>1</value></entry><entry><key>EVENT_DRIVEN</key><value>0</value></entry><entry><key>CRON_DRIVEN</key><value>1</value></entry></defaultConcurrentTasks><defaultSchedulingPeriod><entry><key>TIMER_DRIVEN</key><value>0 sec</value></entry><entry><key>CRON_DRIVEN</key><value>* * * * * ?</value></entry></defaultSchedulingPeriod><descriptors><entry><key>Input Directory</key><value><description>The input directory from which to pull files</description><displayName>Input Directory</displayName><dynamic>false</dynamic><name>Input Directory</name><required>true</required><sensitive>false</sensitive><supportsEl>true</supportsEl></value></entry><entry><key>File Filter</key><value><defaultValue>[^\.].*</defaultValue><description>Only files whose names match the given regular expression will be picked up</description><displayName>File Filter</displayName><dynamic>false</dynamic><name>File Filter</name><required>true</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Path Filter</key><value><description>When Recurse Subdirectories is true, then only subdirectories whose path matches the given regular expression will be scanned</description><displayName>Path Filter</displayName><dynamic>false</dynamic><name>Path Filter</name><required>false</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Batch Size</key><value><defaultValue>10</defaultValue><description>The maximum number of files to pull in each iteration</description><displayName>Batch Size</displayName><dynamic>false</dynamic><name>Batch Size</name><required>true</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Keep Source File</key><value><allowableValues><displayName>true</displayName><value>true</value></allowableValues><allowableValues><displayName>false</displayName><value>false</value></allowableValues><defaultValue>false</defaultValue><description>If true, the file is not deleted after it has been copied to the Content Repository; this causes the file to be picked up continually and is useful for testing purposes.  If not keeping original NiFi will need write permissions on the directory it is pulling from otherwise it will ignore the file.</description><displayName>Keep Source File</displayName><dynamic>false</dynamic><name>Keep Source File</name><required>true</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Recurse Subdirectories</key><value><allowableValues><displayName>true</displayName><value>true</value></allowableValues><allowableValues><displayName>false</displayName><value>false</value></allowableValues><defaultValue>true</defaultValue><description>Indicates whether or not to pull files from subdirectories</description><displayName>Recurse Subdirectories</displayName><dynamic>false</dynamic><name>Recurse Subdirectories</name><required>true</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Polling Interval</key><value><defaultValue>0 sec</defaultValue><description>Indicates how long to wait before performing a directory listing</description><displayName>Polling Interval</displayName><dynamic>false</dynamic><name>Polling Interval</name><required>true</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Ignore Hidden Files</key><value><allowableValues><displayName>true</displayName><value>true</value></allowableValues><allowableValues><displayName>false</displayName><value>false</value></allowableValues><defaultValue>true</defaultValue><description>Indicates whether or not hidden files should be ignored</description><displayName>Ignore Hidden Files</displayName><dynamic>false</dynamic><name>Ignore Hidden Files</name><required>true</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Minimum File Age</key><value><defaultValue>0 sec</defaultValue><description>The minimum age that a file must be in order to be pulled; any file younger than this amount of time (according to last modification date) will be ignored</description><displayName>Minimum File Age</displayName><dynamic>false</dynamic><name>Minimum File Age</name><required>true</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Maximum File Age</key><value><description>The maximum age that a file must be in order to be pulled; any file older than this amount of time (according to last modification date) will be ignored</description><displayName>Maximum File Age</displayName><dynamic>false</dynamic><name>Maximum File Age</name><required>false</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Minimum File Size</key><value><defaultValue>0 B</defaultValue><description>The minimum size that a file must be in order to be pulled</description><displayName>Minimum File Size</displayName><dynamic>false</dynamic><name>Minimum File Size</name><required>true</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Maximum File Size</key><value><description>The maximum size that a file can be in order to be pulled</description><displayName>Maximum File Size</displayName><dynamic>false</dynamic><name>Maximum File Size</name><required>false</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry></descriptors><lossTolerant>false</lossTolerant><penaltyDuration>30 sec</penaltyDuration><properties><entry><key>Input Directory</key><value>/root/scale_ocr/indir</value></entry><entry><key>File Filter</key><value>[^\.].*</value></entry><entry><key>Path Filter</key></entry><entry><key>Batch Size</key><value>10</value></entry><entry><key>Keep Source File</key><value>false</value></entry><entry><key>Recurse Subdirectories</key><value>true</value></entry><entry><key>Polling Interval</key><value>0 sec</value></entry><entry><key>Ignore Hidden Files</key><value>true</value></entry><entry><key>Minimum File Age</key><value>0 sec</value></entry><entry><key>Maximum File Age</key></entry><entry><key>Minimum File Size</key><value>0 B</value></entry><entry><key>Maximum File Size</key></entry></properties><runDurationMillis>0</runDurationMillis><schedulingPeriod>0 sec</schedulingPeriod><schedulingStrategy>TIMER_DRIVEN</schedulingStrategy><yieldDuration>1 sec</yieldDuration></config><name>GetFile</name><relationships><autoTerminate>false</autoTerminate><description>All files are routed to success</description><name>success</name></relationships><state>RUNNING</state><style/><supportsEventDriven>false</supportsEventDriven><supportsParallelProcessing>true</supportsParallelProcessing><type>org.apache.nifi.processors.standard.GetFile</type></processors><processors><id>c77d797f-6d16-4947-92e7-1879586dd7e5</id><parentGroupId>68279867-d055-46a8-9934-0e601af986eb</parentGroupId><position><x>267.3999786376953</x><y>80.68000030517578</y></position><config><bulletinLevel>WARN</bulletinLevel><comments></comments><concurrentlySchedulableTaskCount>1</concurrentlySchedulableTaskCount><defaultConcurrentTasks><entry><key>TIMER_DRIVEN</key><value>1</value></entry><entry><key>EVENT_DRIVEN</key><value>0</value></entry><entry><key>CRON_DRIVEN</key><value>1</value></entry></defaultConcurrentTasks><defaultSchedulingPeriod><entry><key>TIMER_DRIVEN</key><value>0 sec</value></entry><entry><key>CRON_DRIVEN</key><value>* * * * * ?</value></entry></defaultSchedulingPeriod><descriptors><entry><key>temp_space</key><value><description>Temporary directory to be used.</description><displayName>temp_space</displayName><dynamic>false</dynamic><name>temp_space</name><required>false</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>jni_path</key><value><description>JNI Path</description><displayName>jni_path</displayName><dynamic>false</dynamic><name>jni_path</name><required>true</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry></descriptors><lossTolerant>false</lossTolerant><penaltyDuration>30 sec</penaltyDuration><properties><entry><key>temp_space</key><value>/tmp</value></entry><entry><key>jni_path</key><value>/usr/lib</value></entry></properties><runDurationMillis>0</runDurationMillis><schedulingPeriod>0 sec</schedulingPeriod><schedulingStrategy>TIMER_DRIVEN</schedulingStrategy><yieldDuration>1 sec</yieldDuration></config><name>ConversionProcessor</name><relationships><autoTerminate>false</autoTerminate><description>Raw data</description><name>RAW</name></relationships><relationships><autoTerminate>false</autoTerminate><description>Success relationship</description><name>SUCCESS</name></relationships><state>RUNNING</state><style/><supportsEventDriven>false</supportsEventDriven><supportsParallelProcessing>true</supportsParallelProcessing><type>ocr.nifi.conversion.ConversionProcessor</type></processors><processors><id>b80561a8-9dca-4994-bb68-32ac1c4b0de9</id><parentGroupId>68279867-d055-46a8-9934-0e601af986eb</parentGroupId><position><x>-324.4262384033204</x><y>653.7907958984374</y></position><config><bulletinLevel>WARN</bulletinLevel><comments></comments><concurrentlySchedulableTaskCount>1</concurrentlySchedulableTaskCount><defaultConcurrentTasks><entry><key>TIMER_DRIVEN</key><value>1</value></entry><entry><key>EVENT_DRIVEN</key><value>0</value></entry><entry><key>CRON_DRIVEN</key><value>1</value></entry></defaultConcurrentTasks><defaultSchedulingPeriod><entry><key>TIMER_DRIVEN</key><value>0 sec</value></entry><entry><key>CRON_DRIVEN</key><value>* * * * * ?</value></entry></defaultSchedulingPeriod><descriptors><entry><key>Hadoop Configuration Resources</key><value><description>A file or comma separated list of files which contains the Hadoop file system configuration. Without this, Hadoop will search the classpath for a 'core-site.xml' and 'hdfs-site.xml' file or will revert to a default configuration.</description><displayName>Hadoop Configuration Resources</displayName><dynamic>false</dynamic><name>Hadoop Configuration Resources</name><required>false</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Kerberos Principal</key><value><description>Kerberos principal to authenticate as. Requires nifi.kerberos.krb5.file to be set in your nifi.properties</description><displayName>Kerberos Principal</displayName><dynamic>false</dynamic><name>Kerberos Principal</name><required>false</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Kerberos Keytab</key><value><description>Kerberos keytab associated with the principal. Requires nifi.kerberos.krb5.file to be set in your nifi.properties</description><displayName>Kerberos Keytab</displayName><dynamic>false</dynamic><name>Kerberos Keytab</name><required>false</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Kerberos Relogin Period</key><value><defaultValue>4 hours</defaultValue><description>Period of time which should pass before attempting a kerberos relogin</description><displayName>Kerberos Relogin Period</displayName><dynamic>false</dynamic><name>Kerberos Relogin Period</name><required>false</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Directory</key><value><description>The parent HDFS directory to which files should be written</description><displayName>Directory</displayName><dynamic>false</dynamic><name>Directory</name><required>true</required><sensitive>false</sensitive><supportsEl>true</supportsEl></value></entry><entry><key>Conflict Resolution Strategy</key><value><allowableValues><displayName>replace</displayName><value>replace</value></allowableValues><allowableValues><displayName>ignore</displayName><value>ignore</value></allowableValues><allowableValues><displayName>fail</displayName><value>fail</value></allowableValues><defaultValue>fail</defaultValue><description>Indicates what should happen when a file with the same name already exists in the output directory</description><displayName>Conflict Resolution Strategy</displayName><dynamic>false</dynamic><name>Conflict Resolution Strategy</name><required>true</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Block Size</key><value><description>Size of each block as written to HDFS. This overrides the Hadoop Configuration</description><displayName>Block Size</displayName><dynamic>false</dynamic><name>Block Size</name><required>false</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>IO Buffer Size</key><value><description>Amount of memory to use to buffer file contents during IO. This overrides the Hadoop Configuration</description><displayName>IO Buffer Size</displayName><dynamic>false</dynamic><name>IO Buffer Size</name><required>false</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Replication</key><value><description>Number of times that HDFS will replicate each file. This overrides the Hadoop Configuration</description><displayName>Replication</displayName><dynamic>false</dynamic><name>Replication</name><required>false</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Permissions umask</key><value><description>A umask represented as an octal number which determines the permissions of files written to HDFS. This overrides the Hadoop Configuration dfs.umaskmode</description><displayName>Permissions umask</displayName><dynamic>false</dynamic><name>Permissions umask</name><required>false</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Remote Owner</key><value><description>Changes the owner of the HDFS file to this value after it is written. This only works if NiFi is running as a user that has HDFS super user privilege to change owner</description><displayName>Remote Owner</displayName><dynamic>false</dynamic><name>Remote Owner</name><required>false</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Remote Group</key><value><description>Changes the group of the HDFS file to this value after it is written. This only works if NiFi is running as a user that has HDFS super user privilege to change group</description><displayName>Remote Group</displayName><dynamic>false</dynamic><name>Remote Group</name><required>false</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Compression codec</key><value><allowableValues><displayName>NONE</displayName><value>NONE</value></allowableValues><allowableValues><displayName>DEFAULT</displayName><value>DEFAULT</value></allowableValues><allowableValues><displayName>BZIP</displayName><value>BZIP</value></allowableValues><allowableValues><displayName>GZIP</displayName><value>GZIP</value></allowableValues><allowableValues><displayName>LZ4</displayName><value>LZ4</value></allowableValues><allowableValues><displayName>SNAPPY</displayName><value>SNAPPY</value></allowableValues><allowableValues><displayName>AUTOMATIC</displayName><value>AUTOMATIC</value></allowableValues><defaultValue>NONE</defaultValue><description></description><displayName>Compression codec</displayName><dynamic>false</dynamic><name>Compression codec</name><required>true</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry></descriptors><lossTolerant>false</lossTolerant><penaltyDuration>30 sec</penaltyDuration><properties><entry><key>Hadoop Configuration Resources</key><value>/etc/hadoop/conf/hdfs-site.xml,/etc/hadoop/conf/core-site.xml</value></entry><entry><key>Kerberos Principal</key></entry><entry><key>Kerberos Keytab</key></entry><entry><key>Kerberos Relogin Period</key><value>4 hours</value></entry><entry><key>Directory</key><value>/data/ocr/processed/${originalFilename}</value></entry><entry><key>Conflict Resolution Strategy</key><value>fail</value></entry><entry><key>Block Size</key></entry><entry><key>IO Buffer Size</key></entry><entry><key>Replication</key></entry><entry><key>Permissions umask</key></entry><entry><key>Remote Owner</key></entry><entry><key>Remote Group</key></entry><entry><key>Compression codec</key><value>NONE</value></entry></properties><runDurationMillis>0</runDurationMillis><schedulingPeriod>0 sec</schedulingPeriod><schedulingStrategy>TIMER_DRIVEN</schedulingStrategy><yieldDuration>1 sec</yieldDuration></config><name>PutHDFS</name><relationships><autoTerminate>true</autoTerminate><description>Files that could not be written to HDFS for some reason are transferred to this relationship</description><name>failure</name></relationships><relationships><autoTerminate>true</autoTerminate><description>Files that have been successfully written to HDFS are transferred to this relationship</description><name>success</name></relationships><state>RUNNING</state><style/><supportsEventDriven>false</supportsEventDriven><supportsParallelProcessing>true</supportsParallelProcessing><type>org.apache.nifi.processors.hadoop.PutHDFS</type></processors></snippet><timestamp>06/23/2016 16:31:02 UTC</timestamp></template>


--------------------------------------------------------------------------------
/nifi/src/main/resources/META-INF/services/org.apache.nifi.processor.Processor:
--------------------------------------------------------------------------------
1 | ocr.nifi.preprocessing.PreprocessingProcessor
2 | ocr.nifi.conversion.ConversionProcessor
3 | ocr.nifi.extraction.ExtractionProcessor
4 | 


--------------------------------------------------------------------------------
/nifi/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Set root logger level to DEBUG and its only appender to STDOUT.
 2 | log4j.rootLogger=INFO, STDOUT
 3 | 
 4 | # STDOUT is set to be a ConsoleAppender.
 5 | log4j.appender.STDOUT=org.apache.log4j.ConsoleAppender
 6 | 
 7 | # STDOUT uses PatternLayout.
 8 | log4j.appender.STDOUT.layout=org.apache.log4j.PatternLayout
 9 | log4j.appender.STDOUT.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m %C%n
10 | 


--------------------------------------------------------------------------------
/nifi/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | 
 3 | <!-- For assistance related to logback-translator or configuration  -->
 4 | <!-- files in general, please contact the logback user mailing list -->
 5 | <!-- at http://www.qos.ch/mailman/listinfo/logback-user             -->
 6 | <!--                                                                -->
 7 | <!-- For professional support please see                            -->
 8 | <!--    http://www.qos.ch/shop/products/professionalSupport         -->
 9 | <!--                                                                -->
10 | <configuration>
11 |     <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
12 |         <encoder>
13 |             <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
14 |         </encoder>
15 |     </appender>
16 |     <root level="INFO">
17 |         <appender-ref ref="STDOUT"/>
18 |     </root>
19 | </configuration>
20 | 


--------------------------------------------------------------------------------
/nifi/src/test/java/ocr/nifi/conversion/ConversionProcessorTest.java:
--------------------------------------------------------------------------------
 1 | package ocr.nifi.conversion;
 2 | 
 3 | import ocr.preprocessing.conversion.ImageUtils;
 4 | import org.apache.nifi.util.MockFlowFile;
 5 | import org.apache.nifi.util.TestRunner;
 6 | import org.apache.nifi.util.TestRunners;
 7 | import org.ghost4j.document.PDFDocument;
 8 | import org.junit.Assert;
 9 | import org.junit.Test;
10 | 
11 | import java.awt.image.BufferedImage;
12 | import java.io.ByteArrayInputStream;
13 | import java.io.File;
14 | import java.io.FileInputStream;
15 | import java.util.List;
16 | 
17 | import static org.junit.Assert.assertEquals;
18 | import static org.junit.Assert.assertTrue;
19 | 
20 | public class ConversionProcessorTest {
21 |   @Test
22 |   public void test() throws Exception {
23 |     // Generate a test runner to mock a processor in a flow
24 |     TestRunner runner = TestRunners.newTestRunner(new ConversionProcessor());
25 |     File inputFile = new File("../conversion/src/test/resources/text-detection.pdf");
26 |     // Add properties
27 |     runner.setProperty(ConversionProcessor.JNI_PATH, "/opt/local/lib");
28 |     runner.setProperty(ConversionProcessor.TEMP_DIR, "/tmp");
29 | 
30 |     // Add the content to the runner
31 |     runner.enqueue(new FileInputStream(inputFile));
32 | 
33 |     // Run the enqueued content, it also takes an int = number of contents queued
34 |     runner.run(1);
35 | 
36 |     // All results were processed with out failure
37 |     runner.assertQueueEmpty();
38 | 
39 |     // If you need to read or do additional tests on results you can access the content
40 |     List<MockFlowFile> results = runner.getFlowFilesForRelationship(ConversionProcessor.SUCCESS);
41 |     assertEquals(2, results.size() );
42 |     for(MockFlowFile result : results) {
43 |       byte[] value = runner.getContentAsByteArray(result);
44 |       BufferedImage bi = ImageUtils.INSTANCE.readImage(value);
45 |       Assert.assertTrue(bi.getHeight() > 0);
46 |       Assert.assertTrue(bi.getWidth() > 0);
47 |       String pageNum = result.getAttribute("pageNumber");
48 |       Assert.assertTrue(pageNum.equals("0") || pageNum.equals("1"));
49 |     }
50 |     List<MockFlowFile> rawResults = runner.getFlowFilesForRelationship(ConversionProcessor.RAW);
51 |     assertEquals(1, rawResults.size() );
52 |     byte[] value = runner.getContentAsByteArray(rawResults.get(0));
53 |     PDFDocument doc = new PDFDocument();
54 |     doc.load(new ByteArrayInputStream(value));
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/nifi/src/test/java/ocr/nifi/extraction/ExtractionProcessorTest.java:
--------------------------------------------------------------------------------
 1 | package ocr.nifi.extraction;
 2 | 
 3 | import ocr.common.Util;
 4 | import org.apache.nifi.util.MockFlowFile;
 5 | import org.apache.nifi.util.TestRunner;
 6 | import org.apache.nifi.util.TestRunners;
 7 | import org.junit.Assert;
 8 | import org.junit.Test;
 9 | 
10 | import java.io.File;
11 | import java.io.FileInputStream;
12 | import java.io.FileNotFoundException;
13 | import java.util.List;
14 | 
15 | public class ExtractionProcessorTest {
16 | 
17 |     @Test
18 |     public void test() throws FileNotFoundException {
19 |         // Generate a test runner to mock a processor in a flow
20 |         TestRunner runner = TestRunners.newTestRunner(new ExtractionProcessor());
21 |         File inputFile = new File("../extraction/src/test/resources/pdf-test.tiff");
22 |         // Add properties
23 |         runner.setProperty(ExtractionProcessor.JNI_PATH, Util.Locations.JNA.find().get().getAbsolutePath());
24 |         runner.setProperty(ExtractionProcessor.TESS_DATA, Util.Locations.TESSDATA.find().get().getAbsolutePath());
25 |         // Add the content to the runner
26 |         runner.enqueue(new FileInputStream(inputFile));
27 | 
28 |         // Run the enqueued content, it also takes an int = number of contents queued
29 |         runner.run(1);
30 | 
31 |         // All results were processed with out failure
32 |         runner.assertQueueEmpty();
33 | 
34 |         // If you need to read or do additional tests on results you can access the content
35 |         List<MockFlowFile> results = runner.getFlowFilesForRelationship(ExtractionProcessor.SUCCESS);
36 |         Assert.assertEquals(1, results.size());
37 |         String text = new String(runner.getContentAsByteArray(results.get(0)));
38 |         Assert.assertTrue(text.contains("Congratulations, your computer is equipped with a PDF (Portable Document Format)\nreader!"));
39 | 
40 |     }
41 | 
42 | }
43 | 


--------------------------------------------------------------------------------
/nifi/src/test/java/ocr/nifi/preprocessing/PreprocessingTest.java:
--------------------------------------------------------------------------------
 1 | package ocr.nifi.preprocessing;
 2 | 
 3 | import ocr.common.Util;
 4 | import ocr.preprocessing.conversion.ImageUtils;
 5 | import org.apache.nifi.util.MockFlowFile;
 6 | import org.apache.nifi.util.TestRunner;
 7 | import org.apache.nifi.util.TestRunners;
 8 | import org.junit.Assert;
 9 | import org.junit.Test;
10 | 
11 | import java.awt.image.BufferedImage;
12 | import java.io.File;
13 | import java.io.FileInputStream;
14 | import java.util.List;
15 | 
16 | public class PreprocessingTest {
17 | 
18 |     @Test
19 |     public void test() throws Exception {
20 |         // Generate a test runner to mock a processor in a flow
21 |         TestRunner runner = TestRunners.newTestRunner(new PreprocessingProcessor());
22 |         File inputFile = new File("../preprocessing/src/test/resources/images/brscan_original_r90.jpg");
23 |         // Add properties
24 |         runner.setProperty(PreprocessingProcessor.CONVERT_PATH, Util.Locations.CONVERT.find().get().getAbsolutePath());
25 |         runner.setProperty(PreprocessingProcessor.TEMP_DIR, "/tmp");
26 |         runner.setProperty(PreprocessingProcessor.DEFINITIONS, "-g -e normalize -f 15 -o 10 -u -s 2 -T -p 20");
27 |         // Add the content to the runner
28 |         runner.enqueue(new FileInputStream(inputFile));
29 | 
30 |         // Run the enqueued content, it also takes an int = number of contents queued
31 |         runner.run(1);
32 | 
33 |         // All results were processed with out failure
34 |         runner.assertQueueEmpty();
35 | 
36 |         // If you need to read or do additional tests on results you can access the content
37 |         List<MockFlowFile> results = runner.getFlowFilesForRelationship(PreprocessingProcessor.SUCCESS);
38 |         Assert.assertEquals(1, results.size());
39 |         byte[] value = runner.getContentAsByteArray(results.get(0));
40 |         BufferedImage bi = ImageUtils.INSTANCE.readImage(value);
41 |         Assert.assertEquals(1074, bi.getHeight());
42 |         Assert.assertEquals(812, bi.getWidth());
43 |     }
44 | 
45 |     private String findBin(String[] locs) {
46 |         for (String loc : locs) {
47 |             File binPath = new File(loc);
48 |             if (binPath.exists() && binPath.isFile()) {
49 |                 return loc;
50 |             }
51 |         }
52 |         return "";
53 |     }
54 | }
55 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
  3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |     <modelVersion>4.0.0</modelVersion>
  6 | 
  7 |     <groupId>ocr</groupId>
  8 |     <artifactId>ocr</artifactId>
  9 |     <version>1.0-SNAPSHOT</version>
 10 |     <packaging>pom</packaging>
 11 | 
 12 |     <name>ocr</name>
 13 | 
 14 |     <url>https://github.com/mmiklavc/scalable-ocr</url>
 15 | 
 16 |     <description>
 17 |         So much of our data is represented as human readable scans of documents. However, this kind of
 18 |         document-by-document analysis does not scale, so it is becoming evermore common to need to ingest large numbers
 19 |         of PDFs or scanned documents shows up in almost all sectors. Inevitably these scanned documents must be
 20 |         converted to text for analysis. And since dealing with unstructured data is one of the main selling points for a
 21 |         platform like Hadoop, it means that we must convert large volumes of potentially large documents into a textual
 22 |         representation. We will show you how to use open source tooling (Apache NiFi and Tesseract) to scalably convert
 23 |         volumes of PDFs and ingest into a platform that will allow you to analyze this data at scale.
 24 |     </description>
 25 | 
 26 |     <developers>
 27 |         <developer>
 28 |             <name>Casey Stella</name>
 29 |             <email>cestella@gmail.com</email>
 30 |             <organizationUrl>http://www.caseystella.com</organizationUrl>
 31 |         </developer>
 32 |         <developer>
 33 |             <name>Michael Miklavcic</name>
 34 |             <email>michael@clevelandflash.com</email>
 35 |             <organizationUrl>http://blog.michaelmiklavcic.com</organizationUrl>
 36 |         </developer>
 37 |     </developers>
 38 | 
 39 |     <modules>
 40 |         <module>conversion</module>
 41 |         <module>preprocessing</module>
 42 |         <module>extraction</module>
 43 |         <module>nifi</module>
 44 |         <module>common</module>
 45 |         <module>cli</module>
 46 |     </modules>
 47 | 
 48 |     <properties>
 49 |         <!-- platform encoding -->
 50 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 51 |         <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
 52 | 
 53 |         <!-- source code versions -->
 54 |         <java-source.version>1.8</java-source.version>
 55 |         <java-target.version>1.8</java-target.version>
 56 | 
 57 |         <!-- scm locations -->
 58 |         <scm-connection.url>file://localhost/path/to/localrepo</scm-connection.url>
 59 |         <scm-developer-connection.url>file://localhost/path/to/localrepo</scm-developer-connection.url>
 60 | 
 61 |         <!-- build plugin versions -->
 62 |         <maven-shade-plugin.version>2.3</maven-shade-plugin.version>
 63 |         <maven-compiler-plugin.version>3.1</maven-compiler-plugin.version>
 64 |         <maven-surefire-plugin.version>2.18.1</maven-surefire-plugin.version>
 65 |         <maven-failsafe-plugin.version>2.17</maven-failsafe-plugin.version>
 66 |         <maven-source-plugin.version>3.0.0</maven-source-plugin.version>
 67 |         <maven-javadoc-plugin.version>2.10.3</maven-javadoc-plugin.version>
 68 |         <maven-release-plugin.version>2.5.3</maven-release-plugin.version>
 69 |         <scala-maven-plugin.version>3.2.1</scala-maven-plugin.version>
 70 |         <scalatest-maven-plugin.version>1.0</scalatest-maven-plugin.version>
 71 | 
 72 |         <!-- report plugin versions -->
 73 |         <maven-project-info-reports-plugin>2.9</maven-project-info-reports-plugin>
 74 | 
 75 |         <!-- main dependency versions -->
 76 |         <commons-io.version>2.4</commons-io.version>
 77 |         <guava.version>19.0</guava.version>
 78 |         <logback.version>1.1.6</logback.version>
 79 |         <hadoop.version>2.7.2</hadoop.version>
 80 |         <scala.version>2.11.8</scala.version>
 81 |         <scala.binary.version>2.11</scala.binary.version>
 82 | 
 83 |         <!-- test dependency versions -->
 84 |         <!-- junit version 4.12 does not require junit-dep to work with hamcrest matchers. hamcrest-core is now included as a dependency -->
 85 |         <junit.version>4.12</junit.version>
 86 |         <mockito.version>1.10.19</mockito.version>
 87 |         <scalatest.version>2.2.6</scalatest.version>
 88 |         <multiline-string.version>0.1.2</multiline-string.version>
 89 |     </properties>
 90 | 
 91 |     <scm>
 92 |         <connection>scm:git:${scm-connection.url}</connection>
 93 |         <developerConnection>scm:git:${scm-developer-connection.url}</developerConnection>
 94 |     </scm>
 95 | 
 96 |     <repositories>
 97 |         <repository>
 98 |             <id>multiline-release-repo</id>
 99 |             <url>https://raw.github.com/benelog/multiline/master/maven-repository</url>
100 |             <snapshots>
101 |                 <enabled>false</enabled>
102 |             </snapshots>
103 |         </repository>
104 |     </repositories>
105 | 
106 |     <dependencyManagement>
107 |         <dependencies>
108 |             <dependency>
109 |                 <groupId>org.scala-lang</groupId>
110 |                 <artifactId>scala-library</artifactId>
111 |                 <version>${scala.version}</version>
112 |             </dependency>
113 |             <dependency>
114 |                 <groupId>org.scalactic</groupId>
115 |                 <artifactId>scalactic_${scala.binary.version}</artifactId>
116 |                 <version>${scalatest.version}</version>
117 |             </dependency>
118 |             <dependency>
119 |                 <groupId>org.scalatest</groupId>
120 |                 <artifactId>scalatest_${scala.binary.version}</artifactId>
121 |                 <version>${scalatest.version}</version>
122 |             </dependency>
123 |         </dependencies>
124 |     </dependencyManagement>
125 | 
126 |     <dependencies>
127 |         <dependency>
128 |             <groupId>ch.qos.logback</groupId>
129 |             <artifactId>logback-classic</artifactId>
130 |             <version>${logback.version}</version>
131 |         </dependency>
132 |         <dependency>
133 |             <groupId>commons-io</groupId>
134 |             <artifactId>commons-io</artifactId>
135 |             <version>${commons-io.version}</version>
136 |         </dependency>
137 |         <dependency>
138 |             <groupId>com.google.guava</groupId>
139 |             <artifactId>guava</artifactId>
140 |             <version>${guava.version}</version>
141 |         </dependency>
142 |         <!--    uncomment to include the hadoop client
143 |                 <dependency>
144 |                     <groupId>org.apache.hadoop</groupId>
145 |                     <artifactId>hadoop-client</artifactId>
146 |                     <version>${hadoop.version}</version>
147 |                     <exclusions>
148 |                         <exclusion>
149 |                             <groupId>org.slf4j</groupId>
150 |                             <artifactId>slf4j-log4j12</artifactId>
151 |                         </exclusion>
152 |                     </exclusions>
153 |                 </dependency>
154 |         -->
155 |         <dependency>
156 |             <groupId>junit</groupId>
157 |             <artifactId>junit</artifactId>
158 |             <version>${junit.version}</version>
159 |             <scope>test</scope>
160 |         </dependency>
161 |         <dependency>
162 |             <groupId>org.mockito</groupId>
163 |             <artifactId>mockito-all</artifactId>
164 |             <version>${mockito.version}</version>
165 |             <scope>test</scope>
166 |         </dependency>
167 |         <dependency>
168 |             <groupId>org.adrianwalker</groupId>
169 |             <artifactId>multiline-string</artifactId>
170 |             <version>${multiline-string.version}</version>
171 |             <scope>test</scope>
172 |         </dependency>
173 |     </dependencies>
174 | 
175 |     <build>
176 |         <pluginManagement>
177 |             <plugins>
178 |                 <plugin>
179 |                     <groupId>net.alchim31.maven</groupId>
180 |                     <artifactId>scala-maven-plugin</artifactId>
181 |                     <version>${scala-maven-plugin.version}</version>
182 |                     <configuration>
183 |                         <recompileMode>incremental</recompileMode>
184 |                         <useZincServer>true</useZincServer>
185 |                         <javacArgs>
186 |                             <javacArg>-Xlint:unchecked</javacArg>
187 |                             <javacArg>-Xlint:deprecation</javacArg>
188 |                         </javacArgs>
189 |                         <args>
190 |                             <arg>-unchecked</arg>
191 |                             <arg>-deprecation</arg>
192 |                             <arg>-explaintypes</arg>
193 |                             <!-- work-around for https://issues.scala-lang.org/browse/SI-8358 -->
194 |                             <arg>-nobootcp</arg>
195 |                         </args>
196 |                     </configuration>
197 |                     <executions>
198 |                         <execution>
199 |                             <id>scala-compile-first</id>
200 |                             <phase>process-resources</phase>
201 |                             <goals>
202 |                                 <goal>add-source</goal>
203 |                                 <goal>compile</goal>
204 |                             </goals>
205 |                         </execution>
206 |                         <execution>
207 |                             <id>scala-test-compile</id>
208 |                             <phase>process-test-resources</phase>
209 |                             <goals>
210 |                                 <goal>testCompile</goal>
211 |                             </goals>
212 |                         </execution>
213 |                     </executions>
214 |                 </plugin>
215 |                 <plugin>
216 |                     <groupId>org.scalatest</groupId>
217 |                     <artifactId>scalatest-maven-plugin</artifactId>
218 |                     <version>${scalatest-maven-plugin.version}</version>
219 |                     <configuration>
220 |                         <reportsDirectory>${project.build.directory}/surefire-reports</reportsDirectory>
221 |                         <junitxml>.</junitxml>
222 |                         <filereports>WDF TestSuite.txt</filereports>
223 |                     </configuration>
224 |                     <executions>
225 |                         <execution>
226 |                             <id>test</id>
227 |                             <goals>
228 |                                 <goal>test</goal>
229 |                             </goals>
230 |                         </execution>
231 |                     </executions>
232 |                 </plugin>
233 |                 <plugin>
234 |                     <groupId>org.apache.maven.plugins</groupId>
235 |                     <artifactId>maven-shade-plugin</artifactId>
236 |                     <version>${maven-shade-plugin.version}</version>
237 |                     <executions>
238 |                         <execution>
239 |                             <phase>package</phase>
240 |                             <goals>
241 |                                 <goal>shade</goal>
242 |                             </goals>
243 |                         </execution>
244 |                     </executions>
245 |                 </plugin>
246 |             </plugins>
247 |         </pluginManagement>
248 | 
249 |         <plugins>
250 |             <plugin>
251 |                 <groupId>org.apache.maven.plugins</groupId>
252 |                 <artifactId>maven-compiler-plugin</artifactId>
253 |                 <version>${maven-compiler-plugin.version}</version>
254 |                 <configuration>
255 |                     <source>${java-source.version}</source>
256 |                     <target>${java-target.version}</target>
257 |                 </configuration>
258 |             </plugin>
259 |             <plugin>
260 |                 <groupId>org.apache.maven.plugins</groupId>
261 |                 <artifactId>maven-surefire-plugin</artifactId>
262 |                 <version>${maven-surefire-plugin.version}</version>
263 |                 <configuration>
264 |                     <excludes>
265 |                         <exclude>**/*AcceptanceTest*.java</exclude>
266 |                     </excludes>
267 |                 </configuration>
268 |             </plugin>
269 |             <plugin>
270 |                 <groupId>org.apache.maven.plugins</groupId>
271 |                 <artifactId>maven-failsafe-plugin</artifactId>
272 |                 <version>${maven-failsafe-plugin.version}</version>
273 |                 <configuration>
274 |                     <includes>
275 |                         <include>**/*AcceptanceTest*.java</include>
276 |                     </includes>
277 |                 </configuration>
278 |                 <executions>
279 |                     <execution>
280 |                         <goals>
281 |                             <goal>integration-test</goal>
282 |                             <goal>verify</goal>
283 |                         </goals>
284 |                     </execution>
285 |                 </executions>
286 |             </plugin>
287 |             <plugin>
288 |                 <groupId>org.apache.maven.plugins</groupId>
289 |                 <artifactId>maven-source-plugin</artifactId>
290 |                 <version>${maven-source-plugin.version}</version>
291 |                 <executions>
292 |                     <execution>
293 |                         <id>attach-sources</id>
294 |                         <goals>
295 |                             <goal>jar-no-fork</goal>
296 |                         </goals>
297 |                     </execution>
298 |                 </executions>
299 |             </plugin>
300 |             <plugin>
301 |                 <groupId>org.apache.maven.plugins</groupId>
302 |                 <artifactId>maven-release-plugin</artifactId>
303 |                 <version>${maven-release-plugin.version}</version>
304 |                 <configuration>
305 |                     <autoVersionSubmodules>true</autoVersionSubmodules>
306 |                 </configuration>
307 |             </plugin>
308 |         </plugins>
309 |     </build>
310 | 
311 |     <reporting>
312 |         <plugins>
313 |             <plugin>
314 |                 <groupId>org.apache.maven.plugins</groupId>
315 |                 <artifactId>maven-project-info-reports-plugin</artifactId>
316 |                 <version>${maven-project-info-reports-plugin}</version>
317 |                 <configuration>
318 |                     <dependencyLocationsEnabled>false</dependencyLocationsEnabled>
319 |                 </configuration>
320 |             </plugin>
321 |             <plugin>
322 |                 <groupId>org.apache.maven.plugins</groupId>
323 |                 <artifactId>maven-javadoc-plugin</artifactId>
324 |                 <version>${maven-javadoc-plugin.version}</version>
325 |                 <reportSets>
326 |                     <reportSet>
327 |                         <id>default</id>
328 |                         <reports>
329 |                             <report>javadoc</report>
330 |                             <report>test-javadoc</report>
331 |                         </reports>
332 |                     </reportSet>
333 |                     <reportSet>
334 |                         <id>aggregate</id>
335 |                         <reports>
336 |                             <report>aggregate</report>
337 |                         </reports>
338 |                     </reportSet>
339 |                 </reportSets>
340 |             </plugin>
341 |         </plugins>
342 |     </reporting>
343 | 
344 | </project>
345 | 


--------------------------------------------------------------------------------
/preprocessing/README.md:
--------------------------------------------------------------------------------
1 | # Preprocess
2 | 
3 | 


--------------------------------------------------------------------------------
/preprocessing/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <modelVersion>4.0.0</modelVersion>
 6 | 
 7 |     <parent>
 8 |         <groupId>ocr</groupId>
 9 |         <artifactId>ocr</artifactId>
10 |         <version>1.0-SNAPSHOT</version>
11 |     </parent>
12 | 
13 |     <artifactId>preprocessing</artifactId>
14 |     <name>preprocessing</name>
15 |     <repositories>
16 |         <repository>
17 |             <id>OpenIMAJ maven releases repository</id>
18 |             <url>http://maven.openimaj.org</url>
19 |         </repository>
20 |         <repository>
21 |             <id>OpenIMAJ maven snapshots repository</id>
22 |             <url>http://snapshots.openimaj.org</url>
23 |         </repository>
24 |     </repositories>
25 |     <properties>
26 |         <!-- plugin versions -->
27 |         <!-- main dependency versions -->
28 |         <openimaj.version>1.3.1</openimaj.version>
29 |     </properties>
30 | 
31 |     <dependencies>
32 |         <dependency>
33 |             <groupId>ocr</groupId>
34 |             <artifactId>common</artifactId>
35 |             <version>1.0-SNAPSHOT</version>
36 |         </dependency>
37 |         <dependency>
38 |             <groupId>org.im4java</groupId>
39 |             <artifactId>im4java</artifactId>
40 |             <version>1.4.0</version>
41 |         </dependency>
42 |         <dependency>
43 |             <groupId>commons-cli</groupId>
44 |             <artifactId>commons-cli</artifactId>
45 |             <version>1.3.1</version>
46 |         </dependency>
47 | 
48 |     </dependencies>
49 | 
50 |     <build>
51 |         <plugins>
52 |         </plugins>
53 |     </build>
54 | 
55 | </project>
56 | 


--------------------------------------------------------------------------------
/preprocessing/src/main/java/ocr/preprocessing/conversion/CLIUtils.java:
--------------------------------------------------------------------------------
 1 | package ocr.preprocessing.conversion;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.StringTokenizer;
 5 | 
 6 | public class CLIUtils {
 7 | 
 8 |   /**
 9 |    * Crack a command line.
10 |    * @param toProcess the command line to process.
11 |    * @return the command line broken into strings.
12 |    * An empty or null toProcess parameter results in a zero sized array.
13 |    */
14 |   public static String[] translateCommandline(String toProcess) {
15 |     if (toProcess == null || toProcess.length() == 0) {
16 |       //no command? no string
17 |       return new String[0];
18 |     }
19 |     // parse with a simple finite state machine
20 | 
21 |     final int normal = 0;
22 |     final int inQuote = 1;
23 |     final int inDoubleQuote = 2;
24 |     int state = normal;
25 |     final StringTokenizer tok = new StringTokenizer(toProcess, "\"\' ", true);
26 |     final ArrayList<String> result = new ArrayList<String>();
27 |     final StringBuilder current = new StringBuilder();
28 |     boolean lastTokenHasBeenQuoted = false;
29 | 
30 |     while (tok.hasMoreTokens()) {
31 |       String nextTok = tok.nextToken();
32 |       switch (state) {
33 |         case inQuote:
34 |           if ("\'".equals(nextTok)) {
35 |             lastTokenHasBeenQuoted = true;
36 |             state = normal;
37 |           } else {
38 |             current.append(nextTok);
39 |           }
40 |           break;
41 |         case inDoubleQuote:
42 |           if ("\"".equals(nextTok)) {
43 |             lastTokenHasBeenQuoted = true;
44 |             state = normal;
45 |           } else {
46 |             current.append(nextTok);
47 |           }
48 |           break;
49 |         default:
50 |           if ("\'".equals(nextTok)) {
51 |             state = inQuote;
52 |           } else if ("\"".equals(nextTok)) {
53 |             state = inDoubleQuote;
54 |           } else if (" ".equals(nextTok)) {
55 |             if (lastTokenHasBeenQuoted || current.length() != 0) {
56 |               result.add(current.toString());
57 |               current.setLength(0);
58 |             }
59 |           } else {
60 |             current.append(nextTok);
61 |           }
62 |           lastTokenHasBeenQuoted = false;
63 |           break;
64 |       }
65 |     }
66 |     if (lastTokenHasBeenQuoted || current.length() != 0) {
67 |       result.add(current.toString());
68 |     }
69 |     if (state == inQuote || state == inDoubleQuote) {
70 |       throw new IllegalStateException("unbalanced quotes in " + toProcess);
71 |     }
72 |     return result.toArray(new String[result.size()]);
73 |   }
74 | }
75 | 


--------------------------------------------------------------------------------
/preprocessing/src/main/java/ocr/preprocessing/conversion/CleaningOptions.java:
--------------------------------------------------------------------------------
  1 | package ocr.preprocessing.conversion;
  2 | 
  3 | import com.google.common.base.Joiner;
  4 | import com.google.common.collect.Iterables;
  5 | import com.sun.xml.internal.rngom.ast.builder.BuildException;
  6 | import ocr.preprocessing.conversion.handler.*;
  7 | import org.apache.commons.cli.*;
  8 | 
  9 | import java.io.PrintWriter;
 10 | import java.io.StringWriter;
 11 | import java.util.ArrayList;
 12 | import java.util.Arrays;
 13 | import java.util.EnumMap;
 14 | import java.util.StringTokenizer;
 15 | import java.util.function.Function;
 16 | 
 17 | public enum CleaningOptions {
 18 | 
 19 | 
 20 |    ROTATE("r", s -> Option.builder(s).hasArg()
 21 |                                      .longOpt("rotate")
 22 |                                      .argName("DIRECTION")
 23 |                                      .desc("rotate image 90 degrees in direction specified if aspect ratio does not match layout; options are cw (or clockwise), ccw (or counterclockwise) and n (or none); default=none or no rotation")
 24 |                                      .build()
 25 |          , new RotationHandler()
 26 |          )
 27 |   ,LAYOUT("l", s -> Option.builder(s).hasArg()
 28 |                                      .longOpt("layout")
 29 |                                      .argName("LAYOUT")
 30 |                                      .desc("desired layout; options are p (or portrait) or l (or landscape); default=portrait")
 31 |                                      .build()
 32 |           , new LayoutHandler()
 33 |          )
 34 |   ,GREYSCALE("g", s -> Option.builder(s)
 35 |                                      .longOpt("greyscale")
 36 |                                      .desc("convert document to grayscale before enhancing")
 37 |                                      .build()
 38 |             , new GrayscaleHandler()
 39 |             )
 40 |   ,ENHANCE("e", s -> Option.builder(s).hasArg()
 41 |                                      .longOpt("enhance")
 42 |                                      .argName("TYPE")
 43 |                                      .desc("enhance image brightness before cleaning; choices are: none, stretch or normalize; default=none")
 44 |                                      .build()
 45 |           , new EnhancingHandler()
 46 |           )
 47 |   ,FILTER("f", s -> Option.builder(s).hasArg()
 48 |                                      .longOpt("filtersize")
 49 |                                      .argName("SIZE")
 50 |                                      .type(Integer.class)
 51 |                                      .desc("size of filter used to clean background; integer>0; default=15")
 52 |                                      .build()
 53 |           , new FilterHandler()
 54 |           )
 55 |   ,OFFSET("o", s -> Option.builder(s).hasArg()
 56 |                                      .longOpt("offset")
 57 |                                      .argName("SIZE")
 58 |                                      .type(Integer.class)
 59 |                                      .desc("offset of filter in percent used to reduce noise; integer>=0; default=5")
 60 |                                      .build()
 61 |           , new OffsetHandler()
 62 |           )
 63 |   ,UNROTATE("u", s -> Option.builder(s).longOpt("unrotate")
 64 |                                      .desc("unrotate image; cannot unrotate more than about 5 degrees")
 65 |                                      .build()
 66 |           , new UnrotateHandler()
 67 |           )
 68 |   ,SMOOTHING_THRESHOLD("t", s -> Option.builder(s).hasArg()
 69 |                                      .longOpt("threshold")
 70 |                                      .argName("THRESHOLD")
 71 |                                      .type(Integer.class)
 72 |                                      .desc("text smoothing threshold; 0<=threshold<=100; nominal value is about 50; default is no smoothing")
 73 |                                      .build()
 74 |           , new SmoothingThresholdHandler()
 75 |           )
 76 |   ,SHARPEN("s", s -> Option.builder(s).hasArg()
 77 |                                      .longOpt("sharpamt")
 78 |                                      .argName("NUM_PIXELS")
 79 |                                      .type(Integer.class)
 80 |                                      .desc("sharpening amount in pixels; float>=0; nominal about 1; default=0")
 81 |                                      .build()
 82 |           , new SharpenHandler()
 83 |           )
 84 |   ,SATURATION("S", s -> Option.builder(s).hasArg()
 85 |                                      .longOpt("saturation")
 86 |                                      .argName("SATURATION")
 87 |                                      .type(Integer.class)
 88 |                                      .desc("color saturation expressed as percent; integer>=0; only applicable if -g not set; a value of 100 is no change; default=200 (double saturation)")
 89 |                                      .build()
 90 |           , new SaturationHandler()
 91 |           )
 92 |   ,ADAPTIVE_BLUR("a", s -> Option.builder(s).hasArg()
 93 |                                      .longOpt("adaptiveblur")
 94 |                                      .argName("BLUR_AMOUNT")
 95 |                                      .type(Double.class)
 96 |                                      .desc("alternate text smoothing using adaptive blur; floats>=0; default=0 (no smoothing)")
 97 |                                      .build()
 98 |           , new AdaptiveBlurringHandler()
 99 |           )
100 |   ,TRIM("T", s -> Option.builder(s).longOpt("trim")
101 |                                      .desc("trim background around outer part of image ")
102 |                                      .build()
103 |           , new TrimHandler()
104 |           )
105 |   ,PAD("p", s -> Option.builder(s).hasArg()
106 |                                   .longOpt("padamt")
107 |                                   .argName("BLUR_AMOUNT")
108 |                                   .type(Integer.class)
109 |                                   .desc("border pad amount around outer part of image; integer>=0; default=0")
110 |                                   .build()
111 |       , new PadHandler()
112 |       )
113 |   ;
114 |   String shortCode;
115 |   Option option;
116 |   Handler handler;
117 |   CleaningOptions( String shortCode
118 |                  , Function<String, Option> optionCreator
119 |                  , Handler handler
120 |                  )
121 |   {
122 |     this.shortCode = shortCode;
123 |     this.option = optionCreator.apply(shortCode);
124 |     this.handler = handler;
125 |   }
126 | 
127 |   public boolean has(CommandLine cli) {
128 |     return cli.hasOption(shortCode);
129 |   }
130 | 
131 |   public String get(CommandLine cli) {
132 |     return cli.getOptionValue(shortCode);
133 |   }
134 | 
135 |   public static CommandLine parse(CommandLineParser parser, String[] args) {
136 |     try {
137 |       CommandLine cli = parser.parse(getOptions(), args);
138 |       return cli;
139 |     } catch (ParseException e) {
140 |       System.err.println("Unable to parse args: " + Joiner.on(' ').join(args));
141 |       e.printStackTrace(System.err);
142 |       printHelp();
143 |       System.exit(-1);
144 |       return null;
145 |     }
146 |   }
147 | 
148 |   private static Iterable<Handler> getHandlers() {
149 |     return Iterables.transform(Arrays.asList(values()), x -> x.handler);
150 |   }
151 | 
152 |   public static TextCleaner createTextCleaner(CommandLine cli, String convertCommand, String outputDir) {
153 |     return new TextCleaner(getHandlers(), cli, convertCommand, outputDir);
154 |   }
155 | 
156 |   public static TextCleaner createTextCleaner(CommandLine cli, String convertCommand) {
157 |     return createTextCleaner(cli, convertCommand, null);
158 |   }
159 | 
160 |   public static void printHelp() {
161 |     HelpFormatter formatter = new HelpFormatter();
162 |     formatter.printHelp( "Preprocessor", getOptions());
163 |   }
164 | 
165 |   public static String getUsage() {
166 |     HelpFormatter formatter = new HelpFormatter();
167 |     StringWriter sw = new StringWriter();
168 |     PrintWriter pw = new PrintWriter(sw);
169 |     formatter.printOptions(pw, 80, getOptions(), 0, 0);
170 |     pw.flush();
171 |     return sw.toString();
172 |   }
173 | 
174 |   public static Options getOptions() {
175 |     Options ret = new Options();
176 |     for(CleaningOptions o : CleaningOptions.values()) {
177 |       ret.addOption(o.option);
178 |     }
179 |     return ret;
180 |   }
181 | 
182 | }
183 | 


--------------------------------------------------------------------------------
/preprocessing/src/main/java/ocr/preprocessing/conversion/CommandFailedException.java:
--------------------------------------------------------------------------------
 1 | package ocr.preprocessing.conversion;
 2 | 
 3 | public class CommandFailedException extends Exception {
 4 |   public CommandFailedException(String reason, Throwable t) {
 5 |     super(reason, t);
 6 |   }
 7 |   public CommandFailedException(String reason) {
 8 |     super(reason);
 9 |   }
10 | }
11 | 


--------------------------------------------------------------------------------
/preprocessing/src/main/java/ocr/preprocessing/conversion/Handler.java:
--------------------------------------------------------------------------------
1 | package ocr.preprocessing.conversion;
2 | 
3 | import org.apache.commons.cli.CommandLine;
4 | 
5 | public interface Handler {
6 |   void set(CommandLine cli, TextCleaner cleaner);
7 |   String getIMCommand(int aspectRatio, TextCleaner cleaner);
8 | }
9 | 


--------------------------------------------------------------------------------
/preprocessing/src/main/java/ocr/preprocessing/conversion/ImageUtils.java:
--------------------------------------------------------------------------------
 1 | package ocr.preprocessing.conversion;
 2 | 
 3 | import javax.imageio.ImageIO;
 4 | import java.awt.image.BufferedImage;
 5 | import java.io.ByteArrayInputStream;
 6 | import java.io.File;
 7 | import java.io.IOException;
 8 | 
 9 | public enum ImageUtils {
10 |   INSTANCE;
11 | 
12 |   public BufferedImage readImage(byte[] inputImage) throws IOException {
13 |     return ImageIO.read(new ByteArrayInputStream(inputImage));
14 |   }
15 |   public BufferedImage readImage(File inputFile) throws IOException {
16 |     return ImageIO.read(inputFile);
17 |   }
18 | 
19 |   public int getHeight(BufferedImage img) {
20 |     return img.getHeight();
21 |   }
22 | 
23 |   public int getWidth(BufferedImage img) {
24 |     return img.getWidth();
25 |   }
26 | 
27 |   public double getAspectRatio(BufferedImage img) {
28 | 
29 |     return (1.0*getHeight(img))/getWidth(img);
30 |   }
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/preprocessing/src/main/java/ocr/preprocessing/conversion/TextCleaner.java:
--------------------------------------------------------------------------------
  1 | package ocr.preprocessing.conversion;
  2 | 
  3 | import com.google.common.base.Joiner;
  4 | import ocr.common.Util;
  5 | import org.apache.commons.cli.CommandLine;
  6 | import org.apache.commons.io.IOUtils;
  7 | 
  8 | import java.awt.image.BufferedImage;
  9 | import java.io.File;
 10 | import java.io.FileNotFoundException;
 11 | import java.io.IOException;
 12 | import java.io.InputStream;
 13 | import java.nio.file.Files;
 14 | import java.nio.file.StandardCopyOption;
 15 | import java.util.ArrayList;
 16 | import java.util.EnumMap;
 17 | import java.util.List;
 18 | import java.util.Optional;
 19 | 
 20 | public class TextCleaner {
 21 |   public interface Aliased {
 22 |     String getAlias();
 23 |   }
 24 |   public enum Rotation implements Aliased {
 25 |     CLOCKWISE("cw"), COUNTERCLOCKWISE("ccw"), NONE("none");
 26 |     String alias;
 27 |     Rotation(String alias) {
 28 |       this.alias = alias;
 29 |     }
 30 | 
 31 |     @Override
 32 |     public String getAlias() {
 33 |       return alias;
 34 |     }
 35 | 
 36 |     public static Rotation getDefault() { return NONE;}
 37 |   }
 38 | 
 39 |   public enum Layout implements Aliased {
 40 |     PORTRAIT("portrait"), LANDSCAPE("landscape")
 41 |     ;
 42 |     String alias;
 43 |     Layout(String alias) {
 44 |       this.alias = alias;
 45 |     }
 46 | 
 47 |     @Override
 48 |     public String getAlias() {
 49 |       return alias;
 50 |     }
 51 | 
 52 |     public static Layout getDefault() { return PORTRAIT;}
 53 |   }
 54 |   public enum Enhance implements Aliased {
 55 |     STRETCH("stretch"), NORMALIZE("normalize"), NONE("none")
 56 |     ;
 57 |     String alias;
 58 |     Enhance(String alias) {
 59 |       this.alias = alias;
 60 |     }
 61 | 
 62 |     @Override
 63 |     public String getAlias() {
 64 |       return alias;
 65 |     }
 66 | 
 67 |     public static Enhance getDefault() { return NONE;}
 68 |   }
 69 | 
 70 | 
 71 |   private Rotation rotate = Rotation.getDefault();
 72 |   private Layout layout = Layout.getDefault();
 73 |   private boolean grayscale = false;
 74 |   private Enhance enhance = Enhance.getDefault();
 75 |   private int filterSize = 15;
 76 |   private int offset = 5;
 77 |   private Optional<Integer> threshold = Optional.empty();
 78 |   private int sharpAmt = 0;
 79 |   private int saturation = 200;
 80 |   private int adaptiveBlur = 0;
 81 |   private boolean unrotate = false;
 82 |   private boolean trim = false;
 83 |   private int padAmt = 0;
 84 |   private String bgColor = "white";
 85 |   private Optional<String> convertPath = Optional.empty();
 86 |   private Optional<String> tmpPath = Optional.empty();
 87 |   private Iterable<Handler> handlers;
 88 | 
 89 |   public TextCleaner(Iterable<Handler> handlers
 90 |                     ,CommandLine cli
 91 |                     )
 92 |   {
 93 |     this(handlers, cli, null, null);
 94 |   }
 95 |   public TextCleaner(Iterable<Handler> handlers
 96 |                     ,CommandLine cli
 97 |                     ,String convertPath
 98 |                     ,String tmpPath
 99 |                     )
100 |   {
101 |     if(convertPath != null) {
102 |       this.convertPath = Optional.of(convertPath);
103 |     }
104 |     if(tmpPath != null) {
105 |       this.tmpPath = Optional.of(tmpPath);
106 |     }
107 |     this.handlers = handlers;
108 |     for(Handler h : handlers) {
109 |       h.set(cli, this);
110 |     }
111 |   }
112 | 
113 |   public Rotation getRotate() {
114 |     return rotate;
115 |   }
116 | 
117 |   public void setRotate(Rotation rotate) {
118 |     this.rotate = rotate;
119 |   }
120 | 
121 |   public Layout getLayout() {
122 |     return layout;
123 |   }
124 | 
125 |   public void setLayout(Layout layout) {
126 |     this.layout = layout;
127 |   }
128 | 
129 |   public boolean isGrayscale() {
130 |     return grayscale;
131 |   }
132 | 
133 |   public void setGrayscale(boolean grayscale) {
134 |     this.grayscale = grayscale;
135 |   }
136 | 
137 |   public Enhance getEnhance() {
138 |     return enhance;
139 |   }
140 | 
141 |   public void setEnhance(Enhance enhance) {
142 |     this.enhance = enhance;
143 |   }
144 | 
145 |   public int getFilterSize() {
146 |     return filterSize;
147 |   }
148 | 
149 |   public void setFilterSize(int filterSize) {
150 |     this.filterSize = filterSize;
151 |   }
152 | 
153 |   public int getOffset() {
154 |     return offset;
155 |   }
156 | 
157 |   public void setOffset(int offset) {
158 |     this.offset = offset;
159 |   }
160 | 
161 |   public Optional<Integer> getThreshold() {
162 |     return threshold;
163 |   }
164 | 
165 |   public void setThreshold(Optional<Integer> threshold) {
166 |     this.threshold = threshold;
167 |   }
168 | 
169 |   public int getSharpAmt() {
170 |     return sharpAmt;
171 |   }
172 | 
173 |   public void setSharpAmt(int sharpAmt) {
174 |     this.sharpAmt = sharpAmt;
175 |   }
176 | 
177 |   public int getSaturation() {
178 |     return saturation;
179 |   }
180 | 
181 |   public void setSaturation(int saturation) {
182 |     this.saturation = saturation;
183 |   }
184 | 
185 |   public int getAdaptiveBlur() {
186 |     return adaptiveBlur;
187 |   }
188 | 
189 |   public void setAdaptiveBlur(int adaptiveBlur) {
190 |     this.adaptiveBlur = adaptiveBlur;
191 |   }
192 | 
193 |   public boolean isUnrotate() {
194 |     return unrotate;
195 |   }
196 | 
197 |   public void setUnrotate(boolean unrotate) {
198 |     this.unrotate = unrotate;
199 |   }
200 | 
201 |   public boolean isTrim() {
202 |     return trim;
203 |   }
204 | 
205 |   public void setTrim(boolean trim) {
206 |     this.trim = trim;
207 |   }
208 | 
209 |   public int getPadAmt() {
210 |     return padAmt;
211 |   }
212 | 
213 |   public void setPadAmt(int padAmt) {
214 |     this.padAmt = padAmt;
215 |   }
216 | 
217 |   public String getBgColor() {
218 |     return bgColor;
219 |   }
220 | 
221 |   public void setBgColor(String bgColor) {
222 |     this.bgColor = bgColor;
223 |   }
224 | 
225 |   private synchronized File getTmpOutputFile(String suffix) throws IOException {
226 |     String dottedSuffix = suffix.charAt(0) == '.'?suffix:("." + suffix);
227 |     if(tmpPath.isPresent()) {
228 |       return File.createTempFile("textCleaner", dottedSuffix, new File(tmpPath.get()));
229 |     }
230 |     else {
231 |       return File.createTempFile("textCleaner", dottedSuffix);
232 |     }
233 |   }
234 | 
235 |   public String[] getCommandLine( String inputFile
236 |                               , String outputFile
237 |                               ) throws IOException
238 |   {
239 |     BufferedImage img = ImageUtils.INSTANCE.readImage(new File(inputFile));
240 |     int aspectRatio = 0;
241 |     {
242 |       double a = ImageUtils.INSTANCE.getAspectRatio(img);
243 |       if(a >= 1) {
244 |         aspectRatio = 1;
245 |       }
246 |     }
247 |     EnumMap<CleaningOptions, String> options = getOptions(aspectRatio);
248 |     List<String> ret = new ArrayList<>();
249 |     ret.add("-respect-parenthesis");
250 |     {
251 |       ret.add("'('");
252 |       ret.add(inputFile);
253 |       ret.add(options.get(CleaningOptions.ROTATE));
254 |       ret.add(options.get(CleaningOptions.GREYSCALE));
255 |       ret.add(options.get(CleaningOptions.ENHANCE));
256 |       ret.add("')'");
257 |     }
258 |     {
259 |       ret.add("'('");
260 |       ret.add("-clone 0");
261 |       ret.add("-colorspace gray");
262 |       ret.add("-negate");
263 |       ret.add(options.get(CleaningOptions.FILTER));
264 |       ret.add("-contrast-stretch 0");
265 |       ret.add(options.get(CleaningOptions.SMOOTHING_THRESHOLD));
266 |       ret.add("')'");
267 |     }
268 |     ret.add("-compose copy_opacity");
269 |     ret.add("-composite -fill white");
270 |     ret.add("-opaque none");
271 |     ret.add("-alpha off");
272 |     ret.add(options.get(CleaningOptions.UNROTATE));
273 |     ret.add(options.get(CleaningOptions.SHARPEN));
274 |     ret.add(options.get(CleaningOptions.SATURATION));
275 |     ret.add(options.get(CleaningOptions.ADAPTIVE_BLUR));
276 |     ret.add(options.get(CleaningOptions.TRIM));
277 |     ret.add(options.get(CleaningOptions.PAD));
278 |     ret.add(outputFile);
279 |     return CLIUtils.translateCommandline(Joiner.on(" ").join(ret));
280 |   }
281 | 
282 |   public byte[] convert(InputStream is) throws IOException, CommandFailedException {
283 |     String suffix = ".tiff";
284 |     File file = getTmpOutputFile(suffix);
285 |     try {
286 |       Files.copy(is, file.toPath(), StandardCopyOption.REPLACE_EXISTING);
287 |       return convert(file.getAbsolutePath(), suffix);
288 |     }
289 |     finally {
290 |       if(file != null && file.exists()) {
291 |         file.delete();
292 |       }
293 |     }
294 |   }
295 | 
296 |   public byte[] convert(String inputFile, String suffix) throws IOException, CommandFailedException {
297 |     File outFile = null;
298 |     try {
299 |       outFile = getTmpOutputFile(suffix);
300 |       if(!new File(inputFile).exists()) {
301 |         throw new FileNotFoundException("Unable to find input file: " + inputFile);
302 |       }
303 |       ArrayList<String> completeCommand = new ArrayList<>();
304 |       {
305 |         String command = Util.Locations.CONVERT.find(convertPath).orElseThrow(() -> new IllegalStateException("Bad baby.")).getAbsolutePath();
306 |         completeCommand.add(command);
307 |       }
308 |       for(String s : getCommandLine(inputFile, outFile.getAbsolutePath())) {
309 |         completeCommand.add(s);
310 |       }
311 | 
312 |       Process p = new ProcessBuilder(completeCommand).start();
313 |       if(p.waitFor() != 0) {
314 |         String stderr = Joiner.on("\n").join(IOUtils.readLines(p.getErrorStream()));
315 |         String stdout = Joiner.on("\n").join(IOUtils.readLines(p.getInputStream()));
316 |         throw new CommandFailedException("Unable to execute convert.  Stderr is: " +  stderr + "\nStdout is: " + stdout);
317 |       }
318 |       byte[] ret = Files.readAllBytes(outFile.toPath());
319 |       if(ret.length == 0) {
320 |         File iFile = new File(inputFile);
321 |         throw new IllegalStateException("Wrote out a zero-byte file. Input file was " + iFile.getAbsolutePath() + " (" + Files.readAllBytes(iFile.toPath()) + ")");
322 |       }
323 |       return ret;
324 |     } catch (InterruptedException e) {
325 |       throw new CommandFailedException("Unable to complete process!", e);
326 |     } finally {
327 |       if(outFile != null && outFile.exists()) {
328 |         outFile.delete();
329 |       }
330 |     }
331 |   }
332 | 
333 |   public EnumMap<CleaningOptions, String> getOptions(int aspectRatio) {
334 |     EnumMap<CleaningOptions, String> ret = new EnumMap(CleaningOptions.class);
335 |     for(CleaningOptions op : CleaningOptions.values()) {
336 |       ret.put(op, op.handler.getIMCommand(aspectRatio, this));
337 |     }
338 |     return ret;
339 |   }
340 | 
341 |   public static <T extends Enum<T> & Aliased > T getByAlias(String alias, Enum<T> def) {
342 |     for(Enum<T> r : def.getClass().getEnumConstants()) {
343 |        if(((T)r).getAlias().equals(alias)) {
344 |          return (T)r;
345 |        }
346 |     }
347 |     return (T)def;
348 |   }
349 | }
350 | 


--------------------------------------------------------------------------------
/preprocessing/src/main/java/ocr/preprocessing/conversion/handler/AdaptiveBlurringHandler.java:
--------------------------------------------------------------------------------
 1 | package ocr.preprocessing.conversion.handler;
 2 | 
 3 | import ocr.preprocessing.conversion.CleaningOptions;
 4 | import ocr.preprocessing.conversion.Handler;
 5 | import ocr.preprocessing.conversion.TextCleaner;
 6 | import org.apache.commons.cli.CommandLine;
 7 | 
 8 | public class AdaptiveBlurringHandler implements Handler {
 9 |   @Override
10 |   public void set(CommandLine cli, TextCleaner cleaner) {
11 |     if(CleaningOptions.ADAPTIVE_BLUR.has(cli)) {
12 |       cleaner.setAdaptiveBlur(Integer.parseInt(CleaningOptions.ADAPTIVE_BLUR.get(cli)));
13 |     }
14 |   }
15 | 
16 |   @Override
17 |   public String getIMCommand(int aspectRatio, TextCleaner cleaner) {
18 |     if(cleaner.getAdaptiveBlur() != 0) {
19 |       return "-adaptive-blur " + cleaner.getAdaptiveBlur();
20 |     }
21 |     return "";
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/preprocessing/src/main/java/ocr/preprocessing/conversion/handler/EnhancingHandler.java:
--------------------------------------------------------------------------------
 1 | package ocr.preprocessing.conversion.handler;
 2 | 
 3 | import ocr.preprocessing.conversion.CleaningOptions;
 4 | import ocr.preprocessing.conversion.Handler;
 5 | import ocr.preprocessing.conversion.TextCleaner;
 6 | import org.apache.commons.cli.CommandLine;
 7 | 
 8 | public class EnhancingHandler implements Handler {
 9 | 
10 |   @Override
11 |   public void set(CommandLine cli, TextCleaner cleaner) {
12 |   cleaner.setEnhance(TextCleaner.getByAlias( CleaningOptions.ENHANCE.get(cli)
13 |                                             , TextCleaner.Enhance.getDefault()
14 |                                             )
15 |                      );
16 |   }
17 | 
18 |   @Override
19 |   public String getIMCommand(int aspectRatio, TextCleaner cleaner) {
20 |     if(cleaner.getEnhance() == TextCleaner.Enhance.STRETCH) {
21 |       return "-contrast-stretch 0";
22 |     }
23 |     else if(cleaner.getEnhance() == TextCleaner.Enhance.STRETCH) {
24 |       return "-normalize 0";
25 |     }
26 |     return "";
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/preprocessing/src/main/java/ocr/preprocessing/conversion/handler/FilterHandler.java:
--------------------------------------------------------------------------------
 1 | package ocr.preprocessing.conversion.handler;
 2 | 
 3 | import ocr.preprocessing.conversion.CleaningOptions;
 4 | import ocr.preprocessing.conversion.Handler;
 5 | import ocr.preprocessing.conversion.TextCleaner;
 6 | import org.apache.commons.cli.CommandLine;
 7 | 
 8 | public class FilterHandler implements Handler {
 9 |   @Override
10 |   public void set(CommandLine cli, TextCleaner cleaner) {
11 | 
12 |     if(CleaningOptions.FILTER.has(cli)) {
13 |       cleaner.setFilterSize(Integer.parseInt(CleaningOptions.FILTER.get(cli)));
14 |     }
15 |   }
16 | 
17 |   @Override
18 |   public String getIMCommand(int aspectRatio, TextCleaner cleaner) {
19 |     return "-lat " + cleaner.getFilterSize() + "x" + cleaner.getFilterSize() + "+" + cleaner.getOffset() + "%";
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/preprocessing/src/main/java/ocr/preprocessing/conversion/handler/GrayscaleHandler.java:
--------------------------------------------------------------------------------
 1 | package ocr.preprocessing.conversion.handler;
 2 | 
 3 | import ocr.preprocessing.conversion.CleaningOptions;
 4 | import ocr.preprocessing.conversion.Handler;
 5 | import ocr.preprocessing.conversion.TextCleaner;
 6 | import org.apache.commons.cli.CommandLine;
 7 | 
 8 | public class GrayscaleHandler implements Handler {
 9 |   @Override
10 |   public void set(CommandLine cli, TextCleaner cleaner) {
11 |     cleaner.setGrayscale(CleaningOptions.GREYSCALE.has(cli));
12 |   }
13 | 
14 |   @Override
15 |   public String getIMCommand(int aspectRatio, TextCleaner cleaner) {
16 |     if(cleaner.isGrayscale()) {
17 |       return "-colorspace gray -type grayscale";
18 |     }
19 |     return "";
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/preprocessing/src/main/java/ocr/preprocessing/conversion/handler/LayoutHandler.java:
--------------------------------------------------------------------------------
 1 | package ocr.preprocessing.conversion.handler;
 2 | 
 3 | import ocr.preprocessing.conversion.CleaningOptions;
 4 | import ocr.preprocessing.conversion.Handler;
 5 | import ocr.preprocessing.conversion.TextCleaner;
 6 | import org.apache.commons.cli.CommandLine;
 7 | 
 8 | public class LayoutHandler implements Handler{
 9 |   @Override
10 |   public void set(CommandLine cli, TextCleaner cleaner) {
11 |     cleaner.setLayout(TextCleaner.getByAlias( CleaningOptions.LAYOUT.get(cli)
12 |                                             , TextCleaner.Layout.getDefault()
13 |                                             )
14 |                      );
15 |   }
16 | 
17 |   @Override
18 |   public String getIMCommand(int aspectRatio, TextCleaner cleaner) {
19 |     return "";
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/preprocessing/src/main/java/ocr/preprocessing/conversion/handler/OffsetHandler.java:
--------------------------------------------------------------------------------
 1 | package ocr.preprocessing.conversion.handler;
 2 | 
 3 | import ocr.preprocessing.conversion.CleaningOptions;
 4 | import ocr.preprocessing.conversion.Handler;
 5 | import ocr.preprocessing.conversion.TextCleaner;
 6 | import org.apache.commons.cli.CommandLine;
 7 | 
 8 | public class OffsetHandler implements Handler {
 9 |   @Override
10 |   public void set(CommandLine cli, TextCleaner cleaner) {
11 |    if(CleaningOptions.OFFSET.has(cli)) {
12 |       cleaner.setOffset(Integer.parseInt(CleaningOptions.OFFSET.get(cli)));
13 |     }
14 |   }
15 | 
16 |   @Override
17 |   public String getIMCommand(int aspectRatio, TextCleaner cleaner) {
18 |     return "";
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/preprocessing/src/main/java/ocr/preprocessing/conversion/handler/PadHandler.java:
--------------------------------------------------------------------------------
 1 | package ocr.preprocessing.conversion.handler;
 2 | 
 3 | import ocr.preprocessing.conversion.CleaningOptions;
 4 | import ocr.preprocessing.conversion.Handler;
 5 | import ocr.preprocessing.conversion.TextCleaner;
 6 | import org.apache.commons.cli.CommandLine;
 7 | 
 8 | public class PadHandler implements Handler {
 9 | 
10 |   @Override
11 |   public void set(CommandLine cli, TextCleaner cleaner) {
12 | 
13 |     if(CleaningOptions.PAD.has(cli)) {
14 |       cleaner.setPadAmt(Integer.parseInt(CleaningOptions.PAD.get(cli)));
15 |     }
16 |   }
17 | 
18 |   @Override
19 |   public String getIMCommand(int aspectRatio, TextCleaner cleaner) {
20 |     if(cleaner.getPadAmt() > 0) {
21 |       return "-compose over -bordercolor white -border " + cleaner.getPadAmt();
22 |     }
23 |     return "";
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/preprocessing/src/main/java/ocr/preprocessing/conversion/handler/RotationHandler.java:
--------------------------------------------------------------------------------
 1 | package ocr.preprocessing.conversion.handler;
 2 | 
 3 | import ocr.preprocessing.conversion.CleaningOptions;
 4 | import ocr.preprocessing.conversion.Handler;
 5 | import ocr.preprocessing.conversion.TextCleaner;
 6 | import org.apache.commons.cli.CommandLine;
 7 | 
 8 | public class RotationHandler implements Handler {
 9 | 
10 |   @Override
11 |   public void set(CommandLine cli, TextCleaner cleaner) {
12 |     cleaner.setRotate(TextCleaner.getByAlias( CleaningOptions.ROTATE.get(cli)
13 |                                             , TextCleaner.Rotation.getDefault()
14 |                                             )
15 |                      );
16 |   }
17 | 
18 |   @Override
19 |   public String getIMCommand(int aspectRatio, TextCleaner cleaner) {
20 |     if(cleaner.getLayout() == TextCleaner.Layout.PORTRAIT
21 |             && aspectRatio == 0
22 |             && cleaner.getRotate() == TextCleaner.Rotation.CLOCKWISE
23 |             )
24 |     {
25 |       return "-rotate 90";
26 |     }
27 |     else if(cleaner.getLayout() == TextCleaner.Layout.PORTRAIT
28 |             && aspectRatio == 0
29 |             && cleaner.getRotate() == TextCleaner.Rotation.COUNTERCLOCKWISE
30 |             )
31 |     {
32 |       return "-rotate -90";
33 |     }
34 |     else if(cleaner.getLayout() == TextCleaner.Layout.LANDSCAPE
35 |             && aspectRatio == 1
36 |             && cleaner.getRotate() == TextCleaner.Rotation.CLOCKWISE
37 |             )
38 |     {
39 |       return "-rotate 90";
40 |     }
41 |     else if(cleaner.getLayout() == TextCleaner.Layout.LANDSCAPE
42 |             && aspectRatio == 1
43 |             && cleaner.getRotate() == TextCleaner.Rotation.COUNTERCLOCKWISE
44 |             )
45 |     {
46 |       return "-rotate -90";
47 |     }
48 |     return "";
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/preprocessing/src/main/java/ocr/preprocessing/conversion/handler/SaturationHandler.java:
--------------------------------------------------------------------------------
 1 | package ocr.preprocessing.conversion.handler;
 2 | 
 3 | import ocr.preprocessing.conversion.CleaningOptions;
 4 | import ocr.preprocessing.conversion.Handler;
 5 | import ocr.preprocessing.conversion.TextCleaner;
 6 | import org.apache.commons.cli.CommandLine;
 7 | 
 8 | public class SaturationHandler implements Handler {
 9 |   @Override
10 |   public void set(CommandLine cli, TextCleaner cleaner) {
11 |      if(CleaningOptions.SATURATION.has(cli)) {
12 |       String saturation= CleaningOptions.SATURATION.get(cli);
13 |       cleaner.setSaturation((int)Double.parseDouble(saturation));
14 |     }
15 |     if(CleaningOptions.GREYSCALE.has(cli)) {
16 |       cleaner.setSaturation(100);
17 |     }
18 |   }
19 | 
20 |   @Override
21 |   public String getIMCommand(int aspectRatio, TextCleaner cleaner) {
22 |     if(cleaner.getSaturation() == 100) {
23 |       return "";
24 |     }
25 |     else {
26 |       return "-modulate 100," + cleaner.getSaturation() + ",100";
27 |     }
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/preprocessing/src/main/java/ocr/preprocessing/conversion/handler/SharpenHandler.java:
--------------------------------------------------------------------------------
 1 | package ocr.preprocessing.conversion.handler;
 2 | 
 3 | import ocr.preprocessing.conversion.CleaningOptions;
 4 | import ocr.preprocessing.conversion.Handler;
 5 | import ocr.preprocessing.conversion.TextCleaner;
 6 | import org.apache.commons.cli.CommandLine;
 7 | 
 8 | public class SharpenHandler implements Handler {
 9 |   @Override
10 |   public void set(CommandLine cli, TextCleaner cleaner) {
11 |     if(CleaningOptions.SHARPEN.has(cli)) {
12 |       String sharpenAmt = CleaningOptions.SHARPEN.get(cli);
13 |       cleaner.setSharpAmt((int)Double.parseDouble(sharpenAmt));
14 |     }
15 |   }
16 | 
17 |   @Override
18 |   public String getIMCommand(int aspectRatio, TextCleaner cleaner) {
19 |     if(cleaner.getSharpAmt() > 0) {
20 |       return "-sharpen 0x" + cleaner.getSharpAmt();
21 |     }
22 |     return "";
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/preprocessing/src/main/java/ocr/preprocessing/conversion/handler/SmoothingThresholdHandler.java:
--------------------------------------------------------------------------------
 1 | package ocr.preprocessing.conversion.handler;
 2 | 
 3 | import ocr.preprocessing.conversion.CleaningOptions;
 4 | import ocr.preprocessing.conversion.Handler;
 5 | import ocr.preprocessing.conversion.TextCleaner;
 6 | import org.apache.commons.cli.CommandLine;
 7 | 
 8 | import java.util.Optional;
 9 | 
10 | public class SmoothingThresholdHandler implements Handler {
11 | 
12 | 
13 | 
14 |   @Override
15 |   public void set(CommandLine cli, TextCleaner cleaner) {
16 |     if(CleaningOptions.SMOOTHING_THRESHOLD.has(cli)) {
17 |       cleaner.setThreshold(Optional.of(Integer.parseInt(CleaningOptions.SMOOTHING_THRESHOLD.get(cli))));
18 |     }
19 |   }
20 | 
21 |   @Override
22 |   public String getIMCommand(int aspectRatio, TextCleaner cleaner) {
23 |     if(cleaner.getThreshold().isPresent()) {
24 |       return "-blur 1x65535 -level " + cleaner.getThreshold().get() + "x100%";
25 |     }
26 |     return "";
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/preprocessing/src/main/java/ocr/preprocessing/conversion/handler/TrimHandler.java:
--------------------------------------------------------------------------------
 1 | package ocr.preprocessing.conversion.handler;
 2 | 
 3 | import ocr.preprocessing.conversion.CleaningOptions;
 4 | import ocr.preprocessing.conversion.Handler;
 5 | import ocr.preprocessing.conversion.TextCleaner;
 6 | import org.apache.commons.cli.CommandLine;
 7 | 
 8 | public class TrimHandler implements Handler {
 9 |   @Override
10 |   public void set(CommandLine cli, TextCleaner cleaner) {
11 |     cleaner.setTrim(CleaningOptions.TRIM.has(cli));
12 |   }
13 | 
14 |   @Override
15 |   public String getIMCommand(int aspectRatio, TextCleaner cleaner) {
16 |     if(cleaner.isTrim()) {
17 |       return "-trim +repage";
18 |     }
19 |     return "";
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/preprocessing/src/main/java/ocr/preprocessing/conversion/handler/UnrotateHandler.java:
--------------------------------------------------------------------------------
 1 | package ocr.preprocessing.conversion.handler;
 2 | 
 3 | import ocr.preprocessing.conversion.CleaningOptions;
 4 | import ocr.preprocessing.conversion.Handler;
 5 | import ocr.preprocessing.conversion.TextCleaner;
 6 | import org.apache.commons.cli.CommandLine;
 7 | 
 8 | public class UnrotateHandler implements Handler {
 9 |   @Override
10 |   public void set(CommandLine cli, TextCleaner cleaner) {
11 |     cleaner.setUnrotate(CleaningOptions.UNROTATE.has(cli));
12 |   }
13 | 
14 |   @Override
15 |   public String getIMCommand(int aspectRatio, TextCleaner cleaner) {
16 |     if(cleaner.isUnrotate()) {
17 |       return "-background white -deskew 40%";
18 |     }
19 |     return "";
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/preprocessing/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Set root logger level to DEBUG and its only appender to STDOUT.
 2 | log4j.rootLogger=INFO, STDOUT
 3 | 
 4 | # STDOUT is set to be a ConsoleAppender.
 5 | log4j.appender.STDOUT=org.apache.log4j.ConsoleAppender
 6 | 
 7 | # STDOUT uses PatternLayout.
 8 | log4j.appender.STDOUT.layout=org.apache.log4j.PatternLayout
 9 | log4j.appender.STDOUT.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m %C%n
10 | 


--------------------------------------------------------------------------------
/preprocessing/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | 
 3 | <!-- For assistance related to logback-translator or configuration  -->
 4 | <!-- files in general, please contact the logback user mailing list -->
 5 | <!-- at http://www.qos.ch/mailman/listinfo/logback-user             -->
 6 | <!--                                                                -->
 7 | <!-- For professional support please see                            -->
 8 | <!--    http://www.qos.ch/shop/products/professionalSupport         -->
 9 | <!--                                                                -->
10 | <configuration>
11 |     <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
12 |         <encoder>
13 |             <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
14 |         </encoder>
15 |     </appender>
16 |     <root level="INFO">
17 |         <appender-ref ref="STDOUT"/>
18 |     </root>
19 | </configuration>
20 | 


--------------------------------------------------------------------------------
/preprocessing/src/test/java/ocr/preprocessing/conversion/TextCleanerTest.java:
--------------------------------------------------------------------------------
 1 | package ocr.preprocessing.conversion;
 2 | 
 3 | import com.google.common.base.Joiner;
 4 | import org.apache.commons.cli.CommandLine;
 5 | import org.apache.commons.cli.CommandLineParser;
 6 | import org.apache.commons.cli.DefaultParser;
 7 | import org.apache.commons.cli.PosixParser;
 8 | import org.junit.Assert;
 9 | import org.junit.Test;
10 | 
11 | import java.awt.image.BufferedImage;
12 | import java.io.File;
13 | import java.io.IOException;
14 | 
15 | public class TextCleanerTest {
16 | 
17 |   @Test
18 |   public void happyPathTest() throws IOException, CommandFailedException {
19 |     String input = "src/test/resources/images/brscan_original_r90.jpg";
20 |     BufferedImage inputImage = ImageUtils.INSTANCE.readImage(new File(input));
21 |     Assert.assertEquals(1024, inputImage.getHeight());
22 |     Assert.assertEquals(768, inputImage.getWidth());
23 |     String output = "src/test/resources/images/brscan_original_r90-out.jpg";
24 |     String args = "-g -e normalize -f 15 -o 10 -u -s 2 -T -p 20";
25 |     DefaultParser parser = new DefaultParser();
26 |     CommandLine cli = CleaningOptions.parse(parser, CLIUtils.translateCommandline(args) );
27 |     TextCleaner cleaner = CleaningOptions.createTextCleaner(cli, null);
28 |     String commandLine = Joiner.on(" ").join(cleaner.getCommandLine(input, output));
29 |     Assert.assertNotNull(commandLine);
30 |     Assert.assertEquals("-respect-parenthesis ( src/test/resources/images/brscan_original_r90.jpg -colorspace gray -type grayscale ) ( -clone 0 -colorspace gray -negate -lat 15x15+10% -contrast-stretch 0 ) -compose copy_opacity -composite -fill white -opaque none -alpha off -background white -deskew 40% -sharpen 0x2 -trim +repage -compose over -bordercolor white -border 20 src/test/resources/images/brscan_original_r90-out.jpg"
31 |                        , commandLine
32 |                        );
33 | 
34 |     byte[] result = cleaner.convert(input, ".jpg");
35 |     Assert.assertTrue(result.length > 0);
36 |     BufferedImage outputImage = ImageUtils.INSTANCE.readImage(result);
37 |     Assert.assertEquals(1074, outputImage.getHeight());
38 |     Assert.assertEquals(812, outputImage.getWidth());
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/preprocessing/src/test/resources/images/abbott2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mmiklavc/scalable-ocr/9c9e42c4844799c860a3cf344a2d0eb218a6d438/preprocessing/src/test/resources/images/abbott2.jpg


--------------------------------------------------------------------------------
/preprocessing/src/test/resources/images/brscan_original_r90-out.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mmiklavc/scalable-ocr/9c9e42c4844799c860a3cf344a2d0eb218a6d438/preprocessing/src/test/resources/images/brscan_original_r90-out.jpg


--------------------------------------------------------------------------------
/preprocessing/src/test/resources/images/brscan_original_r90.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mmiklavc/scalable-ocr/9c9e42c4844799c860a3cf344a2d0eb218a6d438/preprocessing/src/test/resources/images/brscan_original_r90.jpg


--------------------------------------------------------------------------------
/presentation/README.md:
--------------------------------------------------------------------------------
1 | Presentation files - LaTeX, ppt, images, etc.
2 | 


--------------------------------------------------------------------------------
/presentation/conference-rules.txt:
--------------------------------------------------------------------------------
 1 | PRESENTATION GUIDELINES
 2 | This guide has been designed to help you prepare for your Hadoop Summit presentation. 
 3 | 
 4 | PRESENTATION SUBMISSION REQUESTS
 5 | Your presentation must be uploaded to the speaker website for review by June 17, 2016
 6 | Presentations are to be submitted via uploading into the speaker portal using your unique speaker login credential
 7 | Pack presentations and any embedded video in a folder and deliver the entire folder in a compressed fashion
 8 | Label your file with the day you are scheduled to give your talk, time, company and last name
 9 | For example: June28-1:30pm-company-smith.pptx
10 | Acceptable File Formats Include: PowerPoint for Windows or MAC or Keynote for MAC
11 | 
12 | GENERAL ADVICE/ REQUESTS
13 | The success of your presentation will be a direct result of your preparation. A few notes to keep in mind:
14 | Planning - Structure your presentation, define what the most important messages are and clearly make your point
15 | Researching - Check that your information is accurate
16 | Practice - Rehearse the presentation on your own as well as in front of a friendly audience
17 | Presentations should be created in a 16:9 aspect ratio at a resolution of 1280 x 720
18 | Provide the smallest videos possible to avoid problems with playback
19 | Limit presentation size to 20mb or less to ensure quick loading and smooth playback
20 | Plan on approximately 30 minutes of presentation and 10 minutes of Q&A, for a total session time of (40) minutes
21 | 
22 | PRESENTATION SLIDES
23 | 
24 | TITLE SLIDE
25 | Clearly state your session title as submitted in your abstract
26 | Add the speaker(s) full name and company name
27 | 
28 | TEXT SLIDES
29 | Don't use a long list of first-level bullets
30 | Try to avoid bullets wrapping to next line
31 | Keep bullets succinct; They should be readable, not full sentences
32 | Use standard fonts no smaller than 24 pts.
33 | If your slide looks full of text, it probably is. Split it into two slides.
34 | If using non-standard fonts, you will need to provide those to us in advance so we can ensure your text displays properly. We suggest using standard fonts when and where possible.
35 | 
36 | GRAPHIC SLIDES
37 | Yes, a picture is worth a 1,000 words
38 | Font size should be no smaller than 14pt
39 | Don't bold any text in font size smaller than 16pt
40 | Ensure colors have strong contrast as projectors' colors are less bright when projected
41 | If you use lesser known abbreviations or acronyms ensure they are explained
42 | 
43 | LAPTOP COMPUTER NOTES
44 | All presentations will be preloaded and run from the show computer at the AV Tech Table in each room. You will not be allowed to run your presentation from your own laptop, thus the request to submit presentations in advance.
45 | If you wish to run a demo, you must provide your own laptop, which will be connected to projectors from the podium.
46 | If you will be using a MAC laptop plan on bringing your own dongle to connect to VGA
47 | 
48 | SUPPORT
49 | If you have any questions about your presentation or logistics, please contact Jennifer Beucler at Mosaic Event Management: Jbeucler@mosaicevents.com   
50 | 


--------------------------------------------------------------------------------
/presentation/scalable-ocr-hadoop-summit-2016.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mmiklavc/scalable-ocr/9c9e42c4844799c860a3cf344a2d0eb218a6d438/presentation/scalable-ocr-hadoop-summit-2016.pptx


--------------------------------------------------------------------------------
/presentation/text-detection.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mmiklavc/scalable-ocr/9c9e42c4844799c860a3cf344a2d0eb218a6d438/presentation/text-detection.pdf


--------------------------------------------------------------------------------
/scripts/clinton_email_grabber.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import requests
 4 | import sys
 5 | import json
 6 | import csv
 7 | 
 8 | 
 9 | def clean_request(req):
10 |     return ",".join([ line for line in req.split(',') if not(line.strip().startswith('\"docDate\"') or line.strip().startswith('\"postedDate\"'))])
11 | 
12 | def total_num_pages(total_num_docs=30322, num_returned=50):
13 |     return 30322/50
14 | 
15 | def get_url(page_number, total_num_docs=30322, num_returned=50):
16 |     start = num_returned*page_number
17 |     url = 'https://foia.state.gov/searchapp/Search/SubmitSimpleQuery?_dc=1466446302048&searchText=*&beginDate=false&endDate=false&collectionMatch=Clinton_Email&postedBeginDate=false&postedEndDate=false&caseNumber=false&page=' + str(page_number+1) + '&start=' + str(start) + '&limit=' + str(num_returned)
18 |     r = requests.get(url)
19 |     req_text = r.text
20 |     req_text_stripped = req_text.encode('ascii', 'ignore').decode('ascii')
21 |     return json.loads(clean_request(req_text_stripped))['Results']
22 | 
23 | def main():
24 |     with open('metadata.csv', 'wb') as csvfile:
25 |         out = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
26 |         for i in xrange(0, total_num_pages()):
27 |             for doc in get_url(i):
28 |                 out.writerow([ doc['from'], 'https://foia.state.gov/searchapp/' + doc['pdfLink'] ])
29 | 
30 | if __name__ == '__main__':
31 |     main()
32 | 


--------------------------------------------------------------------------------