├── .github
    └── workflows
    │   └── mvn-test.yml
├── .gitignore
├── LICENSE
├── README.adoc
├── build.gradle
├── pom.xml
├── settings.gradle
└── src
    ├── main
        └── java
        │   └── pdftable
        │       ├── PdfTableReader.java
        │       ├── PdfTableSettings.java
        │       ├── TableExtractor.java
        │       ├── Utils.java
        │       └── models
        │           └── ParsedTablePage.java
    └── test
        ├── java
            └── pdftable
            │   └── PdfTableReaderTest.java
        └── resources
            └── test_tables.pdf


/.github/workflows/mvn-test.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will build a Java project with Maven
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/building-and-testing-java-with-maven
 3 | 
 4 | name: Java CI with Maven
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ release/*, master ]
 9 |   pull_request:
10 |     branches: [ master ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: windows-latest
16 | 
17 |     steps:
18 |     - uses: actions/checkout@v2
19 | 
20 |     - name: Download OpenCV 3.4.2
21 |       run: Invoke-WebRequest -Uri https://github.com/opencv/opencv/releases/download/3.4.2/opencv-3.4.2-vc14_vc15.exe -OutFile $HOME\opencv-3.4.2-vc14_vc15.exe
22 |       shell: powershell
23 | 
24 |     - name: Unpack OpenCV 3.4.2
25 |       run: C:\Users\runneradmin\opencv-3.4.2-vc14_vc15.exe -y -s; while (!(Test-Path "C:\Users\runneradmin\opencv\build\java\x64\opencv_java342.dll")) { Start-Sleep 10 }
26 |       shell: powershell
27 | 
28 |     - name: Add OpenCV to PATH
29 |       run: Write-Host "::add-path::C:\Users\runneradmin\opencv\build\java\x64\"
30 |       shell: powershell
31 | 
32 |     - name: Set up JDK 1.8
33 |       uses: actions/setup-java@v1
34 |       with:
35 |         java-version: 1.8
36 | 
37 |     - name: Build with Maven
38 |       run: mvn -B package --file pom.xml
39 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | target/
2 | build/
3 | *.iml
4 | .idea/
5 | gradle.properties
6 | .gradle/
7 | gradlew*
8 | gradle/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Rafal Ostrowski
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.adoc:
--------------------------------------------------------------------------------
  1 | = PDF-table
  2 | :toc:
  3 | 
  4 | == What is PDF-table?
  5 | PDF-table is Java utility library that can be used for parsing tabular data in PDF documents. +
  6 | Core processing of PDF documents is performed with utilization of *Apache PDFBox* and *OpenCV*.
  7 | 
  8 | == Prerequisites
  9 | 
 10 | === JDK
 11 | 
 12 | JAVA 8 is required.
 13 | 
 14 | === External dependencies
 15 | 
 16 | pdf-table requires compiled *OpenCV 3.4.2* to work properly:
 17 | 
 18 | . Download OpenCV v3.4.2 from https://github.com/opencv/opencv/releases/tag/3.4.2
 19 | . Unpack it and add to your system PATH:
 20 |     * Windows: `<opencv dir>\build\java\x64`
 21 |     * Linux: `TODO`
 22 | 
 23 | == Installation
 24 | [source, xml]
 25 | ----
 26 | <dependency>
 27 |   <groupId>com.github.rostrovsky</groupId>
 28 |   <artifactId>pdf-table</artifactId>
 29 |   <version>1.0.0</version>
 30 | </dependency>
 31 | ----
 32 | 
 33 | == Usage
 34 | 
 35 | === Parsing PDFs
 36 | When PDF document page is being parsed, following operations are performed:
 37 | 
 38 | . Page is converted to grayscale image [OpenCV].
 39 | . Binary Inverted Threshold (BIT) is applied to grayscaled image [OpenCV].
 40 | . Contours are detected on BIT image and contour mask is created (additional Canny filtering can be turned on in this step) [OpenCV].
 41 | . Contour mask is XORed with BIT image [OpenCV].
 42 | . Contours are detected once again on XORed image (additional Canny filtering can be turned on in this step) [OpenCV].
 43 | . Final contours are drawn [OpenCV].
 44 | . Bounding rectangles are detected from final contours [OpenCV].
 45 | . PDF is being parsed region-by-region using bounding rectangles coordinates [Apache PDFBox].
 46 | 
 47 | Above algorithm is mostly derived from http://stackoverflow.com/a/23106594.
 48 | 
 49 | For more information about parsed output, refer to <<Output format>>
 50 | 
 51 | ==== single-threaded example
 52 | [source, java]
 53 | ----
 54 | class SingleThreadParser {
 55 |     public static void main(String[] args) throws IOException {
 56 |         PDDocument pdfDoc = PDDocument.load(new File("some.pdf"));
 57 |         PdfTableReader reader = new PdfTableReader();
 58 |         List<ParsedTablePage> parsed = reader.parsePdfTablePages(pdfDoc, 1, pdfDoc.getNumberOfPages());
 59 |     }
 60 | }
 61 | ----
 62 | 
 63 | ==== multi-threaded example
 64 | [source, java]
 65 | ----
 66 | class MultiThreadParser {
 67 |     public static void main(String[] args) throws IOException {
 68 |         final int THREAD_COUNT = 8;
 69 |         PDDocument pdfDoc = PDDocument.load(new File("some.pdf"));
 70 |         PdfTableReader reader = new PdfTableReader();
 71 | 
 72 |         // parse pages simultaneously
 73 |         ExecutorService executor = Executors.newFixedThreadPool(THREAD_COUNT);
 74 |         List<Future<ParsedTablePage>> futures = new ArrayList<>();
 75 |         for (final int pageNum : IntStream.rangeClosed(1, pdfDoc.getNumberOfPages()).toArray()) {
 76 |             Callable<ParsedTablePage> callable = () -> {
 77 |                 ParsedTablePage page = reader.parsePdfTablePage(pdfDoc, pageNum);
 78 |                 return page;
 79 |             };
 80 |             futures.add(executor.submit(callable));
 81 |         }
 82 | 
 83 |         // collect parsed pages
 84 |         List<ParsedTablePage> unsortedParsedPages = new ArrayList<>(pdfDoc.getNumberOfPages());
 85 |         try {
 86 |             for (Future<ParsedTablePage> f : futures) {
 87 |                 ParsedTablePage page = f.get();
 88 |                 unsortedParsedPages.add(page.getPageNum() - 1, page);
 89 |             }
 90 |         } catch (Exception e) {
 91 |             throw new RuntimeException(e);
 92 |         }
 93 | 
 94 |         // sort pages by pageNum
 95 |         List<ParsedTablePage> sortedParsedPages = unsortedParsedPages.stream()
 96 |                 .sorted((p1, p2) -> Integer.compare(p1.getPageNum(), p2.getPageNum())).collect(Collectors.toList());
 97 |     }
 98 | }
 99 | ----
100 | 
101 | === Saving PDF pages as PNG images
102 | PDF-Table provides methods for saving PDF pages as PNG images. +
103 | Rendering DPI can be modified in `PdfTableSettings` (see: <<Parsing settings>>).
104 | 
105 | ==== single-threaded example
106 | [source, java]
107 | ----
108 | class SingleThreadPNGDump {
109 |     public static void main(String[] args) throws IOException {
110 |         PDDocument pdfDoc = PDDocument.load(new File("some.pdf"));
111 |         Path outputPath = Paths.get("C:", "some_directory");
112 |         PdfTableReader reader = new PdfTableReader();
113 |         reader.savePdfPagesAsPNG(pdfDoc, 1, pdfDoc.getNumberOfPages(), outputPath);
114 |     }
115 | }
116 | ----
117 | 
118 | ==== multi-threaded example
119 | [source, java]
120 | ----
121 | class MultiThreadPNGDump {
122 |     public static void main(String[] args) throws IOException {
123 |         final int THREAD_COUNT = 8;
124 |         Path outputPath = Paths.get("C:", "some_directory");
125 |         PDDocument pdfDoc = PDDocument.load(new File("some.pdf"));
126 |         PdfTableReader reader = new PdfTableReader();
127 | 
128 |         ExecutorService executor = Executors.newFixedThreadPool(THREAD_COUNT);
129 |         List<Future<Boolean>> futures = new ArrayList<>();
130 |         for (final int pageNum : IntStream.rangeClosed(1, pdfDoc.getNumberOfPages()).toArray()) {
131 |             Callable<Boolean> callable = () -> {
132 |                 reader.savePdfPageAsPNG(pdfDoc, pageNum, outputPath);
133 |                 return true;
134 |             };
135 |             futures.add(executor.submit(callable));
136 |         }
137 | 
138 |         try {
139 |             for (Future<Boolean> f : futures) {
140 |                 f.get();
141 |             }
142 |         } catch (Exception e) {
143 |             throw new RuntimeException(e);
144 |         }
145 |     }
146 | }
147 | ----
148 | 
149 | === Saving debug PNG images
150 | When tables in PDF document cannot be parsed correctly with default settings, user can save debug images that show page
151 | at various stages of processing. +
152 | Using these images, user can adjust `PdfTableSettings` accordingly to achieve desired results
153 | (see: <<Parsing settings>>).
154 | 
155 | ==== single-threaded example
156 | [source, java]
157 | ----
158 | class SingleThreadDebugImgsDump {
159 |     public static void main(String[] args) throws IOException {
160 |         PDDocument pdfDoc = PDDocument.load(new File("some.pdf"));
161 |         Path outputPath = Paths.get("C:", "some_directory");
162 |         PdfTableReader reader = new PdfTableReader();
163 |         reader.savePdfTablePagesDebugImages(pdfDoc, 1, pdfDoc.getNumberOfPages(), outputPath);
164 |     }
165 | }
166 | ----
167 | 
168 | ==== multi-threaded example
169 | [source, java]
170 | ----
171 | class MultiThreadDebugImgsDump {
172 |     public static void main(String[] args) throws IOException {
173 |         final int THREAD_COUNT = 8;
174 |         Path outputPath = Paths.get("C:", "some_directory");
175 |         PDDocument pdfDoc = PDDocument.load(new File("some.pdf"));
176 |         PdfTableReader reader = new PdfTableReader();
177 | 
178 |         ExecutorService executor = Executors.newFixedThreadPool(THREAD_COUNT);
179 |         List<Future<Boolean>> futures = new ArrayList<>();
180 |         for (final int pageNum : IntStream.rangeClosed(1, pdfDoc.getNumberOfPages()).toArray()) {
181 |             Callable<Boolean> callable = () -> {
182 |                 reader.savePdfTablePagesDebugImage(pdfDoc, pageNum, outputPath);
183 |                 return true;
184 |             };
185 |             futures.add(executor.submit(callable));
186 |         }
187 | 
188 |         try {
189 |             for (Future<Boolean> f : futures) {
190 |                 f.get();
191 |             }
192 |         } catch (Exception e) {
193 |             throw new RuntimeException(e);
194 |         }
195 |     }
196 | }
197 | ----
198 | 
199 | === Parsing settings
200 | 
201 | PDF rendering and OpenCV filtering settings are stored in `PdfTableSettings` object.
202 | 
203 | Custom settings instance can be passed to `PdfTableReader` constructor when non-default values are needed:
204 | 
205 | [source, java]
206 | ----
207 | (...)
208 | 
209 | // build settings object
210 | PdfTableSettings settings = PdfTableSettings.getBuilder()
211 |                 .setCannyFiltering(true)
212 |                 .setCannyApertureSize(5)
213 |                 .setCannyThreshold1(40)
214 |                 .setCannyThreshold2(190.5)
215 |                 .setPdfRenderingDpi(160)
216 |                 .build();
217 | 
218 | // pass settings to reader
219 | PdfTableReader reader = new PdfTableReader(settings);
220 | ----
221 | 
222 | 
223 | === Output format
224 | Each parsed PDF page is being returned as `ParsedTablePage` object:
225 | [source, java]
226 | ----
227 | (...)
228 | 
229 | PDDocument pdfDoc = PDDocument.load(new File("some.pdf"));
230 | PdfTableReader reader = new PdfTableReader();
231 | 
232 | // first page in document has index == 1, not 0 !
233 | ParsedTablePage firstPage = reader.parsePdfTablePage(pdfDoc, 1);
234 | 
235 | // getting page number
236 | assert firstPage.getPageNum() == 1;
237 | 
238 | // rows and cells are zero-indexed just like elements of the List
239 | // getting first row
240 | ParsedTablePage.ParsedTableRow firstRow = firstPage.getRow(0);
241 | 
242 | // getting third cell in second row
243 | String thirdCellContent = firstPage.getRow(1).getCell(2);
244 | 
245 | // cell content usually contain <CR><LF> characters,
246 | // so it is recommended to trim them before processing
247 | double thirdCellNumericValue = Double.valueOf(thirdCellContent.trim());
248 | ----
249 | 


--------------------------------------------------------------------------------
/build.gradle:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This file was generated by the Gradle 'init' task.
  3 |  */
  4 | 
  5 | plugins {
  6 |     id 'java'
  7 |     id 'maven-publish'
  8 |     id 'maven'
  9 |     id 'signing'
 10 | }
 11 | 
 12 | repositories {
 13 |     mavenLocal()
 14 |     maven {
 15 |         url = uri('https://repo.maven.apache.org/maven2')
 16 |     }
 17 | }
 18 | 
 19 | dependencies {
 20 |     implementation 'org.apache.pdfbox:pdfbox:2.0.19'
 21 |     implementation 'org.apache.pdfbox:pdfbox-tools:2.0.19'
 22 |     implementation 'org.apache.commons:commons-lang3:3.5'
 23 |     implementation 'org.openpnp:opencv:3.4.2-2'
 24 |     testImplementation 'org.testng:testng:7.1.0'
 25 | }
 26 | 
 27 | group = 'com.github.rostrovsky'
 28 | archivesBaseName = "pdf-table"
 29 | version = '1.0.0'
 30 | sourceCompatibility = '1.8'
 31 | targetCompatibility = '1.8'
 32 | 
 33 | publishing {
 34 |     publications {
 35 |         maven(MavenPublication) {
 36 |             from(components.java)
 37 |         }
 38 |     }
 39 | }
 40 | 
 41 | test {
 42 |     useTestNG()
 43 | }
 44 | 
 45 | task javadocJar(type: Jar) {
 46 |     classifier = 'javadoc'
 47 |     from javadoc
 48 | }
 49 | 
 50 | task sourcesJar(type: Jar) {
 51 |     classifier = 'sources'
 52 |     from sourceSets.main.allSource
 53 | }
 54 | 
 55 | artifacts {
 56 |     archives javadocJar, sourcesJar
 57 | }
 58 | 
 59 | signing {
 60 |     sign configurations.archives
 61 | }
 62 | 
 63 | uploadArchives {
 64 |     repositories {
 65 |         mavenDeployer {
 66 |             beforeDeployment { MavenDeployment deployment -> signing.signPom(deployment) }
 67 | 
 68 |             repository(url: "https://oss.sonatype.org/service/local/staging/deploy/maven2/") {
 69 |                 authentication(userName: ossrhUsername, password: ossrhPassword)
 70 |             }
 71 | 
 72 |             snapshotRepository(url: "https://oss.sonatype.org/content/repositories/snapshots/") {
 73 |                 authentication(userName: ossrhUsername, password: ossrhPassword)
 74 |             }
 75 | 
 76 |             pom.project {
 77 |                 name 'PDF-Table'
 78 |                 packaging 'jar'
 79 |                 // optionally artifactId can be defined here
 80 |                 description 'PDF-table is Java utility library that can be used for parsing tabular data in PDF documents.\n' +
 81 |                         'Core processing of PDF documents is performed with utilization of Apache PDFBox and OpenCV.'
 82 |                 url 'https://github.com/rostrovsky/pdf-table'
 83 | 
 84 |                 scm {
 85 |                     connection 'scm:git:git://github.com/rostrovsky/pdf-table.git'
 86 |                     developerConnection 'scm:git:git@github.com:rostrovsky/pdf-table.git'
 87 |                     url 'https://github.com/rostrovsky/pdf-table'
 88 |                 }
 89 | 
 90 |                 licenses {
 91 |                     license {
 92 |                         name 'MIT License'
 93 |                         url 'https://github.com/rostrovsky/pdf-table/LICENSE'
 94 |                     }
 95 |                 }
 96 | 
 97 |                 developers {
 98 |                     developer {
 99 |                         id 'rostrovsky'
100 |                         name 'Rafal Ostrowski'
101 |                         email ''
102 |                     }
103 |                 }
104 |             }
105 |         }
106 |     }
107 | }


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <modelVersion>4.0.0</modelVersion>
 6 | 
 7 |     <properties>
 8 |         <maven.compiler.source>1.8</maven.compiler.source>
 9 |         <maven.compiler.target>1.8</maven.compiler.target>
10 |     </properties>
11 | 
12 |     <groupId>com.github.rostrovsky</groupId>
13 |     <artifactId>pdf-table</artifactId>
14 |     <version>1.0.0</version>
15 |     <dependencies>
16 |         <dependency>
17 |             <groupId>org.apache.pdfbox</groupId>
18 |             <artifactId>pdfbox</artifactId>
19 |             <version>2.0.24</version>
20 |         </dependency>
21 |         <dependency>
22 |             <groupId>org.apache.pdfbox</groupId>
23 |             <artifactId>pdfbox-tools</artifactId>
24 |             <version>2.0.19</version>
25 |         </dependency>
26 |         <dependency>
27 |             <groupId>org.apache.commons</groupId>
28 |             <artifactId>commons-lang3</artifactId>
29 |             <version>3.5</version>
30 |         </dependency>
31 |         <dependency>
32 |             <groupId>org.openpnp</groupId>
33 |             <artifactId>opencv</artifactId>
34 |             <version>3.4.2-2</version>
35 |         </dependency>
36 |         <dependency>
37 |             <groupId>org.testng</groupId>
38 |             <artifactId>testng</artifactId>
39 |             <version>7.1.0</version>
40 |             <scope>test</scope>
41 |         </dependency>
42 |     </dependencies>
43 | </project>
44 | 


--------------------------------------------------------------------------------
/settings.gradle:
--------------------------------------------------------------------------------
1 | /*
2 |  * This file was generated by the Gradle 'init' task.
3 |  */
4 | 
5 | rootProject.name = 'pdf-table'
6 | 


--------------------------------------------------------------------------------
/src/main/java/pdftable/PdfTableReader.java:
--------------------------------------------------------------------------------
  1 | package pdftable;
  2 | 
  3 | 
  4 | import org.apache.pdfbox.pdmodel.PDDocument;
  5 | import org.apache.pdfbox.pdmodel.PDPage;
  6 | import org.apache.pdfbox.rendering.ImageType;
  7 | import org.apache.pdfbox.rendering.PDFRenderer;
  8 | import org.apache.pdfbox.text.PDFTextStripperByArea;
  9 | import org.apache.pdfbox.tools.imageio.ImageIOUtil;
 10 | import org.opencv.core.Core;
 11 | import org.opencv.core.Rect;
 12 | import pdftable.models.ParsedTablePage;
 13 | 
 14 | import java.awt.*;
 15 | import java.awt.image.BufferedImage;
 16 | import java.io.IOException;
 17 | import java.nio.file.Path;
 18 | import java.nio.file.Paths;
 19 | import java.util.ArrayList;
 20 | import java.util.List;
 21 | import java.util.stream.Collectors;
 22 | 
 23 | import static pdftable.Utils.bufferedImage2GrayscaleMat;
 24 | 
 25 | 
 26 | public class PdfTableReader {
 27 | 
 28 |     private TableExtractor extractor;
 29 |     private PdfTableSettings settings;
 30 | 
 31 |     static {
 32 |         System.loadLibrary(Core.NATIVE_LIBRARY_NAME);
 33 |     }
 34 | 
 35 |     public PdfTableReader(PdfTableSettings settings) {
 36 |         this.settings = settings;
 37 |         this.extractor = new TableExtractor(settings);
 38 |     }
 39 | 
 40 |     public PdfTableReader() {
 41 |         this(new PdfTableSettings());
 42 |     }
 43 | 
 44 |     /**
 45 |      * Renders PDF page with DPI specified in settings and saves it in specified directory.
 46 |      *
 47 |      * @param renderer  PDF renderer instance
 48 |      * @param page      page number
 49 |      * @param outputDir output directory
 50 |      * @throws IOException
 51 |      */
 52 |     private void savePdfPageAsPNG(PDFRenderer renderer, int page, Path outputDir) throws IOException {
 53 |         BufferedImage bim;
 54 |         synchronized (this) {
 55 |             bim = renderer.renderImageWithDPI(page, settings.getPdfRenderingDpi(), ImageType.RGB);
 56 |         }
 57 |         Path outPath = outputDir.resolve(Paths.get("page_" + (page + 1) + ".png"));
 58 |         ImageIOUtil.writeImage(bim, outPath.toString(), settings.getPdfRenderingDpi());
 59 | 
 60 |     }
 61 | 
 62 |     /**
 63 |      * Renders PDF pages range with DPI specified in settings and saves images in specified directory.
 64 |      *
 65 |      * @param document  PDF document instance
 66 |      * @param startPage first page in range (first page == 1)
 67 |      * @param endPage   last page in range
 68 |      * @param outputDir output directory
 69 |      * @throws IOException
 70 |      */
 71 |     public void savePdfPagesAsPNG(PDDocument document, int startPage, int endPage, Path outputDir) throws IOException {
 72 |         PDFRenderer pdfRenderer = new PDFRenderer(document);
 73 |         for (int page = startPage - 1; page < endPage; ++page) {
 74 |             savePdfPageAsPNG(pdfRenderer, page, outputDir);
 75 |         }
 76 |     }
 77 | 
 78 |     /**
 79 |      * Renders single PDF page with DPI specified in settings and saves image in specified directory.
 80 |      *
 81 |      * @param document  PDF document instance
 82 |      * @param page      page number (first page == 1)
 83 |      * @param outputDir output directory
 84 |      * @throws IOException
 85 |      */
 86 |     public void savePdfPageAsPNG(PDDocument document, int page, Path outputDir) throws IOException {
 87 |         savePdfPagesAsPNG(document, page, page, outputDir);
 88 |     }
 89 | 
 90 |     /**
 91 |      * Parses single PDF page and returns list of rows containing cell texts.
 92 |      *
 93 |      * @param bi     PDF page in image format
 94 |      * @param pdPage PDF page in PDPage format
 95 |      * @return parsed page
 96 |      * @throws IOException
 97 |      */
 98 |     private ParsedTablePage parsePdfTablePage(BufferedImage bi, PDPage pdPage, int pageNumber) throws IOException {
 99 |         List<Rect> rectangles = extractor.getTableBoundingRectangles(bufferedImage2GrayscaleMat(bi));
100 |         return parsePageByRectangles(pdPage, rectangles, pageNumber);
101 |     }
102 | 
103 |     /**
104 |      * Parses range of PDF pages and returns list of lists of rows containing cell texts.
105 |      *
106 |      * @param document  PDF document instance
107 |      * @param startPage first page in range to parse (first page == 1)
108 |      * @param endPage   last page in range
109 |      * @return List of pages
110 |      * @throws IOException
111 |      */
112 |     public List<ParsedTablePage> parsePdfTablePages(PDDocument document, int startPage, int endPage) throws IOException {
113 |         List<ParsedTablePage> out = new ArrayList<>();
114 |         PDFRenderer renderer = new PDFRenderer(document);
115 |         for (int page = startPage - 1; page < endPage; ++page) {
116 |             BufferedImage bi;
117 |             synchronized (this) {
118 |                 bi = renderer.renderImageWithDPI(page, settings.getPdfRenderingDpi(), ImageType.RGB);
119 |             }
120 |             ParsedTablePage parsedTablePage = parsePdfTablePage(bi, document.getPage(page), page + 1);
121 |             out.add(parsedTablePage);
122 |         }
123 |         return out;
124 |     }
125 | 
126 |     /**
127 |      * Parses single PDF page and returns list of rows containing cell texts.
128 |      *
129 |      * @param document PDF document instance
130 |      * @param page     number of page to parse (first page == 1)
131 |      * @return parsed page
132 |      * @throws IOException
133 |      */
134 |     public ParsedTablePage parsePdfTablePage(PDDocument document, int page) throws IOException {
135 |         return parsePdfTablePages(document, page, page).get(0);
136 |     }
137 | 
138 |     /**
139 |      * Saves debug images of PDF pages from specified range and saves them in specified directory.
140 |      *
141 |      * @param document  PDF document instance
142 |      * @param startPage first page in range to process (first page == 1)
143 |      * @param endPage   last page in range
144 |      * @param outputDir destination directory
145 |      * @throws IOException
146 |      */
147 |     public void savePdfTablePagesDebugImages(PDDocument document, int startPage, int endPage, Path outputDir) throws IOException {
148 |         TableExtractor debugExtractor = new TableExtractor(settings);
149 |         PDFRenderer renderer = new PDFRenderer(document);
150 |         for (int page = startPage - 1; page < endPage; ++page) {
151 |             PdfTableSettings debugSettings = PdfTableSettings.getBuilder()
152 |                     .setDebugImages(true)
153 |                     .setDebugFileOutputDir(outputDir)
154 |                     .setDebugFilename("page_" + (page + 1))
155 |                     .build();
156 |             debugExtractor.setSettings(debugSettings);
157 |             BufferedImage bi;
158 |             synchronized (this) {
159 |                 bi = renderer.renderImageWithDPI(page, settings.getPdfRenderingDpi(), ImageType.RGB);
160 |             }
161 |             debugExtractor.getTableBoundingRectangles(bufferedImage2GrayscaleMat(bi));
162 |         }
163 |     }
164 | 
165 |     /**
166 |      * Saves debug images of PDF page and saves them in specified directory.
167 |      *
168 |      * @param document  PDF document instance
169 |      * @param page      page to process (first page == 1)
170 |      * @param outputDir destination directory
171 |      * @throws IOException
172 |      */
173 |     public void savePdfTablePageDebugImage(PDDocument document, int page, Path outputDir) throws IOException {
174 |         savePdfTablePagesDebugImages(document, page, page, outputDir);
175 |     }
176 | 
177 |     /**
178 |      * Parses PDF page cell by cell using rectangles obtained from TableExtractor.
179 |      *
180 |      * @param page       PDF page
181 |      * @param rectangles list of OpenCV rectangles recognized by TableExtractor
182 |      * @return parsed page
183 |      * @throws IOException
184 |      */
185 |     private ParsedTablePage parsePageByRectangles(PDPage page, List<Rect> rectangles, int pageNumber) throws IOException {
186 |         List<List<Rect>> sortedRects = groupRectanglesByRow(rectangles);
187 |         ParsedTablePage out = new ParsedTablePage(pageNumber);
188 | 
189 |         PDFTextStripperByArea stripper = new PDFTextStripperByArea();
190 |         stripper.setSortByPosition(true);
191 | 
192 |         int iRow = 0;
193 |         int iCol = 0;
194 |         for (List<Rect> row : sortedRects) {
195 |             for (Rect col : row) {
196 |                 Rectangle r = new Rectangle(
197 |                         (int) (col.x * settings.getDpiRatio()),
198 |                         (int) (col.y * settings.getDpiRatio()),
199 |                         (int) (col.width * settings.getDpiRatio()),
200 |                         (int) (col.height * settings.getDpiRatio())
201 |                 );
202 |                 stripper.addRegion(getRegionId(iRow, iCol), r);
203 |                 iCol++;
204 |             }
205 |             iRow++;
206 |             iCol = 0;
207 |         }
208 | 
209 |         stripper.extractRegions(page);
210 | 
211 |         iRow = 0;
212 |         iCol = 0;
213 |         for (List<Rect> row : sortedRects) {
214 |             List<String> rowCells = new ArrayList<>();
215 |             for (Rect col : row) {
216 |                 String cellText = stripper.getTextForRegion(getRegionId(iRow, iCol));
217 |                 rowCells.add(cellText);
218 |                 iCol++;
219 |             }
220 |             out.addRow(rowCells);
221 |             iRow++;
222 |             iCol = 0;
223 |         }
224 | 
225 |         return out;
226 |     }
227 | 
228 |     /**
229 |      * Groups rectangles by y coordinate effectively grouping them into rows.
230 |      *
231 |      * @param rectangles list of OpenCV Rectangles
232 |      * @return list of Rectangle lists representing table rows.
233 |      */
234 |     private List<List<Rect>> groupRectanglesByRow(List<Rect> rectangles) {
235 |         List<List<Rect>> out = new ArrayList<>();
236 |         List<Integer> rowsCoords = rectangles.stream().map(r -> r.y).distinct().collect(Collectors.toList());
237 |         for (int rowCoords : rowsCoords) {
238 |             List<Rect> cols = rectangles.stream().filter(r -> r.y == rowCoords).collect(Collectors.toList());
239 |             out.add(cols);
240 |         }
241 |         return out;
242 |     }
243 | 
244 |     /**
245 |      * Static helper for creating row/column markers.
246 |      *
247 |      * @param row table row
248 |      * @param col table column
249 |      * @return marker with row & column number
250 |      */
251 |     private static String getRegionId(int row, int col) {
252 |         return String.format("r%dc%d", row, col);
253 |     }
254 | 
255 | }
256 | 


--------------------------------------------------------------------------------
/src/main/java/pdftable/PdfTableSettings.java:
--------------------------------------------------------------------------------
  1 | package pdftable;
  2 | 
  3 | 
  4 | import java.nio.file.Path;
  5 | 
  6 | /**
  7 |  * Image conversion settings.
  8 |  */
  9 | public class PdfTableSettings {
 10 | 
 11 |     public static class PdfTableSettingsBuilder {
 12 | 
 13 |         // --------------
 14 |         // DEFAULT VALUES
 15 |         // --------------
 16 | 
 17 |         // DPI SETTINGS
 18 |         private static final int DEFAULT_PDF_DPI = 72;
 19 |         private int pdfRenderingDpi = 120;
 20 | 
 21 |         // CANNY EDGE DETECTION FLAG
 22 |         private boolean cannyFiltering = false;
 23 | 
 24 |         // BINARY INVERTED THRESHOLD SETTINGS
 25 |         private double bitThreshold = 150;
 26 |         private double bitMaxVal = 255;
 27 | 
 28 |         // CANNY FILTER SETTINGS
 29 |         private double cannyThreshold1 = 50;
 30 |         private double cannyThreshold2 = 200;
 31 |         private int cannyApertureSize = 3;
 32 |         private boolean cannyL2Gradient = false;
 33 | 
 34 |         // BOUNDING RECT PARAMS
 35 |         private double approxDistScaleFactor = 0.02;
 36 | 
 37 |         // DEBUG IMAGES PARAMS
 38 |         private boolean debugImages = false;
 39 |         private Path debugFileOutputDir;
 40 |         private String debugFilename;
 41 | 
 42 |         public PdfTableSettingsBuilder setPdfRenderingDpi(int pdfRenderingDpi) {
 43 |             this.pdfRenderingDpi = pdfRenderingDpi;
 44 |             return this;
 45 |         }
 46 | 
 47 |         public PdfTableSettingsBuilder setCannyFiltering(boolean cannyFiltering) {
 48 |             this.cannyFiltering = cannyFiltering;
 49 |             return this;
 50 |         }
 51 | 
 52 |         public PdfTableSettingsBuilder setBitThreshold(double bitThreshold) {
 53 |             this.bitThreshold = bitThreshold;
 54 |             return this;
 55 |         }
 56 | 
 57 |         public PdfTableSettingsBuilder setBitMaxVal(double bitMaxVal) {
 58 |             this.bitMaxVal = bitMaxVal;
 59 |             return this;
 60 |         }
 61 | 
 62 |         public PdfTableSettingsBuilder setCannyThreshold1(double cannyThreshold1) {
 63 |             this.cannyThreshold1 = cannyThreshold1;
 64 |             return this;
 65 |         }
 66 | 
 67 |         public PdfTableSettingsBuilder setCannyThreshold2(double cannyThreshold2) {
 68 |             this.cannyThreshold2 = cannyThreshold2;
 69 |             return this;
 70 |         }
 71 | 
 72 |         public PdfTableSettingsBuilder setCannyApertureSize(int cannyApertureSize) {
 73 |             this.cannyApertureSize = cannyApertureSize;
 74 |             return this;
 75 |         }
 76 | 
 77 |         public PdfTableSettingsBuilder setCannyL2Gradient(boolean cannyL2Gradient) {
 78 |             this.cannyL2Gradient = cannyL2Gradient;
 79 |             return this;
 80 |         }
 81 | 
 82 |         public PdfTableSettingsBuilder setApproxDistScaleFactor(double approxDistScaleFactor) {
 83 |             this.approxDistScaleFactor = approxDistScaleFactor;
 84 |             return this;
 85 |         }
 86 | 
 87 |         public PdfTableSettingsBuilder setDebugImages(boolean debugImages) {
 88 |             this.debugImages = debugImages;
 89 |             return this;
 90 |         }
 91 | 
 92 |         public PdfTableSettingsBuilder setDebugFileOutputDir(Path debugFileOutputDir) {
 93 |             this.debugFileOutputDir = debugFileOutputDir;
 94 |             return this;
 95 |         }
 96 | 
 97 |         public PdfTableSettingsBuilder setDebugFilename(String debugFilename) {
 98 |             this.debugFilename = debugFilename;
 99 |             return this;
100 |         }
101 | 
102 |         public PdfTableSettings build() {
103 |             return new PdfTableSettings(this);
104 |         }
105 |     }
106 | 
107 |     // DPI SETTINGS
108 |     private int defaultPdfDpi;
109 |     private int pdfRenderingDpi;
110 | 
111 |     // CANNY EDGE DETECTION FLAG
112 |     private boolean cannyFiltering;
113 | 
114 |     // BINARY INVERTED THRESHOLD SETTINGS
115 |     private double bitThreshold;
116 |     private double bitMaxVal;
117 | 
118 |     // CANNY FILTER SETTINGS
119 |     private double cannyThreshold1;
120 |     private double cannyThreshold2;
121 |     private int cannyApertureSize;
122 |     private boolean cannyL2Gradient;
123 | 
124 |     // BOUNDING RECT PARAMS
125 |     private double approxDistScaleFactor;
126 | 
127 |     // DEBUG IMAGES PARAMS
128 |     private boolean debugImages;
129 |     private Path debugFileOutputDir;
130 |     private String debugFilename;
131 | 
132 |     private PdfTableSettings(PdfTableSettingsBuilder builder) {
133 |         this.defaultPdfDpi = PdfTableSettingsBuilder.DEFAULT_PDF_DPI;
134 |         this.pdfRenderingDpi = builder.pdfRenderingDpi;
135 |         this.cannyFiltering = builder.cannyFiltering;
136 |         this.bitThreshold = builder.bitThreshold;
137 |         this.bitMaxVal = builder.bitMaxVal;
138 |         this.cannyThreshold1 = builder.cannyThreshold1;
139 |         this.cannyThreshold2 = builder.cannyThreshold2;
140 |         this.cannyApertureSize = builder.cannyApertureSize;
141 |         this.cannyL2Gradient = builder.cannyL2Gradient;
142 |         this.approxDistScaleFactor = builder.approxDistScaleFactor;
143 |         this.debugImages = builder.debugImages;
144 |         this.debugFileOutputDir = builder.debugFileOutputDir;
145 |         this.debugFilename = builder.debugFilename;
146 |     }
147 | 
148 |     public PdfTableSettings() {
149 |         this(new PdfTableSettingsBuilder());
150 |     }
151 | 
152 |     public static PdfTableSettingsBuilder getBuilder() {
153 |         return new PdfTableSettingsBuilder();
154 |     }
155 | 
156 |     public int getDefaultPdfDpi() {
157 |         return defaultPdfDpi;
158 |     }
159 | 
160 |     public int getPdfRenderingDpi() {
161 |         return pdfRenderingDpi;
162 |     }
163 | 
164 |     public boolean hasCannyFiltering() {
165 |         return cannyFiltering;
166 |     }
167 | 
168 |     public double getBitThreshold() {
169 |         return bitThreshold;
170 |     }
171 | 
172 |     public double getBitMaxVal() {
173 |         return bitMaxVal;
174 |     }
175 | 
176 |     public double getCannyThreshold1() {
177 |         return cannyThreshold1;
178 |     }
179 | 
180 |     public double getCannyThreshold2() {
181 |         return cannyThreshold2;
182 |     }
183 | 
184 |     public int getCannyApertureSize() {
185 |         return cannyApertureSize;
186 |     }
187 | 
188 |     public boolean hasCannyL2Gradient() {
189 |         return cannyL2Gradient;
190 |     }
191 | 
192 |     public double getApproxDistScaleFactor() {
193 |         return approxDistScaleFactor;
194 |     }
195 | 
196 |     public boolean hasDebugImages() {
197 |         return debugImages;
198 |     }
199 | 
200 |     public Path getDebugFileOutputDir() {
201 |         return debugFileOutputDir;
202 |     }
203 | 
204 |     public String getDebugFilename() {
205 |         return debugFilename;
206 |     }
207 | 
208 |     public double getDpiRatio() {
209 |         return (double) defaultPdfDpi / pdfRenderingDpi;
210 |     }
211 | }
212 | 


--------------------------------------------------------------------------------
/src/main/java/pdftable/TableExtractor.java:
--------------------------------------------------------------------------------
  1 | package pdftable;
  2 | 
  3 | import org.opencv.core.*;
  4 | import org.opencv.imgcodecs.Imgcodecs;
  5 | import org.opencv.imgproc.Imgproc;
  6 | 
  7 | import java.util.ArrayList;
  8 | import java.util.Collections;
  9 | import java.util.List;
 10 | 
 11 | import static org.opencv.core.Core.bitwise_xor;
 12 | import static org.opencv.imgproc.Imgproc.*;
 13 | 
 14 | /**
 15 |  * Class responsible for determining table cells bounding boxes.
 16 |  * Should be used as static.
 17 |  */
 18 | class TableExtractor {
 19 | 
 20 |     private PdfTableSettings settings;
 21 | 
 22 |     static {
 23 |         System.loadLibrary(Core.NATIVE_LIBRARY_NAME);
 24 |     }
 25 | 
 26 |     public TableExtractor(PdfTableSettings settings) {
 27 |         this.settings = settings;
 28 |     }
 29 | 
 30 |     /**
 31 |      * Applies series of filters on page image and extracts table cells bounding rectangles.
 32 |      * Additionally dumps debug PNG images when settings.hasDebugImages() is true.
 33 |      *
 34 |      * @param inImage Input image
 35 |      * @return List of org.opencv.core.Rect objects representing cell bounding rectangles.
 36 |      */
 37 |     public List<Rect> getTableBoundingRectangles(Mat inImage) {
 38 |         List<Rect> out = new ArrayList<>();
 39 | 
 40 |         if (settings.hasDebugImages()) {
 41 |             Imgcodecs.imwrite(buildDebugFilename("original_grayscaled"), inImage);
 42 |         }
 43 | 
 44 |         // binary inverted threshold
 45 |         Mat bit = binaryInvertedThreshold(inImage);
 46 |         if (settings.hasDebugImages()) {
 47 |             Imgcodecs.imwrite(buildDebugFilename("binary_inverted_threshold"), bit);
 48 |         }
 49 | 
 50 |         // find contours
 51 |         List<MatOfPoint> contours = new ArrayList<>();
 52 |         if (settings.hasCannyFiltering()) {
 53 |             Mat canny = cannyFilter(inImage);
 54 |             findContours(canny, contours, new Mat(), RETR_EXTERNAL, CHAIN_APPROX_SIMPLE);
 55 |             if (settings.hasDebugImages()) {
 56 |                 Imgcodecs.imwrite(buildDebugFilename("canny1"), canny);
 57 |             }
 58 |         } else {
 59 |             findContours(bit, contours, new Mat(), RETR_EXTERNAL, CHAIN_APPROX_SIMPLE);
 60 |         }
 61 | 
 62 |         // draw contour
 63 |         Mat contourMask = bit.clone();
 64 |         drawContours(contourMask, contours, -1, new Scalar(255, 255, 255), Core.FILLED);
 65 |         if (settings.hasDebugImages()) {
 66 |             Imgcodecs.imwrite(buildDebugFilename("contour_mask"), contourMask);
 67 |         }
 68 | 
 69 |         // XOR threshold and mask
 70 |         Mat xored = new Mat();
 71 |         bitwise_xor(bit, contourMask, xored);
 72 |         if (settings.hasDebugImages()) {
 73 |             Imgcodecs.imwrite(buildDebugFilename("xored"), xored);
 74 |         }
 75 | 
 76 |         // find contours #2
 77 |         List<MatOfPoint> contours2 = new ArrayList<>();
 78 |         if (settings.hasCannyFiltering()) {
 79 |             Mat canny2 = cannyFilter(xored);
 80 |             findContours(canny2, contours2, new Mat(), RETR_EXTERNAL, CHAIN_APPROX_SIMPLE);
 81 |             if (settings.hasDebugImages()) {
 82 |                 Imgcodecs.imwrite(buildDebugFilename("canny2"), canny2);
 83 |             }
 84 |         } else {
 85 |             findContours(xored, contours2, new Mat(), RETR_EXTERNAL, CHAIN_APPROX_SIMPLE);
 86 |         }
 87 | 
 88 |         // draw contour #2
 89 |         if (settings.hasDebugImages()) {
 90 |             Mat contourMask2 = inImage.clone();
 91 |             drawContours(contourMask2, contours2, -1, new Scalar(255, 255, 255), Core.FILLED);
 92 |             Imgcodecs.imwrite(buildDebugFilename("final_contours"), contourMask2);
 93 |         }
 94 | 
 95 |         // find contours #2 bounding rectangles
 96 |         for (int i = 0; i < contours2.size(); i++) {
 97 |             MatOfPoint2f approxCurve = new MatOfPoint2f();
 98 |             MatOfPoint2f contour2f = new MatOfPoint2f(contours2.get(i).toArray());
 99 |             double approxDistance = Imgproc.arcLength(contour2f, true) * settings.getApproxDistScaleFactor();
100 |             Imgproc.approxPolyDP(contour2f, approxCurve, approxDistance, true);
101 |             MatOfPoint points = new MatOfPoint(approxCurve.toArray());
102 |             Rect rect = Imgproc.boundingRect(points);
103 |             out.add(rect);
104 |         }
105 | 
106 |         Collections.reverse(out);
107 | 
108 |         if (settings.hasDebugImages()) {
109 |             int ri = 0;
110 |             for (Rect rect : out) {
111 |                 Mat outImage = inImage.clone();
112 |                 Point p1 = new Point(rect.x, rect.y);
113 |                 Point p2 = new Point(rect.x + rect.width, rect.y + rect.height);
114 |                 rectangle(outImage, p1, p2, new Scalar(0, 0, 0, 255), 3);
115 |                 Imgcodecs.imwrite(buildDebugFilename(String.format("box_%03d", ri)), outImage);
116 |                 ri++;
117 |             }
118 |         }
119 | 
120 |         return out;
121 |     }
122 | 
123 |     public void setSettings(PdfTableSettings settings) {
124 |         this.settings = settings;
125 |     }
126 | 
127 |     /**
128 |      * Applies Binary Inverted Threshold (BIT) to Mat image.
129 |      *
130 |      * @param input Input image
131 |      * @return org.opencv.core.Mat image with applied BIT
132 |      */
133 |     private Mat binaryInvertedThreshold(Mat input) {
134 |         Mat out = new Mat();
135 |         threshold(input, out, settings.getBitThreshold(), settings.getBitMaxVal(), THRESH_BINARY_INV);
136 |         return out;
137 |     }
138 | 
139 |     /**
140 |      * Applies Canny filter to Mat image.
141 |      *
142 |      * @param input Input image
143 |      * @return org.opencv.core.Mat image with applied Canny filter
144 |      */
145 |     private Mat cannyFilter(Mat input) {
146 |         Mat out = new Mat();
147 |         Canny(input, out, settings.getCannyThreshold1(), settings.getCannyThreshold2(), settings.getCannyApertureSize(), settings.hasCannyL2Gradient());
148 |         return out;
149 |     }
150 | 
151 |     /**
152 |      * String helper used for constructing debug image output path.
153 |      *
154 |      * @param suffix Image filename suffix
155 |      * @return String representing image path
156 |      */
157 |     private String buildDebugFilename(String suffix) {
158 |         return settings.getDebugFileOutputDir().resolve(settings.getDebugFilename() + "_" + suffix + ".png").toString();
159 |     }
160 | 
161 | }
162 | 


--------------------------------------------------------------------------------
/src/main/java/pdftable/Utils.java:
--------------------------------------------------------------------------------
 1 | package pdftable;
 2 | 
 3 | import org.apache.pdfbox.io.IOUtils;
 4 | import org.opencv.core.Mat;
 5 | import org.opencv.core.MatOfByte;
 6 | import org.opencv.imgcodecs.Imgcodecs;
 7 | 
 8 | import javax.imageio.ImageIO;
 9 | import java.awt.image.BufferedImage;
10 | import java.io.ByteArrayInputStream;
11 | import java.io.ByteArrayOutputStream;
12 | import java.io.IOException;
13 | import java.io.InputStream;
14 | 
15 | /**
16 |  * Static utilities used for converting between image formats.
17 |  */
18 | public class Utils {
19 | 
20 |     private Utils() {
21 |     }
22 | 
23 |     /**
24 |      * Converts InputStream to OpenCV Mat
25 |      *
26 |      * @param stream Input stream
27 |      * @param flag   org.opencv.imgcodecs.Imgcodecs flag
28 |      * @return org.opencv.core.Mat
29 |      * @throws IOException
30 |      */
31 |     public static Mat inputStream2Mat(InputStream stream, int flag) throws IOException {
32 |         byte[] byteBuff = IOUtils.toByteArray(stream);
33 |         return Imgcodecs.imdecode(new MatOfByte(byteBuff), flag);
34 |     }
35 | 
36 |     /**
37 |      * Converts BufferedImage to InputStream.
38 |      *
39 |      * @param inImg Buffered Image
40 |      * @return java.io.InputStream
41 |      * @throws IOException
42 |      */
43 |     public static InputStream bufferedImage2InputStream(BufferedImage inImg) throws IOException {
44 |         ByteArrayOutputStream os = new ByteArrayOutputStream();
45 |         ImageIO.write(inImg, "png", os);
46 |         return new ByteArrayInputStream(os.toByteArray());
47 |     }
48 | 
49 |     /**
50 |      * Converts BufferedImage to OpenCV Mat using custom flag.
51 |      *
52 |      * @param inImg Buffered Image
53 |      * @param flag  org.opencv.imgcodecs.Imgcodecs flag
54 |      * @return org.opencv.core.Mat
55 |      * @throws IOException
56 |      */
57 |     public static Mat bufferedImage2Mat(BufferedImage inImg, int flag) throws IOException {
58 |         return inputStream2Mat(bufferedImage2InputStream(inImg), flag);
59 |     }
60 | 
61 |     /**
62 |      * Converts BufferedImage to grayscaled OpenCV Mat.
63 |      *
64 |      * @param inImg Buffered Image
65 |      * @return org.opencv.core.Mat
66 |      * @throws IOException
67 |      */
68 |     public static Mat bufferedImage2GrayscaleMat(BufferedImage inImg) throws IOException {
69 |         return bufferedImage2Mat(inImg, Imgcodecs.IMREAD_GRAYSCALE);
70 |     }
71 | 
72 | }
73 | 


--------------------------------------------------------------------------------
/src/main/java/pdftable/models/ParsedTablePage.java:
--------------------------------------------------------------------------------
 1 | package pdftable.models;
 2 | 
 3 | import org.apache.commons.lang3.StringEscapeUtils;
 4 | 
 5 | import java.util.ArrayList;
 6 | import java.util.Arrays;
 7 | import java.util.List;
 8 | import java.util.stream.Collectors;
 9 | 
10 | /**
11 |  * Parsed page model.
12 |  */
13 | public class ParsedTablePage {
14 | 
15 |     public class ParsedTableRow {
16 | 
17 |         private List<String> cells;
18 | 
19 |         public ParsedTableRow(List<String> cells) {
20 |             this.cells = cells;
21 |         }
22 | 
23 |         public List<String> getCells() {
24 |             return cells;
25 |         }
26 | 
27 |         public String getCell(int index) {
28 |             return cells.get(index);
29 |         }
30 | 
31 |         @Override
32 |         public String toString() {
33 |             List<String> escapedCells = cells.stream().map(c -> StringEscapeUtils.escapeJava(c)).collect(Collectors.toList());
34 |             return String.format("<%s@%s; cells:%s>",
35 |                     this.getClass().getSimpleName(), System.identityHashCode(this), Arrays.toString(escapedCells.toArray()));
36 |         }
37 |     }
38 | 
39 |     private List<ParsedTableRow> rows;
40 |     private int pageNum;
41 | 
42 |     private ParsedTablePage() {
43 |         rows = new ArrayList<>();
44 |     }
45 | 
46 |     public ParsedTablePage(int pageNumber) {
47 |         this();
48 |         pageNum = pageNumber;
49 |     }
50 | 
51 |     public List<ParsedTableRow> getRows() {
52 |         return rows;
53 |     }
54 | 
55 |     public void addRow(List<String> cells) {
56 |         rows.add(new ParsedTableRow(cells));
57 |     }
58 | 
59 |     public ParsedTableRow getRow(int index) {
60 |         return rows.get(index);
61 |     }
62 | 
63 |     public int getPageNum() {
64 |         return pageNum;
65 |     }
66 | 
67 |     @Override
68 |     public String toString() {
69 |         return String.format("<%s@%s; rows:%s>",
70 |                 this.getClass().getSimpleName(), System.identityHashCode(this), Arrays.toString(rows.toArray()));
71 |     }
72 | }
73 | 


--------------------------------------------------------------------------------
/src/test/java/pdftable/PdfTableReaderTest.java:
--------------------------------------------------------------------------------
  1 | package pdftable;
  2 | 
  3 | 
  4 | import org.apache.pdfbox.pdmodel.PDDocument;
  5 | import org.testng.Assert;
  6 | import org.testng.TestException;
  7 | import org.testng.annotations.AfterMethod;
  8 | import org.testng.annotations.BeforeMethod;
  9 | import org.testng.annotations.Test;
 10 | import pdftable.models.ParsedTablePage;
 11 | 
 12 | import java.io.File;
 13 | import java.io.IOException;
 14 | import java.nio.file.Path;
 15 | import java.nio.file.Paths;
 16 | import java.util.ArrayList;
 17 | import java.util.List;
 18 | import java.util.concurrent.Callable;
 19 | import java.util.concurrent.ExecutorService;
 20 | import java.util.concurrent.Executors;
 21 | import java.util.concurrent.Future;
 22 | import java.util.stream.Collectors;
 23 | import java.util.stream.IntStream;
 24 | 
 25 | 
 26 | public class PdfTableReaderTest {
 27 | 
 28 |     private static final String TEST_FILENAME = "test_tables.pdf";
 29 |     private static Path TEST_OUT_PATH = null;
 30 |     private static PDDocument PDFdoc;
 31 |     private static final int THREAD_COUNT = 8;
 32 |     private static final int PAGE_CYCLE = 4;
 33 | 
 34 |     @BeforeMethod
 35 |     private void setUp() {
 36 |         PDFdoc = getTestPDF();
 37 |     }
 38 | 
 39 |     @AfterMethod
 40 |     private void tearDown() {
 41 |         if (PDFdoc != null) {
 42 |             try {
 43 |                 PDFdoc.close();
 44 |             } catch (IOException ioe) {
 45 |                 throw new TestException(ioe);
 46 |             }
 47 |         }
 48 |     }
 49 | 
 50 |     @SuppressWarnings("ConstantConditions")
 51 |     private PDDocument getTestPDF() {
 52 |         try {
 53 |             ClassLoader classLoader = getClass().getClassLoader();
 54 |             File file = new File(classLoader.getResource(TEST_FILENAME).getFile());
 55 |             TEST_OUT_PATH = Paths.get(file.getParent());
 56 |             return PDDocument.load(file);
 57 |         } catch (Exception e) {
 58 |             e.printStackTrace();
 59 |             throw new RuntimeException(e.getCause());
 60 |         }
 61 |     }
 62 | 
 63 |     @Test
 64 |     public void savePdfPagesAsPNG() throws IOException {
 65 |         PdfTableReader reader = new PdfTableReader();
 66 |         reader.savePdfPagesAsPNG(PDFdoc, 1, 3, TEST_OUT_PATH);
 67 |     }
 68 | 
 69 |     @Test
 70 |     public void savePdfPageAsPNG() throws IOException {
 71 |         PdfTableReader reader = new PdfTableReader();
 72 |         reader.savePdfPageAsPNG(PDFdoc, 4, TEST_OUT_PATH);
 73 |     }
 74 | 
 75 |     @Test
 76 |     public void savePdfDebugImages() throws IOException {
 77 |         PdfTableReader reader = new PdfTableReader();
 78 |         reader.savePdfTablePagesDebugImages(PDFdoc, 1, 3, TEST_OUT_PATH);
 79 |     }
 80 | 
 81 |     @Test
 82 |     public void savePdfDebugImage() throws IOException {
 83 |         PdfTableReader reader = new PdfTableReader();
 84 |         reader.savePdfTablePageDebugImage(PDFdoc, 4, TEST_OUT_PATH);
 85 |     }
 86 | 
 87 |     @Test
 88 |     public void singleThreadedSavePdfPageAsPNG() throws IOException {
 89 |         long start = System.currentTimeMillis();
 90 |         PdfTableReader reader = new PdfTableReader();
 91 |         reader.savePdfPagesAsPNG(PDFdoc, 1, PDFdoc.getNumberOfPages(), TEST_OUT_PATH);
 92 |         long end = System.currentTimeMillis();
 93 |         System.out.println("save page image - Single thread: " + (end - start) / 1000.0);
 94 |     }
 95 | 
 96 |     @Test
 97 |     public void multiThreadedSavePdfPageAsPNG() throws IOException {
 98 |         long start = System.currentTimeMillis();
 99 |         PdfTableReader reader = new PdfTableReader();
100 |         ExecutorService executor = Executors.newFixedThreadPool(THREAD_COUNT);
101 | 
102 |         List<Future<Boolean>> futures = new ArrayList<>();
103 |         for (final int pageNum : IntStream.rangeClosed(1, PDFdoc.getNumberOfPages()).toArray()) {
104 |             Callable<Boolean> callable = () -> {
105 |                 reader.savePdfPageAsPNG(PDFdoc, pageNum, TEST_OUT_PATH);
106 |                 return true;
107 |             };
108 |             futures.add(executor.submit(callable));
109 |         }
110 | 
111 |         try {
112 |             for (Future<Boolean> f : futures) {
113 |                 f.get();
114 |             }
115 |         } catch (Exception e) {
116 |             throw new TestException(e);
117 |         }
118 | 
119 |         long end = System.currentTimeMillis();
120 |         System.out.println("save page image - multi thread: " + (end - start) / 1000.0);
121 |     }
122 | 
123 |     @Test
124 |     public void singleThreadedSavePdfTablePageDebugImage() throws IOException {
125 |         long start = System.currentTimeMillis();
126 |         PdfTableReader reader = new PdfTableReader();
127 |         reader.savePdfTablePagesDebugImages(PDFdoc, 1, PDFdoc.getNumberOfPages(), TEST_OUT_PATH);
128 |         long end = System.currentTimeMillis();
129 |         System.out.println("save debug images - single thread: " + (end - start) / 1000.0);
130 |     }
131 | 
132 |     @Test
133 |     public void multiThreadedSavePdfTablePageDebugImage() throws IOException {
134 |         long start = System.currentTimeMillis();
135 |         PdfTableReader reader = new PdfTableReader();
136 |         ExecutorService executor = Executors.newFixedThreadPool(THREAD_COUNT);
137 | 
138 |         List<Future<Boolean>> futures = new ArrayList<>();
139 |         for (final int pageNum : IntStream.rangeClosed(1, PDFdoc.getNumberOfPages()).toArray()) {
140 |             Callable<Boolean> callable = () -> {
141 |                 reader.savePdfTablePageDebugImage(PDFdoc, pageNum, TEST_OUT_PATH);
142 |                 return true;
143 |             };
144 |             futures.add(executor.submit(callable));
145 |         }
146 | 
147 |         try {
148 |             for (Future<Boolean> f : futures) {
149 |                 f.get();
150 |             }
151 |         } catch (Exception e) {
152 |             throw new TestException(e);
153 |         }
154 | 
155 |         long end = System.currentTimeMillis();
156 |         System.out.println("save debug images - multi thread: " + (end - start) / 1000.0);
157 |     }
158 | 
159 |     @Test
160 |     public void singleThreadedParsePdfTablePages() throws IOException {
161 |         long start = System.currentTimeMillis();
162 |         PdfTableReader reader = new PdfTableReader();
163 |         List<ParsedTablePage> parsed = reader.parsePdfTablePages(PDFdoc, 1, PDFdoc.getNumberOfPages());
164 |         long end = System.currentTimeMillis();
165 |         System.out.println("parse pages - single thread: " + (end - start) / 1000.0);
166 |         validatePdfContent(parsed);
167 |     }
168 | 
169 |     @Test
170 |     public void multiThreadedParsePdfTablePages() throws IOException {
171 |         long start = System.currentTimeMillis();
172 |         PdfTableReader reader = new PdfTableReader();
173 |         ExecutorService executor = Executors.newFixedThreadPool(THREAD_COUNT);
174 | 
175 |         List<Future<ParsedTablePage>> futures = new ArrayList<>();
176 |         for (final int pageNum : IntStream.rangeClosed(1, PDFdoc.getNumberOfPages()).toArray()) {
177 |             Callable<ParsedTablePage> callable = () -> {
178 |                 ParsedTablePage page = reader.parsePdfTablePage(PDFdoc, pageNum);
179 |                 return page;
180 |             };
181 |             futures.add(executor.submit(callable));
182 |         }
183 | 
184 |         List<ParsedTablePage> parsedPages = new ArrayList<>(PDFdoc.getNumberOfPages());
185 |         try {
186 |             for (Future<ParsedTablePage> f : futures) {
187 |                 ParsedTablePage page = f.get();
188 |                 parsedPages.add(page.getPageNum() - 1, page);
189 |             }
190 |         } catch (Exception e) {
191 |             throw new TestException(e);
192 |         }
193 | 
194 |         long end = System.currentTimeMillis();
195 |         System.out.println("parse pages - multi thread: " + (end - start) / 1000.0);
196 | 
197 |         List<ParsedTablePage> sortedParsedPages = parsedPages.stream()
198 |                 .sorted((p1, p2) -> Integer.compare(p1.getPageNum(), p2.getPageNum())).collect(Collectors.toList());
199 | 
200 |         validatePdfContent(sortedParsedPages);
201 |     }
202 | 
203 |     private static String normalizeWhitespaces(String input) {
204 |         return input.replaceAll("[\\s\\u00A0]+", " ").trim();
205 |     }
206 | 
207 |     private static void validatePdfContent(List<ParsedTablePage> parsedPdf) {
208 | 
209 |         // --------------
210 |         // PAGE 1, 5 etc.
211 |         // --------------
212 | 
213 |         for (int i : IntStream.iterate(0, x -> x + PAGE_CYCLE).limit(PDFdoc.getNumberOfPages() / PAGE_CYCLE).toArray()) {
214 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(0).getCell(0)), "Heading 1");
215 | 
216 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(1).getCell(0)), "First");
217 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(1).getCell(2)), "Third");
218 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(1).getCell(5)), "Sixth");
219 | 
220 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(4).getCell(0)), "Sed ut perspiciatis unde omnis iste natus " +
221 |                     "error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo " +
222 |                     "inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo.");
223 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(4).getCell(1)), "Sed");
224 | 
225 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(5).getCell(0)), "But");
226 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(5).getCell(1)), "But I must explain to you how all this " +
227 |                     "mistaken idea of denouncing pleasure and praising pain was born and I will give you a complete account " +
228 |                     "of the system, and expound the actual teachings of the great explorer of the truth, the masterbuilder " +
229 |                     "of human happiness");
230 | 
231 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(6).getCell(0)), "Joined 1");
232 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(6).getCell(1)), "Rest 1");
233 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(6).getCell(2)), "Joined 2 Joined 2");
234 | 
235 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(9).getCell(0)), "AA");
236 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(9).getCell(1)), "BB");
237 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(9).getCell(2)), "CC");
238 | 
239 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(10).getCell(0)), "Joined 4 Joined 4 Joined 4 Joined 4");
240 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(10).getCell(1)), "Subheading 1");
241 | 
242 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(11).getCell(0)), "X");
243 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(11).getCell(1)), "XX");
244 | 
245 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(12).getCell(0)), "Y");
246 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(12).getCell(1)), "YY");
247 | 
248 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(13).getCell(0)), "Z");
249 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(13).getCell(1)), "ZZ");
250 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(13).getCell(2)), "ZZZ");
251 |         }
252 | 
253 |         // --------------
254 |         // PAGE 2, 6 etc.
255 |         // --------------
256 | 
257 |         for (int i : IntStream.iterate(1, x -> x + PAGE_CYCLE).limit(PDFdoc.getNumberOfPages() / PAGE_CYCLE).toArray()) {
258 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(0).getCell(0)), "Heading 1");
259 | 
260 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(1).getCell(0)), "First");
261 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(1).getCell(2)), "Third");
262 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(1).getCell(5)), "Sixth");
263 | 
264 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(4).getCell(0)), "Sed ut perspiciatis unde omnis iste natus " +
265 |                     "error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo " +
266 |                     "inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo.");
267 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(4).getCell(1)), "Sed");
268 | 
269 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(5).getCell(0)), "But");
270 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(5).getCell(1)), "But I must explain to you how all this " +
271 |                     "mistaken idea of denouncing pleasure and praising pain was born and I will give you a complete account " +
272 |                     "of the system, and expound the actual teachings of the great explorer of the truth, the masterbuilder " +
273 |                     "of human happiness");
274 | 
275 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(6).getCell(0)), "Joined 1");
276 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(6).getCell(1)), "Rest 1");
277 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(6).getCell(2)), "Joined 2 Joined 2");
278 | 
279 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(9).getCell(0)), "AA");
280 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(9).getCell(1)), "BB");
281 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(9).getCell(2)), "CC");
282 | 
283 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(10).getCell(0)), "Joined 4 Joined 4 Joined 4 Joined 4");
284 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(10).getCell(1)), "Subheading 1");
285 | 
286 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(11).getCell(0)), "X");
287 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(11).getCell(1)), "XX");
288 | 
289 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(12).getCell(0)), "Y");
290 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(12).getCell(1)), "YY");
291 | 
292 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(13).getCell(0)), "Z");
293 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(13).getCell(1)), "ZZ");
294 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(13).getCell(2)), "ZZZ");
295 |         }
296 | 
297 |         for (int i : IntStream.iterate(2, x -> x + PAGE_CYCLE).limit(PDFdoc.getNumberOfPages() / PAGE_CYCLE).toArray()) {
298 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(0).getCell(0)), "A");
299 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(0).getCell(4)), "E");
300 | 
301 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(1).getCell(0)), "0.01");
302 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(1).getCell(4)), "0.05");
303 | 
304 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(37).getCell(0)), "0.37");
305 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(37).getCell(2)), "0.111");
306 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(37).getCell(4)), "0.185");
307 |         }
308 | 
309 |         for (int i : IntStream.iterate(3, x -> x + PAGE_CYCLE).limit(PDFdoc.getNumberOfPages() / PAGE_CYCLE).toArray()) {
310 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(0).getCell(0)), "Table 1 Heading");
311 | 
312 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(1).getCell(0)), "AAA");
313 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(1).getCell(1)), "111");
314 | 
315 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(5).getCell(0)), "GGG");
316 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(5).getCell(1)), "444");
317 | 
318 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(9).getCell(0)), "Table 3 Heading 2");
319 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(11).getCell(0)), "000");
320 |             Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(11).getCell(1)), "QQQ QQQ QQQ");
321 |         }
322 |     }
323 | 
324 | }
325 | 


--------------------------------------------------------------------------------
/src/test/resources/test_tables.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rostrovsky/pdf-table/e9c372bafbbb9701f151b54672c8a60ee3eebe8d/src/test/resources/test_tables.pdf


--------------------------------------------------------------------------------