├── .github └── workflows │ └── mvn-test.yml ├── .gitignore ├── LICENSE ├── README.adoc ├── build.gradle ├── pom.xml ├── settings.gradle └── src ├── main └── java │ └── pdftable │ ├── PdfTableReader.java │ ├── PdfTableSettings.java │ ├── TableExtractor.java │ ├── Utils.java │ └── models │ └── ParsedTablePage.java └── test ├── java └── pdftable │ └── PdfTableReaderTest.java └── resources └── test_tables.pdf /.github/workflows/mvn-test.yml: -------------------------------------------------------------------------------- 1 | # This workflow will build a Java project with Maven 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/building-and-testing-java-with-maven 3 | 4 | name: Java CI with Maven 5 | 6 | on: 7 | push: 8 | branches: [ release/*, master ] 9 | pull_request: 10 | branches: [ master ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: windows-latest 16 | 17 | steps: 18 | - uses: actions/checkout@v2 19 | 20 | - name: Download OpenCV 3.4.2 21 | run: Invoke-WebRequest -Uri https://github.com/opencv/opencv/releases/download/3.4.2/opencv-3.4.2-vc14_vc15.exe -OutFile $HOME\opencv-3.4.2-vc14_vc15.exe 22 | shell: powershell 23 | 24 | - name: Unpack OpenCV 3.4.2 25 | run: C:\Users\runneradmin\opencv-3.4.2-vc14_vc15.exe -y -s; while (!(Test-Path "C:\Users\runneradmin\opencv\build\java\x64\opencv_java342.dll")) { Start-Sleep 10 } 26 | shell: powershell 27 | 28 | - name: Add OpenCV to PATH 29 | run: Write-Host "::add-path::C:\Users\runneradmin\opencv\build\java\x64\" 30 | shell: powershell 31 | 32 | - name: Set up JDK 1.8 33 | uses: actions/setup-java@v1 34 | with: 35 | java-version: 1.8 36 | 37 | - name: Build with Maven 38 | run: mvn -B package --file pom.xml 39 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | build/ 3 | *.iml 4 | .idea/ 5 | gradle.properties 6 | .gradle/ 7 | gradlew* 8 | gradle/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Rafal Ostrowski 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.adoc: -------------------------------------------------------------------------------- 1 | = PDF-table 2 | :toc: 3 | 4 | == What is PDF-table? 5 | PDF-table is Java utility library that can be used for parsing tabular data in PDF documents. + 6 | Core processing of PDF documents is performed with utilization of *Apache PDFBox* and *OpenCV*. 7 | 8 | == Prerequisites 9 | 10 | === JDK 11 | 12 | JAVA 8 is required. 13 | 14 | === External dependencies 15 | 16 | pdf-table requires compiled *OpenCV 3.4.2* to work properly: 17 | 18 | . Download OpenCV v3.4.2 from https://github.com/opencv/opencv/releases/tag/3.4.2 19 | . Unpack it and add to your system PATH: 20 | * Windows: `\build\java\x64` 21 | * Linux: `TODO` 22 | 23 | == Installation 24 | [source, xml] 25 | ---- 26 | 27 | com.github.rostrovsky 28 | pdf-table 29 | 1.0.0 30 | 31 | ---- 32 | 33 | == Usage 34 | 35 | === Parsing PDFs 36 | When PDF document page is being parsed, following operations are performed: 37 | 38 | . Page is converted to grayscale image [OpenCV]. 39 | . Binary Inverted Threshold (BIT) is applied to grayscaled image [OpenCV]. 40 | . Contours are detected on BIT image and contour mask is created (additional Canny filtering can be turned on in this step) [OpenCV]. 41 | . Contour mask is XORed with BIT image [OpenCV]. 42 | . Contours are detected once again on XORed image (additional Canny filtering can be turned on in this step) [OpenCV]. 43 | . Final contours are drawn [OpenCV]. 44 | . Bounding rectangles are detected from final contours [OpenCV]. 45 | . PDF is being parsed region-by-region using bounding rectangles coordinates [Apache PDFBox]. 46 | 47 | Above algorithm is mostly derived from http://stackoverflow.com/a/23106594. 48 | 49 | For more information about parsed output, refer to <> 50 | 51 | ==== single-threaded example 52 | [source, java] 53 | ---- 54 | class SingleThreadParser { 55 | public static void main(String[] args) throws IOException { 56 | PDDocument pdfDoc = PDDocument.load(new File("some.pdf")); 57 | PdfTableReader reader = new PdfTableReader(); 58 | List parsed = reader.parsePdfTablePages(pdfDoc, 1, pdfDoc.getNumberOfPages()); 59 | } 60 | } 61 | ---- 62 | 63 | ==== multi-threaded example 64 | [source, java] 65 | ---- 66 | class MultiThreadParser { 67 | public static void main(String[] args) throws IOException { 68 | final int THREAD_COUNT = 8; 69 | PDDocument pdfDoc = PDDocument.load(new File("some.pdf")); 70 | PdfTableReader reader = new PdfTableReader(); 71 | 72 | // parse pages simultaneously 73 | ExecutorService executor = Executors.newFixedThreadPool(THREAD_COUNT); 74 | List> futures = new ArrayList<>(); 75 | for (final int pageNum : IntStream.rangeClosed(1, pdfDoc.getNumberOfPages()).toArray()) { 76 | Callable callable = () -> { 77 | ParsedTablePage page = reader.parsePdfTablePage(pdfDoc, pageNum); 78 | return page; 79 | }; 80 | futures.add(executor.submit(callable)); 81 | } 82 | 83 | // collect parsed pages 84 | List unsortedParsedPages = new ArrayList<>(pdfDoc.getNumberOfPages()); 85 | try { 86 | for (Future f : futures) { 87 | ParsedTablePage page = f.get(); 88 | unsortedParsedPages.add(page.getPageNum() - 1, page); 89 | } 90 | } catch (Exception e) { 91 | throw new RuntimeException(e); 92 | } 93 | 94 | // sort pages by pageNum 95 | List sortedParsedPages = unsortedParsedPages.stream() 96 | .sorted((p1, p2) -> Integer.compare(p1.getPageNum(), p2.getPageNum())).collect(Collectors.toList()); 97 | } 98 | } 99 | ---- 100 | 101 | === Saving PDF pages as PNG images 102 | PDF-Table provides methods for saving PDF pages as PNG images. + 103 | Rendering DPI can be modified in `PdfTableSettings` (see: <>). 104 | 105 | ==== single-threaded example 106 | [source, java] 107 | ---- 108 | class SingleThreadPNGDump { 109 | public static void main(String[] args) throws IOException { 110 | PDDocument pdfDoc = PDDocument.load(new File("some.pdf")); 111 | Path outputPath = Paths.get("C:", "some_directory"); 112 | PdfTableReader reader = new PdfTableReader(); 113 | reader.savePdfPagesAsPNG(pdfDoc, 1, pdfDoc.getNumberOfPages(), outputPath); 114 | } 115 | } 116 | ---- 117 | 118 | ==== multi-threaded example 119 | [source, java] 120 | ---- 121 | class MultiThreadPNGDump { 122 | public static void main(String[] args) throws IOException { 123 | final int THREAD_COUNT = 8; 124 | Path outputPath = Paths.get("C:", "some_directory"); 125 | PDDocument pdfDoc = PDDocument.load(new File("some.pdf")); 126 | PdfTableReader reader = new PdfTableReader(); 127 | 128 | ExecutorService executor = Executors.newFixedThreadPool(THREAD_COUNT); 129 | List> futures = new ArrayList<>(); 130 | for (final int pageNum : IntStream.rangeClosed(1, pdfDoc.getNumberOfPages()).toArray()) { 131 | Callable callable = () -> { 132 | reader.savePdfPageAsPNG(pdfDoc, pageNum, outputPath); 133 | return true; 134 | }; 135 | futures.add(executor.submit(callable)); 136 | } 137 | 138 | try { 139 | for (Future f : futures) { 140 | f.get(); 141 | } 142 | } catch (Exception e) { 143 | throw new RuntimeException(e); 144 | } 145 | } 146 | } 147 | ---- 148 | 149 | === Saving debug PNG images 150 | When tables in PDF document cannot be parsed correctly with default settings, user can save debug images that show page 151 | at various stages of processing. + 152 | Using these images, user can adjust `PdfTableSettings` accordingly to achieve desired results 153 | (see: <>). 154 | 155 | ==== single-threaded example 156 | [source, java] 157 | ---- 158 | class SingleThreadDebugImgsDump { 159 | public static void main(String[] args) throws IOException { 160 | PDDocument pdfDoc = PDDocument.load(new File("some.pdf")); 161 | Path outputPath = Paths.get("C:", "some_directory"); 162 | PdfTableReader reader = new PdfTableReader(); 163 | reader.savePdfTablePagesDebugImages(pdfDoc, 1, pdfDoc.getNumberOfPages(), outputPath); 164 | } 165 | } 166 | ---- 167 | 168 | ==== multi-threaded example 169 | [source, java] 170 | ---- 171 | class MultiThreadDebugImgsDump { 172 | public static void main(String[] args) throws IOException { 173 | final int THREAD_COUNT = 8; 174 | Path outputPath = Paths.get("C:", "some_directory"); 175 | PDDocument pdfDoc = PDDocument.load(new File("some.pdf")); 176 | PdfTableReader reader = new PdfTableReader(); 177 | 178 | ExecutorService executor = Executors.newFixedThreadPool(THREAD_COUNT); 179 | List> futures = new ArrayList<>(); 180 | for (final int pageNum : IntStream.rangeClosed(1, pdfDoc.getNumberOfPages()).toArray()) { 181 | Callable callable = () -> { 182 | reader.savePdfTablePagesDebugImage(pdfDoc, pageNum, outputPath); 183 | return true; 184 | }; 185 | futures.add(executor.submit(callable)); 186 | } 187 | 188 | try { 189 | for (Future f : futures) { 190 | f.get(); 191 | } 192 | } catch (Exception e) { 193 | throw new RuntimeException(e); 194 | } 195 | } 196 | } 197 | ---- 198 | 199 | === Parsing settings 200 | 201 | PDF rendering and OpenCV filtering settings are stored in `PdfTableSettings` object. 202 | 203 | Custom settings instance can be passed to `PdfTableReader` constructor when non-default values are needed: 204 | 205 | [source, java] 206 | ---- 207 | (...) 208 | 209 | // build settings object 210 | PdfTableSettings settings = PdfTableSettings.getBuilder() 211 | .setCannyFiltering(true) 212 | .setCannyApertureSize(5) 213 | .setCannyThreshold1(40) 214 | .setCannyThreshold2(190.5) 215 | .setPdfRenderingDpi(160) 216 | .build(); 217 | 218 | // pass settings to reader 219 | PdfTableReader reader = new PdfTableReader(settings); 220 | ---- 221 | 222 | 223 | === Output format 224 | Each parsed PDF page is being returned as `ParsedTablePage` object: 225 | [source, java] 226 | ---- 227 | (...) 228 | 229 | PDDocument pdfDoc = PDDocument.load(new File("some.pdf")); 230 | PdfTableReader reader = new PdfTableReader(); 231 | 232 | // first page in document has index == 1, not 0 ! 233 | ParsedTablePage firstPage = reader.parsePdfTablePage(pdfDoc, 1); 234 | 235 | // getting page number 236 | assert firstPage.getPageNum() == 1; 237 | 238 | // rows and cells are zero-indexed just like elements of the List 239 | // getting first row 240 | ParsedTablePage.ParsedTableRow firstRow = firstPage.getRow(0); 241 | 242 | // getting third cell in second row 243 | String thirdCellContent = firstPage.getRow(1).getCell(2); 244 | 245 | // cell content usually contain characters, 246 | // so it is recommended to trim them before processing 247 | double thirdCellNumericValue = Double.valueOf(thirdCellContent.trim()); 248 | ---- 249 | -------------------------------------------------------------------------------- /build.gradle: -------------------------------------------------------------------------------- 1 | /* 2 | * This file was generated by the Gradle 'init' task. 3 | */ 4 | 5 | plugins { 6 | id 'java' 7 | id 'maven-publish' 8 | id 'maven' 9 | id 'signing' 10 | } 11 | 12 | repositories { 13 | mavenLocal() 14 | maven { 15 | url = uri('https://repo.maven.apache.org/maven2') 16 | } 17 | } 18 | 19 | dependencies { 20 | implementation 'org.apache.pdfbox:pdfbox:2.0.19' 21 | implementation 'org.apache.pdfbox:pdfbox-tools:2.0.19' 22 | implementation 'org.apache.commons:commons-lang3:3.5' 23 | implementation 'org.openpnp:opencv:3.4.2-2' 24 | testImplementation 'org.testng:testng:7.1.0' 25 | } 26 | 27 | group = 'com.github.rostrovsky' 28 | archivesBaseName = "pdf-table" 29 | version = '1.0.0' 30 | sourceCompatibility = '1.8' 31 | targetCompatibility = '1.8' 32 | 33 | publishing { 34 | publications { 35 | maven(MavenPublication) { 36 | from(components.java) 37 | } 38 | } 39 | } 40 | 41 | test { 42 | useTestNG() 43 | } 44 | 45 | task javadocJar(type: Jar) { 46 | classifier = 'javadoc' 47 | from javadoc 48 | } 49 | 50 | task sourcesJar(type: Jar) { 51 | classifier = 'sources' 52 | from sourceSets.main.allSource 53 | } 54 | 55 | artifacts { 56 | archives javadocJar, sourcesJar 57 | } 58 | 59 | signing { 60 | sign configurations.archives 61 | } 62 | 63 | uploadArchives { 64 | repositories { 65 | mavenDeployer { 66 | beforeDeployment { MavenDeployment deployment -> signing.signPom(deployment) } 67 | 68 | repository(url: "https://oss.sonatype.org/service/local/staging/deploy/maven2/") { 69 | authentication(userName: ossrhUsername, password: ossrhPassword) 70 | } 71 | 72 | snapshotRepository(url: "https://oss.sonatype.org/content/repositories/snapshots/") { 73 | authentication(userName: ossrhUsername, password: ossrhPassword) 74 | } 75 | 76 | pom.project { 77 | name 'PDF-Table' 78 | packaging 'jar' 79 | // optionally artifactId can be defined here 80 | description 'PDF-table is Java utility library that can be used for parsing tabular data in PDF documents.\n' + 81 | 'Core processing of PDF documents is performed with utilization of Apache PDFBox and OpenCV.' 82 | url 'https://github.com/rostrovsky/pdf-table' 83 | 84 | scm { 85 | connection 'scm:git:git://github.com/rostrovsky/pdf-table.git' 86 | developerConnection 'scm:git:git@github.com:rostrovsky/pdf-table.git' 87 | url 'https://github.com/rostrovsky/pdf-table' 88 | } 89 | 90 | licenses { 91 | license { 92 | name 'MIT License' 93 | url 'https://github.com/rostrovsky/pdf-table/LICENSE' 94 | } 95 | } 96 | 97 | developers { 98 | developer { 99 | id 'rostrovsky' 100 | name 'Rafal Ostrowski' 101 | email '' 102 | } 103 | } 104 | } 105 | } 106 | } 107 | } -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | 8 | 1.8 9 | 1.8 10 | 11 | 12 | com.github.rostrovsky 13 | pdf-table 14 | 1.0.0 15 | 16 | 17 | org.apache.pdfbox 18 | pdfbox 19 | 2.0.24 20 | 21 | 22 | org.apache.pdfbox 23 | pdfbox-tools 24 | 2.0.19 25 | 26 | 27 | org.apache.commons 28 | commons-lang3 29 | 3.5 30 | 31 | 32 | org.openpnp 33 | opencv 34 | 3.4.2-2 35 | 36 | 37 | org.testng 38 | testng 39 | 7.1.0 40 | test 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /settings.gradle: -------------------------------------------------------------------------------- 1 | /* 2 | * This file was generated by the Gradle 'init' task. 3 | */ 4 | 5 | rootProject.name = 'pdf-table' 6 | -------------------------------------------------------------------------------- /src/main/java/pdftable/PdfTableReader.java: -------------------------------------------------------------------------------- 1 | package pdftable; 2 | 3 | 4 | import org.apache.pdfbox.pdmodel.PDDocument; 5 | import org.apache.pdfbox.pdmodel.PDPage; 6 | import org.apache.pdfbox.rendering.ImageType; 7 | import org.apache.pdfbox.rendering.PDFRenderer; 8 | import org.apache.pdfbox.text.PDFTextStripperByArea; 9 | import org.apache.pdfbox.tools.imageio.ImageIOUtil; 10 | import org.opencv.core.Core; 11 | import org.opencv.core.Rect; 12 | import pdftable.models.ParsedTablePage; 13 | 14 | import java.awt.*; 15 | import java.awt.image.BufferedImage; 16 | import java.io.IOException; 17 | import java.nio.file.Path; 18 | import java.nio.file.Paths; 19 | import java.util.ArrayList; 20 | import java.util.List; 21 | import java.util.stream.Collectors; 22 | 23 | import static pdftable.Utils.bufferedImage2GrayscaleMat; 24 | 25 | 26 | public class PdfTableReader { 27 | 28 | private TableExtractor extractor; 29 | private PdfTableSettings settings; 30 | 31 | static { 32 | System.loadLibrary(Core.NATIVE_LIBRARY_NAME); 33 | } 34 | 35 | public PdfTableReader(PdfTableSettings settings) { 36 | this.settings = settings; 37 | this.extractor = new TableExtractor(settings); 38 | } 39 | 40 | public PdfTableReader() { 41 | this(new PdfTableSettings()); 42 | } 43 | 44 | /** 45 | * Renders PDF page with DPI specified in settings and saves it in specified directory. 46 | * 47 | * @param renderer PDF renderer instance 48 | * @param page page number 49 | * @param outputDir output directory 50 | * @throws IOException 51 | */ 52 | private void savePdfPageAsPNG(PDFRenderer renderer, int page, Path outputDir) throws IOException { 53 | BufferedImage bim; 54 | synchronized (this) { 55 | bim = renderer.renderImageWithDPI(page, settings.getPdfRenderingDpi(), ImageType.RGB); 56 | } 57 | Path outPath = outputDir.resolve(Paths.get("page_" + (page + 1) + ".png")); 58 | ImageIOUtil.writeImage(bim, outPath.toString(), settings.getPdfRenderingDpi()); 59 | 60 | } 61 | 62 | /** 63 | * Renders PDF pages range with DPI specified in settings and saves images in specified directory. 64 | * 65 | * @param document PDF document instance 66 | * @param startPage first page in range (first page == 1) 67 | * @param endPage last page in range 68 | * @param outputDir output directory 69 | * @throws IOException 70 | */ 71 | public void savePdfPagesAsPNG(PDDocument document, int startPage, int endPage, Path outputDir) throws IOException { 72 | PDFRenderer pdfRenderer = new PDFRenderer(document); 73 | for (int page = startPage - 1; page < endPage; ++page) { 74 | savePdfPageAsPNG(pdfRenderer, page, outputDir); 75 | } 76 | } 77 | 78 | /** 79 | * Renders single PDF page with DPI specified in settings and saves image in specified directory. 80 | * 81 | * @param document PDF document instance 82 | * @param page page number (first page == 1) 83 | * @param outputDir output directory 84 | * @throws IOException 85 | */ 86 | public void savePdfPageAsPNG(PDDocument document, int page, Path outputDir) throws IOException { 87 | savePdfPagesAsPNG(document, page, page, outputDir); 88 | } 89 | 90 | /** 91 | * Parses single PDF page and returns list of rows containing cell texts. 92 | * 93 | * @param bi PDF page in image format 94 | * @param pdPage PDF page in PDPage format 95 | * @return parsed page 96 | * @throws IOException 97 | */ 98 | private ParsedTablePage parsePdfTablePage(BufferedImage bi, PDPage pdPage, int pageNumber) throws IOException { 99 | List rectangles = extractor.getTableBoundingRectangles(bufferedImage2GrayscaleMat(bi)); 100 | return parsePageByRectangles(pdPage, rectangles, pageNumber); 101 | } 102 | 103 | /** 104 | * Parses range of PDF pages and returns list of lists of rows containing cell texts. 105 | * 106 | * @param document PDF document instance 107 | * @param startPage first page in range to parse (first page == 1) 108 | * @param endPage last page in range 109 | * @return List of pages 110 | * @throws IOException 111 | */ 112 | public List parsePdfTablePages(PDDocument document, int startPage, int endPage) throws IOException { 113 | List out = new ArrayList<>(); 114 | PDFRenderer renderer = new PDFRenderer(document); 115 | for (int page = startPage - 1; page < endPage; ++page) { 116 | BufferedImage bi; 117 | synchronized (this) { 118 | bi = renderer.renderImageWithDPI(page, settings.getPdfRenderingDpi(), ImageType.RGB); 119 | } 120 | ParsedTablePage parsedTablePage = parsePdfTablePage(bi, document.getPage(page), page + 1); 121 | out.add(parsedTablePage); 122 | } 123 | return out; 124 | } 125 | 126 | /** 127 | * Parses single PDF page and returns list of rows containing cell texts. 128 | * 129 | * @param document PDF document instance 130 | * @param page number of page to parse (first page == 1) 131 | * @return parsed page 132 | * @throws IOException 133 | */ 134 | public ParsedTablePage parsePdfTablePage(PDDocument document, int page) throws IOException { 135 | return parsePdfTablePages(document, page, page).get(0); 136 | } 137 | 138 | /** 139 | * Saves debug images of PDF pages from specified range and saves them in specified directory. 140 | * 141 | * @param document PDF document instance 142 | * @param startPage first page in range to process (first page == 1) 143 | * @param endPage last page in range 144 | * @param outputDir destination directory 145 | * @throws IOException 146 | */ 147 | public void savePdfTablePagesDebugImages(PDDocument document, int startPage, int endPage, Path outputDir) throws IOException { 148 | TableExtractor debugExtractor = new TableExtractor(settings); 149 | PDFRenderer renderer = new PDFRenderer(document); 150 | for (int page = startPage - 1; page < endPage; ++page) { 151 | PdfTableSettings debugSettings = PdfTableSettings.getBuilder() 152 | .setDebugImages(true) 153 | .setDebugFileOutputDir(outputDir) 154 | .setDebugFilename("page_" + (page + 1)) 155 | .build(); 156 | debugExtractor.setSettings(debugSettings); 157 | BufferedImage bi; 158 | synchronized (this) { 159 | bi = renderer.renderImageWithDPI(page, settings.getPdfRenderingDpi(), ImageType.RGB); 160 | } 161 | debugExtractor.getTableBoundingRectangles(bufferedImage2GrayscaleMat(bi)); 162 | } 163 | } 164 | 165 | /** 166 | * Saves debug images of PDF page and saves them in specified directory. 167 | * 168 | * @param document PDF document instance 169 | * @param page page to process (first page == 1) 170 | * @param outputDir destination directory 171 | * @throws IOException 172 | */ 173 | public void savePdfTablePageDebugImage(PDDocument document, int page, Path outputDir) throws IOException { 174 | savePdfTablePagesDebugImages(document, page, page, outputDir); 175 | } 176 | 177 | /** 178 | * Parses PDF page cell by cell using rectangles obtained from TableExtractor. 179 | * 180 | * @param page PDF page 181 | * @param rectangles list of OpenCV rectangles recognized by TableExtractor 182 | * @return parsed page 183 | * @throws IOException 184 | */ 185 | private ParsedTablePage parsePageByRectangles(PDPage page, List rectangles, int pageNumber) throws IOException { 186 | List> sortedRects = groupRectanglesByRow(rectangles); 187 | ParsedTablePage out = new ParsedTablePage(pageNumber); 188 | 189 | PDFTextStripperByArea stripper = new PDFTextStripperByArea(); 190 | stripper.setSortByPosition(true); 191 | 192 | int iRow = 0; 193 | int iCol = 0; 194 | for (List row : sortedRects) { 195 | for (Rect col : row) { 196 | Rectangle r = new Rectangle( 197 | (int) (col.x * settings.getDpiRatio()), 198 | (int) (col.y * settings.getDpiRatio()), 199 | (int) (col.width * settings.getDpiRatio()), 200 | (int) (col.height * settings.getDpiRatio()) 201 | ); 202 | stripper.addRegion(getRegionId(iRow, iCol), r); 203 | iCol++; 204 | } 205 | iRow++; 206 | iCol = 0; 207 | } 208 | 209 | stripper.extractRegions(page); 210 | 211 | iRow = 0; 212 | iCol = 0; 213 | for (List row : sortedRects) { 214 | List rowCells = new ArrayList<>(); 215 | for (Rect col : row) { 216 | String cellText = stripper.getTextForRegion(getRegionId(iRow, iCol)); 217 | rowCells.add(cellText); 218 | iCol++; 219 | } 220 | out.addRow(rowCells); 221 | iRow++; 222 | iCol = 0; 223 | } 224 | 225 | return out; 226 | } 227 | 228 | /** 229 | * Groups rectangles by y coordinate effectively grouping them into rows. 230 | * 231 | * @param rectangles list of OpenCV Rectangles 232 | * @return list of Rectangle lists representing table rows. 233 | */ 234 | private List> groupRectanglesByRow(List rectangles) { 235 | List> out = new ArrayList<>(); 236 | List rowsCoords = rectangles.stream().map(r -> r.y).distinct().collect(Collectors.toList()); 237 | for (int rowCoords : rowsCoords) { 238 | List cols = rectangles.stream().filter(r -> r.y == rowCoords).collect(Collectors.toList()); 239 | out.add(cols); 240 | } 241 | return out; 242 | } 243 | 244 | /** 245 | * Static helper for creating row/column markers. 246 | * 247 | * @param row table row 248 | * @param col table column 249 | * @return marker with row & column number 250 | */ 251 | private static String getRegionId(int row, int col) { 252 | return String.format("r%dc%d", row, col); 253 | } 254 | 255 | } 256 | -------------------------------------------------------------------------------- /src/main/java/pdftable/PdfTableSettings.java: -------------------------------------------------------------------------------- 1 | package pdftable; 2 | 3 | 4 | import java.nio.file.Path; 5 | 6 | /** 7 | * Image conversion settings. 8 | */ 9 | public class PdfTableSettings { 10 | 11 | public static class PdfTableSettingsBuilder { 12 | 13 | // -------------- 14 | // DEFAULT VALUES 15 | // -------------- 16 | 17 | // DPI SETTINGS 18 | private static final int DEFAULT_PDF_DPI = 72; 19 | private int pdfRenderingDpi = 120; 20 | 21 | // CANNY EDGE DETECTION FLAG 22 | private boolean cannyFiltering = false; 23 | 24 | // BINARY INVERTED THRESHOLD SETTINGS 25 | private double bitThreshold = 150; 26 | private double bitMaxVal = 255; 27 | 28 | // CANNY FILTER SETTINGS 29 | private double cannyThreshold1 = 50; 30 | private double cannyThreshold2 = 200; 31 | private int cannyApertureSize = 3; 32 | private boolean cannyL2Gradient = false; 33 | 34 | // BOUNDING RECT PARAMS 35 | private double approxDistScaleFactor = 0.02; 36 | 37 | // DEBUG IMAGES PARAMS 38 | private boolean debugImages = false; 39 | private Path debugFileOutputDir; 40 | private String debugFilename; 41 | 42 | public PdfTableSettingsBuilder setPdfRenderingDpi(int pdfRenderingDpi) { 43 | this.pdfRenderingDpi = pdfRenderingDpi; 44 | return this; 45 | } 46 | 47 | public PdfTableSettingsBuilder setCannyFiltering(boolean cannyFiltering) { 48 | this.cannyFiltering = cannyFiltering; 49 | return this; 50 | } 51 | 52 | public PdfTableSettingsBuilder setBitThreshold(double bitThreshold) { 53 | this.bitThreshold = bitThreshold; 54 | return this; 55 | } 56 | 57 | public PdfTableSettingsBuilder setBitMaxVal(double bitMaxVal) { 58 | this.bitMaxVal = bitMaxVal; 59 | return this; 60 | } 61 | 62 | public PdfTableSettingsBuilder setCannyThreshold1(double cannyThreshold1) { 63 | this.cannyThreshold1 = cannyThreshold1; 64 | return this; 65 | } 66 | 67 | public PdfTableSettingsBuilder setCannyThreshold2(double cannyThreshold2) { 68 | this.cannyThreshold2 = cannyThreshold2; 69 | return this; 70 | } 71 | 72 | public PdfTableSettingsBuilder setCannyApertureSize(int cannyApertureSize) { 73 | this.cannyApertureSize = cannyApertureSize; 74 | return this; 75 | } 76 | 77 | public PdfTableSettingsBuilder setCannyL2Gradient(boolean cannyL2Gradient) { 78 | this.cannyL2Gradient = cannyL2Gradient; 79 | return this; 80 | } 81 | 82 | public PdfTableSettingsBuilder setApproxDistScaleFactor(double approxDistScaleFactor) { 83 | this.approxDistScaleFactor = approxDistScaleFactor; 84 | return this; 85 | } 86 | 87 | public PdfTableSettingsBuilder setDebugImages(boolean debugImages) { 88 | this.debugImages = debugImages; 89 | return this; 90 | } 91 | 92 | public PdfTableSettingsBuilder setDebugFileOutputDir(Path debugFileOutputDir) { 93 | this.debugFileOutputDir = debugFileOutputDir; 94 | return this; 95 | } 96 | 97 | public PdfTableSettingsBuilder setDebugFilename(String debugFilename) { 98 | this.debugFilename = debugFilename; 99 | return this; 100 | } 101 | 102 | public PdfTableSettings build() { 103 | return new PdfTableSettings(this); 104 | } 105 | } 106 | 107 | // DPI SETTINGS 108 | private int defaultPdfDpi; 109 | private int pdfRenderingDpi; 110 | 111 | // CANNY EDGE DETECTION FLAG 112 | private boolean cannyFiltering; 113 | 114 | // BINARY INVERTED THRESHOLD SETTINGS 115 | private double bitThreshold; 116 | private double bitMaxVal; 117 | 118 | // CANNY FILTER SETTINGS 119 | private double cannyThreshold1; 120 | private double cannyThreshold2; 121 | private int cannyApertureSize; 122 | private boolean cannyL2Gradient; 123 | 124 | // BOUNDING RECT PARAMS 125 | private double approxDistScaleFactor; 126 | 127 | // DEBUG IMAGES PARAMS 128 | private boolean debugImages; 129 | private Path debugFileOutputDir; 130 | private String debugFilename; 131 | 132 | private PdfTableSettings(PdfTableSettingsBuilder builder) { 133 | this.defaultPdfDpi = PdfTableSettingsBuilder.DEFAULT_PDF_DPI; 134 | this.pdfRenderingDpi = builder.pdfRenderingDpi; 135 | this.cannyFiltering = builder.cannyFiltering; 136 | this.bitThreshold = builder.bitThreshold; 137 | this.bitMaxVal = builder.bitMaxVal; 138 | this.cannyThreshold1 = builder.cannyThreshold1; 139 | this.cannyThreshold2 = builder.cannyThreshold2; 140 | this.cannyApertureSize = builder.cannyApertureSize; 141 | this.cannyL2Gradient = builder.cannyL2Gradient; 142 | this.approxDistScaleFactor = builder.approxDistScaleFactor; 143 | this.debugImages = builder.debugImages; 144 | this.debugFileOutputDir = builder.debugFileOutputDir; 145 | this.debugFilename = builder.debugFilename; 146 | } 147 | 148 | public PdfTableSettings() { 149 | this(new PdfTableSettingsBuilder()); 150 | } 151 | 152 | public static PdfTableSettingsBuilder getBuilder() { 153 | return new PdfTableSettingsBuilder(); 154 | } 155 | 156 | public int getDefaultPdfDpi() { 157 | return defaultPdfDpi; 158 | } 159 | 160 | public int getPdfRenderingDpi() { 161 | return pdfRenderingDpi; 162 | } 163 | 164 | public boolean hasCannyFiltering() { 165 | return cannyFiltering; 166 | } 167 | 168 | public double getBitThreshold() { 169 | return bitThreshold; 170 | } 171 | 172 | public double getBitMaxVal() { 173 | return bitMaxVal; 174 | } 175 | 176 | public double getCannyThreshold1() { 177 | return cannyThreshold1; 178 | } 179 | 180 | public double getCannyThreshold2() { 181 | return cannyThreshold2; 182 | } 183 | 184 | public int getCannyApertureSize() { 185 | return cannyApertureSize; 186 | } 187 | 188 | public boolean hasCannyL2Gradient() { 189 | return cannyL2Gradient; 190 | } 191 | 192 | public double getApproxDistScaleFactor() { 193 | return approxDistScaleFactor; 194 | } 195 | 196 | public boolean hasDebugImages() { 197 | return debugImages; 198 | } 199 | 200 | public Path getDebugFileOutputDir() { 201 | return debugFileOutputDir; 202 | } 203 | 204 | public String getDebugFilename() { 205 | return debugFilename; 206 | } 207 | 208 | public double getDpiRatio() { 209 | return (double) defaultPdfDpi / pdfRenderingDpi; 210 | } 211 | } 212 | -------------------------------------------------------------------------------- /src/main/java/pdftable/TableExtractor.java: -------------------------------------------------------------------------------- 1 | package pdftable; 2 | 3 | import org.opencv.core.*; 4 | import org.opencv.imgcodecs.Imgcodecs; 5 | import org.opencv.imgproc.Imgproc; 6 | 7 | import java.util.ArrayList; 8 | import java.util.Collections; 9 | import java.util.List; 10 | 11 | import static org.opencv.core.Core.bitwise_xor; 12 | import static org.opencv.imgproc.Imgproc.*; 13 | 14 | /** 15 | * Class responsible for determining table cells bounding boxes. 16 | * Should be used as static. 17 | */ 18 | class TableExtractor { 19 | 20 | private PdfTableSettings settings; 21 | 22 | static { 23 | System.loadLibrary(Core.NATIVE_LIBRARY_NAME); 24 | } 25 | 26 | public TableExtractor(PdfTableSettings settings) { 27 | this.settings = settings; 28 | } 29 | 30 | /** 31 | * Applies series of filters on page image and extracts table cells bounding rectangles. 32 | * Additionally dumps debug PNG images when settings.hasDebugImages() is true. 33 | * 34 | * @param inImage Input image 35 | * @return List of org.opencv.core.Rect objects representing cell bounding rectangles. 36 | */ 37 | public List getTableBoundingRectangles(Mat inImage) { 38 | List out = new ArrayList<>(); 39 | 40 | if (settings.hasDebugImages()) { 41 | Imgcodecs.imwrite(buildDebugFilename("original_grayscaled"), inImage); 42 | } 43 | 44 | // binary inverted threshold 45 | Mat bit = binaryInvertedThreshold(inImage); 46 | if (settings.hasDebugImages()) { 47 | Imgcodecs.imwrite(buildDebugFilename("binary_inverted_threshold"), bit); 48 | } 49 | 50 | // find contours 51 | List contours = new ArrayList<>(); 52 | if (settings.hasCannyFiltering()) { 53 | Mat canny = cannyFilter(inImage); 54 | findContours(canny, contours, new Mat(), RETR_EXTERNAL, CHAIN_APPROX_SIMPLE); 55 | if (settings.hasDebugImages()) { 56 | Imgcodecs.imwrite(buildDebugFilename("canny1"), canny); 57 | } 58 | } else { 59 | findContours(bit, contours, new Mat(), RETR_EXTERNAL, CHAIN_APPROX_SIMPLE); 60 | } 61 | 62 | // draw contour 63 | Mat contourMask = bit.clone(); 64 | drawContours(contourMask, contours, -1, new Scalar(255, 255, 255), Core.FILLED); 65 | if (settings.hasDebugImages()) { 66 | Imgcodecs.imwrite(buildDebugFilename("contour_mask"), contourMask); 67 | } 68 | 69 | // XOR threshold and mask 70 | Mat xored = new Mat(); 71 | bitwise_xor(bit, contourMask, xored); 72 | if (settings.hasDebugImages()) { 73 | Imgcodecs.imwrite(buildDebugFilename("xored"), xored); 74 | } 75 | 76 | // find contours #2 77 | List contours2 = new ArrayList<>(); 78 | if (settings.hasCannyFiltering()) { 79 | Mat canny2 = cannyFilter(xored); 80 | findContours(canny2, contours2, new Mat(), RETR_EXTERNAL, CHAIN_APPROX_SIMPLE); 81 | if (settings.hasDebugImages()) { 82 | Imgcodecs.imwrite(buildDebugFilename("canny2"), canny2); 83 | } 84 | } else { 85 | findContours(xored, contours2, new Mat(), RETR_EXTERNAL, CHAIN_APPROX_SIMPLE); 86 | } 87 | 88 | // draw contour #2 89 | if (settings.hasDebugImages()) { 90 | Mat contourMask2 = inImage.clone(); 91 | drawContours(contourMask2, contours2, -1, new Scalar(255, 255, 255), Core.FILLED); 92 | Imgcodecs.imwrite(buildDebugFilename("final_contours"), contourMask2); 93 | } 94 | 95 | // find contours #2 bounding rectangles 96 | for (int i = 0; i < contours2.size(); i++) { 97 | MatOfPoint2f approxCurve = new MatOfPoint2f(); 98 | MatOfPoint2f contour2f = new MatOfPoint2f(contours2.get(i).toArray()); 99 | double approxDistance = Imgproc.arcLength(contour2f, true) * settings.getApproxDistScaleFactor(); 100 | Imgproc.approxPolyDP(contour2f, approxCurve, approxDistance, true); 101 | MatOfPoint points = new MatOfPoint(approxCurve.toArray()); 102 | Rect rect = Imgproc.boundingRect(points); 103 | out.add(rect); 104 | } 105 | 106 | Collections.reverse(out); 107 | 108 | if (settings.hasDebugImages()) { 109 | int ri = 0; 110 | for (Rect rect : out) { 111 | Mat outImage = inImage.clone(); 112 | Point p1 = new Point(rect.x, rect.y); 113 | Point p2 = new Point(rect.x + rect.width, rect.y + rect.height); 114 | rectangle(outImage, p1, p2, new Scalar(0, 0, 0, 255), 3); 115 | Imgcodecs.imwrite(buildDebugFilename(String.format("box_%03d", ri)), outImage); 116 | ri++; 117 | } 118 | } 119 | 120 | return out; 121 | } 122 | 123 | public void setSettings(PdfTableSettings settings) { 124 | this.settings = settings; 125 | } 126 | 127 | /** 128 | * Applies Binary Inverted Threshold (BIT) to Mat image. 129 | * 130 | * @param input Input image 131 | * @return org.opencv.core.Mat image with applied BIT 132 | */ 133 | private Mat binaryInvertedThreshold(Mat input) { 134 | Mat out = new Mat(); 135 | threshold(input, out, settings.getBitThreshold(), settings.getBitMaxVal(), THRESH_BINARY_INV); 136 | return out; 137 | } 138 | 139 | /** 140 | * Applies Canny filter to Mat image. 141 | * 142 | * @param input Input image 143 | * @return org.opencv.core.Mat image with applied Canny filter 144 | */ 145 | private Mat cannyFilter(Mat input) { 146 | Mat out = new Mat(); 147 | Canny(input, out, settings.getCannyThreshold1(), settings.getCannyThreshold2(), settings.getCannyApertureSize(), settings.hasCannyL2Gradient()); 148 | return out; 149 | } 150 | 151 | /** 152 | * String helper used for constructing debug image output path. 153 | * 154 | * @param suffix Image filename suffix 155 | * @return String representing image path 156 | */ 157 | private String buildDebugFilename(String suffix) { 158 | return settings.getDebugFileOutputDir().resolve(settings.getDebugFilename() + "_" + suffix + ".png").toString(); 159 | } 160 | 161 | } 162 | -------------------------------------------------------------------------------- /src/main/java/pdftable/Utils.java: -------------------------------------------------------------------------------- 1 | package pdftable; 2 | 3 | import org.apache.pdfbox.io.IOUtils; 4 | import org.opencv.core.Mat; 5 | import org.opencv.core.MatOfByte; 6 | import org.opencv.imgcodecs.Imgcodecs; 7 | 8 | import javax.imageio.ImageIO; 9 | import java.awt.image.BufferedImage; 10 | import java.io.ByteArrayInputStream; 11 | import java.io.ByteArrayOutputStream; 12 | import java.io.IOException; 13 | import java.io.InputStream; 14 | 15 | /** 16 | * Static utilities used for converting between image formats. 17 | */ 18 | public class Utils { 19 | 20 | private Utils() { 21 | } 22 | 23 | /** 24 | * Converts InputStream to OpenCV Mat 25 | * 26 | * @param stream Input stream 27 | * @param flag org.opencv.imgcodecs.Imgcodecs flag 28 | * @return org.opencv.core.Mat 29 | * @throws IOException 30 | */ 31 | public static Mat inputStream2Mat(InputStream stream, int flag) throws IOException { 32 | byte[] byteBuff = IOUtils.toByteArray(stream); 33 | return Imgcodecs.imdecode(new MatOfByte(byteBuff), flag); 34 | } 35 | 36 | /** 37 | * Converts BufferedImage to InputStream. 38 | * 39 | * @param inImg Buffered Image 40 | * @return java.io.InputStream 41 | * @throws IOException 42 | */ 43 | public static InputStream bufferedImage2InputStream(BufferedImage inImg) throws IOException { 44 | ByteArrayOutputStream os = new ByteArrayOutputStream(); 45 | ImageIO.write(inImg, "png", os); 46 | return new ByteArrayInputStream(os.toByteArray()); 47 | } 48 | 49 | /** 50 | * Converts BufferedImage to OpenCV Mat using custom flag. 51 | * 52 | * @param inImg Buffered Image 53 | * @param flag org.opencv.imgcodecs.Imgcodecs flag 54 | * @return org.opencv.core.Mat 55 | * @throws IOException 56 | */ 57 | public static Mat bufferedImage2Mat(BufferedImage inImg, int flag) throws IOException { 58 | return inputStream2Mat(bufferedImage2InputStream(inImg), flag); 59 | } 60 | 61 | /** 62 | * Converts BufferedImage to grayscaled OpenCV Mat. 63 | * 64 | * @param inImg Buffered Image 65 | * @return org.opencv.core.Mat 66 | * @throws IOException 67 | */ 68 | public static Mat bufferedImage2GrayscaleMat(BufferedImage inImg) throws IOException { 69 | return bufferedImage2Mat(inImg, Imgcodecs.IMREAD_GRAYSCALE); 70 | } 71 | 72 | } 73 | -------------------------------------------------------------------------------- /src/main/java/pdftable/models/ParsedTablePage.java: -------------------------------------------------------------------------------- 1 | package pdftable.models; 2 | 3 | import org.apache.commons.lang3.StringEscapeUtils; 4 | 5 | import java.util.ArrayList; 6 | import java.util.Arrays; 7 | import java.util.List; 8 | import java.util.stream.Collectors; 9 | 10 | /** 11 | * Parsed page model. 12 | */ 13 | public class ParsedTablePage { 14 | 15 | public class ParsedTableRow { 16 | 17 | private List cells; 18 | 19 | public ParsedTableRow(List cells) { 20 | this.cells = cells; 21 | } 22 | 23 | public List getCells() { 24 | return cells; 25 | } 26 | 27 | public String getCell(int index) { 28 | return cells.get(index); 29 | } 30 | 31 | @Override 32 | public String toString() { 33 | List escapedCells = cells.stream().map(c -> StringEscapeUtils.escapeJava(c)).collect(Collectors.toList()); 34 | return String.format("<%s@%s; cells:%s>", 35 | this.getClass().getSimpleName(), System.identityHashCode(this), Arrays.toString(escapedCells.toArray())); 36 | } 37 | } 38 | 39 | private List rows; 40 | private int pageNum; 41 | 42 | private ParsedTablePage() { 43 | rows = new ArrayList<>(); 44 | } 45 | 46 | public ParsedTablePage(int pageNumber) { 47 | this(); 48 | pageNum = pageNumber; 49 | } 50 | 51 | public List getRows() { 52 | return rows; 53 | } 54 | 55 | public void addRow(List cells) { 56 | rows.add(new ParsedTableRow(cells)); 57 | } 58 | 59 | public ParsedTableRow getRow(int index) { 60 | return rows.get(index); 61 | } 62 | 63 | public int getPageNum() { 64 | return pageNum; 65 | } 66 | 67 | @Override 68 | public String toString() { 69 | return String.format("<%s@%s; rows:%s>", 70 | this.getClass().getSimpleName(), System.identityHashCode(this), Arrays.toString(rows.toArray())); 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/test/java/pdftable/PdfTableReaderTest.java: -------------------------------------------------------------------------------- 1 | package pdftable; 2 | 3 | 4 | import org.apache.pdfbox.pdmodel.PDDocument; 5 | import org.testng.Assert; 6 | import org.testng.TestException; 7 | import org.testng.annotations.AfterMethod; 8 | import org.testng.annotations.BeforeMethod; 9 | import org.testng.annotations.Test; 10 | import pdftable.models.ParsedTablePage; 11 | 12 | import java.io.File; 13 | import java.io.IOException; 14 | import java.nio.file.Path; 15 | import java.nio.file.Paths; 16 | import java.util.ArrayList; 17 | import java.util.List; 18 | import java.util.concurrent.Callable; 19 | import java.util.concurrent.ExecutorService; 20 | import java.util.concurrent.Executors; 21 | import java.util.concurrent.Future; 22 | import java.util.stream.Collectors; 23 | import java.util.stream.IntStream; 24 | 25 | 26 | public class PdfTableReaderTest { 27 | 28 | private static final String TEST_FILENAME = "test_tables.pdf"; 29 | private static Path TEST_OUT_PATH = null; 30 | private static PDDocument PDFdoc; 31 | private static final int THREAD_COUNT = 8; 32 | private static final int PAGE_CYCLE = 4; 33 | 34 | @BeforeMethod 35 | private void setUp() { 36 | PDFdoc = getTestPDF(); 37 | } 38 | 39 | @AfterMethod 40 | private void tearDown() { 41 | if (PDFdoc != null) { 42 | try { 43 | PDFdoc.close(); 44 | } catch (IOException ioe) { 45 | throw new TestException(ioe); 46 | } 47 | } 48 | } 49 | 50 | @SuppressWarnings("ConstantConditions") 51 | private PDDocument getTestPDF() { 52 | try { 53 | ClassLoader classLoader = getClass().getClassLoader(); 54 | File file = new File(classLoader.getResource(TEST_FILENAME).getFile()); 55 | TEST_OUT_PATH = Paths.get(file.getParent()); 56 | return PDDocument.load(file); 57 | } catch (Exception e) { 58 | e.printStackTrace(); 59 | throw new RuntimeException(e.getCause()); 60 | } 61 | } 62 | 63 | @Test 64 | public void savePdfPagesAsPNG() throws IOException { 65 | PdfTableReader reader = new PdfTableReader(); 66 | reader.savePdfPagesAsPNG(PDFdoc, 1, 3, TEST_OUT_PATH); 67 | } 68 | 69 | @Test 70 | public void savePdfPageAsPNG() throws IOException { 71 | PdfTableReader reader = new PdfTableReader(); 72 | reader.savePdfPageAsPNG(PDFdoc, 4, TEST_OUT_PATH); 73 | } 74 | 75 | @Test 76 | public void savePdfDebugImages() throws IOException { 77 | PdfTableReader reader = new PdfTableReader(); 78 | reader.savePdfTablePagesDebugImages(PDFdoc, 1, 3, TEST_OUT_PATH); 79 | } 80 | 81 | @Test 82 | public void savePdfDebugImage() throws IOException { 83 | PdfTableReader reader = new PdfTableReader(); 84 | reader.savePdfTablePageDebugImage(PDFdoc, 4, TEST_OUT_PATH); 85 | } 86 | 87 | @Test 88 | public void singleThreadedSavePdfPageAsPNG() throws IOException { 89 | long start = System.currentTimeMillis(); 90 | PdfTableReader reader = new PdfTableReader(); 91 | reader.savePdfPagesAsPNG(PDFdoc, 1, PDFdoc.getNumberOfPages(), TEST_OUT_PATH); 92 | long end = System.currentTimeMillis(); 93 | System.out.println("save page image - Single thread: " + (end - start) / 1000.0); 94 | } 95 | 96 | @Test 97 | public void multiThreadedSavePdfPageAsPNG() throws IOException { 98 | long start = System.currentTimeMillis(); 99 | PdfTableReader reader = new PdfTableReader(); 100 | ExecutorService executor = Executors.newFixedThreadPool(THREAD_COUNT); 101 | 102 | List> futures = new ArrayList<>(); 103 | for (final int pageNum : IntStream.rangeClosed(1, PDFdoc.getNumberOfPages()).toArray()) { 104 | Callable callable = () -> { 105 | reader.savePdfPageAsPNG(PDFdoc, pageNum, TEST_OUT_PATH); 106 | return true; 107 | }; 108 | futures.add(executor.submit(callable)); 109 | } 110 | 111 | try { 112 | for (Future f : futures) { 113 | f.get(); 114 | } 115 | } catch (Exception e) { 116 | throw new TestException(e); 117 | } 118 | 119 | long end = System.currentTimeMillis(); 120 | System.out.println("save page image - multi thread: " + (end - start) / 1000.0); 121 | } 122 | 123 | @Test 124 | public void singleThreadedSavePdfTablePageDebugImage() throws IOException { 125 | long start = System.currentTimeMillis(); 126 | PdfTableReader reader = new PdfTableReader(); 127 | reader.savePdfTablePagesDebugImages(PDFdoc, 1, PDFdoc.getNumberOfPages(), TEST_OUT_PATH); 128 | long end = System.currentTimeMillis(); 129 | System.out.println("save debug images - single thread: " + (end - start) / 1000.0); 130 | } 131 | 132 | @Test 133 | public void multiThreadedSavePdfTablePageDebugImage() throws IOException { 134 | long start = System.currentTimeMillis(); 135 | PdfTableReader reader = new PdfTableReader(); 136 | ExecutorService executor = Executors.newFixedThreadPool(THREAD_COUNT); 137 | 138 | List> futures = new ArrayList<>(); 139 | for (final int pageNum : IntStream.rangeClosed(1, PDFdoc.getNumberOfPages()).toArray()) { 140 | Callable callable = () -> { 141 | reader.savePdfTablePageDebugImage(PDFdoc, pageNum, TEST_OUT_PATH); 142 | return true; 143 | }; 144 | futures.add(executor.submit(callable)); 145 | } 146 | 147 | try { 148 | for (Future f : futures) { 149 | f.get(); 150 | } 151 | } catch (Exception e) { 152 | throw new TestException(e); 153 | } 154 | 155 | long end = System.currentTimeMillis(); 156 | System.out.println("save debug images - multi thread: " + (end - start) / 1000.0); 157 | } 158 | 159 | @Test 160 | public void singleThreadedParsePdfTablePages() throws IOException { 161 | long start = System.currentTimeMillis(); 162 | PdfTableReader reader = new PdfTableReader(); 163 | List parsed = reader.parsePdfTablePages(PDFdoc, 1, PDFdoc.getNumberOfPages()); 164 | long end = System.currentTimeMillis(); 165 | System.out.println("parse pages - single thread: " + (end - start) / 1000.0); 166 | validatePdfContent(parsed); 167 | } 168 | 169 | @Test 170 | public void multiThreadedParsePdfTablePages() throws IOException { 171 | long start = System.currentTimeMillis(); 172 | PdfTableReader reader = new PdfTableReader(); 173 | ExecutorService executor = Executors.newFixedThreadPool(THREAD_COUNT); 174 | 175 | List> futures = new ArrayList<>(); 176 | for (final int pageNum : IntStream.rangeClosed(1, PDFdoc.getNumberOfPages()).toArray()) { 177 | Callable callable = () -> { 178 | ParsedTablePage page = reader.parsePdfTablePage(PDFdoc, pageNum); 179 | return page; 180 | }; 181 | futures.add(executor.submit(callable)); 182 | } 183 | 184 | List parsedPages = new ArrayList<>(PDFdoc.getNumberOfPages()); 185 | try { 186 | for (Future f : futures) { 187 | ParsedTablePage page = f.get(); 188 | parsedPages.add(page.getPageNum() - 1, page); 189 | } 190 | } catch (Exception e) { 191 | throw new TestException(e); 192 | } 193 | 194 | long end = System.currentTimeMillis(); 195 | System.out.println("parse pages - multi thread: " + (end - start) / 1000.0); 196 | 197 | List sortedParsedPages = parsedPages.stream() 198 | .sorted((p1, p2) -> Integer.compare(p1.getPageNum(), p2.getPageNum())).collect(Collectors.toList()); 199 | 200 | validatePdfContent(sortedParsedPages); 201 | } 202 | 203 | private static String normalizeWhitespaces(String input) { 204 | return input.replaceAll("[\\s\\u00A0]+", " ").trim(); 205 | } 206 | 207 | private static void validatePdfContent(List parsedPdf) { 208 | 209 | // -------------- 210 | // PAGE 1, 5 etc. 211 | // -------------- 212 | 213 | for (int i : IntStream.iterate(0, x -> x + PAGE_CYCLE).limit(PDFdoc.getNumberOfPages() / PAGE_CYCLE).toArray()) { 214 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(0).getCell(0)), "Heading 1"); 215 | 216 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(1).getCell(0)), "First"); 217 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(1).getCell(2)), "Third"); 218 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(1).getCell(5)), "Sixth"); 219 | 220 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(4).getCell(0)), "Sed ut perspiciatis unde omnis iste natus " + 221 | "error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo " + 222 | "inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo."); 223 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(4).getCell(1)), "Sed"); 224 | 225 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(5).getCell(0)), "But"); 226 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(5).getCell(1)), "But I must explain to you how all this " + 227 | "mistaken idea of denouncing pleasure and praising pain was born and I will give you a complete account " + 228 | "of the system, and expound the actual teachings of the great explorer of the truth, the masterbuilder " + 229 | "of human happiness"); 230 | 231 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(6).getCell(0)), "Joined 1"); 232 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(6).getCell(1)), "Rest 1"); 233 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(6).getCell(2)), "Joined 2 Joined 2"); 234 | 235 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(9).getCell(0)), "AA"); 236 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(9).getCell(1)), "BB"); 237 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(9).getCell(2)), "CC"); 238 | 239 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(10).getCell(0)), "Joined 4 Joined 4 Joined 4 Joined 4"); 240 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(10).getCell(1)), "Subheading 1"); 241 | 242 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(11).getCell(0)), "X"); 243 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(11).getCell(1)), "XX"); 244 | 245 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(12).getCell(0)), "Y"); 246 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(12).getCell(1)), "YY"); 247 | 248 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(13).getCell(0)), "Z"); 249 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(13).getCell(1)), "ZZ"); 250 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(13).getCell(2)), "ZZZ"); 251 | } 252 | 253 | // -------------- 254 | // PAGE 2, 6 etc. 255 | // -------------- 256 | 257 | for (int i : IntStream.iterate(1, x -> x + PAGE_CYCLE).limit(PDFdoc.getNumberOfPages() / PAGE_CYCLE).toArray()) { 258 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(0).getCell(0)), "Heading 1"); 259 | 260 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(1).getCell(0)), "First"); 261 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(1).getCell(2)), "Third"); 262 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(1).getCell(5)), "Sixth"); 263 | 264 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(4).getCell(0)), "Sed ut perspiciatis unde omnis iste natus " + 265 | "error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo " + 266 | "inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo."); 267 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(4).getCell(1)), "Sed"); 268 | 269 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(5).getCell(0)), "But"); 270 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(5).getCell(1)), "But I must explain to you how all this " + 271 | "mistaken idea of denouncing pleasure and praising pain was born and I will give you a complete account " + 272 | "of the system, and expound the actual teachings of the great explorer of the truth, the masterbuilder " + 273 | "of human happiness"); 274 | 275 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(6).getCell(0)), "Joined 1"); 276 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(6).getCell(1)), "Rest 1"); 277 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(6).getCell(2)), "Joined 2 Joined 2"); 278 | 279 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(9).getCell(0)), "AA"); 280 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(9).getCell(1)), "BB"); 281 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(9).getCell(2)), "CC"); 282 | 283 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(10).getCell(0)), "Joined 4 Joined 4 Joined 4 Joined 4"); 284 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(10).getCell(1)), "Subheading 1"); 285 | 286 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(11).getCell(0)), "X"); 287 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(11).getCell(1)), "XX"); 288 | 289 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(12).getCell(0)), "Y"); 290 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(12).getCell(1)), "YY"); 291 | 292 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(13).getCell(0)), "Z"); 293 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(13).getCell(1)), "ZZ"); 294 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(13).getCell(2)), "ZZZ"); 295 | } 296 | 297 | for (int i : IntStream.iterate(2, x -> x + PAGE_CYCLE).limit(PDFdoc.getNumberOfPages() / PAGE_CYCLE).toArray()) { 298 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(0).getCell(0)), "A"); 299 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(0).getCell(4)), "E"); 300 | 301 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(1).getCell(0)), "0.01"); 302 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(1).getCell(4)), "0.05"); 303 | 304 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(37).getCell(0)), "0.37"); 305 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(37).getCell(2)), "0.111"); 306 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(37).getCell(4)), "0.185"); 307 | } 308 | 309 | for (int i : IntStream.iterate(3, x -> x + PAGE_CYCLE).limit(PDFdoc.getNumberOfPages() / PAGE_CYCLE).toArray()) { 310 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(0).getCell(0)), "Table 1 Heading"); 311 | 312 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(1).getCell(0)), "AAA"); 313 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(1).getCell(1)), "111"); 314 | 315 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(5).getCell(0)), "GGG"); 316 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(5).getCell(1)), "444"); 317 | 318 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(9).getCell(0)), "Table 3 Heading 2"); 319 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(11).getCell(0)), "000"); 320 | Assert.assertEquals(normalizeWhitespaces(parsedPdf.get(i).getRow(11).getCell(1)), "QQQ QQQ QQQ"); 321 | } 322 | } 323 | 324 | } 325 | -------------------------------------------------------------------------------- /src/test/resources/test_tables.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rostrovsky/pdf-table/e9c372bafbbb9701f151b54672c8a60ee3eebe8d/src/test/resources/test_tables.pdf --------------------------------------------------------------------------------