├── src ├── test │ ├── resources │ │ └── technology │ │ │ └── tabula │ │ │ ├── 20.pdf │ │ │ ├── m27.pdf │ │ │ ├── 12s0324.pdf │ │ │ ├── arabic.pdf │ │ │ ├── china.pdf │ │ │ ├── eu-002.pdf │ │ │ ├── eu-017.pdf │ │ │ ├── labor.pdf │ │ │ ├── mednine.pdf │ │ │ ├── offense.pdf │ │ │ ├── schools.pdf │ │ │ ├── us-007.pdf │ │ │ ├── us-017.pdf │ │ │ ├── us-020.pdf │ │ │ ├── us-024.pdf │ │ │ ├── encrypted.pdf │ │ │ ├── jpeg2000.pdf │ │ │ ├── puertos1.pdf │ │ │ ├── twotables.pdf │ │ │ ├── AnimalSounds.pdf │ │ │ ├── MultiColumn.pdf │ │ │ ├── failing_sort.pdf │ │ │ ├── indictb1h_14.pdf │ │ │ ├── rotated_page.pdf │ │ │ ├── AnimalSounds1.pdf │ │ │ ├── cs-en-us-pbms.pdf │ │ │ ├── npe_issue_206.pdf │ │ │ ├── sort_exception.pdf │ │ │ ├── spanning_cells.pdf │ │ │ ├── S2MNCEbirdisland.pdf │ │ │ ├── campaign_donors.pdf │ │ │ ├── frx_2012_disclosure.pdf │ │ │ ├── should_detect_rulings.pdf │ │ │ ├── sydney_disclosure_contract.pdf │ │ │ ├── spreadsheet_no_bounding_frame.pdf │ │ │ ├── icdar2013-dataset │ │ │ ├── competition-dataset-eu │ │ │ │ ├── eu-001.json │ │ │ │ ├── eu-002.json │ │ │ │ ├── eu-003.json │ │ │ │ ├── eu-004.json │ │ │ │ ├── eu-005.json │ │ │ │ ├── eu-006.json │ │ │ │ ├── eu-007.json │ │ │ │ ├── eu-008.json │ │ │ │ ├── eu-009a.json │ │ │ │ ├── eu-010.json │ │ │ │ ├── eu-011.json │ │ │ │ ├── eu-012.json │ │ │ │ ├── eu-013.json │ │ │ │ ├── eu-014.json │ │ │ │ ├── eu-015.json │ │ │ │ ├── eu-016.json │ │ │ │ ├── eu-017.json │ │ │ │ ├── eu-018.json │ │ │ │ ├── eu-019.json │ │ │ │ ├── eu-020.json │ │ │ │ ├── eu-021.json │ │ │ │ ├── eu-022.json │ │ │ │ ├── eu-023.json │ │ │ │ ├── eu-024.json │ │ │ │ ├── eu-025.json │ │ │ │ ├── eu-026.json │ │ │ │ ├── eu-027.json │ │ │ │ ├── eu-001.pdf │ │ │ │ ├── eu-002.pdf │ │ │ │ ├── eu-003.pdf │ │ │ │ ├── eu-004.pdf │ │ │ │ ├── eu-005.pdf │ │ │ │ ├── eu-006.pdf │ │ │ │ ├── eu-007.pdf │ │ │ │ ├── eu-008.pdf │ │ │ │ ├── eu-010.pdf │ │ │ │ ├── eu-011.pdf │ │ │ │ ├── eu-012.pdf │ │ │ │ ├── eu-013.pdf │ │ │ │ ├── eu-014.pdf │ │ │ │ ├── eu-015.pdf │ │ │ │ ├── eu-016.pdf │ │ │ │ ├── eu-017.pdf │ │ │ │ ├── eu-018.pdf │ │ │ │ ├── eu-019.pdf │ │ │ │ ├── eu-020.pdf │ │ │ │ ├── eu-021.pdf │ │ │ │ ├── eu-022.pdf │ │ │ │ ├── eu-023.pdf │ │ │ │ ├── eu-024.pdf │ │ │ │ ├── eu-025.pdf │ │ │ │ ├── eu-026.pdf │ │ │ │ ├── eu-027.pdf │ │ │ │ ├── eu-009a.pdf │ │ │ │ ├── eu-010-reg.xml │ │ │ │ ├── eu-014-reg.xml │ │ │ │ ├── eu-011-reg.xml │ │ │ │ ├── eu-008-reg.xml │ │ │ │ ├── eu-026-reg.xml │ │ │ │ ├── eu-002-reg.xml │ │ │ │ └── eu-009b-reg.xml │ │ │ └── competition-dataset-us │ │ │ │ ├── us-001.json │ │ │ │ ├── us-002.json │ │ │ │ ├── us-003.json │ │ │ │ ├── us-004.json │ │ │ │ ├── us-005.json │ │ │ │ ├── us-006.json │ │ │ │ ├── us-007.json │ │ │ │ ├── us-008.json │ │ │ │ ├── us-009.json │ │ │ │ ├── us-010.json │ │ │ │ ├── us-011a.json │ │ │ │ ├── us-012.json │ │ │ │ ├── us-013.json │ │ │ │ ├── us-014.json │ │ │ │ ├── us-015.json │ │ │ │ ├── us-016.json │ │ │ │ ├── us-017.json │ │ │ │ ├── us-018.json │ │ │ │ ├── us-019.json │ │ │ │ ├── us-020.json │ │ │ │ ├── us-021.json │ │ │ │ ├── us-022.json │ │ │ │ ├── us-023.json │ │ │ │ ├── us-024.json │ │ │ │ ├── us-025.json │ │ │ │ ├── us-026.json │ │ │ │ ├── us-027.json │ │ │ │ ├── us-028.json │ │ │ │ ├── us-029.json │ │ │ │ ├── us-030.json │ │ │ │ ├── us-031a.json │ │ │ │ ├── us-032.json │ │ │ │ ├── us-033.json │ │ │ │ ├── us-034.json │ │ │ │ ├── us-035a.json │ │ │ │ ├── us-036.json │ │ │ │ ├── us-037.json │ │ │ │ ├── us-038.json │ │ │ │ ├── us-039.json │ │ │ │ ├── us-040.json │ │ │ │ ├── us-001.pdf │ │ │ │ ├── us-002.pdf │ │ │ │ ├── us-003.pdf │ │ │ │ ├── us-004.pdf │ │ │ │ ├── us-005.pdf │ │ │ │ ├── us-006.pdf │ │ │ │ ├── us-007.pdf │ │ │ │ ├── us-008.pdf │ │ │ │ ├── us-009.pdf │ │ │ │ ├── us-010.pdf │ │ │ │ ├── us-012.pdf │ │ │ │ ├── us-013.pdf │ │ │ │ ├── us-014.pdf │ │ │ │ ├── us-015.pdf │ │ │ │ ├── us-016.pdf │ │ │ │ ├── us-017.pdf │ │ │ │ ├── us-018.pdf │ │ │ │ ├── us-019.pdf │ │ │ │ ├── us-020.pdf │ │ │ │ ├── us-021.pdf │ │ │ │ ├── us-022.pdf │ │ │ │ ├── us-023.pdf │ │ │ │ ├── us-024.pdf │ │ │ │ ├── us-025.pdf │ │ │ │ ├── us-026.pdf │ │ │ │ ├── us-027.pdf │ │ │ │ ├── us-028.pdf │ │ │ │ ├── us-029.pdf │ │ │ │ ├── us-030.pdf │ │ │ │ ├── us-032.pdf │ │ │ │ ├── us-033.pdf │ │ │ │ ├── us-034.pdf │ │ │ │ ├── us-036.pdf │ │ │ │ ├── us-037.pdf │ │ │ │ ├── us-038.pdf │ │ │ │ ├── us-039.pdf │ │ │ │ ├── us-040.pdf │ │ │ │ ├── us-011a.pdf │ │ │ │ ├── us-031a.pdf │ │ │ │ ├── us-035a.pdf │ │ │ │ ├── us-039-reg.xml │ │ │ │ ├── us-005-reg.xml │ │ │ │ ├── us-038-reg.xml │ │ │ │ ├── us-010-reg.xml │ │ │ │ ├── us-040-reg.xml │ │ │ │ ├── us-034-reg.xml │ │ │ │ ├── us-011b-reg.xml │ │ │ │ ├── us-011a-reg.xml │ │ │ │ ├── us-005-str.xml │ │ │ │ ├── us-006-reg.xml │ │ │ │ ├── us-039-str.xml │ │ │ │ ├── us-003-reg.xml │ │ │ │ └── us-038-str.xml │ │ │ ├── argentina_diputados_voting_record.pdf │ │ │ ├── Publication_of_award_of_Bids_for_Transport_Sector__August_2016.pdf │ │ │ ├── csv │ │ │ ├── AnimalSounds.csv │ │ │ ├── MultiColumn.csv │ │ │ ├── twotables.csv │ │ │ ├── TestCommandLineApp_testGuessOption_with_guessing.csv │ │ │ ├── spanning_cells.csv │ │ │ ├── spreadsheet_no_bounding_frame.csv │ │ │ ├── indictb1h_14.csv │ │ │ ├── us-020.csv │ │ │ ├── TestCommandLineApp_testGuessOption_no_guessing.csv │ │ │ ├── argentina_diputados_voting_record.csv │ │ │ ├── Publication_of_award_of_Bids_for_Transport_Sector__August_2016.csv │ │ │ ├── TestSpreadsheetExtractor-CELLS.csv │ │ │ ├── frx_2012_disclosure.csv │ │ │ └── schools.csv │ │ │ └── json │ │ │ └── AnimalSounds1.json │ └── java │ │ └── technology │ │ └── tabula │ │ ├── TestRectangleSpatialIndex.java │ │ ├── TestDebug.java │ │ ├── TableTest.java │ │ ├── TestCell.java │ │ ├── TestLine.java │ │ ├── TestRuling.java │ │ ├── UtilsForTesting.java │ │ ├── TestProjectionProfile.java │ │ ├── TestCohenSutherland.java │ │ └── TestUtils.java └── main │ └── java │ └── technology │ └── tabula │ ├── HasText.java │ ├── writers │ ├── TSVWriter.java │ ├── Writer.java │ ├── CSVWriter.java │ └── JSONWriter.java │ ├── extractors │ └── ExtractionAlgorithm.java │ ├── detectors │ ├── DetectionAlgorithm.java │ └── SpreadsheetDetectionAlgorithm.java │ ├── Pair.java │ ├── PageDims.java │ ├── json │ ├── RectangularTextContainerSerializer.java │ └── TableSerializer.java │ ├── PageIterator.java │ ├── RectangleSpatialIndex.java │ ├── RectangularTextContainer.java │ ├── Cell.java │ ├── Line.java │ ├── ObjectExtractor.java │ ├── TableWithRulingLines.java │ ├── Table.java │ ├── QuickSort.java │ └── CohenSutherlandClipping.java ├── .github ├── dependabot.yml └── workflows │ ├── tests.yml │ └── tests-windows.yml ├── .gitignore ├── jbang-catalog.json └── LICENSE /src/test/resources/technology/tabula/20.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/20.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/m27.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/m27.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/12s0324.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/12s0324.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/arabic.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/arabic.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/china.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/china.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/eu-002.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/eu-002.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/eu-017.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/eu-017.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/labor.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/labor.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/mednine.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/mednine.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/offense.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/offense.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/schools.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/schools.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/us-007.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/us-007.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/us-017.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/us-017.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/us-020.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/us-020.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/us-024.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/us-024.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/encrypted.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/encrypted.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/jpeg2000.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/jpeg2000.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/puertos1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/puertos1.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/twotables.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/twotables.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/AnimalSounds.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/AnimalSounds.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/MultiColumn.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/MultiColumn.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/failing_sort.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/failing_sort.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/indictb1h_14.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/indictb1h_14.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/rotated_page.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/rotated_page.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/AnimalSounds1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/AnimalSounds1.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/cs-en-us-pbms.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/cs-en-us-pbms.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/npe_issue_206.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/npe_issue_206.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/sort_exception.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/sort_exception.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/spanning_cells.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/spanning_cells.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/S2MNCEbirdisland.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/S2MNCEbirdisland.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/campaign_donors.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/campaign_donors.pdf -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: maven 4 | directory: "/" 5 | schedule: 6 | interval: daily 7 | open-pull-requests-limit: 10 8 | -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/frx_2012_disclosure.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/frx_2012_disclosure.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/should_detect_rulings.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/should_detect_rulings.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/sydney_disclosure_contract.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/sydney_disclosure_contract.pdf -------------------------------------------------------------------------------- /src/main/java/technology/tabula/HasText.java: -------------------------------------------------------------------------------- 1 | package technology.tabula; 2 | 3 | public interface HasText { 4 | 5 | String getText(); 6 | String getText(boolean useLineReturns); 7 | 8 | } 9 | -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/spreadsheet_no_bounding_frame.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/spreadsheet_no_bounding_frame.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-001.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":7,"numCorrectlyDetectedTables":7,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-002.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":1,"numCorrectlyDetectedTables":1,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-003.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":3,"numCorrectlyDetectedTables":3,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-004.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":12,"numCorrectlyDetectedTables":12,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-005.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":2,"numCorrectlyDetectedTables":2,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-006.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":4,"numCorrectlyDetectedTables":4,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-007.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":6,"numCorrectlyDetectedTables":6,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-008.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":1,"numCorrectlyDetectedTables":1,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-009a.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":1,"numCorrectlyDetectedTables":1,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-010.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":1,"numCorrectlyDetectedTables":1,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-011.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":1,"numCorrectlyDetectedTables":1,"numErroneouslyDetectedTables":3,"expectedFailure":true} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-012.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":5,"numCorrectlyDetectedTables":5,"numErroneouslyDetectedTables":1,"expectedFailure":true} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-013.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":4,"numCorrectlyDetectedTables":4,"numErroneouslyDetectedTables":1,"expectedFailure":true} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-014.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":1,"numCorrectlyDetectedTables":0,"numErroneouslyDetectedTables":1,"expectedFailure":true} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-015.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":5,"numCorrectlyDetectedTables":2,"numErroneouslyDetectedTables":2,"expectedFailure":true} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-016.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":1,"numCorrectlyDetectedTables":1,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-017.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":1,"numCorrectlyDetectedTables":1,"numErroneouslyDetectedTables":1,"expectedFailure":true} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-018.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":2,"numCorrectlyDetectedTables":2,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-019.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":1,"numCorrectlyDetectedTables":1,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-020.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":3,"numCorrectlyDetectedTables":3,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-021.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":2,"numCorrectlyDetectedTables":2,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-022.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":1,"numCorrectlyDetectedTables":1,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-023.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":1,"numCorrectlyDetectedTables":1,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-024.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":1,"numCorrectlyDetectedTables":1,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-025.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":5,"numCorrectlyDetectedTables":5,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-026.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":3,"numCorrectlyDetectedTables":3,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-027.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":1,"numCorrectlyDetectedTables":1,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-001.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":2,"numCorrectlyDetectedTables":2,"numErroneouslyDetectedTables":2,"expectedFailure":true} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-002.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":2,"numCorrectlyDetectedTables":2,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-003.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":1,"numCorrectlyDetectedTables":1,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-004.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":1,"numCorrectlyDetectedTables":1,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-005.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":1,"numCorrectlyDetectedTables":1,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-006.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":1,"numCorrectlyDetectedTables":1,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-007.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":2,"numCorrectlyDetectedTables":2,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-008.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":2,"numCorrectlyDetectedTables":2,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-009.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":1,"numCorrectlyDetectedTables":1,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-010.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":1,"numCorrectlyDetectedTables":1,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-011a.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":2,"numCorrectlyDetectedTables":2,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-012.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":1,"numCorrectlyDetectedTables":1,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-013.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":1,"numCorrectlyDetectedTables":1,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-014.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":2,"numCorrectlyDetectedTables":2,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-015.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":2,"numCorrectlyDetectedTables":2,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-016.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":1,"numCorrectlyDetectedTables":1,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-017.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":6,"numCorrectlyDetectedTables":6,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-018.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":7,"numCorrectlyDetectedTables":6,"numErroneouslyDetectedTables":1,"expectedFailure":true} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-019.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":4,"numCorrectlyDetectedTables":2,"numErroneouslyDetectedTables":2,"expectedFailure":true} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-020.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":4,"numCorrectlyDetectedTables":4,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-021.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":2,"numCorrectlyDetectedTables":2,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-022.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":1,"numCorrectlyDetectedTables":1,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-023.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":1,"numCorrectlyDetectedTables":1,"numErroneouslyDetectedTables":4,"expectedFailure":true} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-024.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":4,"numCorrectlyDetectedTables":4,"numErroneouslyDetectedTables":2,"expectedFailure":true} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-025.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":6,"numCorrectlyDetectedTables":2,"numErroneouslyDetectedTables":3,"expectedFailure":true} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-026.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":1,"numCorrectlyDetectedTables":1,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-027.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":2,"numCorrectlyDetectedTables":2,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-028.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":2,"numCorrectlyDetectedTables":2,"numErroneouslyDetectedTables":2,"expectedFailure":true} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-029.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":1,"numCorrectlyDetectedTables":1,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-030.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":1,"numCorrectlyDetectedTables":1,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-031a.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":1,"numCorrectlyDetectedTables":1,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-032.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":1,"numCorrectlyDetectedTables":1,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-033.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":3,"numCorrectlyDetectedTables":1,"numErroneouslyDetectedTables":2,"expectedFailure":true} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-034.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":2,"numCorrectlyDetectedTables":0,"numErroneouslyDetectedTables":1,"expectedFailure":true} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-035a.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":3,"numCorrectlyDetectedTables":3,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-036.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":1,"numCorrectlyDetectedTables":1,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-037.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":1,"numCorrectlyDetectedTables":0,"numErroneouslyDetectedTables":3,"expectedFailure":true} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-038.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":1,"numCorrectlyDetectedTables":1,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-039.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":1,"numCorrectlyDetectedTables":1,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-040.json: -------------------------------------------------------------------------------- 1 | {"numExpectedTables":1,"numCorrectlyDetectedTables":1,"numErroneouslyDetectedTables":0,"expectedFailure":false} -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/argentina_diputados_voting_record.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/argentina_diputados_voting_record.pdf -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .settings/ 2 | .idea/ 3 | .project 4 | .classpath 5 | /bin/ 6 | /src/test/**/*.jpg 7 | /src/test/resources/technology/tabula/icdar2013-dataset/test-statistics.json 8 | /target/ 9 | *.iml 10 | -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-001.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-001.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-002.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-002.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-003.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-003.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-004.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-004.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-005.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-005.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-006.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-006.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-007.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-007.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-008.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-008.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-010.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-010.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-011.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-011.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-012.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-012.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-013.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-013.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-014.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-014.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-015.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-015.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-016.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-016.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-017.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-017.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-018.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-018.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-019.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-019.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-020.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-020.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-021.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-021.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-022.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-022.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-023.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-023.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-024.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-024.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-025.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-025.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-026.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-026.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-027.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-027.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-001.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-001.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-002.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-002.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-003.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-003.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-004.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-004.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-005.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-005.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-006.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-006.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-007.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-007.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-008.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-008.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-009.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-009.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-010.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-010.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-012.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-012.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-013.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-013.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-014.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-014.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-015.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-015.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-016.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-016.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-017.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-017.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-018.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-018.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-019.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-019.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-020.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-020.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-021.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-021.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-022.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-022.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-023.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-023.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-024.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-024.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-025.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-025.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-026.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-026.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-027.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-027.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-028.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-028.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-029.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-029.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-032.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-032.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-033.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-033.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-034.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-034.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-036.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-036.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-037.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-037.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-038.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-038.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-039.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-039.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-040.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-040.pdf -------------------------------------------------------------------------------- /jbang-catalog.json: -------------------------------------------------------------------------------- 1 | { 2 | "catalogs": {}, 3 | "aliases": { 4 | "tabula": { 5 | "script-ref": "https://github.com/tabulapdf/tabula-java/releases/download/v1.0.4/tabula-1.0.4-jar-with-dependencies.jar" 6 | } 7 | } 8 | } -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-009a.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-009a.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-011a.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-011a.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-031a.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-031a.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-035a.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-035a.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/Publication_of_award_of_Bids_for_Transport_Sector__August_2016.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-java/HEAD/src/test/resources/technology/tabula/Publication_of_award_of_Bids_for_Transport_Sector__August_2016.pdf -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/csv/AnimalSounds.csv: -------------------------------------------------------------------------------- 1 | Cat,Says,Meow 2 | "Parastratiosphecomyiastratiosph 3 | ecomyioides",Says,bzzzzzzz 4 | Fox,Says,"Ring- 5 | dingdingdingdingeringedingGer 6 | ing- 7 | dingdingdingdingeringedingGer 8 | ing-dingdingdingdingeringeding" -------------------------------------------------------------------------------- /src/main/java/technology/tabula/writers/TSVWriter.java: -------------------------------------------------------------------------------- 1 | package technology.tabula.writers; 2 | 3 | import org.apache.commons.csv.CSVFormat; 4 | 5 | public class TSVWriter extends CSVWriter { 6 | 7 | public TSVWriter() { 8 | super(CSVFormat.TDF); 9 | } 10 | 11 | } 12 | -------------------------------------------------------------------------------- /src/main/java/technology/tabula/extractors/ExtractionAlgorithm.java: -------------------------------------------------------------------------------- 1 | package technology.tabula.extractors; 2 | 3 | import java.util.List; 4 | 5 | import technology.tabula.Page; 6 | import technology.tabula.Table; 7 | 8 | public interface ExtractionAlgorithm { 9 | 10 | List extract(Page page); 11 | String toString(); 12 | 13 | } 14 | -------------------------------------------------------------------------------- /src/main/java/technology/tabula/detectors/DetectionAlgorithm.java: -------------------------------------------------------------------------------- 1 | package technology.tabula.detectors; 2 | 3 | import technology.tabula.Page; 4 | import technology.tabula.Rectangle; 5 | 6 | import java.util.List; 7 | 8 | /** 9 | * Created by matt on 2015-12-14. 10 | */ 11 | public interface DetectionAlgorithm { 12 | List detect(Page page); 13 | } 14 | -------------------------------------------------------------------------------- /src/main/java/technology/tabula/writers/Writer.java: -------------------------------------------------------------------------------- 1 | package technology.tabula.writers; 2 | 3 | import java.io.IOException; 4 | import java.util.List; 5 | 6 | import technology.tabula.Table; 7 | 8 | public interface Writer { 9 | 10 | void write(Appendable out, Table table) throws IOException; 11 | 12 | void write(Appendable out, List tables) throws IOException; 13 | 14 | } 15 | -------------------------------------------------------------------------------- /src/main/java/technology/tabula/Pair.java: -------------------------------------------------------------------------------- 1 | package technology.tabula; 2 | 3 | public class Pair { 4 | private final L left; 5 | private final R right; 6 | 7 | public Pair(L left, R right) { 8 | this.left = left; 9 | this.right = right; 10 | } 11 | 12 | public L getLeft() { 13 | return this.left; 14 | } 15 | 16 | public R getRight() { 17 | return this.right; 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Java CI 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | 9 | steps: 10 | - uses: actions/checkout@v3 11 | - name: Set up JDK 11 12 | uses: actions/setup-java@v3 13 | with: 14 | java-version: '11' 15 | distribution: 'adopt' 16 | cache: maven 17 | - name: Build with Maven 18 | run: mvn --batch-mode test 19 | -------------------------------------------------------------------------------- /src/test/java/technology/tabula/TestRectangleSpatialIndex.java: -------------------------------------------------------------------------------- 1 | package technology.tabula; 2 | 3 | import static org.junit.Assert.*; 4 | 5 | import org.junit.Test; 6 | 7 | public class TestRectangleSpatialIndex { 8 | 9 | @Test 10 | public void testIntersects() { 11 | 12 | Rectangle r = new Rectangle(0, 0, 0, 0); 13 | 14 | RectangleSpatialIndex rSpatialIndex = new RectangleSpatialIndex<>(); 15 | rSpatialIndex.add(r); 16 | 17 | assertTrue(rSpatialIndex.intersects(r).size() > 0); 18 | 19 | } 20 | 21 | } 22 | -------------------------------------------------------------------------------- /src/test/java/technology/tabula/TestDebug.java: -------------------------------------------------------------------------------- 1 | package technology.tabula; 2 | 3 | public class TestDebug { 4 | 5 | private final static String PATH = "src/test/resources/technology/tabula/spanning_cells.pdf"; 6 | 7 | // @Test 8 | // public void test() throws IOException { 9 | // File outFile = new File(new File(System.getProperty("java.io.tmpdir")), "/rendered_page.jpg"); 10 | // Debug.renderPage(PATH, outFile.getAbsolutePath(), 0, null, true, false, false, false, false, false, false, false, false, false); 11 | // assertTrue(outFile.exists()); 12 | // System.out.println(outFile.getAbsolutePath()); 13 | // } 14 | 15 | } 16 | -------------------------------------------------------------------------------- /.github/workflows/tests-windows.yml: -------------------------------------------------------------------------------- 1 | name: Java CI (Windows) 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | runs-on: windows-latest 8 | 9 | steps: 10 | # https://github.com/actions/checkout/issues/135#issuecomment-602171132 11 | - name: Set git to use LF 12 | run: | 13 | git config --global core.autocrlf false 14 | git config --global core.eol lf 15 | - uses: actions/checkout@v3 16 | - name: Set up JDK 11 17 | uses: actions/setup-java@v3 18 | with: 19 | java-version: '11' 20 | distribution: 'adopt' 21 | cache: maven 22 | - name: Build with Maven 23 | run: mvn --batch-mode test 24 | -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/csv/MultiColumn.csv: -------------------------------------------------------------------------------- 1 | 1,100,200 2 | 2,101,201 3 | 3,102,202 4 | 4,103,203 5 | 5,104,204 6 | 6,105,205 7 | 7,106,206 8 | 8,107,207 9 | 9,108,208 10 | 10,109,209 11 | 11,110,210 12 | 12,111,211 13 | 13,112,212 14 | 14,113,213 15 | 15,114,214 16 | 16,115,215 17 | 17,116,216 18 | 18,117,217 19 | 19,118,218 20 | 20,119,219 21 | 21,120,220 22 | 22,121,221 23 | 23,122,222 24 | 24,123,223 25 | 25,124,224 26 | 26,125,225 27 | 27,126,226 28 | 28,127,227 29 | 29,128,228 30 | 30,129,229 31 | 31,130,230 32 | 32,131,231 33 | 33,132,232 34 | 34,133,233 35 | 35,134,234 36 | 36,135,235 37 | 37,136,236 38 | 38,137,237 39 | 39,138,238 40 | 40,139,239 41 | 41,140,240 42 | 42,141,241 43 | 43,142,242 44 | 44,143,243 -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/csv/twotables.csv: -------------------------------------------------------------------------------- 1 | "",株主資本,,,, 2 | "",資本金,資本剰余金,利益剰余金,自己株式,株主資本合計 3 | "当期首残高","5,664",749,"12,017",△747,"17,683" 4 | "当期変動額",,,,, 5 | "剰余金の配当",,,△525,,△525 6 | "当期純利益",,,"1,269",,"1,269" 7 | "自己株式の取得",,,,△0,△0 8 | "持分法の適用範囲 9 | の変動",,,85,,85 10 | "株主資本以外の項目 11 | の当期変動額(純額)",,,,, 12 | "当期変動額合計",―,―,829,△0,829 13 | "当期末残高","5,664",749,"12,846",△747,"18,512" 14 | "",その他の包括利益累計額,少数株主持分,純資産合計,,,, 15 | "","その他有価証券 16 | 評価差額金","繰延ヘッジ 17 | 損益","為替換算 18 | 調整勘定","退職給付に係る 19 | 調整累計額","その他の 20 | 包括利益 21 | 累計額合計",, 22 | "当期首残高",△669,61,△109,―,△717,246,"17,212" 23 | "当期変動額",,,,,,, 24 | "剰余金の配当",,,,,,,△525 25 | "当期純利益",,,,,,,"1,269" 26 | "自己株式の取得",,,,,,,△0 27 | "持分法の適用範囲 28 | の変動",,,,,,,85 29 | "株主資本以外の項目 30 | の当期変動額(純額)",556,80,5,―,642,△0,642 31 | "当期変動額合計",556,80,5,―,642,△0,"1,471" 32 | "当期末残高",△113,142,△104,―,△75,245,"18,683" -------------------------------------------------------------------------------- /src/main/java/technology/tabula/PageDims.java: -------------------------------------------------------------------------------- 1 | package technology.tabula; 2 | 3 | public class PageDims { 4 | private final float top; 5 | private final float left; 6 | private final float width; 7 | private final float height; 8 | 9 | private PageDims(final float top, final float left, final float width, final float height) { 10 | this.top = top; 11 | this.left = left; 12 | this.width = width; 13 | this.height = height; 14 | } 15 | 16 | public static PageDims of(final float top, final float left, final float width, final float height) { 17 | return new PageDims(top, left, width, height); 18 | } 19 | 20 | public float getTop() { 21 | return top; 22 | } 23 | 24 | public float getLeft() { 25 | return left; 26 | } 27 | 28 | public float getWidth() { 29 | return width; 30 | } 31 | 32 | public float getHeight() { 33 | return height; 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-039-reg.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 |
5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 |
23 | 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014-2016 Manuel Aristarán 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/main/java/technology/tabula/detectors/SpreadsheetDetectionAlgorithm.java: -------------------------------------------------------------------------------- 1 | package technology.tabula.detectors; 2 | 3 | import technology.tabula.Cell; 4 | import technology.tabula.Page; 5 | import technology.tabula.Rectangle; 6 | import technology.tabula.extractors.SpreadsheetExtractionAlgorithm; 7 | 8 | import java.util.Collections; 9 | import java.util.List; 10 | 11 | /** 12 | * Created by matt on 2015-12-14. 13 | * 14 | * This is the basic spreadsheet table detection algorithm currently implemented in tabula (web). 15 | * 16 | * It uses intersecting ruling lines to find tables. 17 | */ 18 | public class SpreadsheetDetectionAlgorithm implements DetectionAlgorithm { 19 | @Override 20 | public List detect(Page page) { 21 | List cells = SpreadsheetExtractionAlgorithm.findCells(page.getHorizontalRulings(), page.getVerticalRulings()); 22 | 23 | List tables = SpreadsheetExtractionAlgorithm.findSpreadsheetsFromCells(cells); 24 | 25 | // we want tables to be returned from top to bottom on the page 26 | Collections.sort(tables, Rectangle.ILL_DEFINED_ORDER); 27 | 28 | return tables; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-005-reg.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
24 |
25 | -------------------------------------------------------------------------------- /src/main/java/technology/tabula/json/RectangularTextContainerSerializer.java: -------------------------------------------------------------------------------- 1 | package technology.tabula.json; 2 | 3 | import java.lang.reflect.Type; 4 | 5 | import com.google.gson.JsonElement; 6 | import com.google.gson.JsonObject; 7 | import com.google.gson.JsonSerializationContext; 8 | import com.google.gson.JsonSerializer; 9 | 10 | import technology.tabula.RectangularTextContainer; 11 | 12 | public final class RectangularTextContainerSerializer implements JsonSerializer> { 13 | 14 | public static final RectangularTextContainerSerializer INSTANCE = new RectangularTextContainerSerializer(); 15 | 16 | private RectangularTextContainerSerializer() {} 17 | 18 | @Override 19 | public JsonElement serialize(RectangularTextContainer textContainer, Type type, JsonSerializationContext context) { 20 | JsonObject json = new JsonObject(); 21 | json.addProperty("top", textContainer.getTop()); 22 | json.addProperty("left", textContainer.getLeft()); 23 | json.addProperty("width", textContainer.getWidth()); 24 | json.addProperty("height", textContainer.getHeight()); 25 | json.addProperty("text", textContainer.getText()); 26 | return json; 27 | } 28 | 29 | } -------------------------------------------------------------------------------- /src/test/java/technology/tabula/TableTest.java: -------------------------------------------------------------------------------- 1 | package technology.tabula; 2 | 3 | import static org.junit.Assert.*; 4 | 5 | import org.junit.Test; 6 | 7 | public class TableTest { 8 | 9 | @Test public void testEmpty() { 10 | Table empty = Table.empty(); 11 | 12 | assertEquals(TextChunk.EMPTY, empty.getCell(0, 0)); 13 | assertEquals(TextChunk.EMPTY, empty.getCell(1, 1)); 14 | 15 | assertEquals(0, empty.getRowCount()); 16 | assertEquals(0, empty.getColCount()); 17 | 18 | assertEquals("", empty.getExtractionMethod()); 19 | 20 | assertEquals(0, empty.getTop(), 0); 21 | assertEquals(0, empty.getRight(), 0); 22 | assertEquals(0, empty.getBottom(), 0); 23 | assertEquals(0, empty.getLeft(), 0); 24 | 25 | assertEquals(0, empty.getArea(), 0); 26 | } 27 | 28 | @Test public void testRowColCounts() { 29 | Table table = Table.empty(); 30 | 31 | assertEquals(0, table.getRowCount()); 32 | assertEquals(0, table.getColCount()); 33 | 34 | table.add(TextChunk.EMPTY, 0, 0); 35 | 36 | assertEquals(1, table.getRowCount()); 37 | assertEquals(1, table.getColCount()); 38 | 39 | table.add(TextChunk.EMPTY, 9, 9); 40 | 41 | assertEquals(10, table.getRowCount()); 42 | assertEquals(10, table.getColCount()); 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-038-reg.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 |
26 |
27 | -------------------------------------------------------------------------------- /src/test/java/technology/tabula/TestCell.java: -------------------------------------------------------------------------------- 1 | package technology.tabula; 2 | 3 | import static org.junit.Assert.*; 4 | 5 | import java.util.List; 6 | import java.util.ArrayList; 7 | 8 | import org.apache.pdfbox.pdmodel.font.PDType1Font; 9 | import org.apache.pdfbox.pdmodel.font.Standard14Fonts; 10 | import org.junit.Test; 11 | 12 | public class TestCell { 13 | 14 | @Test 15 | public void testIsSpanning() { 16 | Cell cell = new Cell(0, 0, 0, 0); 17 | assertFalse(cell.isSpanning()); 18 | cell.setSpanning(true); 19 | assertTrue(cell.isSpanning()); 20 | } 21 | 22 | @Test 23 | public void testIsPlaceholder() { 24 | Cell cell = new Cell(0, 0, 0, 0); 25 | assertFalse(cell.isPlaceholder()); 26 | cell.setPlaceholder(true); 27 | assertTrue(cell.isPlaceholder()); 28 | } 29 | 30 | @Test 31 | public void testGetTextElements() { 32 | Cell cell = new Cell(0, 0, 0, 0); 33 | assertTrue(cell.getTextElements().isEmpty()); 34 | 35 | TextElement tElement = new TextElement(0, 0, 0, 0, new PDType1Font(Standard14Fonts.FontName.HELVETICA_BOLD), 10, "test", 5); 36 | TextChunk tChunk = new TextChunk(tElement); 37 | List tList = new ArrayList<>(); 38 | tList.add(tChunk); 39 | cell.setTextElements(tList); 40 | 41 | assertEquals("test", cell.getTextElements().get(0).getText()); 42 | 43 | 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /src/main/java/technology/tabula/PageIterator.java: -------------------------------------------------------------------------------- 1 | package technology.tabula; 2 | 3 | import java.io.IOException; 4 | import java.util.Iterator; 5 | 6 | public class PageIterator implements Iterator { 7 | 8 | private ObjectExtractor objectExtractor; 9 | private Iterator pageIndexIterator; 10 | 11 | public PageIterator(ObjectExtractor objectExtractor, Iterable pages) { 12 | super(); 13 | this.objectExtractor = objectExtractor; 14 | this.pageIndexIterator = pages.iterator(); 15 | } 16 | 17 | // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // 18 | @Override 19 | public boolean hasNext() { 20 | return pageIndexIterator.hasNext(); 21 | } 22 | 23 | @Override 24 | public Page next() { 25 | Page nextPage = null; 26 | if (!this.hasNext()) { 27 | throw new IllegalStateException(); 28 | } 29 | try { 30 | nextPage = objectExtractor.extractPage(pageIndexIterator.next()); 31 | } catch (IOException e) { 32 | e.printStackTrace(); 33 | } 34 | return nextPage; 35 | } 36 | 37 | // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // 38 | @Override 39 | public void remove() { 40 | throw new UnsupportedOperationException(); 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/csv/TestCommandLineApp_testGuessOption_with_guessing.csv: -------------------------------------------------------------------------------- 1 | "",,THRESHOLD FOR RELEASES, 2 | "",to air,to water,to land 3 | "",kg/year,kg/year,kg/year 4 | Carbon dioxide (CO2),100 million,-,- 5 | Hydro-fluorocarbons (HFCs),100,-,- 6 | Methane (CH4),100 000,-,- 7 | Nitrous oxide (N2O),10 000,-,- 8 | Perfluorocarbons (PFCs),100,-,- 9 | Sulphur hexafluoride (SF6),50,-,- 10 | "",THRESHOLD FOR RELEASES 11 | "",to air to water to land 12 | "",kg/year kg/year kg/year 13 | Ammonia (NH3),10 000 - - 14 | Carbon monoxide (CO),500 000 - - 15 | Chlorine and inorganic compounds, 16 | "",10 000 - - 17 | (as HCl), 18 | Chlorofluorocarbons (CFCs),1 - - 19 | Flourine and inorganic compounds, 20 | "",5 000 - - 21 | (as HF), 22 | Halons,1 - - 23 | Hydrochlorofluorocarbons (HCFCs),1 - - 24 | Hydrogen Cyanide (HCN),200 - - 25 | Nitrogen oxides (NOx/NO2),100 000 - - 26 | Non-methane volatile organic, 27 | "",100 000 - - 28 | compounds (NMVOC), 29 | Sulphur oxides (SOx/SO2),150 000 - - 30 | "",THRESHOLD FOR RELEASES 31 | "",to air to water to land 32 | "",kg/year kg/year kg/year 33 | Arsenic and compounds (as As),20 5 5 34 | Cadmium and compounds (as Cd),10 5 5 35 | Chromium and compounds (as Cr),100 50 50 36 | Copper and compounds (as Cu),100 50 50 37 | Lead and compounds (as Pb),200 20 20 38 | Mercury and compounds (as Hg),10 1 1 39 | Nickel and compounds (as Ni),50 20 20 40 | Zinc and compounds (as Zn),200 100 100 41 | -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/csv/spanning_cells.csv: -------------------------------------------------------------------------------- 1 | Improved operation scenario,,,,, 2 | Volume servers in:,2007,2008,2009,2010,2011 3 | Server closets,"1,505","1,580","1,643","1,673","1,689" 4 | Server rooms,"1,512","1,586","1,646","1,677","1,693" 5 | Localized data centers,"1,512","1,586","1,646","1,677","1,693" 6 | Mid-tier data centers,"1,512","1,586","1,646","1,677","1,693" 7 | Enterprise-class data centers,"1,512","1,586","1,646","1,677","1,693" 8 | Best practice scenario,,,,, 9 | Volume servers in:,2007,2008,2009,2010,2011 10 | Server closets,"1,456","1,439","1,386","1,296","1,326" 11 | Server rooms,"1,465","1,472","1,427","1,334","1,371" 12 | Localized data centers,"1,465","1,471","1,426","1,334","1,371" 13 | Mid-tier data centers,"1,465","1,471","1,426","1,334","1,371" 14 | Enterprise-class data centers,"1,465","1,471","1,426","1,334","1,371" 15 | State-of-the-art scenario,,,,, 16 | Volume servers in:,2007,2008,2009,2010,2011 17 | Server closets,"1,485","1,471","1,424","1,315","1,349" 18 | Server rooms,"1,495","1,573","1,586","1,424","1,485" 19 | Localized data centers,"1,495","1,572","1,585","1,424","1,485" 20 | Mid-tier data centers,"1,495","1,572","1,585","1,424","1,485" 21 | Enterprise-class data centers,"1,495","1,572","1,585","1,424","1,485" 22 | All alternative scenarios,,,,, 23 | Server class,2007,2008,2009,2010,2011 24 | Mid-range,"4,921","5,467","6,152","6,649","7,185" 25 | High-end,"76,295","81,624","86,849","92,662","98,864" -------------------------------------------------------------------------------- /src/main/java/technology/tabula/RectangleSpatialIndex.java: -------------------------------------------------------------------------------- 1 | package technology.tabula; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import org.locationtech.jts.geom.Envelope; 7 | import org.locationtech.jts.index.strtree.STRtree; 8 | 9 | public class RectangleSpatialIndex { 10 | 11 | 12 | private final STRtree si = new STRtree(); 13 | private final List rectangles = new ArrayList<>(); 14 | 15 | public void add(T te) { 16 | rectangles.add(te); 17 | si.insert(new Envelope(te.getLeft(), te.getRight(), te.getBottom(), te.getTop()), te); 18 | } 19 | 20 | public List contains(Rectangle r) { 21 | List intersection = si.query(new Envelope(r.getLeft(), r.getRight(), r.getTop(), r.getBottom())); 22 | List rv = new ArrayList(); 23 | 24 | for (T ir: intersection) { 25 | if (r.contains(ir)) { 26 | rv.add(ir); 27 | } 28 | } 29 | 30 | Utils.sort(rv, Rectangle.ILL_DEFINED_ORDER); 31 | return rv; 32 | } 33 | 34 | public List intersects(Rectangle r) { 35 | return si.query(new Envelope(r.getLeft(), r.getRight(), r.getTop(), r.getBottom())); 36 | } 37 | 38 | /** 39 | * Minimum bounding box of all the Rectangles contained on this RectangleSpatialIndex 40 | * 41 | * @return a Rectangle 42 | */ 43 | public Rectangle getBounds() { 44 | return Rectangle.boundingBoxOf(rectangles); 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /src/main/java/technology/tabula/writers/CSVWriter.java: -------------------------------------------------------------------------------- 1 | package technology.tabula.writers; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.Collections; 6 | import java.util.List; 7 | 8 | import org.apache.commons.csv.CSVPrinter; 9 | import org.apache.commons.csv.CSVFormat; 10 | 11 | import technology.tabula.RectangularTextContainer; 12 | import technology.tabula.Table; 13 | 14 | public class CSVWriter implements Writer { 15 | 16 | private final CSVFormat format; 17 | 18 | public CSVWriter() { 19 | this(CSVFormat.EXCEL); 20 | } 21 | 22 | protected CSVWriter(CSVFormat format) { 23 | this.format = format; 24 | } 25 | 26 | // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // 27 | @Override 28 | public void write(Appendable out, Table table) throws IOException { 29 | write(out, Collections.singletonList(table)); 30 | } 31 | 32 | @Override 33 | public void write(Appendable out, List tables) throws IOException { 34 | try (CSVPrinter printer = new CSVPrinter(out, format)) { 35 | for (Table table : tables) { 36 | for (List row : table.getRows()) { 37 | List cells = new ArrayList<>(row.size()); 38 | for (RectangularTextContainer cell : row) 39 | cells.add(cell.getText()); 40 | printer.printRecord(cells); 41 | } 42 | } 43 | printer.flush(); 44 | } 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /src/main/java/technology/tabula/RectangularTextContainer.java: -------------------------------------------------------------------------------- 1 | package technology.tabula; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | @SuppressWarnings("serial") 7 | public class RectangularTextContainer extends Rectangle implements HasText { 8 | 9 | protected List textElements = new ArrayList<>(); 10 | 11 | protected RectangularTextContainer(float top, float left, float width, float height) { 12 | super(top, left, width, height); 13 | } 14 | 15 | public RectangularTextContainer merge(RectangularTextContainer other) { 16 | if (compareTo(other) < 0) { 17 | this.getTextElements().addAll(other.getTextElements()); 18 | } else { 19 | this.getTextElements().addAll(0, other.getTextElements()); 20 | } 21 | super.merge(other); 22 | return this; 23 | } 24 | 25 | public List getTextElements() { 26 | return textElements; 27 | } 28 | 29 | public void setTextElements(List textElements) { 30 | this.textElements = textElements; 31 | } 32 | 33 | @Override 34 | public String getText() { 35 | throw new UnsupportedOperationException(); 36 | } 37 | 38 | @Override 39 | public String getText(boolean useLineReturns) { 40 | throw new UnsupportedOperationException(); 41 | } 42 | 43 | @Override public String toString() { 44 | StringBuilder sb = new StringBuilder(); 45 | String s = super.toString(); 46 | sb.append(s.substring(0, s.length() - 1)); 47 | sb.append(String.format(",text=%s]", this.getText() == null ? "null" : "\"" + this.getText() + "\"")); 48 | return sb.toString(); 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/csv/spreadsheet_no_bounding_frame.csv: -------------------------------------------------------------------------------- 1 | "",HARVEST,VARIATION,,, 2 | "","11/12 (a)",12/13,Percentage,Absolute, 3 | "","May/2013 (b)","Jun/2013 (c)",(c/a),(c-a), 4 | COTTON,"1.393,4","886,7","894,9","(35,8)","( 498,5)" 5 | TOTAL PEANUT,"93,9","100,6","100,2","6,7","6,3" 6 | PEANUT 1ST CROP,"82,1","86,3","86,2","5,0","4,1" 7 | PEANUT 2ND CROP,"11,8","14,3","14,0","18,6","2,2" 8 | RICE,"2.426,7","2.389,7","2.396,0","(1,3)","( 30,7)" 9 | TOTAL BEANS,"3.262,1","2.952,7","3.026,9","(7,2)","( 235,2)" 10 | BEANS 1ST CROP,"1.241,4","1.122,6","1.122,9","(9,5)","( 118,5)" 11 | BEANS 2ND CROP,"1.394,6","1.275,4","1.271,7","(8,8)","( 122,9)" 12 | BEANS 3RD CROP,"626,1","554,7","632,3","1,0","6,3" 13 | SUNFLOWER,"74,5","60,4","68,9","(7,5)","( 5,6)" 14 | CASTOR BEAN,"128,2","87,5","87,4","(31,8)","( 40,8)" 15 | TOTAL CORN,"15.178,1","15.686,2","15.817,4","4,2","639,3" 16 | CORN 1ST CROP,"7.558,5","6.879,2","6.864,7","(9,2)","( 693,8)" 17 | CORN 2ND CROP,"7.619,6","8.807,0","8.952,7","17,5","1.333,1" 18 | SOYBEAN,"25.042,2","27.715,2","27.715,5","10,7","2.673,3" 19 | SORGHUM,"786,9","836,4","836,4","6,3","49,5" 20 | SUBTOTAL,"48.386,0","50.715,4","50.943,6","5,3","2.557,7" 21 | OAT,"153,0","168,7","168,7","10,3","15,7" 22 | CANOLA,"42,4","43,8","43,8","3,3","1,4" 23 | RYE,"2,3","2,3","2,3",-,- 24 | BARLEY,"88,4","102,8","102,8","16,3","14,4" 25 | WHEAT,"2.166,2","1.895,4","1.895,4","(12,5)","( 270,8)" 26 | TRITICALE,"46,9","48,0","48,0","2,3","1,1" 27 | SUBTOTAL,"2.499,2","2.261,0","2.261,0","(9,5)","( 238,2)" 28 | BRAZIL,"50.885,2","5 2.976,4","5 3.204,6","4,6","2.319,5" 29 | -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/csv/indictb1h_14.csv: -------------------------------------------------------------------------------- 1 | 2014,2013,2012,2011,2010,2009,,, 2 | "",,,,,,,,תוצר מקומי גולמי 3 | 2.6,3.2,3.0,4.2,5.8,1.9,,,סך הכל 4 | 0.7,1.3,1.1,2.3,3.8,0.1,,,סך הכל - לנפש 5 | 2.5,3.4,2.9,4.5,6.8,1.5,,,הסקטור העסקי 6 | 0.3,3.2,2.9,0.0,12.0,-4.3,,,תעשייה )ללא יהלומים( 7 | "-1.7",1.3,6.8,11.6,11.2,3.4,,,בינוי 8 | 2.2,2.4,2.9,5.8,7.9,-2.5,,,שימושים במקורות 9 | "",,,,,,,,צריכה 10 | "",,,,,,,:ההוצאה לצריכה ציבורית, 11 | 3.8,3.5,3.6,2.7,2.8,2.9,,,סך הכל 12 | 2.5,3.1,3.8,3.6,2.9,4.8,,,אזרחית 13 | 7.8,4.7,3.0,0.1,2.8,-1.7,,,ביטחונית 14 | 8.6,2.3,1.4,1.9,0.4,1.8,,הוצאה מקומית לביטחון, 15 | 3.4,6.8,11.5,6.4,13.3,-20.6,,,יבוא ביטחוני 16 | "",,,,,,:ההוצאה לצריכה פרטית לנפש,, 17 | 1.8,1.4,1.2,1.0,2.7,0.4,,,סך הכל 18 | 1.4,1.4,1.5,0.8,2.0,1.2,,למעט מוצרים בני-קיימא, 19 | 7.0,1.9,-2.0,3.8,9.3,-6.3,,,מוצרים בני-קיימא 20 | 1.6,1.4,1.3,1.2,2.5,0.8,ההוצאה לצריכה אינדיבידואלית 1)( לנפש,, 21 | 3.4,2.0,1.5,-0.1,0.5,-0.4,ההוצאה לצריכה קולקטיבית לנפש,, 22 | 1.8,1.5,1.4,1.0,2.2,0.6,סך כל ההוצאה לצריכה סופית לנפש(2),, 23 | "",,,,,,,,ההשקעה בנכסים קבועים 24 | "-2.7",1.1,3.2,14.5,10.2,-2.6,,,סך הכל 25 | "-1.2",1.2,8.6,12.1,13.0,8.0,,,בתי מגורים:סך הכל 26 | "-2.4",0.0,8.5,11.3,13.7,8.6,בנייה פרטית למגורים,, 27 | 14.8,19.1,9.4,25.6,2.5,0.0,בנייה ציבורית למגורים,, 28 | "-3.4",1.1,0.9,15.6,9.1,-6.3,,,ענפי המשק : סך הכל 29 | "-8.1",12.1,-1.7,7.7,5.7,1.0,בניינים שלא למגורים,, 30 | "-8.7",4.0,13.6,13.9,9.5,-5.1,עבודות בנייה אחרות,, 31 | 3.2,-12.1,-3.1,41.8,5.3,-20.0,,מכונות וציוד, 32 | "-2.4",16.3,-20.2,-2.0,27.6,-6.8,כלי תחבורה יבשתיים,, 33 | 0.6,1.5,0.9,6.5,15.1,-11.9,,,יצוא סחורות ושירותים 34 | "",,,,,,,,יבוא סחורות ושירותים 35 | 0.9,-0.1,2.5,10.7,15.1,-13.8,,,סך הכל 36 | 0.9,-0.3,2.2,10.8,15.1,-13.6,,,ללא יבוא ביטחוני -------------------------------------------------------------------------------- /src/main/java/technology/tabula/json/TableSerializer.java: -------------------------------------------------------------------------------- 1 | package technology.tabula.json; 2 | 3 | import java.lang.reflect.Type; 4 | import java.util.List; 5 | 6 | import technology.tabula.RectangularTextContainer; 7 | import technology.tabula.Table; 8 | 9 | import com.google.gson.JsonArray; 10 | import com.google.gson.JsonElement; 11 | import com.google.gson.JsonObject; 12 | import com.google.gson.JsonSerializationContext; 13 | import com.google.gson.JsonSerializer; 14 | 15 | public final class TableSerializer implements JsonSerializer
{ 16 | 17 | public static final TableSerializer INSTANCE = new TableSerializer(); 18 | 19 | private TableSerializer() {} 20 | 21 | @Override 22 | public JsonElement serialize(Table table, Type type, JsonSerializationContext context) { 23 | JsonObject json = new JsonObject(); 24 | JsonArray data = new JsonArray(); 25 | 26 | json.addProperty("extraction_method", table.getExtractionMethod()); 27 | json.addProperty("page_number", table.getPageNumber()); 28 | json.addProperty("top", table.getTop()); 29 | json.addProperty("left", table.getLeft()); 30 | json.addProperty("width", table.getWidth()); 31 | json.addProperty("height", table.getHeight()); 32 | json.addProperty("right", table.getRight()); 33 | json.addProperty("bottom", table.getBottom()); 34 | json.add("data", data); 35 | 36 | for (List tableRow : table.getRows()) { 37 | JsonArray jsonRow = new JsonArray(); 38 | for (RectangularTextContainer textChunk : tableRow) 39 | jsonRow.add(context.serialize(textChunk)); 40 | data.add(jsonRow); 41 | } 42 | 43 | return json; 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /src/main/java/technology/tabula/Cell.java: -------------------------------------------------------------------------------- 1 | package technology.tabula; 2 | 3 | import java.awt.geom.Point2D; 4 | import java.util.Collections; 5 | 6 | @SuppressWarnings("serial") 7 | public class Cell extends RectangularTextContainer { 8 | 9 | public Cell(float top, float left, float width, float height) { 10 | super(top, left, width, height); 11 | this.setPlaceholder(false); 12 | this.setSpanning(false); 13 | } 14 | 15 | public Cell(Point2D topLeft, Point2D bottomRight) { 16 | super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight.getY() - topLeft.getY())); 17 | this.setPlaceholder(false); 18 | this.setSpanning(false); 19 | } 20 | 21 | private boolean spanning; 22 | private boolean placeholder; 23 | 24 | @Override 25 | public String getText(boolean useLineReturns) { 26 | if (this.textElements.size() == 0) { 27 | return ""; 28 | } 29 | StringBuilder sb = new StringBuilder(); 30 | this.textElements.sort(Rectangle.ILL_DEFINED_ORDER); 31 | double curTop = this.textElements.get(0).getTop(); 32 | for (TextChunk tc : this.textElements) { 33 | if (useLineReturns && tc.getTop() > curTop) { 34 | sb.append('\r'); 35 | } 36 | sb.append(tc.getText()); 37 | curTop = tc.getTop(); 38 | } 39 | return sb.toString().trim(); 40 | } 41 | 42 | @Override 43 | public String getText() { 44 | return getText(true); 45 | } 46 | 47 | public boolean isSpanning() { 48 | return spanning; 49 | } 50 | 51 | public void setSpanning(boolean spanning) { 52 | this.spanning = spanning; 53 | } 54 | 55 | public boolean isPlaceholder() { 56 | return placeholder; 57 | } 58 | 59 | public void setPlaceholder(boolean placeholder) { 60 | this.placeholder = placeholder; 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/csv/us-020.csv: -------------------------------------------------------------------------------- 1 | "",,,,,,Combined 2 | "",Percentage of,,,,,weighted school 3 | "",international,National desired,Weighted school,Weighted school,,participation 4 | "",desired population,population overall,participation rate,participation rate,Weighted student,and student 5 | Education system,coverage,exclusion rate,before substitution,after substitution,response rate,response rate 6 | Australia,100,4,96,98,95,93 7 | Austria,100,5,100,100,98,98 8 | Azerbaijan,100,7,84,100,100,100 9 | Belgium (French)-BEL,100,6,77,85,97,82 10 | Bulgaria,100,3,97,100,95,95 11 | Canada,100,10,98,98,96,94 12 | Chinese Taipei-CHN,100,1,100,100,99,99 13 | Colombia,100,2,89,99,97,95 14 | Croatia,100,8,99,100,95,95 15 | Czech Republic,100,5,90,99,94,94 16 | Denmark,100,7,87,98,97,95 17 | England-GBR,100,2,73,87,94,82 18 | Finland,100,3,97,99,96,95 19 | France,100,5,98,100,98,97 20 | Georgia,92,5,97,98,98,96 21 | Germany,100,2,96,99,96,95 22 | Hong Kong-CHN,100,12,86,88,94,83 23 | Hungary,100,4,98,99,97,96 24 | Indonesia,100,3,100,100,97,97 25 | "Iran, Islamic Rep. Of",100,5,100,100,99,99 26 | Ireland,100,3,98,100,95,95 27 | Israel,100,25,98,99,94,93 28 | Italy,100,4,81,98,96,95 29 | Lithuania,93,6,94,100,94,94 30 | Malta,100,4,100,100,95,95 31 | Morocco,100,2,99,99,96,95 32 | Netherlands,100,4,68,92,97,89 33 | New Zealand,100,3,93,99,94,93 34 | Northern Ireland-GBR,100,4,62,85,93,79 35 | Norway,100,4,57,83,86,71 36 | Oman,100,2,98,98,98,96 37 | Poland,100,4,100,100,96,96 38 | Portugal,100,3,87,99,95,93 39 | Qatar,100,6,100,100,99,99 40 | Romania,100,4,99,100,97,97 41 | Russian Federation,100,5,100,100,98,98 42 | Saudi Arabia,100,2,95,100,98,98 43 | Singapore,100,6,100,100,96,96 44 | Slovak Republic,100,5,95,99,97,96 45 | Slovenia,100,3,96,97,97,95 46 | Spain,100,5,96,99,97,96 47 | Sweden,100,4,97,99,92,91 48 | Trinidad and Tobago,100,1,99,99,96,95 49 | United Arab Emirates,100,3,100,100,97,97 50 | United States,100,7,80,85,96,81 -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-010-reg.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 |
5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 |
42 | 43 | -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-010-reg.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 |
50 |
51 | -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-040-reg.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 |
49 |
50 | -------------------------------------------------------------------------------- /src/main/java/technology/tabula/Line.java: -------------------------------------------------------------------------------- 1 | package technology.tabula; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | // TODO this class seems superfluous - get rid of it 7 | 8 | @SuppressWarnings("serial") 9 | public class Line extends Rectangle { 10 | 11 | List textChunks = new ArrayList<>(); 12 | public static final Character[] WHITE_SPACE_CHARS = { ' ', '\t', '\r', '\n', '\f' }; 13 | 14 | 15 | public List getTextElements() { 16 | return textChunks; 17 | } 18 | 19 | public void setTextElements(List textChunks) { 20 | this.textChunks = textChunks; 21 | } 22 | 23 | public void addTextChunk(int i, TextChunk textChunk) { 24 | if (i < 0) { 25 | throw new IllegalArgumentException("i can't be less than 0"); 26 | } 27 | 28 | int s = this.textChunks.size(); 29 | if (s < i + 1) { 30 | for (; s <= i; s++) { 31 | this.textChunks.add(null); 32 | } 33 | this.textChunks.set(i, textChunk); 34 | } 35 | else { 36 | this.textChunks.set(i, this.textChunks.get(i).merge(textChunk)); 37 | } 38 | this.merge(textChunk); 39 | } 40 | 41 | public void addTextChunk(TextChunk textChunk) { 42 | if (this.textChunks.isEmpty()) { 43 | this.setRect(textChunk); 44 | } 45 | else { 46 | this.merge(textChunk); 47 | } 48 | this.textChunks.add(textChunk); 49 | } 50 | 51 | @Override 52 | public String toString() { 53 | StringBuilder sb = new StringBuilder(); 54 | String s = super.toString(); 55 | sb.append(s, 0, s.length() - 1); 56 | sb.append(",chunks="); 57 | for (TextChunk te: this.textChunks) { 58 | sb.append("'" + te.getText() + "', "); 59 | } 60 | sb.append(']'); 61 | return sb.toString(); 62 | } 63 | 64 | static Line removeRepeatedCharacters(Line line, Character c, int minRunLength) { 65 | 66 | Line rv = new Line(); 67 | 68 | for(TextChunk t: line.getTextElements()) { 69 | for (TextChunk r: t.squeeze(c, minRunLength)) { 70 | rv.addTextChunk(r); 71 | } 72 | } 73 | 74 | return rv; 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/csv/TestCommandLineApp_testGuessOption_no_guessing.csv: -------------------------------------------------------------------------------- 1 | E-PRTR pollutants and their thresholds 2 | "" 3 | A facility has to report data under E-PRTR if it fulfils the following criteria: 4 | • the facility falls under at least one of the 65 E-PRTR economic activities. The 5 | activities are also reported using a statistical classification of economic activities 6 | (NACE rev 2) 7 | • the facility has a capacity exceeding at least one of the E-PRTR capacity 8 | thresholds 9 | • the facility releases pollutants or transfers waste off-site which exceed specific 10 | thresholds set out in Article 5 of the E-PRTR Regulation. These thresholds for 11 | "releases of pollutants are specified for each media - air, water and land - in Annex" 12 | II of the E-PRTR Regulation. 13 | "" 14 | In the following tables you will find the 91 E-PRTR pollutants and their thresholds broken 15 | down by the 7 groups used in all the searches of the E-PRTR website. 16 | "" 17 | "" 18 | Greenhouse gases 19 | "" 20 | THRESHOLD FOR RELEASES 21 | to air to water to land 22 | kg/year kg/year kg/year 23 | Carbon dioxide (CO2) 100 million - - 24 | Hydro-fluorocarbons (HFCs) 100 - - 25 | Methane (CH4) 100 000 - - 26 | Nitrous oxide (N2O) 10 000 - - 27 | Perfluorocarbons (PFCs) 100 - - 28 | Sulphur hexafluoride (SF6) 50 - - 29 | "" 30 | Other gases 31 | "" 32 | THRESHOLD FOR RELEASES 33 | to air to water to land 34 | kg/year kg/year kg/year 35 | Ammonia (NH3) 10 000 - - 36 | Carbon monoxide (CO) 500 000 - - 37 | Chlorine and inorganic compounds 38 | 10 000 - - 39 | (as HCl) 40 | Chlorofluorocarbons (CFCs) 1 - - 41 | Flourine and inorganic compounds 42 | 5 000 - - 43 | (as HF) 44 | Halons 1 - - 45 | Hydrochlorofluorocarbons (HCFCs) 1 - - 46 | Hydrogen Cyanide (HCN) 200 - - 47 | Nitrogen oxides (NOx/NO2) 100 000 - - 48 | Non-methane volatile organic 49 | 100 000 - - 50 | compounds (NMVOC) 51 | Sulphur oxides (SOx/SO2) 150 000 - - 52 | "" 53 | Heavy metals 54 | "" 55 | THRESHOLD FOR RELEASES 56 | to air to water to land 57 | kg/year kg/year kg/year 58 | Arsenic and compounds (as As) 20 5 5 59 | Cadmium and compounds (as Cd) 10 5 5 60 | Chromium and compounds (as Cr) 100 50 50 61 | Copper and compounds (as Cu) 100 50 50 62 | Lead and compounds (as Pb) 200 20 20 63 | Mercury and compounds (as Hg) 10 1 1 64 | Nickel and compounds (as Ni) 50 20 20 65 | Zinc and compounds (as Zn) 200 100 100 66 | -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-034-reg.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 |
29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 |
54 |
55 | -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-014-reg.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 |
46 |
47 | -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/csv/argentina_diputados_voting_record.csv: -------------------------------------------------------------------------------- 1 | "ABDALA de MATARAZZO, Norma Amanda",Frente Cívico por Santiago,Santiago del Estero,AFIRMATIVO 2 | "ALBRIEU, Oscar Edmundo Nicolas",Frente para la Victoria - PJ,Rio Negro,AFIRMATIVO 3 | "ALONSO, María Luz",Frente para la Victoria - PJ,La Pampa,AFIRMATIVO 4 | "ARENA, Celia Isabel",Frente para la Victoria - PJ,Santa Fe,AFIRMATIVO 5 | "ARREGUI, Andrés Roberto",Frente para la Victoria - PJ,Buenos Aires,AFIRMATIVO 6 | "AVOSCAN, Herman Horacio",Frente para la Victoria - PJ,Rio Negro,AFIRMATIVO 7 | "BALCEDO, María Ester",Frente para la Victoria - PJ,Buenos Aires,AFIRMATIVO 8 | "BARRANDEGUY, Raúl Enrique",Frente para la Victoria - PJ,Entre Ríos,AFIRMATIVO 9 | "BASTERRA, Luis Eugenio",Frente para la Victoria - PJ,Formosa,AFIRMATIVO 10 | "BEDANO, Nora Esther",Frente para la Victoria - PJ,Córdoba,AFIRMATIVO 11 | "BERNAL, María Eugenia",Frente para la Victoria - PJ,Jujuy,AFIRMATIVO 12 | "BERTONE, Rosana Andrea",Frente para la Victoria - PJ,Tierra del Fuego,AFIRMATIVO 13 | "BIANCHI, María del Carmen",Frente para la Victoria - PJ,Cdad. Aut. Bs. As.,AFIRMATIVO 14 | "BIDEGAIN, Gloria Mercedes",Frente para la Victoria - PJ,Buenos Aires,AFIRMATIVO 15 | "BRAWER, Mara",Frente para la Victoria - PJ,Cdad. Aut. Bs. As.,AFIRMATIVO 16 | "BRILLO, José Ricardo",Movimiento Popular Neuquino,Neuquén,AFIRMATIVO 17 | "BROMBERG, Isaac Benjamín",Frente para la Victoria - PJ,Tucumán,AFIRMATIVO 18 | "BRUE, Daniel Agustín",Frente Cívico por Santiago,Santiago del Estero,AFIRMATIVO 19 | "CALCAGNO, Eric",Frente para la Victoria - PJ,Buenos Aires,AFIRMATIVO 20 | "CARLOTTO, Remo Gerardo",Frente para la Victoria - PJ,Buenos Aires,AFIRMATIVO 21 | "CARMONA, Guillermo Ramón",Frente para la Victoria - PJ,Mendoza,AFIRMATIVO 22 | "CATALAN MAGNI, Julio César",Frente para la Victoria - PJ,Tierra del Fuego,AFIRMATIVO 23 | "CEJAS, Jorge Alberto",Frente para la Victoria - PJ,Rio Negro,AFIRMATIVO 24 | "CHIENO, María Elena",Frente para la Victoria - PJ,Corrientes,AFIRMATIVO 25 | "CIAMPINI, José Alberto",Frente para la Victoria - PJ,Neuquén,AFIRMATIVO 26 | "CIGOGNA, Luis Francisco Jorge",Frente para la Victoria - PJ,Buenos Aires,AFIRMATIVO 27 | "CLERI, Marcos",Frente para la Victoria - PJ,Santa Fe,AFIRMATIVO 28 | "COMELLI, Alicia Marcela",Movimiento Popular Neuquino,Neuquén,AFIRMATIVO 29 | "CONTI, Diana Beatriz",Frente para la Victoria - PJ,Buenos Aires,AFIRMATIVO 30 | "CORDOBA, Stella Maris",Frente para la Victoria - PJ,Tucumán,AFIRMATIVO 31 | "CURRILEN, Oscar Rubén",Frente para la Victoria - PJ,Chubut,AFIRMATIVO -------------------------------------------------------------------------------- /src/main/java/technology/tabula/writers/JSONWriter.java: -------------------------------------------------------------------------------- 1 | package technology.tabula.writers; 2 | 3 | import com.google.gson.ExclusionStrategy; 4 | import com.google.gson.FieldAttributes; 5 | import com.google.gson.Gson; 6 | import com.google.gson.GsonBuilder; 7 | import com.google.gson.JsonArray; 8 | 9 | import technology.tabula.Cell; 10 | import technology.tabula.RectangularTextContainer; 11 | import technology.tabula.Table; 12 | import technology.tabula.TextChunk; 13 | import technology.tabula.json.RectangularTextContainerSerializer; 14 | import technology.tabula.json.TableSerializer; 15 | 16 | import java.io.IOException; 17 | import java.util.List; 18 | 19 | import static java.lang.reflect.Modifier.PUBLIC; 20 | 21 | public class JSONWriter implements Writer { 22 | 23 | private static final ExclusionStrategy ALL_CLASSES_SKIPPING_NON_PUBLIC_FIELDS = new ExclusionStrategy() { 24 | @Override 25 | public boolean shouldSkipClass(Class c) { 26 | return false; 27 | } 28 | 29 | @Override 30 | public boolean shouldSkipField(FieldAttributes fieldAttributes) { 31 | return !fieldAttributes.hasModifier(PUBLIC); 32 | } 33 | }; 34 | 35 | // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // 36 | @Override 37 | public void write(Appendable out, Table table) throws IOException { 38 | out.append(gson().toJson(table, Table.class)); 39 | } 40 | 41 | @Override 42 | public void write(Appendable out, List tables) throws IOException { 43 | Gson gson = gson(); 44 | JsonArray jsonElements = new JsonArray(); 45 | for (Table table : tables) 46 | jsonElements.add(gson.toJsonTree(table, Table.class)); 47 | out.append(gson.toJson(jsonElements)); 48 | } 49 | 50 | // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // 51 | private static Gson gson() { 52 | return new GsonBuilder() 53 | .addSerializationExclusionStrategy(ALL_CLASSES_SKIPPING_NON_PUBLIC_FIELDS) 54 | .registerTypeAdapter(Table.class, TableSerializer.INSTANCE) 55 | .registerTypeAdapter(RectangularTextContainer.class, RectangularTextContainerSerializer.INSTANCE) 56 | .registerTypeAdapter(Cell.class, RectangularTextContainerSerializer.INSTANCE) 57 | .registerTypeAdapter(TextChunk.class, RectangularTextContainerSerializer.INSTANCE) 58 | .create(); 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /src/test/java/technology/tabula/TestLine.java: -------------------------------------------------------------------------------- 1 | package technology.tabula; 2 | 3 | import static org.junit.Assert.*; 4 | 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | 8 | import org.apache.pdfbox.pdmodel.font.PDType1Font; 9 | import org.apache.pdfbox.pdmodel.font.Standard14Fonts; 10 | import org.junit.Test; 11 | 12 | public class TestLine { 13 | 14 | @Test 15 | public void testSetTextElements() { 16 | Line line = new Line(); 17 | 18 | TextElement tElement = new TextElement(0, 0, 0, 0, new PDType1Font(Standard14Fonts.FontName.HELVETICA_BOLD), 10, "test", 5); 19 | TextChunk tChunk = new TextChunk(tElement); 20 | List tList = new ArrayList<>(); 21 | tList.add(tChunk); 22 | line.setTextElements(tList); 23 | 24 | assertEquals("test", line.getTextElements().get(0).getText()); 25 | 26 | } 27 | 28 | @Test 29 | public void testAddTextChunkIntTextChunk() { 30 | Line line = new Line(); 31 | 32 | TextElement tElement = new TextElement(0, 0, 0, 0, new PDType1Font(Standard14Fonts.FontName.HELVETICA_BOLD), 10, "test", 5); 33 | TextChunk tChunk = new TextChunk(tElement); 34 | line.addTextChunk(3, tChunk); 35 | 36 | assertEquals("test", line.getTextElements().get(3).getText()); 37 | } 38 | 39 | @Test 40 | public void testLessThanAddTextChunkIntTextChunk() { 41 | Line line = new Line(); 42 | 43 | TextElement tElement = new TextElement(0, 0, 0, 0, new PDType1Font(Standard14Fonts.FontName.HELVETICA_BOLD), 10, "test", 5); 44 | TextChunk tChunk = new TextChunk(tElement); 45 | line.addTextChunk(0, tChunk); 46 | line.addTextChunk(0, tChunk); 47 | 48 | assertEquals("testtest", line.getTextElements().get(0).getText()); 49 | } 50 | 51 | @Test(expected = IllegalArgumentException.class) 52 | public void testErrorAddTextChunkIntTextChunk() { 53 | Line line = new Line(); 54 | 55 | TextElement tElement = new TextElement(0, 0, 0, 0,new PDType1Font(Standard14Fonts.FontName.HELVETICA_BOLD), 10, "test", 5); 56 | TextChunk tChunk = new TextChunk(tElement); 57 | line.addTextChunk(-1, tChunk); 58 | } 59 | 60 | @Test 61 | public void testToString() { 62 | Line line = new Line(); 63 | 64 | TextElement tElement = new TextElement(0, 0, 0, 0, new PDType1Font(Standard14Fonts.FontName.HELVETICA_BOLD), 10, "test", 5); 65 | TextChunk tChunk = new TextChunk(tElement); 66 | line.addTextChunk(0, tChunk); 67 | line.addTextChunk(0, tChunk); 68 | 69 | assertEquals("technology.tabula.Line[x=0.0,y=0.0,w=0.0,h=0.0,bottom=0.000000,right=0.000000,chunks='testtest', ]", line.toString()); 70 | } 71 | 72 | } 73 | -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-011b-reg.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 |
41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 |
65 | 66 | -------------------------------------------------------------------------------- /src/main/java/technology/tabula/ObjectExtractor.java: -------------------------------------------------------------------------------- 1 | package technology.tabula; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.pdfbox.pdmodel.PDDocument; 6 | import org.apache.pdfbox.pdmodel.PDPage; 7 | 8 | public class ObjectExtractor implements java.io.Closeable { 9 | 10 | private final PDDocument pdfDocument; 11 | 12 | public ObjectExtractor(PDDocument pdfDocument) { 13 | this.pdfDocument = pdfDocument; 14 | } 15 | 16 | // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // 17 | protected Page extractPage(Integer pageNumber) throws IOException { 18 | if (pageNumber > pdfDocument.getNumberOfPages() || pageNumber < 1) { 19 | throw new java.lang.IndexOutOfBoundsException("Page number does not exist."); 20 | } 21 | PDPage page = pdfDocument.getPage(pageNumber - 1); 22 | 23 | ObjectExtractorStreamEngine streamEngine = new ObjectExtractorStreamEngine(page); 24 | streamEngine.processPage(page); 25 | 26 | TextStripper textStripper = new TextStripper(pdfDocument, pageNumber); 27 | textStripper.process(); 28 | 29 | Utils.sort(textStripper.getTextElements(), Rectangle.ILL_DEFINED_ORDER); 30 | 31 | float width, height; 32 | int rotation = page.getRotation(); 33 | if (Math.abs(rotation) == 90 || Math.abs(rotation) == 270) { 34 | width = page.getCropBox().getHeight(); 35 | height = page.getCropBox().getWidth(); 36 | } else { 37 | width = page.getCropBox().getWidth(); 38 | height = page.getCropBox().getHeight(); 39 | } 40 | 41 | return Page.Builder.newInstance() 42 | .withPageDims(PageDims.of(0, 0, width, height)) 43 | .withRotation(rotation) 44 | .withNumber(pageNumber) 45 | .withPdPage(page) 46 | .withPdDocument(pdfDocument) 47 | .withRulings(streamEngine.rulings) 48 | .withTextElements(textStripper.getTextElements()) 49 | .withMinCharWidth(textStripper.getMinCharWidth()) 50 | .withMinCharHeight(textStripper.getMinCharHeight()) 51 | .withIndex(textStripper.getSpatialIndex()) 52 | .build(); 53 | } 54 | 55 | // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // 56 | public PageIterator extract(Iterable pages) { 57 | return new PageIterator(this, pages); 58 | } 59 | 60 | public PageIterator extract() { 61 | return extract(Utils.range(1, pdfDocument.getNumberOfPages() + 1)); 62 | } 63 | 64 | public Page extract(int pageNumber) { 65 | return extract(Utils.range(pageNumber, pageNumber + 1)).next(); 66 | } 67 | 68 | // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // 69 | public void close() throws IOException { 70 | pdfDocument.close(); 71 | } 72 | 73 | } 74 | -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-011-reg.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 |
60 |
61 | -------------------------------------------------------------------------------- /src/test/java/technology/tabula/TestRuling.java: -------------------------------------------------------------------------------- 1 | package technology.tabula; 2 | 3 | import static org.junit.Assert.*; 4 | 5 | import org.junit.Before; 6 | import org.junit.Test; 7 | 8 | public class TestRuling { 9 | 10 | Ruling ruling; 11 | 12 | @Before 13 | public void setUpRuling() { 14 | ruling = new Ruling(0, 0, 10, 10); 15 | } 16 | 17 | @Test 18 | public void testGetWidth() { 19 | assertEquals(10f, ruling.getWidth(), 1e-5); 20 | } 21 | 22 | @Test 23 | public void testGetHeight() { 24 | assertEquals(10f, ruling.getHeight(), 1e-5); 25 | } 26 | 27 | @Test 28 | public void testToString() { 29 | assertEquals("class technology.tabula.Ruling[x1=0.000000 y1=0.000000 x2=10.000000 y2=10.000000]",ruling.toString()); 30 | } 31 | 32 | @Test 33 | public void testEqualsOther() { 34 | Ruling other = new Ruling(0, 0, 11, 10); 35 | assertTrue(ruling.equals(ruling)); 36 | } 37 | 38 | @Test 39 | public void testEqualsDifferentInstance() { 40 | assertFalse(ruling.equals("test")); 41 | } 42 | 43 | @Test 44 | public void testNearlyIntersects(){ 45 | Ruling another = new Ruling(0, 0, 11, 10); 46 | 47 | assertTrue(ruling.nearlyIntersects(another)); 48 | } 49 | 50 | @Test(expected = UnsupportedOperationException.class) 51 | public void testGetPositionError(){ 52 | Ruling other = new Ruling(0, 0, 1, 1); 53 | other.getPosition(); 54 | fail(); 55 | } 56 | 57 | @Test(expected = UnsupportedOperationException.class) 58 | public void testSetPositionError(){ 59 | Ruling other = new Ruling(0, 0, 1, 1); 60 | other.setPosition(5f); 61 | fail(); 62 | } 63 | 64 | @Test(expected = UnsupportedOperationException.class) 65 | public void testsetPosition(){ 66 | ruling.setPosition(0); 67 | } 68 | 69 | @Test(expected = UnsupportedOperationException.class) 70 | public void testGetStartError(){ 71 | Ruling other = new Ruling(0, 0, 1, 1); 72 | other.getStart(); 73 | fail(); 74 | } 75 | 76 | @Test(expected = UnsupportedOperationException.class) 77 | public void testGetEndError(){ 78 | Ruling other = new Ruling(0, 0, 1, 1); 79 | other.getEnd(); 80 | fail(); 81 | } 82 | 83 | @Test(expected = UnsupportedOperationException.class) 84 | public void testSetEndError(){ 85 | Ruling other = new Ruling(0, 0, 1, 1); 86 | other.setEnd(5f); 87 | fail(); 88 | } 89 | 90 | 91 | @Test 92 | public void testColinear(){ 93 | // Ruling another = new Ruling(0, 0, 500, 5); 94 | java.awt.geom.Point2D.Float float1 = new java.awt.geom.Point2D.Float(20, 20); 95 | java.awt.geom.Point2D.Float float2 = new java.awt.geom.Point2D.Float(0, 0); 96 | java.awt.geom.Point2D.Float float3 = new java.awt.geom.Point2D.Float(20, 0); 97 | java.awt.geom.Point2D.Float float4 = new java.awt.geom.Point2D.Float(0, 20); 98 | 99 | assertFalse(ruling.colinear(float1)); 100 | assertTrue(ruling.colinear(float2)); 101 | assertFalse(ruling.colinear(float3)); 102 | assertFalse(ruling.colinear(float4)); 103 | 104 | 105 | } 106 | 107 | } 108 | -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/json/AnimalSounds1.json: -------------------------------------------------------------------------------- 1 | [{"extraction_method":"lattice","page_number":1,"top":0.006499578,"left":56.8,"width":241.1999969482422,"height":315.36407470703125,"right":298.0,"bottom":315.37057,"data":[[{"top":0.006499578,"left":56.8,"width":79.19999694824219,"height":95.31405639648438,"text":"Animal"},{"top":0.006499578,"left":136.0,"width":61.0,"height":95.31405639648438,"text":"Action"},{"top":0.006499578,"left":197.0,"width":101.0,"height":95.31405639648438,"text":"Result"}],[{"top":95.32056,"left":56.8,"width":79.19999694824219,"height":23.050010681152344,"text":"Cat"},{"top":95.32056,"left":136.0,"width":61.0,"height":23.050010681152344,"text":"Says"},{"top":95.32056,"left":197.0,"width":101.0,"height":23.050010681152344,"text":"Meow"}],[{"top":118.37057,"left":56.8,"width":79.19999694824219,"height":63.99999237060547,"text":"Parastratiosph\recomyiastratio\rsphecomyioid\res"},{"top":118.37057,"left":136.0,"width":61.0,"height":63.99999237060547,"text":"Says"},{"top":118.37057,"left":197.0,"width":101.0,"height":63.99999237060547,"text":"bzzzzzzz"}],[{"top":182.37056,"left":56.8,"width":79.19999694824219,"height":133.00001525878906,"text":"Fox"},{"top":182.37056,"left":136.0,"width":61.0,"height":133.00001525878906,"text":"Says"},{"top":182.37056,"left":197.0,"width":101.0,"height":133.00001525878906,"text":"Ring-\rdingdingdingd\ringeringeding\rGering-\rdingdingdingd\ringeringeding\rGering-\rdingdingdingd\ringeringeding"}]]},{"extraction_method":"lattice","page_number":1,"top":0.006499578,"left":313.35715,"width":241.55941772460938,"height":259.2640380859375,"right":554.91656,"bottom":259.27054,"data":[[{"top":0.006499578,"left":313.35715,"width":77.64285278320312,"height":72.26405334472656,"text":""},{"top":0.006499578,"left":391.0,"width":66.0,"height":72.26405334472656,"text":""},{"top":0.006499578,"left":457.0,"width":97.91656494140625,"height":72.26405334472656,"text":""}],[{"top":72.27055,"left":313.35715,"width":77.64285278320312,"height":23.050003051757812,"text":"Animal"},{"top":72.27055,"left":391.0,"width":66.0,"height":23.050003051757812,"text":"Action"},{"top":72.27055,"left":457.0,"width":97.91656494140625,"height":23.050003051757812,"text":"Result"}],[{"top":95.32056,"left":313.35715,"width":77.64285278320312,"height":35.94999694824219,"text":"Dogs/wolves/\rMore dogs"},{"top":95.32056,"left":391.0,"width":66.0,"height":35.94999694824219,"text":"Says"},{"top":95.32056,"left":457.0,"width":97.91656494140625,"height":35.94999694824219,"text":"Bow-wow/\rruff-ruff"}],[{"top":131.27055,"left":313.35715,"width":77.64285278320312,"height":36.40000915527344,"text":"Donkey"},{"top":131.27055,"left":391.0,"width":66.0,"height":36.40000915527344,"text":"Says"},{"top":131.27055,"left":457.0,"width":97.91656494140625,"height":36.40000915527344,"text":"Hee-Haw Hee-\rHaw"}],[{"top":167.67056,"left":313.35715,"width":77.64285278320312,"height":91.5999755859375,"text":"Fox"},{"top":167.67056,"left":391.0,"width":66.0,"height":91.5999755859375,"text":"Says"},{"top":167.67056,"left":457.0,"width":97.91656494140625,"height":91.5999755859375,"text":"Wa-pa-pa-pa-\rpa-pa-pow\rWa-pa-pa-pa-\rpa-pow\rWa-pa-pa-pa-\rpa-pa-pow"}]]}] 2 | -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-011a-reg.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 |
46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 |
70 |
71 | -------------------------------------------------------------------------------- /src/test/java/technology/tabula/UtilsForTesting.java: -------------------------------------------------------------------------------- 1 | package technology.tabula; 2 | 3 | import java.io.*; 4 | import java.nio.charset.Charset; 5 | import java.util.List; 6 | 7 | import org.apache.commons.csv.CSVFormat; 8 | import org.apache.commons.csv.CSVParser; 9 | import org.apache.commons.csv.CSVPrinter; 10 | import org.apache.pdfbox.Loader; 11 | import org.apache.pdfbox.pdmodel.PDDocument; 12 | import org.junit.Assert; 13 | 14 | public class UtilsForTesting { 15 | 16 | public static Page getAreaFromFirstPage(String path, float top, float left, float bottom, float right) throws IOException { 17 | return getAreaFromPage(path, 1, top, left, bottom, right); 18 | } 19 | 20 | public static Page getAreaFromPage(String path, int page, float top, float left, float bottom, float right) throws IOException { 21 | return getPage(path, page).getArea(top, left, bottom, right); 22 | } 23 | 24 | public static Page getPage(String path, int pageNumber) throws IOException { 25 | ObjectExtractor oe = null; 26 | try { 27 | PDDocument document = Loader.loadPDF(new File(path)); 28 | oe = new ObjectExtractor(document); 29 | return oe.extract(pageNumber); 30 | } finally { 31 | if (oe != null) 32 | oe.close(); 33 | } 34 | } 35 | 36 | public static String[][] tableToArrayOfRows(Table table) { 37 | List> tableRows = table.getRows(); 38 | 39 | int maxColCount = 0; 40 | 41 | for (int i = 0; i < tableRows.size(); i++) { 42 | List row = tableRows.get(i); 43 | if (maxColCount < row.size()) { 44 | maxColCount = row.size(); 45 | } 46 | } 47 | 48 | Assert.assertEquals(maxColCount, table.getColCount()); 49 | 50 | String[][] rv = new String[tableRows.size()][maxColCount]; 51 | 52 | for (int i = 0; i < tableRows.size(); i++) { 53 | List row = tableRows.get(i); 54 | for (int j = 0; j < row.size(); j++) { 55 | rv[i][j] = table.getCell(i, j).getText(); 56 | } 57 | } 58 | 59 | return rv; 60 | } 61 | 62 | public static String loadJson(String path) throws IOException { 63 | 64 | StringBuilder stringBuilder = new StringBuilder(); 65 | try (BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8"))) { 66 | String line = null; 67 | while ((line = reader.readLine()) != null) { 68 | stringBuilder.append(line); 69 | } 70 | } 71 | 72 | return stringBuilder.toString(); 73 | 74 | } 75 | 76 | public static String loadCsv(String path) throws IOException { 77 | 78 | StringBuilder out = new StringBuilder(); 79 | CSVParser parse = org.apache.commons.csv.CSVParser.parse(new File(path), Charset.forName("utf-8"), CSVFormat.EXCEL); 80 | 81 | CSVPrinter printer = new CSVPrinter(out, CSVFormat.EXCEL); 82 | printer.printRecords(parse); 83 | printer.close(); 84 | 85 | String csv = out.toString().replaceAll("(? 2 | 3 | 4 | 5 | 6 | 7 | Income level of individual or geography 8 | 9 | 10 | 11 | 12 | 13 | 14 | % of the area median income 15 | 16 | 17 | 18 | 19 | 20 | Low-income 21 | 22 | 23 | 24 | 25 | 26 | Less than 50 27 | 28 | 29 | 30 | 31 | Moderate-income 32 | 33 | 34 | 35 | 36 | At least 50 and less than 80 37 | 38 | 39 | 40 | 41 | Middle-income 42 | 43 | 44 | 45 | 46 | 47 | 48 | At least 80 and less than 120 49 | 50 | 51 | 52 | 53 | Upper-income 54 | 55 | 56 | 57 | 58 | 120 or more 59 | 60 | 61 | 62 |
63 |
64 | -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/csv/TestSpreadsheetExtractor-CELLS.csv: -------------------------------------------------------------------------------- 1 | 40.0,18.0,208.0,4.0 2 | 44.0,18.0,52.0,6.0 3 | 50.0,18.0,52.0,4.0 4 | 54.0,18.0,52.0,6.0 5 | 60.0,18.0,52.0,4.0 6 | 64.0,18.0,52.0,6.0 7 | 70.0,18.0,52.0,4.0 8 | 74.0,18.0,52.0,6.0 9 | 90.0,18.0,52.0,4.0 10 | 94.0,18.0,52.0,6.0 11 | 100.0,18.0,52.0,28.0 12 | 128.0,18.0,52.0,4.0 13 | 132.0,18.0,52.0,64.0 14 | 196.0,18.0,52.0,66.0 15 | 262.0,18.0,52.0,4.0 16 | 266.0,18.0,52.0,84.0 17 | 350.0,18.0,52.0,4.0 18 | 354.0,18.0,52.0,32.0 19 | 386.0,18.0,52.0,38.0 20 | 424.0,18.0,52.0,18.0 21 | 442.0,18.0,52.0,74.0 22 | 516.0,18.0,52.0,28.0 23 | 544.0,18.0,52.0,4.0 24 | 44.0,70.0,156.0,6.0 25 | 50.0,70.0,156.0,4.0 26 | 54.0,70.0,156.0,6.0 27 | 60.0,70.0,156.0,4.0 28 | 64.0,70.0,156.0,6.0 29 | 70.0,70.0,156.0,4.0 30 | 74.0,70.0,156.0,6.0 31 | 84.0,70.0,2.0,6.0 32 | 90.0,70.0,156.0,4.0 33 | 94.0,70.0,156.0,6.0 34 | 100.0,70.0,156.0,28.0 35 | 128.0,70.0,156.0,4.0 36 | 132.0,70.0,156.0,64.0 37 | 196.0,70.0,156.0,66.0 38 | 262.0,70.0,156.0,4.0 39 | 266.0,70.0,156.0,84.0 40 | 350.0,70.0,156.0,4.0 41 | 354.0,70.0,156.0,32.0 42 | 386.0,70.0,156.0,38.0 43 | 424.0,70.0,156.0,18.0 44 | 442.0,70.0,156.0,74.0 45 | 516.0,70.0,156.0,28.0 46 | 544.0,70.0,156.0,4.0 47 | 84.0,72.0,446.0,6.0 48 | 90.0,226.0,176.0,4.0 49 | 94.0,226.0,176.0,6.0 50 | 100.0,226.0,176.0,28.0 51 | 128.0,226.0,176.0,4.0 52 | 132.0,226.0,176.0,64.0 53 | 196.0,226.0,176.0,66.0 54 | 262.0,226.0,176.0,4.0 55 | 266.0,226.0,176.0,84.0 56 | 350.0,226.0,176.0,4.0 57 | 354.0,226.0,176.0,32.0 58 | 386.0,226.0,176.0,38.0 59 | 424.0,226.0,176.0,18.0 60 | 442.0,226.0,176.0,74.0 61 | 516.0,226.0,176.0,28.0 62 | 544.0,226.0,176.0,4.0 63 | 90.0,402.0,116.0,4.0 64 | 94.0,402.0,116.0,6.0 65 | 100.0,402.0,116.0,28.0 66 | 128.0,402.0,116.0,4.0 67 | 132.0,402.0,116.0,64.0 68 | 196.0,402.0,116.0,66.0 69 | 262.0,402.0,116.0,4.0 70 | 266.0,402.0,116.0,84.0 71 | 350.0,402.0,116.0,4.0 72 | 354.0,402.0,116.0,32.0 73 | 386.0,402.0,116.0,38.0 74 | 424.0,402.0,116.0,18.0 75 | 442.0,402.0,116.0,74.0 76 | 516.0,402.0,116.0,28.0 77 | 544.0,402.0,116.0,4.0 78 | 84.0,518.0,246.0,6.0 79 | 90.0,518.0,186.0,4.0 80 | 94.0,518.0,186.0,6.0 81 | 100.0,518.0,186.0,28.0 82 | 128.0,518.0,186.0,4.0 83 | 132.0,518.0,186.0,64.0 84 | 196.0,518.0,186.0,66.0 85 | 262.0,518.0,186.0,4.0 86 | 266.0,518.0,186.0,84.0 87 | 350.0,518.0,186.0,4.0 88 | 354.0,518.0,186.0,32.0 89 | 386.0,518.0,186.0,38.0 90 | 424.0,518.0,186.0,18.0 91 | 442.0,518.0,186.0,74.0 92 | 516.0,518.0,186.0,28.0 93 | 544.0,518.0,186.0,4.0 94 | 90.0,704.0,60.0,4.0 95 | 94.0,704.0,60.0,6.0 96 | 100.0,704.0,60.0,28.0 97 | 128.0,704.0,60.0,4.0 98 | 132.0,704.0,60.0,64.0 99 | 196.0,704.0,60.0,66.0 100 | 262.0,704.0,60.0,4.0 101 | 266.0,704.0,60.0,84.0 102 | 350.0,704.0,60.0,4.0 103 | 354.0,704.0,60.0,32.0 104 | 386.0,704.0,60.0,38.0 105 | 424.0,704.0,60.0,18.0 106 | 442.0,704.0,60.0,74.0 107 | 516.0,704.0,60.0,28.0 108 | 544.0,704.0,60.0,4.0 109 | 84.0,764.0,216.0,6.0 110 | 90.0,764.0,216.0,4.0 111 | 94.0,764.0,216.0,6.0 112 | 100.0,764.0,216.0,28.0 113 | 128.0,764.0,216.0,4.0 114 | 132.0,764.0,216.0,64.0 115 | 196.0,764.0,216.0,66.0 116 | 262.0,764.0,216.0,4.0 117 | 266.0,764.0,216.0,84.0 118 | 350.0,764.0,216.0,4.0 119 | 354.0,764.0,216.0,32.0 120 | 386.0,764.0,216.0,38.0 121 | 424.0,764.0,216.0,18.0 122 | 442.0,764.0,216.0,74.0 123 | 516.0,764.0,216.0,28.0 124 | 544.0,764.0,216.0,4.0f 125 | -------------------------------------------------------------------------------- /src/main/java/technology/tabula/TableWithRulingLines.java: -------------------------------------------------------------------------------- 1 | package technology.tabula; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Collections; 5 | import java.util.Comparator; 6 | import java.util.Iterator; 7 | import java.util.List; 8 | 9 | import technology.tabula.extractors.ExtractionAlgorithm; 10 | 11 | @SuppressWarnings("serial") 12 | public class TableWithRulingLines extends Table { 13 | 14 | List verticalRulings, horizontalRulings; 15 | RectangleSpatialIndex si = new RectangleSpatialIndex<>(); 16 | 17 | public TableWithRulingLines(Rectangle area, List cells, List horizontalRulings, List verticalRulings, ExtractionAlgorithm extractionAlgorithm, int pageNumber) { 18 | super(extractionAlgorithm); 19 | this.setRect(area); 20 | this.verticalRulings = verticalRulings; 21 | this.horizontalRulings = horizontalRulings; 22 | this.addCells(cells); 23 | this.setPageNumber(pageNumber); 24 | } 25 | 26 | private void addCells(List cells) { 27 | 28 | if (cells.isEmpty()) { 29 | return; 30 | } 31 | 32 | for (Cell ce: cells) { 33 | si.add(ce); 34 | } 35 | 36 | List> rowsOfCells = rowsOfCells(cells); 37 | for (int i = 0; i < rowsOfCells.size(); i++) { 38 | List row = rowsOfCells.get(i); 39 | Iterator rowCells = row.iterator(); 40 | Cell cell = rowCells.next(); 41 | List> others = rowsOfCells( 42 | si.contains( 43 | new Rectangle(cell.getBottom(), si.getBounds().getLeft(), cell.getLeft() - si.getBounds().getLeft(), 44 | si.getBounds().getBottom() - cell.getBottom()) 45 | )); 46 | int startColumn = 0; 47 | for (List r: others) { 48 | startColumn = Math.max(startColumn, r.size()); 49 | } 50 | this.add(cell, i, startColumn++); 51 | while (rowCells.hasNext()) { 52 | this.add(rowCells.next(), i, startColumn++); 53 | } 54 | } 55 | } 56 | 57 | private static List> rowsOfCells(List cells) { 58 | Cell c; 59 | float lastTop; 60 | List> rv = new ArrayList<>(); 61 | List lastRow; 62 | 63 | if (cells.isEmpty()) { 64 | return rv; 65 | } 66 | 67 | Collections.sort(cells, new Comparator() { 68 | @Override 69 | public int compare(Cell arg0, Cell arg1) { 70 | return java.lang.Double.compare(arg0.getTop(), arg1.getTop()); 71 | } 72 | }); 73 | 74 | 75 | Iterator iter = cells.iterator(); 76 | c = iter.next(); 77 | lastTop = c.getTop(); 78 | lastRow = new ArrayList<>(); 79 | lastRow.add(c); 80 | rv.add(lastRow); 81 | 82 | while (iter.hasNext()) { 83 | c = iter.next(); 84 | if (!Utils.feq(c.getTop(), lastTop)) { 85 | lastRow = new ArrayList<>(); 86 | rv.add(lastRow); 87 | } 88 | lastRow.add(c); 89 | lastTop = c.getTop(); 90 | } 91 | return rv; 92 | } 93 | 94 | } 95 | -------------------------------------------------------------------------------- /src/main/java/technology/tabula/Table.java: -------------------------------------------------------------------------------- 1 | package technology.tabula; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | import java.util.TreeMap; 6 | 7 | import technology.tabula.extractors.ExtractionAlgorithm; 8 | 9 | @SuppressWarnings("serial") 10 | public class Table extends Rectangle { 11 | 12 | public static final Table empty() { return new Table(""); } 13 | 14 | private Table(String extractionMethod) { 15 | this.extractionMethod = extractionMethod; 16 | } 17 | 18 | public Table(ExtractionAlgorithm extractionAlgorithm) { 19 | this(extractionAlgorithm.toString()); 20 | } 21 | 22 | private final String extractionMethod; 23 | 24 | private int rowCount = 0; 25 | private int colCount = 0; 26 | private int pageNumber = 0; 27 | 28 | /* visible for testing */ final TreeMap cells = new TreeMap<>(); 29 | 30 | public int getRowCount() { return rowCount; } 31 | public int getColCount() { return colCount; } 32 | public int getPageNumber() { return pageNumber; } 33 | public void setPageNumber(int pageNumber) { this.pageNumber = pageNumber; } 34 | 35 | public String getExtractionMethod() { return extractionMethod; } 36 | 37 | public void add(RectangularTextContainer chunk, int row, int col) { 38 | this.merge(chunk); 39 | 40 | rowCount = Math.max(rowCount, row + 1); 41 | colCount = Math.max(colCount, col + 1); 42 | 43 | CellPosition cp = new CellPosition(row, col); 44 | 45 | RectangularTextContainer old = cells.get(cp); 46 | if (old != null) chunk.merge(old); 47 | cells.put(cp, chunk); 48 | 49 | this.memoizedRows = null; 50 | } 51 | 52 | private List> memoizedRows = null; 53 | 54 | public List> getRows() { 55 | if (this.memoizedRows == null) this.memoizedRows = computeRows(); 56 | return this.memoizedRows; 57 | } 58 | 59 | private List> computeRows() { 60 | List> rows = new ArrayList<>(); 61 | for (int i = 0; i < rowCount; i++) { 62 | List lastRow = new ArrayList<>(); 63 | rows.add(lastRow); 64 | for (int j = 0; j < colCount; j++) { 65 | RectangularTextContainer cell = cells.get(new CellPosition(i,j)); // JAVA_8 use getOrDefault() 66 | lastRow.add(cell != null ? cell : TextChunk.EMPTY); 67 | } 68 | } 69 | return rows; 70 | } 71 | 72 | public RectangularTextContainer getCell(int i, int j) { 73 | RectangularTextContainer cell = cells.get(new CellPosition(i,j)); // JAVA_8 use getOrDefault() 74 | return cell != null ? cell : TextChunk.EMPTY; 75 | } 76 | 77 | } 78 | 79 | class CellPosition implements Comparable { 80 | 81 | CellPosition(int row, int col) { 82 | this.row = row; 83 | this.col = col; 84 | } 85 | 86 | final int row, col; 87 | 88 | @Override public int hashCode() { 89 | return row + 101 * col; 90 | } 91 | 92 | @Override public boolean equals(Object obj) { 93 | if (this == obj) return true; 94 | if (obj == null) return false; 95 | if (getClass() != obj.getClass()) return false; 96 | CellPosition other = (CellPosition) obj; 97 | return row == other.row && col == other.col; 98 | } 99 | 100 | @Override public int compareTo(CellPosition other) { 101 | int rowdiff = row - other.row; 102 | return rowdiff != 0 ? rowdiff : col - other.col; 103 | } 104 | 105 | } 106 | -------------------------------------------------------------------------------- /src/test/java/technology/tabula/TestProjectionProfile.java: -------------------------------------------------------------------------------- 1 | package technology.tabula; 2 | 3 | import static org.junit.Assert.*; 4 | 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | 8 | import org.apache.pdfbox.pdmodel.PDDocument; 9 | import org.apache.pdfbox.pdmodel.PDPage; 10 | import org.apache.pdfbox.pdmodel.font.PDType1Font; 11 | import org.apache.pdfbox.pdmodel.font.Standard14Fonts; 12 | import org.junit.Before; 13 | import org.junit.Test; 14 | 15 | public class TestProjectionProfile { 16 | 17 | ProjectionProfile pProfile; 18 | Page page; 19 | 20 | @Before 21 | public void setUpProjectionProfile() { 22 | PDPage pdPage = new PDPage(); 23 | PDDocument pdDocument = new PDDocument(); 24 | 25 | PDType1Font font = new PDType1Font(Standard14Fonts.FontName.HELVETICA); 26 | TextElement textElement = new TextElement(5f, 15f, 10f, 20f, font, 1f, "test", 1f); 27 | TextElement textElement2 = new TextElement(5f, 15f, 10f, 20f, font, 1f, "test", 1f); 28 | List textList = new ArrayList<>(); 29 | textList.add(textElement); 30 | textList.add(textElement2); 31 | 32 | Ruling ruling = new Ruling(0, 0, 10, 10); 33 | List rulingList = new ArrayList<>(); 34 | rulingList.add(ruling); 35 | 36 | page = Page.Builder.newInstance() 37 | .withPageDims(PageDims.of(0, 0, 1, 1)) 38 | .withRotation(0) 39 | .withNumber(1) 40 | .withPdPage(pdPage) 41 | .withPdDocument(pdDocument) 42 | .withTextElements(textList) 43 | .withRulings(rulingList) 44 | .build(); 45 | 46 | List rectangles = new ArrayList<>(); 47 | rectangles.add(new Rectangle(0f, 0f, 500f, 5f)); 48 | 49 | pProfile = new ProjectionProfile(page, rectangles, 5, 5); 50 | } 51 | 52 | @Test 53 | public void testGetVerticalProjection() { 54 | float[] projection = pProfile.getVerticalProjection(); 55 | assertTrue(projection.length == 10); 56 | } 57 | 58 | @Test 59 | public void testGetHorizontalProjection() { 60 | float[] projection = pProfile.getHorizontalProjection(); 61 | assertTrue(projection.length == 10); 62 | } 63 | 64 | @Test 65 | public void testFindVerticalSeparators() { 66 | float[] seperators = pProfile.findVerticalSeparators(page.getText().size() * 2.5f); 67 | assertTrue(seperators.length == 0); 68 | } 69 | 70 | @Test 71 | public void testFindHorizontalSeparators() { 72 | float[] seperators = pProfile.findHorizontalSeparators(page.getText().size() * 2.5f); 73 | assertTrue(seperators.length == 0); 74 | } 75 | 76 | @Test 77 | public void testSmooth() { 78 | float[] data = {0, 1, 2}; 79 | float[] rv = ProjectionProfile.smooth(data, 3); 80 | 81 | assertEquals(1f, rv[2], 1e-5); 82 | } 83 | 84 | @Test 85 | public void testFilter() { 86 | float[] data = {0, 1, 2}; 87 | float[] rv = ProjectionProfile.filter(data, 3); 88 | 89 | assertEquals(3f, rv[1], 1e-5); 90 | } 91 | 92 | @Test 93 | public void testGetAutocorrelation() { 94 | float[] projection = {0, 1, 2}; 95 | float[] rv = ProjectionProfile.getAutocorrelation(projection); 96 | 97 | assertEquals(0f, rv[0], 1e-5); 98 | assertTrue(rv.length == 2); 99 | 100 | } 101 | 102 | @Test 103 | public void testGetFirstDeriv() { 104 | // float[] 105 | // float[] projection = pProfile.getFirstDeriv(new float[]{0.0, 0.0) 106 | // System.out.println(Arrays.toString(projection)); 107 | // assertEquals(10, projection[0], 1e-15); 108 | } 109 | 110 | } 111 | -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/csv/frx_2012_disclosure.csv: -------------------------------------------------------------------------------- 1 | "FOREST LABORATORIES, INC. DISCLOSURE REPORT",,,, 2 | Calendar Year - 2012,,,, 3 | Physician,Related Entity (if applicable),City / State,Purpose of Payment,Amount ($USD) * ** 4 | "AALAEI, BEHZAD",,"HIGHLAND, IN",MEALS,$51.24 5 | TOTAL,,,,$51.24 6 | "AAMODT, DENISE, E",,"ALBUQUERQUE, NM",MEALS,$66.12 7 | TOTAL,,,,$66.12 8 | "AANONSEN, DEBORAH, A",,"STATEN ISLAND, NY",MEALS,$85.00 9 | TOTAL,,,,$85.00 10 | "AARON, CAREN, T",,"RICHMOND, VA",EDUCATIONAL ITEMS,$78.80 11 | "AARON, CAREN, T",,"RICHMOND, VA",MEALS,$392.45 12 | TOTAL,,,,$471.25 13 | "AARON, JOHN",,"CLARKSVILLE, TN",MEALS,$20.39 14 | TOTAL,,,,$20.39 15 | "AARON, JOSHUA, N",,"WEST GROVE, PA",MEALS,$310.33 16 | "AARON, JOSHUA, N","REGIONAL PULMONARY & SLEEP 17 | MEDICINE","WEST GROVE, PA",SPEAKING FEES,"$4,700.00" 18 | TOTAL,,,,"$5,010.33" 19 | "AARON, MAUREEN, M",,"MARTINSVILLE, VA",MEALS,$193.67 20 | TOTAL,,,,$193.67 21 | "AARON, MICHAEL, L",,"WEST ISLIP, NY",MEALS,$19.50 22 | TOTAL,,,,$19.50 23 | "AARON, MICHAEL, R",,"BROOKLYN, NY",MEALS,$65.92 24 | TOTAL,,,,$65.92 25 | "AARONS, MARK, G",,"PINEHURST, NC",MEALS,$154.19 26 | TOTAL,,,,$154.19 27 | "AARONSON, GARY, A",,"PHILADELPHIA, PA",MEALS,$205.17 28 | TOTAL,,,,$205.17 29 | "AARONSON, ROBERT, M",,"TUCSON, AZ",MEALS,$24.38 30 | TOTAL,,,,$24.38 31 | "AASHEIM, RICHARD, J",,"GREENEVILLE, TN",EDUCATIONAL ITEMS,$2.27 32 | "AASHEIM, RICHARD, J",,"GREENEVILLE, TN",MEALS,$100.76 33 | TOTAL,,,,$103.03 34 | "AASMAA, SIRIKE, T",,"MONTVILLE, NJ",MEALS,$53.33 35 | TOTAL,,,,$53.33 36 | "AAZAMI, HESSAM",,"GRANADA HILLS, CA",MEALS,$402.90 37 | TOTAL,,,,$402.90 38 | "ABAABA, ABIEDU, C",,"JACKSONVILLE, FL",MEALS,$13.49 39 | TOTAL,,,,$13.49 40 | "ABABNEH, ALAELDIN, A",,"KANSAS CITY, KS",MEALS,$10.31 41 | TOTAL,,,,$10.31 42 | "ABAD, ANTONIO, A",,"CORAL SPRINGS, FL",MEALS,$516.29 43 | TOTAL,,,,$516.29 44 | "ABADEER, PETER, S",,"NORMAL, IL",MEALS,$200.38 45 | TOTAL,,,,$200.38 46 | "ABAD, ENZO, L",,"MIAMI, FL",MEALS,$67.61 47 | TOTAL,,,,$67.61 48 | "ABADIAN SHARIFABAD, 49 | MANOOCHEHR",,"GRANADA HILLS, CA",MEALS,$12.37 50 | TOTAL,,,,$12.37 51 | "ABADI, CHRISTOPHER, A",,"WARWICK, RI",MEALS,$157.42 52 | TOTAL,,,,$157.42 53 | "ABADIE, MARCUS, G",,"ATHENS, TX",MEALS,$361.89 54 | TOTAL,,,,$361.89 55 | "ABADI, JAMSHEED, S",,"BROOKLYN, NY",MEALS,$363.40 56 | TOTAL,,,,$363.40 57 | "ABADILLA, JUNE, E",,"JACKSON, KY",MEALS,$105.33 58 | TOTAL,,,,$105.33 59 | "ABAD, JOHN, P",,"NEWARK, OH",MEALS,$347.64 60 | TOTAL,,,,$347.64 61 | "ABAD, JOSE, F",,"FOLSOM, CA",MEALS,$30.28 62 | TOTAL,,,,$30.28 63 | "ABAD, REMEDIOS, D",,"WILNINGTON, DE",MEALS,$26.85 64 | TOTAL,,,,$26.85 65 | "ABAD, SO KIM, F",,"WICHITA FALLS, TX",MEALS,$136.52 66 | TOTAL,,,,$136.52 67 | "ABAD, ZOILO, R",,"MIAMI, FL",MEALS,$93.83 68 | TOTAL,,,,$93.83 69 | "ABALIHI, CAROL, N",,"EL PASO, TX",MEALS,$88.48 70 | TOTAL,,,,$88.48 71 | "ABALOS, ANNA, T",,"ROSEVILLE, CA",MEALS,$178.60 72 | TOTAL,,,,$178.60 73 | "ABALOS, ARTURO, Z",,"DELANO, CA",MEALS,$48.06 74 | TOTAL,,,,$48.06 75 | "ABALOS, JOSEPH, M",,"SENECA, PA",MEALS,$39.03 76 | TOTAL,,,,$39.03 77 | "ABANDO, JOSE, R",,"DAYTONA BEACH, FL",MEALS,$83.44 78 | TOTAL,,,,$83.44 79 | "ABANG, ANTHONY, E",,"ELIZABETHTOWN, KY",MEALS,$12.62 80 | TOTAL,,,,$12.62 81 | "ABAN, KENRIC, T",,"SAN DIEGO, CA",MEALS,$11.91 82 | TOTAL,,,,$11.91 83 | "ABAQUETA, ALVIN, Y",,"CHARLOTTE, NC",MEALS,$233.71 84 | TOTAL,,,,$233.71 85 | "ABARCA, SERGIO, O",,"TOOELE, UT",MEALS,$159.58 86 | TOTAL,,,,$159.58 87 | "ABARIKWU, CONSTANTIA, A",,"PHOENIX, AZ",MEALS,$153.57 88 | TOTAL,,,,$153.57 89 | "ABASHIDZE, TEAH, A",,"CLEVELAND, OH",MEALS,$153.59 90 | TOTAL,,,,$153.59 -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-006-reg.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 |
71 |
72 | -------------------------------------------------------------------------------- /src/main/java/technology/tabula/QuickSort.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package technology.tabula; 18 | 19 | import java.util.ArrayList; 20 | import java.util.Comparator; 21 | import java.util.List; 22 | import java.util.RandomAccess; 23 | import java.util.Stack; 24 | 25 | /** 26 | * An implementation of Quicksort. 27 | * 28 | * @see wikipedia 29 | * 30 | * @author UWe Pachler 31 | */ 32 | public final class QuickSort { 33 | 34 | private QuickSort() { 35 | // utility 36 | } 37 | 38 | /** 39 | * Sorts the given list according to natural order. 40 | */ 41 | public static > void sort(List list) { 42 | sort(list, QuickSort.naturalOrder()); // JAVA_8 replace with Comparator.naturalOrder() (and cleanup) 43 | } 44 | 45 | /** 46 | * Sorts the given list using the given comparator. 47 | */ 48 | public static void sort(List list, Comparator comparator) { 49 | if (list instanceof RandomAccess) { 50 | quicksort(list, comparator); 51 | } else { 52 | List copy = new ArrayList<>(list); 53 | quicksort(copy, comparator); 54 | list.clear(); 55 | list.addAll(copy); 56 | } 57 | } 58 | 59 | private static void quicksort(List list, Comparator cmp) { 60 | Stack stack = new Stack<>(); 61 | stack.push(0); 62 | stack.push(list.size()); 63 | while (!stack.isEmpty()) { 64 | int right = stack.pop(); 65 | int left = stack.pop(); 66 | 67 | if (right - left < 2) continue; 68 | int p = left + ((right - left) / 2); 69 | p = partition(list, cmp, p, left, right); 70 | 71 | stack.push(p + 1); 72 | stack.push(right); 73 | 74 | stack.push(left); 75 | stack.push(p); 76 | } 77 | } 78 | 79 | private static int partition(List list, Comparator cmp, int p, int start, int end) { 80 | int l = start; 81 | int h = end - 2; 82 | T piv = list.get(p); 83 | swap(list, p, end - 1); 84 | 85 | while (l < h) { 86 | if (cmp.compare(list.get(l), piv) <= 0) l++; 87 | else if (cmp.compare(piv, list.get(h)) <= 0) h--; 88 | else swap(list, l, h); 89 | } 90 | int idx = h; 91 | if (cmp.compare(list.get(h), piv) < 0) idx++; 92 | swap(list, end - 1, idx); 93 | return idx; 94 | } 95 | 96 | private static void swap(List list, int i, int j) { 97 | T tmp = list.get(i); 98 | list.set(i, list.get(j)); 99 | list.set(j, tmp); 100 | } 101 | 102 | @SuppressWarnings({ "rawtypes", "unchecked" }) 103 | private static final Comparator NATURAL_ORDER = new Comparator() { 104 | @Override public int compare(Object l, Object r) { return ((Comparable) l).compareTo(r); } 105 | }; 106 | 107 | @SuppressWarnings("unchecked") 108 | private static > Comparator naturalOrder() { 109 | return NATURAL_ORDER; 110 | } 111 | 112 | } 113 | -------------------------------------------------------------------------------- /src/test/java/technology/tabula/TestCohenSutherland.java: -------------------------------------------------------------------------------- 1 | package technology.tabula; 2 | 3 | import org.junit.Before; 4 | import org.junit.Test; 5 | 6 | import java.awt.geom.Line2D; 7 | import java.awt.geom.Rectangle2D; 8 | 9 | import static org.junit.Assert.*; 10 | 11 | public class TestCohenSutherland { 12 | 13 | private Rectangle2D clipWindow; 14 | private CohenSutherlandClipping algorithm; 15 | private static final double DELTA = 0.001; 16 | 17 | @Before 18 | public void set() { 19 | clipWindow = new Rectangle(10, 10, 50, 50); 20 | algorithm = new CohenSutherlandClipping(clipWindow); 21 | } 22 | 23 | // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // 24 | // TODO: How to parameterize the tests? 25 | @Test 26 | public void theLineIsCompletelyInside() { 27 | Line2D.Float line = new Line2D.Float(20, 20, 30, 30); 28 | assertTrue(algorithm.clip(line)); 29 | assertEquals(20, line.x1, DELTA); 30 | assertEquals(20, line.y1, DELTA); 31 | assertEquals(30, line.x2, DELTA); 32 | assertEquals(30, line.y2, DELTA); 33 | } 34 | 35 | @Test 36 | public void theLineIsCompletelyOnTheLeft() { 37 | float x1 = 3, y1 = 13, x2 = 6, y2 = 16; 38 | Line2D.Float line = new Line2D.Float(x1, y1, x2, y2); 39 | assertFalse(algorithm.clip(line)); 40 | assertEquals(x1, line.x1, DELTA); 41 | assertEquals(y1, line.y1, DELTA); 42 | assertEquals(x2, line.x2, DELTA); 43 | assertEquals(y2, line.y2, DELTA); 44 | } 45 | 46 | @Test 47 | public void theLineIsCompletelyOnTheUp() { 48 | float x1 = 15, y1 = 5, x2 = 25, y2 = 2; 49 | Line2D.Float line = new Line2D.Float(x1, y1, x2, y2); 50 | assertFalse(algorithm.clip(line)); 51 | assertEquals(x1, line.x1, DELTA); 52 | assertEquals(y1, line.y1, DELTA); 53 | assertEquals(x2, line.x2, DELTA); 54 | assertEquals(y2, line.y2, DELTA); 55 | } 56 | 57 | @Test 58 | public void theLineIsCompletelyOnTheRight() { 59 | float x1 = 65, y1 = 15, x2 = 70, y2 = 20; 60 | Line2D.Float line = new Line2D.Float(x1, y1, x2, y2); 61 | assertFalse(algorithm.clip(line)); 62 | assertEquals(x1, line.x1, DELTA); 63 | assertEquals(y1, line.y1, DELTA); 64 | assertEquals(x2, line.x2, DELTA); 65 | assertEquals(y2, line.y2, DELTA); 66 | } 67 | 68 | @Test 69 | public void theLineIsCompletelyOnTheBottom() { 70 | float x1 = 15, y1 = 65, x2 = 25, y2 = 70; 71 | Line2D.Float line = new Line2D.Float(x1, y1, x2, y2); 72 | assertFalse(algorithm.clip(line)); 73 | assertEquals(x1, line.x1, DELTA); 74 | assertEquals(y1, line.y1, DELTA); 75 | assertEquals(x2, line.x2, DELTA); 76 | assertEquals(y2, line.y2, DELTA); 77 | } 78 | 79 | // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // 80 | @Test 81 | public void lineCrossesTopLeftCorner() { 82 | float x1 = 5, y1 = 25, x2 = 25, y2 = 5; 83 | Line2D.Float line = new Line2D.Float(x1, y1, x2, y2); 84 | assertTrue(algorithm.clip(line)); 85 | assertEquals(10, line.x1, DELTA); 86 | assertEquals(20, line.y1, DELTA); 87 | assertEquals(20, line.x2, DELTA); 88 | assertEquals(10, line.y2, DELTA); 89 | } 90 | 91 | @Test 92 | public void lineCrossesPartiallyTopLeftCorner() { 93 | float x1 = 15, y1 = 15, x2 = 25, y2 = 5; 94 | Line2D.Float line = new Line2D.Float(x1, y1, x2, y2); 95 | assertTrue(algorithm.clip(line)); 96 | assertEquals(x1, line.x1, DELTA); 97 | assertEquals(y1, line.y1, DELTA); 98 | assertEquals(20, line.x2, DELTA); 99 | assertEquals(10, line.y2, DELTA); 100 | } 101 | 102 | } 103 | -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-039-str.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | Organism 9 | 10 | 11 | 12 | 13 | Wildlife Criterion (pg/L) 14 | 15 | 16 | 17 | 18 | Mink 19 | 20 | 21 | 22 | 23 | 57 24 | 25 | 26 | 27 | 28 | River otter 29 | 30 | 31 | 32 | 33 | 42 34 | 35 | 36 | 37 | 38 | Kingfisher 39 | 40 | 41 | 42 | 43 | 33 44 | 45 | 46 | 47 | 48 | Loon 49 | 50 | 51 | 52 | 53 | 82 54 | 55 | 56 | 57 | 58 | Osprey 59 | 60 | 61 | 62 | 63 | 82 64 | 65 | 66 | 67 | 68 | Bald eagle 69 | 70 | 71 | 72 | 73 | 100 74 | 75 | 76 | 77 |
78 |
79 | -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-003-reg.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 |
76 |
77 | -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/csv/schools.csv: -------------------------------------------------------------------------------- 1 | "",Last Name,First Name,Address,City,State,Zip,Occupation,Employer,Date,Amount 2 | "",Lidstad,Dick & Peg,62 Mississippi River Blvd N,Saint Paul,MN,55104,retired,,10/12/2012,60.00 3 | "",Strom,Pam,1229 Hague Ave,St. Paul,MN,55104,,,9/12/2012,60.00 4 | "",Seeba,Louise & Paul,1399 Sheldon St,Saint Paul,MN,55108,BOE,City of Saint Paul,10/12/2012,60.00 5 | "",Schumacher / Bales,Douglas L. / Patricia,948 County Rd. D W,Saint Paul,MN,55126,,,10/13/2012,60.00 6 | "",Abrams,Marjorie,238 8th St east,St Paul,MN,55101,Retired,Retired,8/8/2012,75.00 7 | "",Crouse / Schroeder,Abigail / Jonathan,1545 Branston St.,Saint Paul,MN,55108,,,10/6/2012,75.00 8 | "",O'Connell,Jean,888 Ivy Ave W.,Saint Paul,MN,55117,,,9/30/2012,75.00 9 | "",Reese,Cheri A,981 Davern St.,Saint Paul,MN,55116,Public Relations,Far North Spirits,10/3/2012,75.00 10 | "",Serrano,Luz Maria,5559 Park Place Drive,Shoreview,MN,55126,retired,SPPS,9/20/2012,75.00 11 | "",Alsiddiqui,Jaber,13056 Euclid Ave,Apple Valley,MN,55124,SPPS budget analyst,SPPS,9/20/2012,100.00 12 | "",Andrastek,John,774 Ashland Ave,St Paul,MN,55104,asst principal,SPPS,8/22/2012,100.00 13 | "",Anfang,Heather L. & Matt,1635 Bayard Ave,Saint Paul,MN,55116,Ex Director,BOMA,10/6/2012,100.00 14 | "",Aronson,Roger J.,4852 Emerson Ave. S.,Minneapolis,MN,55419,Attorney at Law,,9/20/2012,100.00 15 | "",Aronson,Roger J.,4852 Emerson Ave. S.,Minneapolis,MN,55419,Attorney at Law,,9/20/2012,100.00 16 | "",Banks,Michael or Patricia,1664 Van Buren Ave.,Saint Paul,MN,55104,retired social worker,,9/20/2012,100.00 17 | "",Bennett,David,25 Birchwood Rd.,Mahtomedi,MN,55110,retired,retired,9/3/2012,100.00 18 | "",Berry,Margaret,1267 Pike Lake Dr,New Brighton,MN,55112,Asst Principal,Saint Paul Public Schools,8/8/2012,100.00 19 | "",Boyle,Matthew C. & Eliza,2165 Princeton Ave,Saint Paul,MN,55105,,,10/6/2012,100.00 20 | "",Brodrick,John,1007 Charles,St. Paul,MN,55104,BOE,retired,8/26/2012,100.00 21 | "",Brodrick,John F.,1007 Charles Ave,Saint Paul,MN,55104,BoE,SPPS,10/4/2012,100.00 22 | "",Brown,Mike,1440 Goodrich Ave,Saint Paul,MN,55105,,,9/23/2012,100.00 23 | "",Cacy,Jill,1409 Smith Ave So,West St. Paul,MN,55118,Administrator,SPPS,9/12/2012,100.00 24 | "",Cardwell,Eileen,4172 Bridgewater Ct,Vadnais Height,MN,55127,Retired,Retired,8/3/2012,100.00 25 | "",Carlstrom,Cheryl,4244 Oakmede Ln,St Paul,MN,55110,Satff,Saint Paul Public Schools,8/8/2012,100.00 26 | "",Carter,Melvin W. & Willet,405 Western Ave N,Saint Paul,MN,55103,Cty Commissioner,Ramsey Cty,10/8/2012,100.00 27 | "",Caruso,Sarah,2615 Newton Ave S,Minneapolis,MN,55405,CEO,United Way,9/12/2012,100.00 28 | "",Casey /Trewartha,Kerry F. / Kelly A.,2266 Goodrich Ave,Saint Paul,MN,55105,,,10/4/2012,100.00 29 | "",Cassidy,Paul D.,1990 Dayton Ave,Saint Paul,MN,55104,,,10/6/2012,100.00 30 | "",Causey,Christopher,2181 Doswell Avenue,Saint Paul,MN,55108,finance,,9/3/2012,100.00 31 | "",Christiansen,Lena,2453 Como Ave,St Paul,MN,55108,Principal,Saint Paul Public Schools,8/8/2012,100.00 32 | "",Clapp,Agustus (Bill),757 Osceola Ave #1,Saint Paul,MN,55105,retired,,10/2/2012,100.00 33 | "",Cohen,Brad A.,1460 Raymond Ave,Saint Paul,MN,55108,academic technology,UMN,10/17/2012,100.00 34 | "",Commers,Beth,2294 Commonwealth Ave,St Paul,MN,55108,Self Employed,Homemaker,8/1/2012,100.00 35 | "",Couture,Steven,7950 Victoria Way,Saint Louise Park,MN,55426,Principal,Saint Paul Public Schools,8/10/2012,100.00 36 | "",Crawford,Lydia P.,1140 Edgcumbe Rd,Saint Paul,MN,55105,,,9/15/2012,100.00 37 | "",Cudahy / Ricker,Robert & Mary C,616 Cherokee Ave.,Saint Paul,MN,55107,Teacher/Union Presid,SPPS/SPFT,9/18/2012,100.00 38 | "",Cudahy / Ricker,Robert & Mary C,616 Cherokee Ave.,Saint Paul,MN,55107,Teacher/Union Presid,SPPS/SPFT,10/6/2012,100.00 39 | "",Currie,Elisabeth,2274 Hillside Ave,St. Paul,MN,55108,,,9/12/2012,100.00 40 | "",Doane,Paul V & Helen R,444 Portland Ave,Sant Paul,MN,55102,Ex director,St Paul Teachers' Retirement A,10/3/2012,100.00 41 | "",Dougherty,Richards S & Patrici,812 Goodrich Ave,Saint Paul,MN,55105,,,10/4/2012,100.00 42 | "",Driscoll,Joe & Jill,2383 Bourne Ave,Saint Paul,MN,55108,HR Manager,LexisNexis,10/18/2012,100.00 43 | "",Dubaille / Haugee,Florence M. /Eric,1009 Edmund Ave,Saint Paul,MN,55104,Florence‐teacher,SPPS,10/3/2012,100.00 44 | "",Eaton,Jim,2133 Berkeley Ave,St Paul,MN,55105,Principal,Saint Paul Public Schools,8/23/2012,100.00 45 | "",Eaves /Alger,Patricia / Stuart,1143 Portladn Ave.,Saint Paul,MN,55104,,,10/3/2012,100.00 -------------------------------------------------------------------------------- /src/test/java/technology/tabula/TestUtils.java: -------------------------------------------------------------------------------- 1 | package technology.tabula; 2 | 3 | import static org.junit.Assert.assertArrayEquals; 4 | import static org.junit.Assert.assertEquals; 5 | import static org.junit.Assert.assertNull; 6 | 7 | import java.awt.geom.Point2D; 8 | import java.io.File; 9 | import java.io.IOException; 10 | import java.util.ArrayList; 11 | import java.util.Arrays; 12 | import java.util.Collections; 13 | import java.util.List; 14 | 15 | import org.apache.pdfbox.Loader; 16 | import org.apache.pdfbox.rendering.ImageType; 17 | import org.apache.commons.cli.ParseException; 18 | import org.apache.pdfbox.pdmodel.PDDocument; 19 | import org.apache.pdfbox.pdmodel.PDPage; 20 | import org.junit.Test; 21 | 22 | public class TestUtils { 23 | 24 | public static final Ruling[] RULINGS = { 25 | new Ruling(new Point2D.Float(0, 0), new Point2D.Float(1,1)), 26 | new Ruling(new Point2D.Float(2, 2), new Point2D.Float(3,3)) 27 | }; 28 | 29 | public static final Rectangle[] RECTANGLES = { 30 | new Rectangle(), 31 | new Rectangle(0, 0, 2, 4) 32 | }; 33 | 34 | 35 | @Test 36 | public void testBoundsOfTwoRulings() { 37 | Rectangle r = Utils.bounds(Arrays.asList(RULINGS)); 38 | assertEquals(0, r.getMinX(), 0); 39 | assertEquals(0, r.getMinY(), 0); 40 | assertEquals(3, r.getWidth(), 0); 41 | assertEquals(3, r.getHeight(), 0); 42 | } 43 | 44 | @Test 45 | public void testBoundsOfOneEmptyRectangleAndAnotherNonEmpty() { 46 | Rectangle r = Utils.bounds(Arrays.asList(RECTANGLES)); 47 | assertEquals(r, RECTANGLES[1]); 48 | } 49 | 50 | @Test 51 | public void testBoundsOfOneRectangle() { 52 | ArrayList shapes = new ArrayList<>(); 53 | shapes.add(new Rectangle(0, 0, 20, 40)); 54 | Rectangle r = Utils.bounds(shapes); 55 | assertEquals(r, shapes.get(0)); 56 | } 57 | 58 | @Test 59 | public void testParsePagesOption() throws ParseException { 60 | 61 | List rv = Utils.parsePagesOption("1"); 62 | assertArrayEquals(new Integer[] { 1 }, rv.toArray()); 63 | 64 | rv = Utils.parsePagesOption("1-4"); 65 | assertArrayEquals(new Integer[] { 1,2,3,4 }, rv.toArray()); 66 | 67 | rv = Utils.parsePagesOption("1-4,20-24"); 68 | assertArrayEquals(new Integer[] { 1,2,3,4,20,21,22,23,24 }, rv.toArray()); 69 | 70 | rv = Utils.parsePagesOption("all"); 71 | assertNull(rv); 72 | } 73 | 74 | @Test(expected=ParseException.class) 75 | public void testExceptionInParsePages() throws ParseException { 76 | Utils.parsePagesOption("1-4,24-22"); 77 | } 78 | 79 | @Test(expected=ParseException.class) 80 | public void testAnotherExceptionInParsePages() throws ParseException { 81 | Utils.parsePagesOption("quuxor"); 82 | } 83 | 84 | @Test 85 | public void testQuickSortEmptyList() { 86 | List numbers = new ArrayList<>(); 87 | QuickSort.sort(numbers); 88 | 89 | assertEquals(Collections.emptyList(), numbers); 90 | } 91 | 92 | @Test 93 | public void testQuickSortOneElementList() { 94 | List numbers = Arrays.asList(5); 95 | QuickSort.sort(numbers); 96 | 97 | assertEquals(Arrays.asList(5), numbers); 98 | } 99 | 100 | @Test 101 | public void testQuickSortShortList() { 102 | List numbers = Arrays.asList(4, 5, 6, 8, 7, 1, 2, 3); 103 | QuickSort.sort(numbers); 104 | 105 | assertEquals(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8), numbers); 106 | } 107 | 108 | @Test 109 | public void testQuickSortLongList() { 110 | 111 | List numbers = new ArrayList<>(); 112 | List expectedNumbers = new ArrayList<>(); 113 | 114 | for(int i = 0; i <= 12000; i++){ 115 | numbers.add(12000 - i); 116 | expectedNumbers.add(i); 117 | } 118 | 119 | QuickSort.sort(numbers); 120 | 121 | assertEquals(expectedNumbers, numbers); 122 | } 123 | 124 | @Test 125 | public void testJPEG2000DoesNotRaise() throws IOException { 126 | PDDocument pdf_document = Loader.loadPDF(new File("src/test/resources/technology/tabula/jpeg2000.pdf")); 127 | PDPage page = pdf_document.getPage(0); 128 | Utils.pageConvertToImage(pdf_document, page, 360, ImageType.RGB); 129 | } 130 | 131 | } 132 | -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-008-reg.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 |
94 |
95 | -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-us/us-038-str.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | Species 9 | 10 | 11 | 12 | 13 | Percent of Range 14 | Impacted 15 | 16 | 17 | 18 | 19 | 20 | Kingfisher 21 | 22 | 23 | 24 | 25 | 29% 26 | 27 | 28 | 29 | 30 | Bald Eagle 31 | 32 | 33 | 34 | 35 | 34% 36 | 37 | 38 | 39 | 40 | Osprey 41 | 42 | 43 | 44 | 45 | 20% 46 | 47 | 48 | 49 | 50 | Common Loon 51 | 52 | 53 | 54 | 55 | 40% 56 | 57 | 58 | 59 | 60 | Florida Panther 61 | 62 | 63 | 64 | 65 | 100% 66 | 67 | 68 | 69 | 70 | Mink 71 | 72 | 73 | 74 | 75 | 35% 76 | 77 | 78 | 79 | 80 | River Otter 81 | 82 | 83 | 84 | 85 | 38% 86 | 87 | 88 | 89 |
90 |
91 | -------------------------------------------------------------------------------- /src/main/java/technology/tabula/CohenSutherlandClipping.java: -------------------------------------------------------------------------------- 1 | /* 2 | * CohenSutherland.java 3 | * -------------------- 4 | * (c) 2007 by Intevation GmbH 5 | * 6 | * @author Sascha L. Teichmann (teichmann@intevation.de) 7 | * @author Ludwig Reiter (ludwig@intevation.de) 8 | * 9 | * This program is free software under the LGPL (>=v2.1) 10 | * Read the file LICENSE.txt coming with the sources for details. 11 | */ 12 | package technology.tabula; 13 | 14 | import java.awt.geom.Rectangle2D; 15 | import java.awt.geom.Line2D; 16 | 17 | /** 18 | * Implements the well known Cohen Sutherland line 19 | * clipping algorithm (line against clip rectangle). 20 | */ 21 | public final class CohenSutherlandClipping { 22 | 23 | private double xMin; 24 | private double yMin; 25 | private double xMax; 26 | private double yMax; 27 | 28 | private static final int INSIDE = 0; 29 | private static final int LEFT = 1; 30 | private static final int RIGHT = 2; 31 | private static final int BOTTOM = 4; 32 | private static final int TOP = 8; 33 | 34 | private final static float MINIMUM_DELTA = 0.01f; 35 | 36 | /** 37 | * Creates a Cohen Sutherland clipper with clip window (0, 0, 0, 0). 38 | */ 39 | public CohenSutherlandClipping() {} 40 | 41 | /** 42 | * Creates a Cohen Sutherland clipper with the given clip window. 43 | * @param clipWindow the clip window to use. 44 | */ 45 | public CohenSutherlandClipping(Rectangle2D clipWindow) { 46 | setClip(clipWindow); 47 | } 48 | 49 | /** 50 | * Sets the clip rectangle. 51 | * @param clipWindow the clip window. 52 | */ 53 | public void setClip(Rectangle2D clipWindow) { 54 | xMin = clipWindow.getX(); 55 | xMax = xMin + clipWindow.getWidth(); 56 | yMin = clipWindow.getY(); 57 | yMax = yMin + clipWindow.getHeight(); 58 | } 59 | 60 | // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // 61 | /** 62 | * Clips a given line against the clip window. 63 | * The modification (if needed) is done in place. 64 | * @param line the line to clip. 65 | * @return true if line is clipped, false if line is 66 | * totally outside the clip window. 67 | */ 68 | public boolean clip(Line2D.Float line) { 69 | Point point1 = new Point(line.getX1(), line.getY1()); 70 | Point point2 = new Point(line.getX2(), line.getY2()); 71 | Point outsidePoint = new Point(0d, 0d); 72 | 73 | boolean lineIsVertical = (point1.x == point2.x); 74 | double lineSlope = lineIsVertical ? 0d : (point2.y-point1.y)/(point2.x-point1.x); 75 | 76 | while (point1.region != INSIDE || point2.region != INSIDE) { 77 | if ((point1.region & point2.region) != 0) return false; 78 | 79 | outsidePoint.region = (point1.region == INSIDE) ? point2.region : point1.region; 80 | 81 | if ((outsidePoint.region & LEFT) != 0) { 82 | outsidePoint.x = xMin; 83 | outsidePoint.y = delta(outsidePoint.x, point1.x)*lineSlope + point1.y; 84 | } 85 | else if ((outsidePoint.region & RIGHT) != 0) { 86 | outsidePoint.x = xMax; 87 | outsidePoint.y = delta(outsidePoint.x, point1.x)*lineSlope + point1.y; 88 | } 89 | else if ((outsidePoint.region & BOTTOM) != 0) { 90 | outsidePoint.y = yMin; 91 | outsidePoint.x = lineIsVertical 92 | ? point1.x 93 | : delta(outsidePoint.y, point1.y)/lineSlope + point1.x; 94 | } 95 | else if ((outsidePoint.region & TOP) != 0) { 96 | outsidePoint.y = yMax; 97 | outsidePoint.x = lineIsVertical 98 | ? point1.x 99 | : delta(outsidePoint.y, point1.y)/lineSlope + point1.x; 100 | } 101 | 102 | if (outsidePoint.isInTheSameRegionAs(point1)) { 103 | point1.setPositionAndRegion(outsidePoint.x, outsidePoint.y); 104 | } 105 | else { 106 | point2.setPositionAndRegion(outsidePoint.x, outsidePoint.y); 107 | } 108 | } 109 | line.setLine(point1.x, point1.y, point2.x, point2.y); 110 | return true; 111 | } 112 | 113 | // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // 114 | private static double delta(double value1, double value2) { 115 | return (Math.abs(value1 - value2) < MINIMUM_DELTA) ? 0 : (value1 - value2); 116 | } 117 | 118 | // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // 119 | class Point { 120 | double x, y; 121 | int region; 122 | 123 | Point(double x, double y) { 124 | setPositionAndRegion(x, y); 125 | } 126 | 127 | void setPositionAndRegion(double x, double y) { 128 | this.x = x; this.y = y; 129 | region = (x < xMin) ? LEFT : (x > xMax) ? RIGHT : INSIDE; 130 | if (y < yMin) 131 | region |= BOTTOM; 132 | else if (y > yMax) 133 | region |= TOP; 134 | } 135 | 136 | boolean isInTheSameRegionAs(Point otherPoint) { 137 | return this.region == otherPoint.region; 138 | } 139 | } 140 | 141 | } 142 | -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-026-reg.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 |
61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 |
93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 |
125 |
126 | -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-002-reg.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 |
101 |
102 | -------------------------------------------------------------------------------- /src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-eu/eu-009b-reg.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 |
107 |
108 | --------------------------------------------------------------------------------