├── project ├── build.properties └── plugins.sbt ├── .travis.yml ├── src ├── test │ ├── resources │ │ ├── simpleCSVs │ │ │ ├── simple3x3Digits.csv │ │ │ ├── SimpleTest1Table.csv │ │ │ ├── SimpleTest2Tables1TitlePage1.csv │ │ │ └── SimpleTest2Tables1TitlePage2.csv │ │ ├── textFiles │ │ │ └── quickBrown.txt │ │ ├── expenseReports │ │ │ ├── P34_LUNDY_Kate.pdf │ │ │ ├── P34_ABBOTT_Tony.pdf │ │ │ ├── P34_LEIGH_Andrew.pdf │ │ │ ├── P34_SESELJA_Zed.pdf │ │ │ ├── P34_BRODTMANN_Gai.pdf │ │ │ ├── P34_HUMPHRIES_Gary.pdf │ │ │ └── LICENSE │ │ └── simplePDFs │ │ │ ├── SimpleTest1Table.pdf │ │ │ ├── SimpleTest2Tables1Title.pdf │ │ │ └── TwoPagedBlankDocument.pdf │ └── scala │ │ └── tools │ │ └── ambitious │ │ └── pdfextractiontoolkit │ │ ├── AmbitiousToolsSpec.scala │ │ ├── library │ │ ├── util │ │ │ ├── TabulaConverterSpec.scala │ │ │ └── CSVUtilSpec.scala │ │ ├── model │ │ │ ├── geometry │ │ │ │ ├── PointSpec.scala │ │ │ │ ├── PositivePointSpec.scala │ │ │ │ ├── SizeSpec.scala │ │ │ │ └── RectangleSpec.scala │ │ │ ├── CellSpec.scala │ │ │ ├── PageSpec.scala │ │ │ ├── DocumentSpec.scala │ │ │ ├── RowSpec.scala │ │ │ └── TableSpec.scala │ │ ├── SimpleExtractSpec.scala │ │ ├── extraction │ │ │ ├── ExtractionUtilsSpec.scala │ │ │ ├── extractionconstraints │ │ │ │ ├── FirstOccurrenceOfStringExtractionConstraintSpec.scala │ │ │ │ └── PageNumberExtractionConstraintSpec.scala │ │ │ ├── tablemergers │ │ │ │ └── SimpleTableMergerSpec.scala │ │ │ └── ExtractorSpec.scala │ │ └── expensereports │ │ │ └── SummaryOfParliamentaryExpenditureByPeriodExtractionSpec.scala │ │ ├── Resources.scala │ │ ├── webapp │ │ ├── services │ │ │ └── documentstorage │ │ │ │ ├── DocumentIdentifierSpec.scala │ │ │ │ ├── DocumentDescriptionSpec.scala │ │ │ │ ├── DocumentLibraryImplSpec.scala │ │ │ │ └── DocumentFileStoreImplSpec.scala │ │ └── data │ │ │ ├── DAOTestUtils.scala │ │ │ ├── RootDAOImplSpec.scala │ │ │ └── DocumentInformationDaoSpec.scala │ │ └── utils │ │ └── AmbitiousIoUtilsSpec.scala └── main │ ├── scala │ └── tools │ │ └── ambitious │ │ └── pdfextractiontoolkit │ │ ├── library │ │ ├── model │ │ │ ├── geometry │ │ │ │ ├── Point.scala │ │ │ │ ├── PositivePoint.scala │ │ │ │ ├── Size.scala │ │ │ │ └── Rectangle.scala │ │ │ ├── Cell.scala │ │ │ ├── Page.scala │ │ │ ├── Row.scala │ │ │ ├── Table.scala │ │ │ └── Document.scala │ │ ├── extraction │ │ │ ├── StateBundle.scala │ │ │ ├── tableextractors │ │ │ │ ├── TableExtractor.scala │ │ │ │ └── RegionBasedTableExtractor.scala │ │ │ ├── tablemergers │ │ │ │ ├── TableMerger.scala │ │ │ │ └── SimpleTableMerger.scala │ │ │ ├── extractionconstraints │ │ │ │ ├── ExtractionConstraint.scala │ │ │ │ ├── SimpleExtractionConstraint.scala │ │ │ │ ├── MergingSimpleExtractionConstraint.scala │ │ │ │ ├── PageNumberExtractionConstraint.scala │ │ │ │ └── FirstOccurrenceOfStringExtractionConstraint.scala │ │ │ ├── ExtractionResult.scala │ │ │ ├── ExtractionUtils.scala │ │ │ ├── Extractor.scala │ │ │ └── DocumentWalker.scala │ │ └── util │ │ │ ├── CSVUtil.scala │ │ │ └── TabulaConverter.scala │ │ ├── webapp │ │ ├── data │ │ │ ├── RootDAO.scala │ │ │ ├── DocumentInformationDao.scala │ │ │ ├── RootDAOImpl.scala │ │ │ ├── model │ │ │ │ └── Documents.scala │ │ │ └── DocumentInformationDaoImpl.scala │ │ └── services │ │ │ └── documentstorage │ │ │ ├── DocumentFileStore.scala │ │ │ ├── DocumentLibrary.scala │ │ │ ├── DocumentIdentifier.scala │ │ │ ├── DocumentFileStoreImpl.scala │ │ │ ├── DocumentDescription.scala │ │ │ └── DocumentLibraryImpl.scala │ │ └── utils │ │ └── AmbitiousIoUtils.scala │ └── resources │ ├── log4j.properties │ └── application.conf ├── TODO.md ├── .gitignore └── README.md /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version = 0.13.8 -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: scala 2 | scala: 3 | - 2.11.7 -------------------------------------------------------------------------------- /src/test/resources/simpleCSVs/simple3x3Digits.csv: -------------------------------------------------------------------------------- 1 | 1,2,3 2 | 4,5,6 3 | 7,8,9 -------------------------------------------------------------------------------- /src/test/resources/textFiles/quickBrown.txt: -------------------------------------------------------------------------------- 1 | The quick brown fox jumped over the lazy dogs -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | logLevel := Level.Warn 2 | addSbtPlugin("io.spray" % "sbt-revolver" % "0.7.2") -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | # Things To Do 2 | 1. Close PDF documents correctly 3 | 2. Improve the tools.ambitious.pdfextractiontoolkit.extraction.Extractor.extractTables method 4 | -------------------------------------------------------------------------------- /src/test/resources/expenseReports/P34_LUNDY_Kate.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gavia/PDFExtractionToolkit/master/src/test/resources/expenseReports/P34_LUNDY_Kate.pdf -------------------------------------------------------------------------------- /src/test/resources/simplePDFs/SimpleTest1Table.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gavia/PDFExtractionToolkit/master/src/test/resources/simplePDFs/SimpleTest1Table.pdf -------------------------------------------------------------------------------- /src/test/resources/expenseReports/P34_ABBOTT_Tony.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gavia/PDFExtractionToolkit/master/src/test/resources/expenseReports/P34_ABBOTT_Tony.pdf -------------------------------------------------------------------------------- /src/test/resources/expenseReports/P34_LEIGH_Andrew.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gavia/PDFExtractionToolkit/master/src/test/resources/expenseReports/P34_LEIGH_Andrew.pdf -------------------------------------------------------------------------------- /src/test/resources/expenseReports/P34_SESELJA_Zed.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gavia/PDFExtractionToolkit/master/src/test/resources/expenseReports/P34_SESELJA_Zed.pdf -------------------------------------------------------------------------------- /src/test/resources/expenseReports/P34_BRODTMANN_Gai.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gavia/PDFExtractionToolkit/master/src/test/resources/expenseReports/P34_BRODTMANN_Gai.pdf -------------------------------------------------------------------------------- /src/test/resources/expenseReports/P34_HUMPHRIES_Gary.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gavia/PDFExtractionToolkit/master/src/test/resources/expenseReports/P34_HUMPHRIES_Gary.pdf -------------------------------------------------------------------------------- /src/test/resources/simplePDFs/SimpleTest2Tables1Title.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gavia/PDFExtractionToolkit/master/src/test/resources/simplePDFs/SimpleTest2Tables1Title.pdf -------------------------------------------------------------------------------- /src/test/resources/simplePDFs/TwoPagedBlankDocument.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gavia/PDFExtractionToolkit/master/src/test/resources/simplePDFs/TwoPagedBlankDocument.pdf -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target/ 2 | /project/target/ 3 | *.class 4 | *.iml 5 | /lib_managed/ 6 | /src_managed/ 7 | *.ipr 8 | *.iws 9 | .idea 10 | out 11 | .DS_Store 12 | /working/ 13 | /test_working/ -------------------------------------------------------------------------------- /src/test/resources/simpleCSVs/SimpleTest1Table.csv: -------------------------------------------------------------------------------- 1 | 10,9,8,2 2 | 2,4,8,5 3 | 6,5,6,9 4 | 3,4,4,5 5 | 10,3,5,7 6 | 4,1,4,5 7 | 10,9,2,8 8 | 1,1,8,4 9 | 7,4,9,1 10 | 8,3,4,4 11 | 9,7,5,8 12 | 8,4,2,9 13 | 3,8,7,9 14 | 5,6,3,5 15 | 3,6,5,6 16 | 7,4,1,4 17 | 3,9,6,9 18 | 3,5,8,1 -------------------------------------------------------------------------------- /src/test/resources/expenseReports/LICENSE: -------------------------------------------------------------------------------- 1 | The files in this directory are licensed under Creative Commons Attribution 2.5 Australia licence, by the Department of Finance and Deregulation. For more information, go here: http://www.finance.gov.au/publications/parliamentarians-reporting/ -------------------------------------------------------------------------------- /src/main/scala/tools/ambitious/pdfextractiontoolkit/library/model/geometry/Point.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.model.geometry 2 | 3 | case class Point protected (x: Double, y: Double) 4 | 5 | object Point { 6 | def at(x:Double, y:Double) = new Point(x, y) 7 | } -------------------------------------------------------------------------------- /src/main/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/StateBundle.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.extraction 2 | 3 | class StateBundle { 4 | var state: Option[Any] = None 5 | } 6 | 7 | object StateBundle { 8 | def create: StateBundle = new StateBundle 9 | } -------------------------------------------------------------------------------- /src/test/scala/tools/ambitious/pdfextractiontoolkit/AmbitiousToolsSpec.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit 2 | 3 | import org.scalatest.{FlatSpec, GivenWhenThen, OneInstancePerTest} 4 | 5 | class AmbitiousToolsSpec extends FlatSpec with OneInstancePerTest with GivenWhenThen { 6 | 7 | } 8 | -------------------------------------------------------------------------------- /src/test/resources/simpleCSVs/SimpleTest2Tables1TitlePage1.csv: -------------------------------------------------------------------------------- 1 | 1,6,4,1 2 | 10,2,5,6 3 | 1,8,2,10 4 | 4,3,6,3 5 | 7,3,6,5 6 | 4,3,4,6 7 | 2,3,3,6 8 | 2,10,5,1 9 | 10,9,6,8 10 | 8,8,9,1 11 | 1,1,1,5 12 | 7,6,7,8 13 | 6,8,2,8 14 | 8,8,8,4 15 | 7,1,4,1 16 | 9,2,2,3 17 | 7,5,5,3 18 | 1,1,10,7 19 | 1,3,8,5 20 | 1,4,5,9 -------------------------------------------------------------------------------- /src/test/resources/simpleCSVs/SimpleTest2Tables1TitlePage2.csv: -------------------------------------------------------------------------------- 1 | 4,6,7,2 2 | 3,3,1,8 3 | 5,9,8,4 4 | 5,6,10,10 5 | 10,5,4,4 6 | 10,5,7,7 7 | 4,4,4,7 8 | 4,7,5,3 9 | 2,8,2,2 10 | 8,7,2,7 11 | 7,2,6,2 12 | 10,1,8,7 13 | 2,7,4,7 14 | 5,2,7,3 15 | 7,2,10,3 16 | 9,5,3,8 17 | 2,1,8,6 18 | 2,3,6,8 19 | 3,1,6,7 20 | 10,2,6,2 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # The PDF Extraction Toolkit 2 | 3 | [![Build Status](https://travis-ci.org/AmbitiousTools/PDFExtractionToolkit.svg?branch=master)](https://travis-ci.org/AmbitiousTools/PDFExtractionToolkit) 4 | 5 | A project to provide a set of user-friendly utilities for extracting tabulated data from large numbers of similar PDF 6 | files. -------------------------------------------------------------------------------- /src/main/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/tableextractors/TableExtractor.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.extraction.tableextractors 2 | 3 | import tools.ambitious.pdfextractiontoolkit.library.model.{Page, Table} 4 | 5 | trait TableExtractor { 6 | def getTable(page: Page): Table 7 | } 8 | -------------------------------------------------------------------------------- /src/main/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/tablemergers/TableMerger.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.extraction.tablemergers 2 | 3 | import tools.ambitious.pdfextractiontoolkit.library.model.Table 4 | 5 | trait TableMerger { 6 | def mergeTables(toMerge: List[Table]): Option[Table] 7 | } 8 | -------------------------------------------------------------------------------- /src/main/scala/tools/ambitious/pdfextractiontoolkit/library/model/Cell.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.model 2 | 3 | class Cell(val text: String = "") { 4 | 5 | def isEmpty: Boolean = text == "" 6 | 7 | override def equals(obj: Any): Boolean = 8 | obj.isInstanceOf[Cell] && (text == obj.asInstanceOf[Cell].text) 9 | } 10 | -------------------------------------------------------------------------------- /src/test/scala/tools/ambitious/pdfextractiontoolkit/library/util/TabulaConverterSpec.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.util 2 | 3 | import org.scalatest.FreeSpec 4 | 5 | /** 6 | * This is empty as there is currently no easy way to mock Tabula tables/rows 7 | */ 8 | class TabulaConverterSpec extends FreeSpec { 9 | "placeholder test" in { 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/main/scala/tools/ambitious/pdfextractiontoolkit/webapp/data/RootDAO.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.webapp.data 2 | 3 | import slick.driver.SQLiteDriver.api.Database 4 | 5 | import scala.concurrent.Future 6 | 7 | private[data] trait RootDAO { 8 | def database: Database 9 | 10 | def initialiseIfNeeded(): Future[Unit] 11 | } 12 | 13 | private[data] object RootDAO { 14 | def forConfigName(configName: String): RootDAO = new RootDAOImpl(configName) 15 | } -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=ERROR, A1 2 | log4j.logger.org.apache.pdfbox=ERROR, A1 3 | log4j.logger.org.apache.pdfbox.cos.COSDocument=ERROR, A1 4 | log4j.logger.org.apache.pdfbox.pdmodel.graphics.xobject.PDPixelMap=OFF 5 | 6 | # A1 is set to be a ConsoleAppender. 7 | log4j.appender.A1=org.apache.log4j.ConsoleAppender 8 | 9 | # A1 uses PatternLayout. 10 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout 11 | log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n -------------------------------------------------------------------------------- /src/main/scala/tools/ambitious/pdfextractiontoolkit/webapp/services/documentstorage/DocumentFileStore.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.webapp.services.documentstorage 2 | 3 | import java.net.URL 4 | 5 | import scala.concurrent.Future 6 | 7 | private trait DocumentFileStore { 8 | 9 | def storeFileFor(docID: DocumentIdentifier, documentSource: URL): Future[Unit] 10 | 11 | def retrieveFileFor(docID: DocumentIdentifier): Future[URL] 12 | 13 | def deleteFileFor(docID: DocumentIdentifier): Future[Unit] 14 | 15 | } 16 | -------------------------------------------------------------------------------- /src/main/scala/tools/ambitious/pdfextractiontoolkit/library/model/geometry/PositivePoint.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.model.geometry 2 | 3 | class PositivePoint protected (override val x: Double, override val y: Double) extends Point(x, y) { 4 | if (x < 0) 5 | throw new IllegalArgumentException("x must not be negative") 6 | if (y < 0) 7 | throw new IllegalArgumentException("y must not be negative") 8 | } 9 | 10 | object PositivePoint { 11 | def at(x:Double, y:Double) = new PositivePoint(x, y) 12 | } 13 | -------------------------------------------------------------------------------- /src/test/scala/tools/ambitious/pdfextractiontoolkit/library/model/geometry/PointSpec.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.model.geometry 2 | 3 | import org.scalatest.FreeSpec 4 | 5 | class PointSpec extends FreeSpec { 6 | 7 | "A Point instantiated with x equal to 100.0 and y equal to 50.0" - { 8 | val point: Point = Point.at(100.0, 50.0) 9 | 10 | "should have x equal to 100.0" in { 11 | assert(point.x == 100.0) 12 | } 13 | 14 | "should have y equal to 50.0" in { 15 | assert(point.y == 50.0) 16 | } 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/main/scala/tools/ambitious/pdfextractiontoolkit/library/model/geometry/Size.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.model.geometry 2 | 3 | case class Size protected (width: Double, height: Double) { 4 | if (width < 0) 5 | throw new IllegalArgumentException("Width must not be negative") 6 | 7 | if (height < 0) 8 | throw new IllegalArgumentException("Height must not be negative") 9 | 10 | val area = math.abs(width * height) 11 | } 12 | 13 | object Size { 14 | def fromWidthAndHeight(width:Double, height:Double) = new Size(width, height) 15 | } 16 | -------------------------------------------------------------------------------- /src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | akka { 2 | loglevel = INFO 3 | } 4 | 5 | spray.can.server { 6 | request-timeout = 1s 7 | } 8 | 9 | prodWorkingDir = "working" 10 | 11 | testWorkingDir = "test_working" 12 | 13 | prodDB = { 14 | path = ${prodWorkingDir}"/PDFExtractionToolkit.db" 15 | url = "jdbc:sqlite:"${prodDB.path} 16 | driver = org.sqlite.JDBC 17 | connectionPool = disabled 18 | keepAliveConnection = true 19 | } 20 | 21 | testDB = { 22 | path = ${testWorkingDir}"/TestPDFExtractionToolkit.db" 23 | url = "jdbc:sqlite:"${testDB.path} 24 | driver = org.sqlite.JDBC 25 | connectionPool = disabled 26 | keepAliveConnection = true 27 | } -------------------------------------------------------------------------------- /src/main/scala/tools/ambitious/pdfextractiontoolkit/library/util/CSVUtil.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.util 2 | 3 | import java.io.File 4 | import java.net.URL 5 | 6 | import com.github.tototoshi.csv._ 7 | import tools.ambitious.pdfextractiontoolkit.library.model.{Row, Table} 8 | 9 | object CSVUtil { 10 | def tableFromFile(file: File): Table = { 11 | val reader: CSVReader = CSVReader.open(file) 12 | val lines: List[List[String]] = reader.all() 13 | Table.fromRows(lines.map(line => Row.fromStrings(line))) 14 | } 15 | 16 | def tableFromURL(url: URL): Table = 17 | tableFromFile(new File(url.toURI)) 18 | } 19 | -------------------------------------------------------------------------------- /src/main/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/extractionconstraints/ExtractionConstraint.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.extraction.extractionconstraints 2 | 3 | import tools.ambitious.pdfextractiontoolkit.library.extraction.StateBundle 4 | import tools.ambitious.pdfextractiontoolkit.library.model.{Document, Page, Table} 5 | 6 | trait ExtractionConstraint { 7 | 8 | def onStart(stateBundle: StateBundle) = {} 9 | def onPage(page:Page, document:Document, stateBundle: StateBundle) 10 | def onEnd(stateBundle: StateBundle) = {} 11 | 12 | def tableFromState(stateBundle: StateBundle): Option[Table] 13 | 14 | } 15 | -------------------------------------------------------------------------------- /src/main/scala/tools/ambitious/pdfextractiontoolkit/webapp/services/documentstorage/DocumentLibrary.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.webapp.services.documentstorage 2 | 3 | import java.net.URL 4 | 5 | import scala.concurrent.Future 6 | 7 | /** 8 | * A globally available interface for manipulating the applications stored set of Documents. 9 | */ 10 | trait DocumentLibrary { 11 | 12 | def store(description: DocumentDescription, documentSource: URL): Future[DocumentIdentifier] 13 | 14 | def retrieve(docID: DocumentIdentifier): Future[URL] 15 | 16 | def delete(docID: DocumentIdentifier): Future[Unit] 17 | 18 | def list(): Future[Seq[DocumentIdentifier]] 19 | 20 | } 21 | -------------------------------------------------------------------------------- /src/main/scala/tools/ambitious/pdfextractiontoolkit/webapp/data/DocumentInformationDao.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.webapp.data 2 | 3 | import tools.ambitious.pdfextractiontoolkit.webapp.services.documentstorage.DocumentIdentifier 4 | 5 | import scala.concurrent.Future 6 | 7 | private[webapp] trait DocumentInformationDao { 8 | def storeDocumentID(docID: DocumentIdentifier): Future[Unit] 9 | 10 | def deleteDocumentID(docID: DocumentIdentifier): Future[Unit] 11 | 12 | def retrieveAllIDs(): Future[Seq[DocumentIdentifier]] 13 | } 14 | 15 | private[webapp] object DocumentInformationDao { 16 | def forRootDao(rootDao: RootDAO): DocumentInformationDao = new DocumentInformationDaoImpl(rootDao) 17 | } -------------------------------------------------------------------------------- /src/test/scala/tools/ambitious/pdfextractiontoolkit/Resources.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit 2 | 3 | import java.net.URL 4 | 5 | object Resources { 6 | val quickBrownFoxTxt: URL = getClass.getResource("/textFiles/quickBrown.txt") 7 | 8 | val simpleTest1TableURL = getClass.getResource("/simplePDFs/SimpleTest1Table.pdf") 9 | val simpleTest1TableCSVURL = getClass.getResource("/simpleCSVs/SimpleTest1Table.csv") 10 | 11 | val simpleTest2Tables2TitleURL = getClass.getResource("/simplePDFs/SimpleTest2Tables1Title.pdf") 12 | val simpleTest2Tables2TitlePage1CSVURL = getClass.getResource("/simpleCSVs/SimpleTest2Tables1TitlePage1.csv") 13 | val simpleTest2Tables2TitlePage2CSVURL = getClass.getResource("/simpleCSVs/SimpleTest2Tables1TitlePage2.csv") 14 | } 15 | -------------------------------------------------------------------------------- /src/main/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/extractionconstraints/SimpleExtractionConstraint.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.extraction.extractionconstraints 2 | 3 | import tools.ambitious.pdfextractiontoolkit.library.extraction.StateBundle 4 | import tools.ambitious.pdfextractiontoolkit.library.extraction.tableextractors.TableExtractor 5 | import tools.ambitious.pdfextractiontoolkit.library.model.{Document, Page, Table} 6 | 7 | trait SimpleExtractionConstraint extends ExtractionConstraint { 8 | protected val tableExtractor: TableExtractor 9 | 10 | protected var table: Option[Table] = None 11 | 12 | def shouldExtractOnPage(page: Page, document: Document, stateBundle: StateBundle): Boolean 13 | 14 | protected def performExtraction(page: Page) = 15 | tableExtractor.getTable(page) 16 | } 17 | -------------------------------------------------------------------------------- /src/test/scala/tools/ambitious/pdfextractiontoolkit/webapp/services/documentstorage/DocumentIdentifierSpec.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.webapp.services.documentstorage 2 | 3 | import org.scalatest.FreeSpec 4 | 5 | class DocumentIdentifierSpec extends FreeSpec { 6 | 7 | s"a ${DocumentIdentifier.getClass.getSimpleName}" - { 8 | 9 | "created with a hash and a description" - { 10 | val hash: String = "425A" 11 | val description: DocumentDescription = DocumentDescription.withTitle("testDoc") 12 | 13 | val docID: DocumentIdentifier = DocumentIdentifier.withHashAndDescription(hash, description) 14 | 15 | "has that hash and description" in { 16 | assert(docID.hash == hash) 17 | assert(docID.description == description) 18 | } 19 | 20 | } 21 | 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/tools/ambitious/pdfextractiontoolkit/webapp/services/documentstorage/DocumentIdentifier.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.webapp.services.documentstorage 2 | 3 | import tools.ambitious.pdfextractiontoolkit.utils.AmbitiousIoUtils.ByteArrayUtils 4 | 5 | case class DocumentIdentifier protected(hash: String, 6 | description: DocumentDescription) { 7 | } 8 | 9 | object DocumentIdentifier { 10 | 11 | def computeFor(documentDescription: DocumentDescription, rawBytes: Array[Byte]): DocumentIdentifier = { 12 | val hash: String = rawBytes.computeHashAsHex 13 | 14 | withHashAndDescription(hash, documentDescription) 15 | } 16 | 17 | def withHashAndDescription(hash: String, description: DocumentDescription): DocumentIdentifier = 18 | new DocumentIdentifier(hash, description) 19 | 20 | } -------------------------------------------------------------------------------- /src/test/scala/tools/ambitious/pdfextractiontoolkit/library/model/geometry/PositivePointSpec.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.model.geometry 2 | 3 | import org.scalatest.FreeSpec 4 | 5 | class PositivePointSpec extends FreeSpec { 6 | 7 | "A PositivePoint instantiated with negative x and positive y should throw an IllegalArgumentException" in { 8 | val instantiatePositivePoint = intercept[IllegalArgumentException] { 9 | PositivePoint.at(-100, 50) 10 | } 11 | assert(instantiatePositivePoint.getMessage === "x must not be negative") 12 | } 13 | 14 | "A PositivePoint instantiated with positive x and negative y should throw an IllegalArgumentException" in { 15 | val instantiatePositivePoint = intercept[IllegalArgumentException] { 16 | PositivePoint.at(100, -50) 17 | } 18 | assert(instantiatePositivePoint.getMessage === "y must not be negative") 19 | } 20 | 21 | } 22 | -------------------------------------------------------------------------------- /src/main/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/ExtractionResult.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.extraction 2 | 3 | import tools.ambitious.pdfextractiontoolkit.library.extraction.extractionconstraints.ExtractionConstraint 4 | import tools.ambitious.pdfextractiontoolkit.library.model.{Document, Table} 5 | 6 | class ExtractionResult protected (private val resultsMap: Map[Document, Map[ExtractionConstraint, Table]]) { 7 | 8 | def getResults(document: Document)(extractionConstraint: ExtractionConstraint): Option[Table] = 9 | resultsMap(document).get(extractionConstraint) 10 | 11 | def apply(document: Document)(extractionConstraint: ExtractionConstraint): Table = 12 | resultsMap(document)(extractionConstraint) 13 | 14 | } 15 | 16 | object ExtractionResult { 17 | 18 | def withResultsMap(resultsMap: Map[Document, Map[ExtractionConstraint, Table]]) = new ExtractionResult(resultsMap) 19 | 20 | } 21 | -------------------------------------------------------------------------------- /src/main/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/tablemergers/SimpleTableMerger.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.extraction.tablemergers 2 | 3 | import tools.ambitious.pdfextractiontoolkit.library.model.{Row, Table} 4 | 5 | class SimpleTableMerger(val ignoreHeaders: Boolean) extends TableMerger { 6 | override def mergeTables(toMerge: List[Table]): Option[Table] = { 7 | if (ignoreHeaders) { 8 | val rowsMinusHeaders: List[Row] = toMerge.flatMap(_.rows.drop(1)) 9 | if (rowsMinusHeaders.isEmpty) { 10 | Option.empty 11 | } else { 12 | Option.apply(Table.fromRows(rowsMinusHeaders)) 13 | } 14 | } else { 15 | toMerge.reduceOption(Table.merge) 16 | } 17 | } 18 | } 19 | 20 | object SimpleTableMerger { 21 | def create: SimpleTableMerger = new SimpleTableMerger(false) 22 | def createIgnoringHeaderRows: SimpleTableMerger = new SimpleTableMerger(true) 23 | } 24 | -------------------------------------------------------------------------------- /src/main/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/tableextractors/RegionBasedTableExtractor.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.extraction.tableextractors 2 | 3 | import technology.tabula 4 | import tools.ambitious.pdfextractiontoolkit.library.extraction.ExtractionUtils 5 | import tools.ambitious.pdfextractiontoolkit.library.model.geometry.Rectangle 6 | import tools.ambitious.pdfextractiontoolkit.library.model.{Page, Table} 7 | import tools.ambitious.pdfextractiontoolkit.library.util.TabulaConverter 8 | 9 | class RegionBasedTableExtractor protected (val region: Rectangle) extends TableExtractor { 10 | override def getTable(page: Page): Table = { 11 | val tabulaTable: tabula.Table = ExtractionUtils.extractTabulaTableFromPage(page, region) 12 | TabulaConverter.tableFromTabulaTable(tabulaTable) 13 | } 14 | } 15 | 16 | object RegionBasedTableExtractor { 17 | def forRegion(region: Rectangle) = new RegionBasedTableExtractor(region) 18 | } -------------------------------------------------------------------------------- /src/test/scala/tools/ambitious/pdfextractiontoolkit/library/SimpleExtractSpec.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library 2 | 3 | import org.apache.pdfbox.pdmodel.PDDocument 4 | import org.scalatest._ 5 | import technology.tabula.extractors.BasicExtractionAlgorithm 6 | import technology.tabula.{ObjectExtractor, Page} 7 | 8 | class SimpleExtractSpec extends FlatSpec { 9 | 10 | val SIMPLE_TEST_1_TABLE = getClass.getResource("/simplePDFs/SimpleTest1Table.pdf") 11 | 12 | "The SimpleTest1Table.pdf table" should "have the value \"10\" in its first cell" in { 13 | val objectExtractor = new ObjectExtractor(PDDocument.load(SIMPLE_TEST_1_TABLE)) 14 | val wholePage: Page = objectExtractor.extract(1) 15 | val tablePageArea = wholePage.getArea(81, 108, 305, 312) 16 | 17 | val extractionAlgorithm = new BasicExtractionAlgorithm 18 | val table = extractionAlgorithm.extract(tablePageArea).get(0) 19 | 20 | assert(table.getCell(0, 0).getText.trim == "10") 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/scala/tools/ambitious/pdfextractiontoolkit/utils/AmbitiousIoUtils.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.utils 2 | 3 | import java.io.InputStream 4 | import java.net.URL 5 | import java.security.MessageDigest 6 | 7 | import org.apache.commons.io.IOUtils 8 | 9 | object AmbitiousIoUtils { 10 | 11 | private val digest: MessageDigest = MessageDigest.getInstance("SHA-256") 12 | 13 | implicit class URLUtils(url: URL) { 14 | def toBytes: Array[Byte] = { 15 | var inputStream: InputStream = null 16 | 17 | try { 18 | inputStream = url.openStream() 19 | 20 | IOUtils.toByteArray(inputStream) 21 | } finally { 22 | if (inputStream != null) { 23 | inputStream.close() 24 | } 25 | } 26 | } 27 | } 28 | 29 | implicit class ByteArrayUtils(bytes: Array[Byte]) { 30 | def computeHashAsHex: String = { 31 | digest.digest(bytes) 32 | .map("%02X" format _) 33 | .mkString 34 | } 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/main/scala/tools/ambitious/pdfextractiontoolkit/webapp/data/RootDAOImpl.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.webapp.data 2 | 3 | import slick.driver.SQLiteDriver.api._ 4 | import slick.jdbc.meta.MTable 5 | import tools.ambitious.pdfextractiontoolkit.webapp.data.model.Documents 6 | 7 | import scala.concurrent.ExecutionContext.Implicits.global 8 | import scala.concurrent.Future 9 | 10 | private[data] class RootDAOImpl(databaseConfigName: String) extends RootDAO { 11 | val database = Database.forConfig(databaseConfigName) 12 | 13 | private def isInitialised: Future[Boolean] = 14 | database.run(MTable.getTables).map(tables => tables.nonEmpty) 15 | 16 | def initialiseIfNeeded(): Future[Unit] = 17 | isInitialised.flatMap(alreadyInitialised => if (!alreadyInitialised) initialise() else Future(Unit)) 18 | 19 | private def initialise(): Future[Unit] = 20 | database.run(createTablesAction) 21 | 22 | private lazy val createTablesAction: DBIO[Unit] = 23 | TableQuery[Documents].schema.create 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/tools/ambitious/pdfextractiontoolkit/library/util/TabulaConverter.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.util 2 | 3 | import java.util 4 | 5 | import technology.tabula 6 | import technology.tabula.{HasText, RectangularTextContainer} 7 | import tools.ambitious.pdfextractiontoolkit.library.model.{Row, Table} 8 | 9 | import scala.collection.JavaConverters._ 10 | 11 | object TabulaConverter { 12 | def tableFromTabulaTable(table: tabula.Table): Table = 13 | Table.fromRows(rowsFromTabulaRows(table.getRows.asScala.toList)) 14 | 15 | def rowsFromTabulaRows(rows: List[util.List[RectangularTextContainer[_ <: HasText]]]): List[Row] = 16 | rows.map(row => rowFromTabulaRow(row)) 17 | 18 | def rowFromTabulaRow(row: util.List[RectangularTextContainer[_ <: HasText]]): Row = 19 | Row.fromStrings(tabulaRowAsListOfTrimmedStrings(row)) 20 | 21 | def tabulaRowAsListOfTrimmedStrings(row: util.List[RectangularTextContainer[_ <: HasText]]): List[String] = 22 | row.asScala.map(container => container.getText.trim).toList 23 | } 24 | -------------------------------------------------------------------------------- /src/test/scala/tools/ambitious/pdfextractiontoolkit/utils/AmbitiousIoUtilsSpec.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.utils 2 | 3 | import org.scalatest.FreeSpec 4 | import tools.ambitious.pdfextractiontoolkit.Resources 5 | import tools.ambitious.pdfextractiontoolkit.utils.AmbitiousIoUtils.{ByteArrayUtils, URLUtils} 6 | 7 | class AmbitiousIoUtilsSpec extends FreeSpec { 8 | "the toBytes method will return the bytes of the quick brown fox text file" in { 9 | val actualBytes: Array[Byte] = Resources.quickBrownFoxTxt.toBytes 10 | val expectedBytes: Array[Byte] = "The quick brown fox jumped over the lazy dogs".getBytes("UTF-8") 11 | 12 | assert(expectedBytes sameElements actualBytes) 13 | } 14 | 15 | "the compute hex method should return the correct hash of an array of bytes" in { 16 | val input = Array(42, 16, 43).map(_.toByte) 17 | 18 | val actualHash = input.computeHashAsHex 19 | val expectedHash = "220ED007F88E894C0AA52A193C826EED7B37AD84A2C4EA69FD538991550F8C46" 20 | 21 | assert(expectedHash == actualHash) 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/main/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/ExtractionUtils.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.extraction 2 | 3 | import java.util.NoSuchElementException 4 | 5 | import technology.tabula 6 | import technology.tabula.ObjectExtractor 7 | import technology.tabula.extractors.BasicExtractionAlgorithm 8 | import tools.ambitious.pdfextractiontoolkit.library.model.Page 9 | import tools.ambitious.pdfextractiontoolkit.library.model.geometry.Rectangle 10 | 11 | object ExtractionUtils { 12 | 13 | def extractTabulaTableFromPage(page: Page, portal: Rectangle) :tabula.Table = { 14 | val objectExtractor = new ObjectExtractor(page.asPDDocument) 15 | val wholePage: tabula.Page = objectExtractor.extract(1) 16 | 17 | try { 18 | val tablePageArea = wholePage.getArea( 19 | portal.top.toFloat, 20 | portal.left.toFloat, 21 | portal.bottom.toFloat, 22 | portal.right.toFloat) 23 | 24 | (new BasicExtractionAlgorithm).extract(tablePageArea).get(0) 25 | } catch { 26 | case e: NoSuchElementException => new tabula.Table 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/scala/tools/ambitious/pdfextractiontoolkit/library/model/geometry/Rectangle.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.model.geometry 2 | 3 | case class Rectangle protected (topLeft: PositivePoint, bottomRight: PositivePoint) { 4 | 5 | lazy val size:Size = Size.fromWidthAndHeight(math.abs(bottomRight.x - topLeft.x), math.abs(bottomRight.y - topLeft.y)) 6 | 7 | lazy val left :Double = math.min(topLeft.x, bottomRight.x) 8 | lazy val right :Double = math.max(topLeft.x, bottomRight.x) 9 | lazy val top :Double = math.min(topLeft.y, bottomRight.y) 10 | lazy val bottom :Double = math.max(topLeft.y, bottomRight.y) 11 | } 12 | 13 | object Rectangle { 14 | def fromCornerCoords(x1:Double, y1:Double, x2:Double, y2:Double): Rectangle = { 15 | new Rectangle(PositivePoint.at(x1, y1), PositivePoint.at(x2, y2)) 16 | } 17 | 18 | def fromCorners(topLeft: PositivePoint, bottomRight: PositivePoint): Rectangle = { 19 | new Rectangle(topLeft, bottomRight) 20 | } 21 | 22 | def fromCornerAndSize(corner: PositivePoint, size: Size): Rectangle = { 23 | new Rectangle(corner, PositivePoint.at(corner.x + size.width, corner.y + size.height)) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala/tools/ambitious/pdfextractiontoolkit/library/model/Page.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.model 2 | 3 | import org.apache.pdfbox.pdmodel.{PDDocument, PDPage} 4 | import tools.ambitious.pdfextractiontoolkit.library.model.geometry.Size 5 | 6 | class Page protected (private val document: PDDocument) { 7 | 8 | val asPDPage: PDPage = this.asPDDocument.getDocumentCatalog.getAllPages.get(0).asInstanceOf[PDPage] 9 | 10 | private val mediaBox = asPDPage.getMediaBox 11 | val size: Size = Size.fromWidthAndHeight(mediaBox.getWidth, mediaBox.getHeight) 12 | 13 | def asPDDocument: PDDocument = document 14 | } 15 | 16 | object Page { 17 | def fromPDDocument(document: PDDocument): Page = { 18 | if (numberOfPagesInPDDocument(document) != 1) 19 | throw new IllegalArgumentException("Page constructor fromPDDocument must supply a PDDocument with one page only.") 20 | 21 | new Page(document) 22 | } 23 | 24 | def listFromSinglePagePDDocuments(documents: List[PDDocument]): List[Page] = 25 | documents.map(document => Page.fromPDDocument(document)) 26 | 27 | private def numberOfPagesInPDDocument(document: PDDocument): Int = 28 | document.getDocumentCatalog.getAllPages.size 29 | } -------------------------------------------------------------------------------- /src/main/scala/tools/ambitious/pdfextractiontoolkit/webapp/services/documentstorage/DocumentFileStoreImpl.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.webapp.services.documentstorage 2 | 3 | import java.net.URL 4 | import java.nio.file.{Files, Path} 5 | 6 | import org.apache.commons.io.FileUtils 7 | 8 | import scala.concurrent.ExecutionContext.Implicits.global 9 | import scala.concurrent.Future 10 | 11 | private class DocumentFileStoreImpl(val workingDirectory: Path) extends DocumentFileStore { 12 | 13 | private def computeExpectedPath(docID: DocumentIdentifier): Path = { 14 | val fileName: String = docID.hash 15 | val outputFile: Path = workingDirectory.resolve(fileName) 16 | outputFile 17 | } 18 | 19 | override def storeFileFor(docID: DocumentIdentifier, source: URL): Future[Unit] = { 20 | Future { 21 | val outputFile: Path = computeExpectedPath(docID) 22 | 23 | FileUtils.copyURLToFile(source, outputFile.toFile) 24 | } 25 | } 26 | 27 | override def deleteFileFor(docID: DocumentIdentifier): Future[Unit] = { 28 | Future { 29 | Files.deleteIfExists(computeExpectedPath(docID)) 30 | } 31 | } 32 | 33 | override def retrieveFileFor(docID: DocumentIdentifier): Future[URL] = ??? 34 | } -------------------------------------------------------------------------------- /src/main/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/extractionconstraints/MergingSimpleExtractionConstraint.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.extraction.extractionconstraints 2 | 3 | import tools.ambitious.pdfextractiontoolkit.library.extraction.StateBundle 4 | import tools.ambitious.pdfextractiontoolkit.library.extraction.tablemergers.TableMerger 5 | import tools.ambitious.pdfextractiontoolkit.library.model.{Document, Page, Table} 6 | 7 | trait MergingSimpleExtractionConstraint extends SimpleExtractionConstraint { 8 | 9 | val tableMerger: TableMerger 10 | 11 | override def onStart(stateBundle: StateBundle): Unit = stateBundle.state = Option.apply(List()) 12 | 13 | override def onPage(page: Page, document: Document, stateBundle: StateBundle): Unit = { 14 | if (shouldExtractOnPage(page, document, stateBundle)) { 15 | 16 | val newList: List[Table] = stateBundle.state.asInstanceOf[Option[List[Table]]].get :+ performExtraction(page) 17 | 18 | stateBundle.state = Option.apply(newList) 19 | } 20 | } 21 | 22 | override def tableFromState(stateBundle: StateBundle): Option[Table] = 23 | tableMerger.mergeTables(stateBundle.state.asInstanceOf[Option[List[Table]]].get) 24 | 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala/tools/ambitious/pdfextractiontoolkit/webapp/services/documentstorage/DocumentDescription.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.webapp.services.documentstorage 2 | 3 | import spray.http.MediaType 4 | 5 | /** 6 | * A description of a document in the [[DocumentLibrary]], containing information that would be useful to a user. 7 | */ 8 | case class DocumentDescription protected (title: String, 9 | description: Option[String], 10 | mediaType: MediaType = MediaType.custom("application/octet-stream")) { 11 | } 12 | 13 | object DocumentDescription { 14 | def withTitleAndDescriptionAndMediaType(title: String, description: String, mediaType: MediaType): DocumentDescription = 15 | new DocumentDescription(title, Some(description), mediaType) 16 | 17 | def withTitleAndMediaType(title: String, mediaType: MediaType): DocumentDescription = 18 | new DocumentDescription(title, None, mediaType) 19 | 20 | def withTitleAndDescription(title: String, description: String): DocumentDescription = 21 | new DocumentDescription(title, Some(description)) 22 | 23 | def withTitle(title: String): DocumentDescription = 24 | new DocumentDescription(title, None) 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala/tools/ambitious/pdfextractiontoolkit/webapp/data/model/Documents.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.webapp.data.model 2 | 3 | import slick.driver.SQLiteDriver.api._ 4 | import spray.http.MediaType 5 | import tools.ambitious.pdfextractiontoolkit.webapp.services.documentstorage.{DocumentDescription, DocumentIdentifier} 6 | 7 | case class Document(documentID: Option[Long], hash: String, title: String, description: String, mediaType: String) { 8 | 9 | lazy val asDocumentIdentifier: DocumentIdentifier = { 10 | val parsedMediaType: MediaType = MediaType.custom(mediaType) 11 | val documentDescription = DocumentDescription.withTitleAndDescriptionAndMediaType(title, description, parsedMediaType) 12 | 13 | DocumentIdentifier.withHashAndDescription(hash, documentDescription) 14 | } 15 | } 16 | 17 | class Documents(tag: Tag) extends Table[Document](tag, "Documents") { 18 | def documentID = column[Long]("documentID", O.PrimaryKey, O.AutoInc) 19 | def hash = column[String]("hash") 20 | def title = column[String]("title") 21 | def description = column[String]("description") 22 | def mediaType = column[String]("mediaType") 23 | 24 | def * = (documentID.?, hash, title, description, mediaType) <> (Document.tupled, Document.unapply) 25 | } 26 | -------------------------------------------------------------------------------- /src/test/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/ExtractionUtilsSpec.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.extraction 2 | 3 | import org.scalatest.FreeSpec 4 | import technology.tabula 5 | import tools.ambitious.pdfextractiontoolkit.Resources 6 | import tools.ambitious.pdfextractiontoolkit.library.model.geometry.Rectangle 7 | import tools.ambitious.pdfextractiontoolkit.library.model.{Document, Page, Table} 8 | import tools.ambitious.pdfextractiontoolkit.library.util.{CSVUtil, TabulaConverter} 9 | 10 | class ExtractionUtilsSpec extends FreeSpec { 11 | "The method extractTabulaTableFromPage() should be able to extract the same values as those in the corresponding" + 12 | "csv file for simpleTest1Table.pdf file" in { 13 | val window: Rectangle = Rectangle.fromCornerCoords(108, 81, 312, 305) 14 | val page: Page = Document.fromPDFPath(Resources.simpleTest1TableURL).getPage(1) 15 | 16 | val extractedTabulaTable: tabula.Table = ExtractionUtils.extractTabulaTableFromPage(page, window) 17 | val extractedTable: Table = TabulaConverter.tableFromTabulaTable(extractedTabulaTable) 18 | val tableFromCSV: Table = CSVUtil.tableFromURL(Resources.simpleTest1TableCSVURL) 19 | 20 | assert(extractedTable == tableFromCSV) 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/test/scala/tools/ambitious/pdfextractiontoolkit/library/model/geometry/SizeSpec.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.model.geometry 2 | 3 | import org.scalatest.FreeSpec 4 | 5 | class SizeSpec extends FreeSpec { 6 | 7 | "A Size instantiated with width equal to 100.0 and height equal to 50.0" - { 8 | val size: Size = Size.fromWidthAndHeight(100.0, 50.0) 9 | 10 | "should have width equal to 100.0" in { 11 | assert(size.width == 100.0) 12 | } 13 | 14 | "should have height equal to 50.0" in { 15 | assert(size.height == 50.0) 16 | } 17 | 18 | "should have area equal to 5000.0" in { 19 | assert(size.area == 5000.0) 20 | } 21 | } 22 | 23 | "A Size instantiated with negative width and positive height should throw an IllegalArgumentException" in { 24 | val instantiateSize = intercept[IllegalArgumentException] { 25 | Size.fromWidthAndHeight(-100, 50) 26 | } 27 | assert(instantiateSize.getMessage === "Width must not be negative") 28 | } 29 | 30 | "A Size instantiated with negative height and positive width should throw an IllegalArgumentException" in { 31 | val instantiateSize = intercept[IllegalArgumentException] { 32 | Size.fromWidthAndHeight(100, -50) 33 | } 34 | assert(instantiateSize.getMessage === "Height must not be negative") 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/test/scala/tools/ambitious/pdfextractiontoolkit/library/model/CellSpec.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.model 2 | 3 | import org.scalatest.FreeSpec 4 | 5 | class CellSpec extends FreeSpec { 6 | "A cell that is instantiated without any arguments" - { 7 | val cell: Cell = new Cell 8 | 9 | "should have text set to an empty string" in { 10 | assert(cell.text == "") 11 | } 12 | 13 | "should return true when asked if empty" in { 14 | assert(cell.isEmpty) 15 | } 16 | } 17 | 18 | "A cell that is instantiated with the string 'blah'" - { 19 | val cell: Cell = new Cell("blah") 20 | 21 | "should have text set to 'blah'" in { 22 | assert(cell.text == "blah") 23 | } 24 | 25 | "should return false when asked if empty" in { 26 | assert(!cell.isEmpty) 27 | } 28 | } 29 | 30 | "Two cells with the same text" - { 31 | val cellA: Cell = new Cell("test") 32 | val cellB: Cell = new Cell("test") 33 | 34 | "should be equal" in { 35 | assert(cellA == cellB) 36 | } 37 | 38 | "should not have the same reference" in { 39 | assert(cellA ne cellB) 40 | } 41 | } 42 | 43 | "Two cells with different text" - { 44 | val cellA: Cell = new Cell("test") 45 | val cellB: Cell = new Cell("test2") 46 | 47 | "should not be equal" in { 48 | assert(cellA != cellB) 49 | } 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/test/scala/tools/ambitious/pdfextractiontoolkit/library/util/CSVUtilSpec.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.util 2 | 3 | import java.io.File 4 | 5 | import org.scalatest.FreeSpec 6 | import tools.ambitious.pdfextractiontoolkit.library.model.{Row, Table} 7 | 8 | class CSVUtilSpec extends FreeSpec { 9 | val simple3x3DigitsURL = getClass.getResource("/simpleCSVs/simple3x3Digits.csv") 10 | 11 | "A Table from CSV File" - { 12 | val table: Table = CSVUtil.tableFromFile(new File(simple3x3DigitsURL.toURI)) 13 | 14 | "should be equal to table: 1,2,3\n4,5,6\n7,8,9" in { 15 | val rowA: Row = Row.fromStrings(List("1", "2", "3")) 16 | val rowB: Row = Row.fromStrings(List("4", "5", "6")) 17 | val rowC: Row = Row.fromStrings(List("7", "8", "9")) 18 | 19 | val tableB: Table = Table.fromRows(List(rowA, rowB, rowC)) 20 | 21 | assert(table == tableB) 22 | } 23 | } 24 | 25 | "A Table from CSV URL" - { 26 | val table: Table = CSVUtil.tableFromURL(simple3x3DigitsURL) 27 | 28 | "should be equal to table: 1,2,3\n4,5,6\n7,8,9" in { 29 | val rowA: Row = Row.fromStrings(List("1", "2", "3")) 30 | val rowB: Row = Row.fromStrings(List("4", "5", "6")) 31 | val rowC: Row = Row.fromStrings(List("7", "8", "9")) 32 | 33 | val tableB: Table = Table.fromRows(List(rowA, rowB, rowC)) 34 | 35 | assert(table == tableB) 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/test/scala/tools/ambitious/pdfextractiontoolkit/webapp/data/DAOTestUtils.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.webapp.data 2 | 3 | import java.nio.file.{Files, Path, Paths} 4 | 5 | import com.typesafe.config.{Config, ConfigFactory} 6 | import org.apache.commons.io.FileUtils 7 | 8 | import scala.concurrent.Await 9 | import scala.concurrent.duration._ 10 | 11 | object DAOTestUtils { 12 | private val applicationConfig: Config = ConfigFactory.load() 13 | 14 | private val workingDirPathFromConfig: String = applicationConfig.getString("testWorkingDir") 15 | 16 | private val workingDir: Path = Paths.get(workingDirPathFromConfig) 17 | 18 | def createAndGetWorkingDirectory(): Path = { 19 | workingDir.toFile.mkdirs() 20 | workingDir 21 | } 22 | 23 | def cleanWorkingDirectory(): Unit = { 24 | FileUtils.deleteQuietly(workingDir.toFile) 25 | } 26 | 27 | private val testDBPathFromConfig: String = applicationConfig.getString("testDB.path") 28 | 29 | val testDBFile: Path = Paths.get(testDBPathFromConfig) 30 | 31 | val testDBConfigName: String = "testDB" 32 | 33 | def constructCleanRootDAO: RootDAO = { 34 | Files.deleteIfExists(testDBFile) 35 | DAOTestUtils.testDBFile.getParent.toFile.mkdirs() 36 | 37 | val rootDAO: RootDAO = RootDAO.forConfigName(testDBConfigName) 38 | 39 | Await.result(rootDAO.initialiseIfNeeded(), 30.seconds) 40 | 41 | rootDAO 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/scala/tools/ambitious/pdfextractiontoolkit/webapp/services/documentstorage/DocumentLibraryImpl.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.webapp.services.documentstorage 2 | 3 | import java.net.URL 4 | 5 | import tools.ambitious.pdfextractiontoolkit.utils.AmbitiousIoUtils._ 6 | import tools.ambitious.pdfextractiontoolkit.webapp.data.DocumentInformationDao 7 | 8 | import scala.concurrent.ExecutionContext.Implicits.global 9 | import scala.concurrent.Future 10 | 11 | private class DocumentLibraryImpl (val fileStore: DocumentFileStore, 12 | val dao: DocumentInformationDao) extends DocumentLibrary { 13 | 14 | override def store(description: DocumentDescription, documentSource: URL): Future[DocumentIdentifier] = { 15 | val bytes: Array[Byte] = documentSource.toBytes 16 | 17 | val docID: DocumentIdentifier = DocumentIdentifier.computeFor(description, bytes) 18 | 19 | fileStore.storeFileFor(docID, documentSource) 20 | 21 | dao.storeDocumentID(docID) 22 | .flatMap(_ => Future(docID)) 23 | } 24 | 25 | override def retrieve(docID: DocumentIdentifier): Future[URL] = fileStore.retrieveFileFor(docID) 26 | 27 | override def delete(docID: DocumentIdentifier): Future[Unit] = { 28 | fileStore.deleteFileFor(docID) 29 | dao.deleteDocumentID(docID) 30 | } 31 | 32 | override def list(): Future[Seq[DocumentIdentifier]] = dao.retrieveAllIDs() 33 | 34 | } 35 | 36 | object DocumentLibraryImpl { 37 | } -------------------------------------------------------------------------------- /src/main/scala/tools/ambitious/pdfextractiontoolkit/library/model/Row.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.model 2 | 3 | class Row(private val cells: List[Cell] = Nil) { 4 | def getCell(number: Int) = { 5 | if (number <= numberOfCells && number > 0) 6 | cells(number-1) 7 | else 8 | throw new IllegalArgumentException(s"Invalid cell number $number") 9 | } 10 | 11 | def numberOfCells: Int = cells.length 12 | 13 | def isEmpty: Boolean = numberOfCells == 0 14 | 15 | override def toString: String = 16 | cells.map(cell => cell.text).mkString(",") 17 | 18 | override def equals(obj: Any): Boolean = { 19 | obj match { 20 | case row: Row => 21 | var cellComparison: Boolean = this.numberOfCells == row.numberOfCells 22 | for (i <- 1 to cells.length) { 23 | try { 24 | cellComparison = cellComparison && (this.getCell(i) == row.getCell(i)) 25 | } catch { 26 | case _: Throwable => cellComparison = false 27 | } 28 | } 29 | cellComparison 30 | case _ => false 31 | } 32 | } 33 | } 34 | 35 | object Row { 36 | def fromCells(cells: List[Cell]): Row = 37 | new Row(cells) 38 | 39 | def fromCell(cell: Cell): Row = 40 | fromCells(List(cell)) 41 | 42 | def fromStrings(strings: List[String]): Row = 43 | fromCells(strings.map(string => new Cell(string))) 44 | 45 | def fromString(string: String): Row = 46 | fromStrings(List(string)) 47 | } -------------------------------------------------------------------------------- /src/main/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/extractionconstraints/PageNumberExtractionConstraint.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.extraction.extractionconstraints 2 | 3 | import tools.ambitious.pdfextractiontoolkit.library.extraction.StateBundle 4 | import tools.ambitious.pdfextractiontoolkit.library.extraction.tableextractors.TableExtractor 5 | import tools.ambitious.pdfextractiontoolkit.library.extraction.tablemergers.{SimpleTableMerger, TableMerger} 6 | import tools.ambitious.pdfextractiontoolkit.library.model.{Document, Page} 7 | 8 | import scala.collection.immutable.Range.Inclusive 9 | 10 | case class PageNumberExtractionConstraint protected (pages: Set[Int], tableExtractor: TableExtractor) extends MergingSimpleExtractionConstraint { 11 | if (pages.exists(_ <= 0)) { 12 | throw new IllegalArgumentException("Page numbers can only be positive numbers.") 13 | } 14 | 15 | override def shouldExtractOnPage(page: Page, document: Document, stateBundle: StateBundle): Boolean = 16 | document.pageNumberOf(page).exists(pages.contains) 17 | 18 | override val tableMerger: TableMerger = SimpleTableMerger.create 19 | } 20 | object PageNumberExtractionConstraint { 21 | def withPageNumberAndTableExtractor(pageNumber: Int, tableExtractor: TableExtractor) = 22 | new PageNumberExtractionConstraint(Set(pageNumber), tableExtractor) 23 | 24 | def withPageRangeAndTableExtractor(range: Inclusive, tableExtractor: TableExtractor) = 25 | new PageNumberExtractionConstraint(Set(range: _*), tableExtractor) 26 | } 27 | 28 | -------------------------------------------------------------------------------- /src/main/scala/tools/ambitious/pdfextractiontoolkit/webapp/data/DocumentInformationDaoImpl.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.webapp.data 2 | 3 | import slick.driver.SQLiteDriver.api._ 4 | import slick.lifted.{Query, TableQuery} 5 | import tools.ambitious.pdfextractiontoolkit.webapp.data.model.{Document, Documents} 6 | import tools.ambitious.pdfextractiontoolkit.webapp.services.documentstorage.DocumentIdentifier 7 | 8 | import scala.concurrent.ExecutionContext.Implicits.global 9 | import scala.concurrent.Future 10 | 11 | private class DocumentInformationDaoImpl(val rootDAO: RootDAO) extends DocumentInformationDao { 12 | override def storeDocumentID(docID: DocumentIdentifier): Future[Unit] = { 13 | val newRow = Document(None, 14 | docID.hash, 15 | docID.description.title, 16 | docID.description.description.getOrElse(""), 17 | docID.description.mediaType.toString()) 18 | 19 | val tableQuery: TableQuery[Documents] = new TableQuery(new Documents(_)) 20 | val insertAction = tableQuery += newRow 21 | 22 | rootDAO.database.run(insertAction).map(_ => Unit) 23 | } 24 | 25 | override def deleteDocumentID(docID: DocumentIdentifier): Future[Unit] = ??? 26 | 27 | override def retrieveAllIDs(): Future[Seq[DocumentIdentifier]] = { 28 | val tableQuery: TableQuery[Documents] = new TableQuery(new Documents(_)) 29 | 30 | val query: Query[Documents, Document, Seq] = tableQuery.map(row => row) 31 | 32 | val resultsFuture: Future[Seq[Document]] = rootDAO.database.run(query.result) 33 | 34 | resultsFuture.map(documents => documents.map(_.asDocumentIdentifier)) 35 | } 36 | } -------------------------------------------------------------------------------- /src/main/scala/tools/ambitious/pdfextractiontoolkit/library/model/Table.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.model 2 | 3 | class Table(val rows: List[Row] = Nil) { 4 | def getCell(row: Int, column: Int): Cell = getRow(row).getCell(column) 5 | 6 | def getRow(number: Int): Row = { 7 | if (number <= numberOfRows && number > 0) 8 | rows(number-1) 9 | else 10 | throw new IllegalArgumentException("Invalid row number.") 11 | } 12 | 13 | lazy val numberOfRows: Int = rows.length 14 | 15 | lazy val numberOfColumns: Int = rows.map(_.numberOfCells).max 16 | 17 | override def toString: String = 18 | rows.map(row => row.toString).mkString("\n") 19 | 20 | override def equals(obj: Any): Boolean = { 21 | obj match { 22 | case table: Table => 23 | var rowComparison: Boolean = this.numberOfRows == table.numberOfRows 24 | for (i <- 1 to this.numberOfRows) { 25 | try { 26 | rowComparison = rowComparison && (this.getRow(i) == table.getRow(i)) 27 | } catch { 28 | case _: Throwable => rowComparison = false 29 | } 30 | } 31 | rowComparison 32 | case _ => false 33 | } 34 | } 35 | 36 | def mergedWith(table: Table): Table = 37 | Table.merge(this, table) 38 | } 39 | 40 | object Table { 41 | def fromRows(rows: List[Row]): Table = 42 | new Table(rows) 43 | 44 | def fromRow(row: Row): Table = 45 | fromRows(List(row)) 46 | 47 | def merge(tables: List[Table]): Table = 48 | fromRows(tables.flatMap(table => table.rows)) 49 | 50 | def merge(tableA: Table, tableB: Table): Table = 51 | merge(List(tableA, tableB)) 52 | } -------------------------------------------------------------------------------- /src/main/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/extractionconstraints/FirstOccurrenceOfStringExtractionConstraint.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.extraction.extractionconstraints 2 | 3 | import technology.tabula 4 | import tools.ambitious.pdfextractiontoolkit.library.extraction.tableextractors.TableExtractor 5 | import tools.ambitious.pdfextractiontoolkit.library.extraction.{ExtractionUtils, StateBundle} 6 | import tools.ambitious.pdfextractiontoolkit.library.model.geometry.Rectangle 7 | import tools.ambitious.pdfextractiontoolkit.library.model.{Document, Page, Table} 8 | 9 | case class FirstOccurrenceOfStringExtractionConstraint protected (text: String, textRegion: Rectangle, tableExtractor: TableExtractor) 10 | extends SimpleExtractionConstraint { 11 | 12 | override def onPage(page: Page, document: Document, stateBundle: StateBundle): Unit = { 13 | if (shouldExtractOnPage(page, document, stateBundle)) { 14 | stateBundle.state = Option.apply(page) 15 | } 16 | } 17 | 18 | override def tableFromState(stateBundle: StateBundle): Option[Table] = { 19 | if (stateBundle.state.isDefined) { 20 | val extractedTable: Table = performExtraction(stateBundle.state.asInstanceOf[Option[Page]].get) 21 | 22 | Option.apply(extractedTable) 23 | } else { 24 | Option.empty 25 | } 26 | } 27 | 28 | def shouldExtractOnPage(page: Page, document: Document, stateBundle: StateBundle): Boolean = { 29 | val table: tabula.Table = ExtractionUtils.extractTabulaTableFromPage(page, textRegion) 30 | val foundText = table.getCell(0, 0).getText 31 | 32 | text == foundText && stateBundle.state.isEmpty 33 | } 34 | } 35 | 36 | object FirstOccurrenceOfStringExtractionConstraint { 37 | def withTextAndTableExtractor(text: String, textRegion: Rectangle, tableExtractor: TableExtractor) = 38 | new FirstOccurrenceOfStringExtractionConstraint(text, textRegion, tableExtractor) 39 | } 40 | 41 | -------------------------------------------------------------------------------- /src/test/scala/tools/ambitious/pdfextractiontoolkit/webapp/services/documentstorage/DocumentDescriptionSpec.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.webapp.services.documentstorage 2 | 3 | import org.scalatest.FreeSpec 4 | import spray.http.MediaType 5 | 6 | class DocumentDescriptionSpec extends FreeSpec { 7 | 8 | s"a ${DocumentDescription.getClass.getSimpleName}" - { 9 | 10 | "can be created with a title 'testDoc'" in { 11 | val docDescription: DocumentDescription = DocumentDescription.withTitle("testDoc") 12 | 13 | assert(docDescription.title == "testDoc") 14 | } 15 | 16 | "can be created with a title 'testDoc' and a description 'a test document'" in { 17 | val docDescription: DocumentDescription = 18 | DocumentDescription.withTitleAndDescription("testDoc", "a test document") 19 | 20 | assert(docDescription.title == "testDoc") 21 | assert(docDescription.description.contains("a test document")) 22 | } 23 | 24 | "can be created with a title 'testDoc' and media type 'application/pdf'" in { 25 | val mediaType: MediaType = MediaType.custom("application/pdf") 26 | 27 | val docDescription: DocumentDescription = 28 | DocumentDescription.withTitleAndMediaType("testDoc", mediaType) 29 | 30 | assert(docDescription.title == "testDoc") 31 | assert(docDescription.mediaType == mediaType) 32 | } 33 | 34 | "can be created with a title 'testDoc' and a description 'a test document' and media type 'application/pdf'" in { 35 | val mediaType: MediaType = MediaType.custom("application/pdf") 36 | 37 | val docDescription: DocumentDescription = 38 | DocumentDescription.withTitleAndDescriptionAndMediaType("testDoc", "a test document", mediaType) 39 | 40 | assert(docDescription.title == "testDoc") 41 | assert(docDescription.description.contains("a test document")) 42 | assert(docDescription.mediaType == mediaType) 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/Extractor.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.extraction 2 | 3 | import tools.ambitious.pdfextractiontoolkit.library.extraction.extractionconstraints.ExtractionConstraint 4 | import tools.ambitious.pdfextractiontoolkit.library.model._ 5 | 6 | import scala.concurrent.ExecutionContext.Implicits.global 7 | import scala.concurrent.{Future, Promise} 8 | 9 | class Extractor protected(private val documents: List[Document], private val extractors: List[ExtractionConstraint]) { 10 | 11 | private def extractTablesFromDocument(document: Document): Future[Map[ExtractionConstraint, Table]] = { 12 | val walker: DocumentWalker = DocumentWalker.toWalkWithExtractionConstraint(document, extractors) 13 | 14 | val promise: Promise[Map[ExtractionConstraint, Table]] = Promise() 15 | walker.getTables.onSuccess { 16 | case tables => promise.success(tables) 17 | } 18 | promise.future 19 | } 20 | 21 | def extractTables: Future[ExtractionResult] = { 22 | val documentMap: Map[Document, Future[Map[ExtractionConstraint, Table]]] = 23 | documents.map(document => document -> extractTablesFromDocument(document)).toMap 24 | 25 | Future.sequence(documentMap.map(entry => entry._2.map(i => (entry._1, i)))) 26 | .map(_.toMap) 27 | .map(ExtractionResult.withResultsMap) 28 | } 29 | } 30 | 31 | object Extractor { 32 | def fromDocumentsAndConstraints(documents: List[Document], extractors: List[ExtractionConstraint]): Extractor = 33 | new Extractor(documents, extractors) 34 | 35 | def fromDocumentsAndConstraints(documents: List[Document], extractors: ExtractionConstraint*): Extractor = 36 | fromDocumentsAndConstraints(documents, extractors.toList) 37 | 38 | def fromDocumentAndConstraints(document: Document, extractors: List[ExtractionConstraint]): Extractor = 39 | fromDocumentsAndConstraints(List(document), extractors) 40 | 41 | def fromDocumentAndConstraints(document: Document, extractors: ExtractionConstraint*): Extractor = 42 | fromDocumentAndConstraints(document, extractors.toList) 43 | } -------------------------------------------------------------------------------- /src/test/scala/tools/ambitious/pdfextractiontoolkit/webapp/data/RootDAOImplSpec.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.webapp.data 2 | 3 | import java.net.URL 4 | import java.nio.file.Files 5 | 6 | import org.scalatest.FreeSpec 7 | import slick.jdbc.meta.MTable 8 | import tools.ambitious.pdfextractiontoolkit.utils.AmbitiousIoUtils.{ByteArrayUtils, URLUtils} 9 | 10 | import scala.concurrent.duration._ 11 | import scala.concurrent.{Await, Future} 12 | 13 | class RootDAOImplSpec extends FreeSpec { 14 | 15 | "the root DAO" - { 16 | val dao: RootDAO = RootDAO.forConfigName(DAOTestUtils.testDBConfigName) 17 | DAOTestUtils.testDBFile.getParent.toFile.mkdirs() 18 | 19 | "if the database doesn't exist" - { 20 | 21 | Files.deleteIfExists(DAOTestUtils.testDBFile) 22 | 23 | "a call to initialiseIfNeeded" - { 24 | 25 | Await.result(dao.initialiseIfNeeded(), 30.seconds) 26 | 27 | "will create a database" in { 28 | assert(Files.exists(DAOTestUtils.testDBFile)) 29 | } 30 | 31 | "will create the correct tables" in { 32 | val listTables: Future[Vector[MTable]] = dao.database.run(MTable.getTables) 33 | 34 | val tablesMetaData: Vector[MTable] = Await.result(listTables, 30.seconds) 35 | 36 | val actualTableNames = tablesMetaData 37 | .map(_.name.name) 38 | .toSet 39 | .filterNot(_ == "sqlite_sequence") // Metadata table that's always created. We don't give a shit. 40 | 41 | val expectedTableNames = Set("Documents") 42 | 43 | assert(expectedTableNames == actualTableNames) 44 | } 45 | } 46 | } 47 | 48 | "if the database does exist" - { 49 | 50 | dao.initialiseIfNeeded() 51 | 52 | "does not change the database" in { 53 | val testDBUrl: URL = DAOTestUtils.testDBFile.toUri.toURL 54 | 55 | val originalHash = testDBUrl 56 | .toBytes 57 | .computeHashAsHex 58 | 59 | dao.initialiseIfNeeded() 60 | 61 | val newHash = testDBUrl 62 | .toBytes 63 | .computeHashAsHex 64 | 65 | assert(originalHash == newHash) 66 | } 67 | } 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/test/scala/tools/ambitious/pdfextractiontoolkit/library/model/PageSpec.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.model 2 | 3 | import org.apache.pdfbox.pdmodel.{PDDocument, PDPage} 4 | import org.scalatest.FreeSpec 5 | 6 | class PageSpec extends FreeSpec { 7 | val samplePDFPath = getClass.getResource("/simplePDFs/SimpleTest1Table.pdf") 8 | val twoPagedBlankDocumentPath = getClass.getResource("/simplePDFs/TwoPagedBlankDocument.pdf") 9 | 10 | "In the context of a PDDocument with two pages (TwoPagedBlankDocument.pdf)" - { 11 | val document: PDDocument = PDDocument.load(twoPagedBlankDocumentPath) 12 | 13 | "An IllegalArgumentException should be thrown when trying to call the constructor Page.fromPDDocument" in { 14 | val interceptException = intercept[IllegalArgumentException] { 15 | val page: Page = Page.fromPDDocument(document) 16 | } 17 | 18 | assert(interceptException.getMessage === "Page constructor fromPDDocument must supply a PDDocument with one page only.") 19 | } 20 | } 21 | 22 | "A Page should instantiate from a PDDocument with one page" in { 23 | val document: PDDocument = PDDocument.load(samplePDFPath) 24 | val page: Page = Page.fromPDDocument(document) 25 | } 26 | 27 | "A Page" - { 28 | val document: PDDocument = PDDocument.load(samplePDFPath) 29 | val page: Page = Page.fromPDDocument(document) 30 | 31 | "should have width equal to the document's PDPage width" in { 32 | val pDPage: PDPage = document.getDocumentCatalog.getAllPages.get(0).asInstanceOf[PDPage] 33 | assert(page.size.width == pDPage.getMediaBox.getWidth) 34 | } 35 | 36 | "should have height equal to the document's PDPage height" in { 37 | val pDPage: PDPage = document.getDocumentCatalog.getAllPages.get(0).asInstanceOf[PDPage] 38 | assert(page.size.height == pDPage.getMediaBox.getHeight) 39 | } 40 | } 41 | 42 | "A Page instantiated from a PDDocument with one page" - { 43 | val document: PDDocument = PDDocument.load(samplePDFPath) 44 | val page: Page = Page.fromPDDocument(document) 45 | 46 | "should return the same PDDocument" in { 47 | assert(page.asPDDocument == document) 48 | } 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/test/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/extractionconstraints/FirstOccurrenceOfStringExtractionConstraintSpec.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.extraction.extractionconstraints 2 | 3 | import org.scalatest.FreeSpec 4 | import tools.ambitious.pdfextractiontoolkit.Resources 5 | import tools.ambitious.pdfextractiontoolkit.library.extraction._ 6 | import tools.ambitious.pdfextractiontoolkit.library.extraction.tableextractors.RegionBasedTableExtractor 7 | import tools.ambitious.pdfextractiontoolkit.library.model.geometry.{PositivePoint, Rectangle, Size} 8 | import tools.ambitious.pdfextractiontoolkit.library.model.{Document, Table} 9 | import tools.ambitious.pdfextractiontoolkit.library.util.CSVUtil 10 | 11 | import scala.concurrent.Await 12 | import scala.concurrent.duration._ 13 | 14 | class FirstOccurrenceOfStringExtractionConstraintSpec extends FreeSpec { 15 | val simpleTest2Tables2TitleURL = getClass.getResource("/simplePDFs/SimpleTest2Tables1Title.pdf") 16 | 17 | s"A ${FirstOccurrenceOfStringExtractionConstraint.getClass.getSimpleName} with string 'An example Title' and " + 18 | s"appropriate Window for SimpleTest2Tables1Title.pdf" - { 19 | val region = Rectangle.fromCornerAndSize(PositivePoint.at(168.48, 273.95), Size.fromWidthAndHeight(213.54, 303.5)) 20 | 21 | val tableExtractor = RegionBasedTableExtractor.forRegion(region) 22 | 23 | val textRegion = Rectangle.fromCornerAndSize(PositivePoint.at(185.38, 165.62), Size.fromWidthAndHeight(112.64, 16.16)) 24 | val extractionConstraint = FirstOccurrenceOfStringExtractionConstraint.withTextAndTableExtractor("An example Title", textRegion, tableExtractor) 25 | 26 | "when put through a walker with test document SimpleTest2Tables1Title.pdf" - { 27 | val document: Document = Document.fromPDFPath(simpleTest2Tables2TitleURL) 28 | val walker: DocumentWalker = DocumentWalker.toWalkWithExtractionConstraint(document, extractionConstraint) 29 | val tables: Map[ExtractionConstraint, Table] = Await.result(walker.getTables, 60.seconds) 30 | 31 | "should return the table at page 2" in { 32 | val table: Option[Table] = tables.get(extractionConstraint) 33 | val tableFromCSV: Table = CSVUtil.tableFromURL(Resources.simpleTest2Tables2TitlePage2CSVURL) 34 | 35 | assert(table.get == tableFromCSV) 36 | } 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/test/scala/tools/ambitious/pdfextractiontoolkit/library/model/geometry/RectangleSpec.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.model.geometry 2 | 3 | import org.scalatest.FreeSpec 4 | 5 | class RectangleSpec extends FreeSpec { 6 | 7 | "A Rectangle" - { 8 | "defined by the points [1, 2] and [5, 4]" - { 9 | 10 | val theRectangle = Rectangle.fromCornerCoords(1, 2, 5, 4) 11 | 12 | standardRectangleTest(theRectangle, 13 | expectedArea = 8, 14 | expectedWidth = 4, 15 | expectedHeight = 2, 16 | expectedTop = 2, 17 | expectedBottom = 4, 18 | expectedLeft = 1, 19 | expectedRight = 5) 20 | } 21 | 22 | "defined by the points [4, 3] and [1, 2]" - { 23 | 24 | val theRectangle = Rectangle.fromCornerCoords(4, 3, 1, 2) 25 | 26 | standardRectangleTest(theRectangle, 27 | expectedArea = 3, 28 | expectedWidth = 3, 29 | expectedHeight = 1, 30 | expectedTop = 2, 31 | expectedBottom = 3, 32 | expectedLeft = 1, 33 | expectedRight = 4) 34 | 35 | } 36 | } 37 | 38 | def standardRectangleTest(rectangle:Rectangle, 39 | expectedArea:Double, 40 | expectedWidth:Double, 41 | expectedHeight:Double, 42 | expectedLeft:Double, 43 | expectedRight:Double, 44 | expectedTop:Double, 45 | expectedBottom:Double) = { 46 | s"should have an area of $expectedArea" in { 47 | assert(rectangle.size.area == expectedArea) 48 | } 49 | 50 | s"should have a width of $expectedWidth" in { 51 | assert(rectangle.size.width == expectedWidth) 52 | } 53 | 54 | s"should have a height of $expectedHeight" in { 55 | assert(rectangle.size.height == expectedHeight) 56 | } 57 | 58 | s"should have a left bound of $expectedLeft" in { 59 | assert(rectangle.left == expectedLeft) 60 | } 61 | 62 | s"should have a right bound of $expectedRight" in { 63 | assert(rectangle.right == expectedRight) 64 | } 65 | 66 | s"should have a top bound of $expectedTop" in { 67 | assert(rectangle.top == expectedTop) 68 | } 69 | 70 | s"should have a bottom bound of $expectedBottom" in { 71 | assert(rectangle.bottom == expectedBottom) 72 | } 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /src/test/scala/tools/ambitious/pdfextractiontoolkit/webapp/services/documentstorage/DocumentLibraryImplSpec.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.webapp.services.documentstorage 2 | 3 | import java.net.URL 4 | 5 | import org.scalamock.scalatest.MockFactory 6 | import org.scalatest.FreeSpec 7 | import tools.ambitious.pdfextractiontoolkit.Resources 8 | import tools.ambitious.pdfextractiontoolkit.webapp.data.DocumentInformationDao 9 | 10 | import scala.concurrent.ExecutionContext.Implicits.global 11 | import scala.concurrent.duration._ 12 | import scala.concurrent.{Await, Future} 13 | 14 | class DocumentLibraryImplSpec extends FreeSpec with MockFactory { 15 | 16 | s"a ${DocumentLibraryImpl.getClass.getSimpleName}" - { 17 | val mockFileStore: DocumentFileStore = mock[DocumentFileStore] 18 | val mockDao: DocumentInformationDao = mock[DocumentInformationDao] 19 | 20 | val documentLibrary: DocumentLibraryImpl = new DocumentLibraryImpl(mockFileStore, mockDao) 21 | 22 | val description: DocumentDescription = DocumentDescription.withTitle("testTitle") 23 | val source: URL = Resources.quickBrownFoxTxt 24 | 25 | val expectedHash: String = "58B433FA7E8B0F94B2FF02178E7768F5A329EF346D908C7B917824E5A4CA9575" 26 | 27 | val expectedID: DocumentIdentifier = DocumentIdentifier.withHashAndDescription(expectedHash, description) 28 | 29 | "will store a document when store is called" in { 30 | 31 | (mockFileStore.storeFileFor _).expects(expectedID, source) 32 | (mockDao.storeDocumentID _).expects(expectedID).returns(Future(Unit)) 33 | 34 | Await.result(documentLibrary.store(description, source), 30.seconds) 35 | } 36 | 37 | "will retrieve a document when retrieve is called" in { 38 | (mockFileStore.retrieveFileFor _).expects(expectedID).returns(Future(source)) 39 | 40 | val retrievedDocument = Await.result(documentLibrary.retrieve(expectedID), 30.seconds) 41 | assert(retrievedDocument == source) 42 | } 43 | 44 | "will delete a document when delete is called" in { 45 | (mockFileStore.deleteFileFor _).expects(expectedID) 46 | (mockDao.deleteDocumentID _).expects(expectedID).returns(Future(Unit)) 47 | 48 | Await.result(documentLibrary.delete(expectedID), 30.seconds) 49 | } 50 | 51 | "will list all documents when list is called" in { 52 | (mockDao.retrieveAllIDs _).expects().returns(Future(Seq())) 53 | Await.result(documentLibrary.list(), 30.seconds) 54 | } 55 | } 56 | 57 | } 58 | -------------------------------------------------------------------------------- /src/main/scala/tools/ambitious/pdfextractiontoolkit/library/model/Document.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.model 2 | 3 | import java.net.URL 4 | import java.util 5 | 6 | import org.apache.pdfbox.pdmodel.font._ 7 | import org.apache.pdfbox.pdmodel.{PDDocument, PDPage} 8 | import org.apache.pdfbox.util.Splitter 9 | 10 | import scala.collection.JavaConverters._ 11 | 12 | class Document private (private val pDDocument: PDDocument, val pages: List[Page]) { 13 | def numberOfPages: Int = pages.length 14 | 15 | def getPage(number: Int): Page = { 16 | if (number <= numberOfPages && number > 0) 17 | pages(number-1) 18 | else 19 | throw new IllegalArgumentException("Invalid page number.") 20 | } 21 | 22 | def close() = pDDocument.close() 23 | 24 | def pageNumberOf(page: Page): Option[Int] = 25 | Option.apply(pages.indexOf(page)) 26 | .filter(_ >= 0) 27 | .map(_ + 1) 28 | } 29 | 30 | object Document { 31 | def fromPDFPath(path: URL): Document = { 32 | val pDDocument: PDDocument = PDDocument.load(path) 33 | 34 | val splitPDDocuments: List[PDDocument] = Document.splitPDDocumentIntoPDDocumentForEachPage(pDDocument) 35 | val pages = Page.listFromSinglePagePDDocuments(splitPDDocuments) 36 | 37 | new Document(pDDocument, pages) 38 | } 39 | 40 | def splitPDDocumentIntoPDDocumentForEachPage(document: PDDocument): List[PDDocument] = { 41 | val splitter: Splitter = new Splitter 42 | val splitPages: List[PDDocument] = splitter.split(document).asScala.toList 43 | 44 | // There is a bug in PDFBox Splitter that doesn't set fonts from the old 45 | // document to each new document. This manifests itself as a NullPointerException 46 | // when Tabula tries to extract a page. The following fixes that bug. 47 | setFontsFromMasterPDDocumentToSplitPDDocuments(document, splitPages) 48 | 49 | splitPages 50 | } 51 | 52 | private def setFontsFromMasterPDDocumentToSplitPDDocuments(masterPDDocument: PDDocument, splitPDDocuments: List[PDDocument]) = { 53 | for (i <- 0 until masterPDDocument.getNumberOfPages) { 54 | val masterPage: PDPage = masterPDDocument.getDocumentCatalog.getAllPages.get(i).asInstanceOf[PDPage] 55 | val splitPage: PDPage = splitPDDocuments(i).getDocumentCatalog.getAllPages.get(0).asInstanceOf[PDPage] 56 | 57 | val fontsFromMasterPage: util.Map[String, PDFont] = masterPage.getResources.getFonts 58 | 59 | splitPage.getResources.setFonts(fontsFromMasterPage) 60 | } 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/test/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/tablemergers/SimpleTableMergerSpec.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.extraction.tablemergers 2 | 3 | import org.scalatest.FreeSpec 4 | import tools.ambitious.pdfextractiontoolkit.Resources 5 | import tools.ambitious.pdfextractiontoolkit.library.model.Table 6 | import tools.ambitious.pdfextractiontoolkit.library.util.CSVUtil 7 | 8 | class SimpleTableMergerSpec extends FreeSpec { 9 | 10 | private val table1: Table = CSVUtil.tableFromURL(Resources.simpleTest2Tables2TitlePage1CSVURL) 11 | private val table2: Table = CSVUtil.tableFromURL(Resources.simpleTest2Tables2TitlePage2CSVURL) 12 | private val tables: List[Table] = List(table1, table2) 13 | 14 | s"A ${SimpleTableMerger.getClass.getSimpleName}" - { 15 | "that does not ignore header rows" - { 16 | val simpleTableMerger: SimpleTableMerger = SimpleTableMerger.create 17 | 18 | "when it merges the two tables" - { 19 | val merged: Table = simpleTableMerger.mergeTables(tables).get 20 | 21 | shouldReturnTableWith(table = merged, row = 1, column = 3, expected = "4") 22 | shouldReturnTableWith(table = merged, row = 6, column = 1, expected = "4") 23 | shouldReturnTableWith(table = merged, row = 7, column = 4, expected = "6") 24 | shouldReturnTableWith(table = merged, row = 9, column = 2, expected = "9") 25 | shouldReturnTableWith(table = merged, row = 20, column = 3, expected = "5") 26 | } 27 | } 28 | 29 | "created to ignore header rows" - { 30 | val simpleTableMerger: SimpleTableMerger = SimpleTableMerger.createIgnoringHeaderRows 31 | 32 | "when it merges the two tables" - { 33 | val merged: Table = simpleTableMerger.mergeTables(tables).get 34 | 35 | shouldReturnTableWith(table = merged, row = 1, column = 3, expected = "5") 36 | shouldReturnTableWith(table = merged, row = 6, column = 1, expected = "2") 37 | shouldReturnTableWith(table = merged, row = 7, column = 4, expected = "1") 38 | shouldReturnTableWith(table = merged, row = 9, column = 2, expected = "8") 39 | shouldReturnTableWith(table = merged, row = 20, column = 3, expected = "1") 40 | } 41 | } 42 | } 43 | 44 | private def shouldReturnTableWith(table: Table, row: Int, column: Int, expected: String):Unit = { 45 | s"should return a table with the value $expected at row $row and column $column" in { 46 | assert(table.getCell(row, column).text == expected) 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/test/scala/tools/ambitious/pdfextractiontoolkit/library/model/DocumentSpec.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.model 2 | 3 | import org.apache.pdfbox.pdmodel.PDDocument 4 | import org.scalatest.FreeSpec 5 | 6 | class DocumentSpec extends FreeSpec { 7 | val samplePDFPath = getClass.getResource("/simplePDFs/SimpleTest1Table.pdf") 8 | val twoPagedDocumentPath = getClass.getResource("/simplePDFs/TwoPagedBlankDocument.pdf") 9 | 10 | "A Document should instantiate from PDF" in { 11 | val document = Document.fromPDFPath(samplePDFPath) 12 | } 13 | 14 | "A Document instantiated with SimpleTest1Table.pdf" - { 15 | val document = Document.fromPDFPath(samplePDFPath) 16 | 17 | "should have one page" in { 18 | assert(document.numberOfPages == 1) 19 | } 20 | } 21 | 22 | "A Document instantiated with TwoPagedBlankDocument.pdf" - { 23 | val document = Document.fromPDFPath(twoPagedDocumentPath) 24 | 25 | "should have two pages" in { 26 | assert(document.numberOfPages == 2) 27 | } 28 | 29 | "should be able to get the first page" in { 30 | val page: Page = document.getPage(1) 31 | } 32 | 33 | "should be able to get the second page" in { 34 | val page: Page = document.getPage(2) 35 | } 36 | 37 | "should throw an IllegalArgumentException when trying to get the third page" in { 38 | val interceptException = intercept[IllegalArgumentException] { 39 | val page: Page = document.getPage(3) 40 | } 41 | 42 | assert(interceptException.getMessage === "Invalid page number.") 43 | } 44 | 45 | "should say that its first page is page 1" in { 46 | val page1: Page = document.getPage(1) 47 | val pageNumOfPage1: Option[Int] = document.pageNumberOf(page1) 48 | assert(pageNumOfPage1 == Option.apply(1)) 49 | } 50 | 51 | "should say that its second page is page 2" in { 52 | val page2: Page = document.getPage(2) 53 | val pageNumOfPage2: Option[Int] = document.pageNumberOf(page2) 54 | assert(pageNumOfPage2 == Option.apply(2)) 55 | } 56 | } 57 | 58 | "A List of PDDocuments split from TwoPagedBlankDocument.pdf" - { 59 | val document: PDDocument = PDDocument.load(twoPagedDocumentPath) 60 | val pages: List[PDDocument] = Document.splitPDDocumentIntoPDDocumentForEachPage(document) 61 | 62 | "should have only one page in each PDDocument" in { 63 | for (pDDocument: PDDocument <- pages) 64 | assert(pDDocument.getDocumentCatalog.getAllPages.size == 1) 65 | } 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/main/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/DocumentWalker.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.extraction 2 | 3 | import tools.ambitious.pdfextractiontoolkit.library.extraction.extractionconstraints.ExtractionConstraint 4 | import tools.ambitious.pdfextractiontoolkit.library.model.{Document, Page, Table} 5 | 6 | import scala.concurrent.ExecutionContext.Implicits.global 7 | import scala.concurrent.{Future, Promise} 8 | 9 | class DocumentWalker protected (val document:Document, val extractionConstraints: Set[ExtractionConstraint]) { 10 | private val stateBundles: Map[ExtractionConstraint, StateBundle] = extractionConstraints.map(_ -> StateBundle.create).toMap 11 | private val promise: Promise[Map[ExtractionConstraint, Table]] = Promise() 12 | 13 | private def run() = 14 | Future { 15 | traverseDocument() 16 | promise.success(getCompletedTables) 17 | } 18 | 19 | private def traverseDocument() = { 20 | callOnStartOnExtractionConstraints() 21 | 22 | document.pages.foreach(page => callOnPageOnExtractionConstraints(page)) 23 | 24 | callOnEndOnExtractionConstraints() 25 | } 26 | 27 | private def getCompletedTables: Map[ExtractionConstraint, Table] = 28 | extractionConstraints 29 | .map(extractionConstraint => extractionConstraint -> extractionConstraint.tableFromState(stateBundles(extractionConstraint))) 30 | .filter((tuple: (ExtractionConstraint, Option[Table])) => tuple._2.isDefined) 31 | .map((tuple: (ExtractionConstraint, Option[Table])) => tuple._1 -> tuple._2.get) 32 | .toMap 33 | 34 | def getTables: Future[Map[ExtractionConstraint, Table]] = 35 | promise.future 36 | 37 | private def callOnStartOnExtractionConstraints() = 38 | extractionConstraints.foreach(extractionConstraint => extractionConstraint.onStart(stateBundles(extractionConstraint))) 39 | 40 | private def callOnPageOnExtractionConstraints(page: Page) = 41 | extractionConstraints.foreach(extractionConstraint => extractionConstraint.onPage(page, document, stateBundles(extractionConstraint))) 42 | 43 | private def callOnEndOnExtractionConstraints() = 44 | extractionConstraints.foreach(extractionConstraint => extractionConstraint.onEnd(stateBundles(extractionConstraint))) 45 | } 46 | 47 | object DocumentWalker { 48 | def toWalkWithExtractionConstraint(document: Document, extractionConstraints: Set[ExtractionConstraint]): DocumentWalker = { 49 | val walker = new DocumentWalker(document, extractionConstraints) 50 | walker.run() 51 | walker 52 | } 53 | 54 | def toWalkWithExtractionConstraint(document: Document, extractionConstraints: Seq[ExtractionConstraint]): DocumentWalker = 55 | toWalkWithExtractionConstraint(document, extractionConstraints.toSet) 56 | 57 | def toWalkWithExtractionConstraint(document: Document, extractionConstraint: ExtractionConstraint): DocumentWalker = 58 | toWalkWithExtractionConstraint(document, Seq(extractionConstraint)) 59 | } -------------------------------------------------------------------------------- /src/test/scala/tools/ambitious/pdfextractiontoolkit/webapp/services/documentstorage/DocumentFileStoreImplSpec.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.webapp.services.documentstorage 2 | 3 | import java.io.File 4 | import java.net.URL 5 | import java.nio.file.{Files, Path} 6 | 7 | import org.scalatest._ 8 | import spray.http.MediaType 9 | import _root_.tools.ambitious.pdfextractiontoolkit.Resources 10 | import _root_.tools.ambitious.pdfextractiontoolkit.utils.AmbitiousIoUtils._ 11 | import _root_.tools.ambitious.pdfextractiontoolkit.webapp.data.DAOTestUtils 12 | 13 | import scala.concurrent.Await 14 | import scala.concurrent.duration._ 15 | import scala.io.Source 16 | 17 | class DocumentFileStoreImplSpec extends FlatSpec with GivenWhenThen with BeforeAndAfter with OneInstancePerTest { 18 | private val workingDirectory: Path = DAOTestUtils.createAndGetWorkingDirectory() 19 | private val documentFileStore: DocumentFileStore = new DocumentFileStoreImpl(workingDirectory) 20 | 21 | private val mediaType: MediaType = MediaType.custom("text/plain") 22 | private val documentDescription: DocumentDescription = DocumentDescription.withTitleAndMediaType("Test", mediaType) 23 | private val source: URL = Resources.quickBrownFoxTxt 24 | 25 | private val documentID: DocumentIdentifier = DocumentIdentifier.computeFor(documentDescription, source.toBytes) 26 | 27 | private val expectedSourceContents: String = "The quick brown fox jumped over the lazy dogs" 28 | private val expectedOutputFile: File = workingDirectory.resolve(documentID.hash.toString).toFile 29 | 30 | def storeTestFileInFileStore(): Unit = { 31 | Await.result(documentFileStore.storeFileFor(documentID, source), 30.seconds) 32 | } 33 | 34 | after { 35 | DAOTestUtils.cleanWorkingDirectory() 36 | } 37 | 38 | "A file store" should "store files with the expected name" in { 39 | When("a file is stored in the file store") 40 | storeTestFileInFileStore() 41 | 42 | Then("the file should exist on disk") 43 | assert(expectedOutputFile.isFile) 44 | } 45 | 46 | it should "store files with the expected content" in { 47 | When("a file is stored in the file store") 48 | storeTestFileInFileStore() 49 | 50 | Then("the file stored on disk should contain the contents of the source file") 51 | assert(Source.fromFile(expectedOutputFile).mkString == expectedSourceContents) 52 | } 53 | 54 | it should "delete stored files" in { 55 | Given("a file is in the file store") 56 | storeTestFileInFileStore() 57 | 58 | When("the file is deleted") 59 | Await.result(documentFileStore.deleteFileFor(documentID), 30.seconds) 60 | 61 | Then("the file should not exist on disk") 62 | assert(!expectedOutputFile.exists()) 63 | } 64 | 65 | it should "return quietly if asked to delete a missing file" in { 66 | Given("no file has been stored in the file store") 67 | 68 | When("the file is deleted") 69 | Await.result(documentFileStore.deleteFileFor(documentID), 30.seconds) 70 | 71 | Then("the file store should ignore the missing file") 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/test/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/ExtractorSpec.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.extraction 2 | 3 | import org.scalatest.FreeSpec 4 | import tools.ambitious.pdfextractiontoolkit.Resources 5 | import tools.ambitious.pdfextractiontoolkit.library.extraction.extractionconstraints.{FirstOccurrenceOfStringExtractionConstraint, PageNumberExtractionConstraint} 6 | import tools.ambitious.pdfextractiontoolkit.library.extraction.tableextractors.RegionBasedTableExtractor 7 | import tools.ambitious.pdfextractiontoolkit.library.model.geometry.{PositivePoint, Rectangle, Size} 8 | import tools.ambitious.pdfextractiontoolkit.library.model.{Document, Table} 9 | import tools.ambitious.pdfextractiontoolkit.library.util.CSVUtil 10 | 11 | import scala.concurrent.Await 12 | import scala.concurrent.duration._ 13 | 14 | class ExtractorSpec extends FreeSpec { 15 | s"An ${Extractor.getClass.getSimpleName} with a document and a ${PageNumberExtractionConstraint.getClass.getSimpleName}" - { 16 | val document: Document = Document.fromPDFPath(Resources.simpleTest1TableURL) 17 | 18 | val region: Rectangle = Rectangle.fromCornerCoords(108, 81, 312, 305) 19 | val tableExtractor = RegionBasedTableExtractor.forRegion(region) 20 | val extractionConstraint = PageNumberExtractionConstraint.withPageNumberAndTableExtractor(1, tableExtractor) 21 | 22 | val extractor: Extractor = Extractor.fromDocumentAndConstraints(document, extractionConstraint) 23 | 24 | "should be able to extract the table and have it match the values from it's corresponding CSV file" in { 25 | val extractionResult: ExtractionResult = Await.result(extractor.extractTables, 60.seconds) 26 | 27 | document.close() 28 | 29 | val table: Table = extractionResult(document)(extractionConstraint) 30 | val tableFromCSV: Table = CSVUtil.tableFromURL(Resources.simpleTest1TableCSVURL) 31 | 32 | assert(table == tableFromCSV) 33 | } 34 | } 35 | 36 | s"An ${Extractor.getClass.getSimpleName} with one document and a ${FirstOccurrenceOfStringExtractionConstraint.getClass.getSimpleName}" - { 37 | val document: Document = Document.fromPDFPath(Resources.simpleTest2Tables2TitleURL) 38 | 39 | val region: Rectangle = Rectangle.fromCornerAndSize(PositivePoint.at(168.48, 273.95), Size.fromWidthAndHeight(213.54, 303.5)) 40 | val tableExtractor = RegionBasedTableExtractor.forRegion(region) 41 | 42 | val textRegion: Rectangle = Rectangle.fromCornerAndSize(PositivePoint.at(185.38, 165.62), Size.fromWidthAndHeight(112.64, 16.16)) 43 | val extractionConstraint = FirstOccurrenceOfStringExtractionConstraint.withTextAndTableExtractor("An example Title", textRegion, tableExtractor) 44 | 45 | val extractor: Extractor = Extractor.fromDocumentAndConstraints(document, extractionConstraint) 46 | 47 | "should be able to extract the table and have it match the values from it's corresponding CSV file" in { 48 | val extractionResult: ExtractionResult = Await.result(extractor.extractTables, 60.seconds) 49 | 50 | document.close() 51 | 52 | val table: Table = extractionResult(document)(extractionConstraint) 53 | val tableFromCSV: Table = CSVUtil.tableFromURL(Resources.simpleTest2Tables2TitlePage2CSVURL) 54 | 55 | assert(table == tableFromCSV) 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/test/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/extractionconstraints/PageNumberExtractionConstraintSpec.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.extraction.extractionconstraints 2 | 3 | import org.scalatest.FreeSpec 4 | import tools.ambitious.pdfextractiontoolkit.Resources 5 | import tools.ambitious.pdfextractiontoolkit.library.extraction._ 6 | import tools.ambitious.pdfextractiontoolkit.library.extraction.tableextractors.RegionBasedTableExtractor 7 | import tools.ambitious.pdfextractiontoolkit.library.extraction.tablemergers.SimpleTableMerger 8 | import tools.ambitious.pdfextractiontoolkit.library.model.geometry.{PositivePoint, Rectangle, Size} 9 | import tools.ambitious.pdfextractiontoolkit.library.model.{Document, Table} 10 | import tools.ambitious.pdfextractiontoolkit.library.util.CSVUtil 11 | 12 | import scala.concurrent.Await 13 | import scala.concurrent.duration._ 14 | 15 | class PageNumberExtractionConstraintSpec extends FreeSpec { 16 | val region: Rectangle = Rectangle.fromCornerAndSize(PositivePoint.at(168.48, 240), Size.fromWidthAndHeight(213.54, 340)) 17 | val tableExtractor = RegionBasedTableExtractor.forRegion(region) 18 | 19 | s"A ${PageNumberExtractionConstraint.getClass.getSimpleName}" - { 20 | "for page 2" - { 21 | val extractionConstraint: PageNumberExtractionConstraint = PageNumberExtractionConstraint.withPageNumberAndTableExtractor(2, tableExtractor) 22 | 23 | "when put through a walker with test document 2" - { 24 | val document: Document = Document.fromPDFPath(Resources.simpleTest2Tables2TitleURL) 25 | val walker: DocumentWalker = DocumentWalker.toWalkWithExtractionConstraint(document, extractionConstraint) 26 | val tables: Map[ExtractionConstraint, Table] = Await.result(walker.getTables, 60.seconds) 27 | 28 | "should return the table at page 2" in { 29 | val table: Option[Table] = tables.get(extractionConstraint) 30 | val tableFromCSV: Table = CSVUtil.tableFromURL(Resources.simpleTest2Tables2TitlePage2CSVURL) 31 | 32 | assert(table.get == tableFromCSV) 33 | } 34 | } 35 | } 36 | 37 | "for a page range from 1 to 2" - { 38 | val extractionConstraint: PageNumberExtractionConstraint = 39 | PageNumberExtractionConstraint.withPageRangeAndTableExtractor(Range.inclusive(1, 2), tableExtractor) 40 | 41 | "when put through a walker with test document 2" - { 42 | val document: Document = Document.fromPDFPath(Resources.simpleTest2Tables2TitleURL) 43 | val walker: DocumentWalker = DocumentWalker.toWalkWithExtractionConstraint(document, extractionConstraint) 44 | val tables: Map[ExtractionConstraint, Table] = Await.result(walker.getTables, 60.seconds) 45 | 46 | "should return the two tables merged" in { 47 | val table: Option[Table] = tables.get(extractionConstraint) 48 | 49 | val tableMerger: SimpleTableMerger = SimpleTableMerger.create 50 | 51 | val table1: Table = CSVUtil.tableFromURL(Resources.simpleTest2Tables2TitlePage1CSVURL) 52 | val table2: Table = CSVUtil.tableFromURL(Resources.simpleTest2Tables2TitlePage2CSVURL) 53 | val tablesToMerge: List[Table] = List(table1, table2) 54 | 55 | val tableFromCSV: Table = tableMerger.mergeTables(tablesToMerge).get 56 | 57 | assert(table.get == tableFromCSV) 58 | } 59 | } 60 | } 61 | 62 | s"instantiated for a page number less than 1 should throw an IllegalArgumentException" - { 63 | val instantiatePageNumberTableExtractor = intercept[IllegalArgumentException] { 64 | 65 | val dummyExtractor = RegionBasedTableExtractor.forRegion(Rectangle.fromCornerCoords(0, 0, 0, 0)) 66 | 67 | PageNumberExtractionConstraint.withPageNumberAndTableExtractor(0, dummyExtractor) 68 | } 69 | assert(instantiatePageNumberTableExtractor.getMessage === "Page numbers can only be positive numbers.") 70 | } 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/test/scala/tools/ambitious/pdfextractiontoolkit/library/expensereports/SummaryOfParliamentaryExpenditureByPeriodExtractionSpec.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.expensereports 2 | 3 | import org.scalatest.{BeforeAndAfterEach, FlatSpec} 4 | import tools.ambitious.pdfextractiontoolkit.library.extraction.extractionconstraints.FirstOccurrenceOfStringExtractionConstraint 5 | import tools.ambitious.pdfextractiontoolkit.library.extraction.tableextractors.RegionBasedTableExtractor 6 | import tools.ambitious.pdfextractiontoolkit.library.extraction.{ExtractionResult, Extractor} 7 | import tools.ambitious.pdfextractiontoolkit.library.model.geometry.{PositivePoint, Rectangle, Size} 8 | import tools.ambitious.pdfextractiontoolkit.library.model.{Document, Table} 9 | 10 | import scala.concurrent.Await 11 | import scala.concurrent.duration._ 12 | 13 | class SummaryOfParliamentaryExpenditureByPeriodExtractionSpec extends FlatSpec with BeforeAndAfterEach { 14 | 15 | val abbottTonyReport = getClass.getResource("/expenseReports/P34_ABBOTT_Tony.pdf") 16 | val leighAndrewReport = getClass.getResource("/expenseReports/P34_LEIGH_Andrew.pdf") 17 | 18 | val textToFind = "Summary of Parliamentary Expenditure by Period" 19 | 20 | val tableRegion = Rectangle.fromCornerAndSize(PositivePoint.at(34, 138), Size.fromWidthAndHeight(540, 608)) 21 | val tableExtractor = RegionBasedTableExtractor.forRegion(tableRegion) 22 | 23 | val textRegion = Rectangle.fromCornerAndSize(PositivePoint.at(165, 90), Size.fromWidthAndHeight(280, 25)) 24 | 25 | val extractionConstraint = FirstOccurrenceOfStringExtractionConstraint.withTextAndTableExtractor(textToFind, textRegion, tableExtractor) 26 | 27 | val abbottTonyDocument = Document.fromPDFPath(abbottTonyReport) 28 | val leighAndrewDocument = Document.fromPDFPath(leighAndrewReport) 29 | 30 | val extractor = Extractor.fromDocumentsAndConstraints(List(abbottTonyDocument, leighAndrewDocument), extractionConstraint) 31 | val extractionResult: ExtractionResult = Await.result(extractor.extractTables, 60.seconds) 32 | 33 | "the Extractor" should "extract a single table from the Tony Abbott report" in { 34 | assert(extractionResult.getResults(abbottTonyDocument)(extractionConstraint).isDefined) 35 | } 36 | 37 | it should "extract a single table from the Andrew Leigh report" in { 38 | assert(extractionResult.getResults(leighAndrewDocument)(extractionConstraint).isDefined) 39 | } 40 | 41 | val expectedRowsTonyAbbott = 24 42 | it should s"extract a table from the Tony Abbott Report with $expectedRowsTonyAbbott rows" in { 43 | val table: Table = extractionResult(abbottTonyDocument)(extractionConstraint) 44 | 45 | assert(table.numberOfRows == expectedRowsTonyAbbott) 46 | } 47 | 48 | val expectedRowsAndrewLeigh = 26 49 | it should s"extract a table from the Andrew Leigh Report with $expectedRowsAndrewLeigh rows" in { 50 | val table: Table = extractionResult(leighAndrewDocument)(extractionConstraint) 51 | 52 | assert(table.numberOfRows == expectedRowsAndrewLeigh) 53 | } 54 | 55 | it should "extract the expected values from the Tony Abbott report" in { 56 | val table: Table = extractionResult(abbottTonyDocument)(extractionConstraint) 57 | 58 | assertValueAtCell(expectedText = "Expenses From", table = table, rowNumber = 1, columnNumber = 2) 59 | assertValueAtCell(expectedText = "$628,736.33", table = table, rowNumber = 23, columnNumber = 2) 60 | } 61 | 62 | it should "extract the expected values from the Andrew Leigh report" in { 63 | val table: Table = extractionResult(leighAndrewDocument)(extractionConstraint) 64 | 65 | assertValueAtCell(expectedText = "Expenses From", table = table, rowNumber = 1, columnNumber = 2) 66 | assertValueAtCell(expectedText = "$109,760.32", table = table, rowNumber = 26, columnNumber = 2) 67 | } 68 | 69 | private def assertValueAtCell(expectedText: String, table: Table, rowNumber: Int, columnNumber: Int) = { 70 | assert(table.getCell(rowNumber, columnNumber).text == expectedText) 71 | } 72 | 73 | override def afterEach(): Unit = { 74 | abbottTonyDocument.close() 75 | leighAndrewDocument.close() 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/test/scala/tools/ambitious/pdfextractiontoolkit/library/model/RowSpec.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.model 2 | 3 | import org.scalatest.FreeSpec 4 | 5 | class RowSpec extends FreeSpec { 6 | 7 | "A row that is instantiated without any arguments" - { 8 | val row: Row = new Row 9 | 10 | "should be empty" in { 11 | assert(row.isEmpty) 12 | } 13 | 14 | "should have no cells" in { 15 | assert(row.numberOfCells == 0) 16 | } 17 | } 18 | 19 | "A row that is instantiated with a cell containing the text 'test'" - { 20 | val row: Row = new Row(List(new Cell("test"))) 21 | 22 | "should not be empty" in { 23 | assert(!row.isEmpty) 24 | } 25 | 26 | "should have the first cell contain the text 'test" in { 27 | assert(row.getCell(1).text == "test") 28 | } 29 | 30 | "should have one cell" in { 31 | assert(row.numberOfCells == 1) 32 | } 33 | } 34 | 35 | "A row containing the entries 1, 2 and 3" - { 36 | val row: Row = Row.fromStrings(List("1", "2", "3")) 37 | 38 | "should return '1,2,3' when converted to String" in { 39 | assert(row.toString == "1,2,3") 40 | } 41 | 42 | "should have three cells" in { 43 | assert(row.numberOfCells == 3) 44 | } 45 | } 46 | 47 | "A row instantiated from three Cells containing 1, 2 and 3 consecutively" - { 48 | val row: Row = Row.fromCells(List(new Cell("1"), new Cell("2"), new Cell("3"))) 49 | 50 | "should have first cell with text 1" in { 51 | assert(row.getCell(1).text == "1") 52 | } 53 | 54 | "should have second cell with text 2" in { 55 | assert(row.getCell(2).text == "2") 56 | } 57 | 58 | "should have third cell with text 3" in { 59 | assert(row.getCell(3).text == "3") 60 | } 61 | } 62 | 63 | "A row instantiated from three Strings containing 1, 2 and 3 consecutively" - { 64 | val cellContents: List[String] = List("1", "2", "3") 65 | val row: Row = Row.fromStrings(cellContents) 66 | 67 | "should have first cell with text 1" in { 68 | assert(row.getCell(1).text == "1") 69 | } 70 | 71 | "should have second cell with text 2" in { 72 | assert(row.getCell(2).text == "2") 73 | } 74 | 75 | "should have third cell with text 3" in { 76 | assert(row.getCell(3).text == "3") 77 | } 78 | } 79 | 80 | "A row instantiated from a single cell" - { 81 | val row: Row = Row.fromCell(new Cell("1")) 82 | 83 | "should have first cell with text 1" in { 84 | assert(row.getCell(1).text == "1") 85 | } 86 | } 87 | 88 | "A row instantiated from a single string" - { 89 | val row: Row = Row.fromString("1") 90 | 91 | "should have first cell with text 1" in { 92 | assert(row.getCell(1).text == "1") 93 | } 94 | } 95 | 96 | "Two rows with the same cells" - { 97 | val cellA: Cell = new Cell("test") 98 | val cellB: Cell = new Cell("test") 99 | 100 | val rowA: Row = Row.fromCells(List(cellA, cellB)) 101 | val rowB: Row = Row.fromCells(List(cellA, cellB)) 102 | 103 | "should be equal" in { 104 | assert(rowA == rowB) 105 | } 106 | 107 | "should not have the same reference" in { 108 | assert(rowA ne rowB) 109 | } 110 | } 111 | 112 | "Two rows with different cells" - { 113 | val rowA: Row = Row.fromStrings(List("a", "b")) 114 | val rowB: Row = Row.fromStrings(List("c", "d")) 115 | 116 | "should not be equal" in { 117 | assert(rowA != rowB) 118 | } 119 | } 120 | 121 | "Two rows with the first row having more cells than the second row" - { 122 | val rowA: Row = Row.fromStrings(List("a", "b")) 123 | val rowB: Row = Row.fromStrings(List("a")) 124 | 125 | "should not be equal" in { 126 | assert(rowA != rowB) 127 | } 128 | } 129 | 130 | "Two rows with the first row having less cells than the second row" - { 131 | val rowA: Row = Row.fromStrings(List("a")) 132 | val rowB: Row = Row.fromStrings(List("a", "b")) 133 | 134 | "should not be equal" in { 135 | assert(rowA != rowB) 136 | } 137 | } 138 | 139 | "A row with three cells" - { 140 | val row: Row = Row.fromStrings(List("a", "b", "c")) 141 | 142 | "should have three cells" in { 143 | assert(row.numberOfCells == 3) 144 | } 145 | } 146 | } 147 | -------------------------------------------------------------------------------- /src/test/scala/tools/ambitious/pdfextractiontoolkit/webapp/data/DocumentInformationDaoSpec.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.webapp.data 2 | 3 | import slick.driver.SQLiteDriver.api._ 4 | import slick.lifted.{Query, TableQuery} 5 | import spray.http.{MediaType, MediaTypes} 6 | import tools.ambitious.pdfextractiontoolkit.AmbitiousToolsSpec 7 | import tools.ambitious.pdfextractiontoolkit.webapp.data.model.{Document, Documents} 8 | import tools.ambitious.pdfextractiontoolkit.webapp.services.documentstorage.{DocumentDescription, DocumentIdentifier} 9 | 10 | import scala.concurrent.Await 11 | import scala.concurrent.duration._ 12 | 13 | class DocumentInformationDaoSpec extends AmbitiousToolsSpec { 14 | 15 | val rootDao: RootDAO = DAOTestUtils.constructCleanRootDAO 16 | val dao = DocumentInformationDao.forRootDao(rootDao) 17 | 18 | val documentID: DocumentIdentifier = { 19 | val testDocumentHash: String = "425A" 20 | val testDocumentTitle: String = "testDoc" 21 | val testDocumentDescription: String = "A test document" 22 | val testDocumentMediaType: MediaType = MediaTypes.`application/pdf` 23 | 24 | val description: DocumentDescription = DocumentDescription 25 | .withTitleAndDescriptionAndMediaType(testDocumentTitle, testDocumentDescription, testDocumentMediaType) 26 | 27 | DocumentIdentifier.withHashAndDescription(testDocumentHash, description) 28 | } 29 | 30 | "the document information dao" should "write one database row per stored document ID" in { 31 | storeDocumentUsingDao(documentID) 32 | 33 | val allDocuments = retrieveAllDocsForIDFromDatabase(documentID) 34 | 35 | assert(allDocuments.length == 1) 36 | } 37 | 38 | it should "store a stored document'sthe document's hash" in { 39 | storeDocumentUsingDao(documentID) 40 | 41 | val allDocuments = retrieveAllDocsForIDFromDatabase(documentID) 42 | 43 | assert(allDocuments.head.hash == documentID.hash) 44 | } 45 | 46 | it should "store the document's title" in { 47 | storeDocumentUsingDao(documentID) 48 | 49 | val allDocuments = retrieveAllDocsForIDFromDatabase(documentID) 50 | 51 | assert(allDocuments.head.title == documentID.description.title) 52 | } 53 | 54 | it should "store the document's description" in { 55 | storeDocumentUsingDao(documentID) 56 | 57 | val allDocuments = retrieveAllDocsForIDFromDatabase(documentID) 58 | 59 | assert(documentID.description.description.contains(allDocuments.head.description)) 60 | } 61 | 62 | it should "store the document's media type" in { 63 | storeDocumentUsingDao(documentID) 64 | 65 | val allDocuments = retrieveAllDocsForIDFromDatabase(documentID) 66 | 67 | assert(allDocuments.head.mediaType == documentID.description.mediaType.toString()) 68 | } 69 | 70 | it should "retrieve no identifiers from an empty identifier table" in { 71 | val allDocumentIDs: Seq[DocumentIdentifier] = retrieveAllDocumentIDsFromDao() 72 | 73 | assert(allDocumentIDs.isEmpty) 74 | } 75 | 76 | it should "retrieve one identifier from an identifer table with one record" in { 77 | Given("one document has been stored in the database") 78 | storeDocumentUsingDao(documentID) 79 | 80 | When("the document IDs are retrieved from the dao") 81 | val allDocIDs: Seq[DocumentIdentifier] = retrieveAllDocumentIDsFromDao() 82 | 83 | Then("there should be one document ID") 84 | assert(allDocIDs.size == 1) 85 | } 86 | 87 | it should "retrieve the correct document ID" in { 88 | Given("one document has been stored in the database") 89 | storeDocumentUsingDao(documentID) 90 | 91 | When("the document IDs are retrieved from the dao") 92 | val allDocIDs: Seq[DocumentIdentifier] = retrieveAllDocumentIDsFromDao() 93 | 94 | Then("the retrieved document ID should be the same as the stored one") 95 | val retrievedDocID = allDocIDs.head 96 | 97 | assert(retrievedDocID == documentID) 98 | } 99 | 100 | def storeDocumentUsingDao(documentID: DocumentIdentifier): Unit = { 101 | Await.result(dao.storeDocumentID(documentID), 30.seconds) 102 | } 103 | 104 | def retrieveAllDocsForIDFromDatabase(documentID: DocumentIdentifier): Seq[Document] = { 105 | val tableQuery: TableQuery[Documents] = new TableQuery(new Documents(_)) 106 | 107 | val query: Query[Documents, Document, Seq] = tableQuery.map(row => row) 108 | Await.result(rootDao.database.run(query.result), 30.seconds) 109 | } 110 | 111 | def retrieveAllDocumentIDsFromDao(): Seq[DocumentIdentifier] = { 112 | val allDocumentIDs: Seq[DocumentIdentifier] = Await.result(dao.retrieveAllIDs(), 30.seconds) 113 | allDocumentIDs 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /src/test/scala/tools/ambitious/pdfextractiontoolkit/library/model/TableSpec.scala: -------------------------------------------------------------------------------- 1 | package tools.ambitious.pdfextractiontoolkit.library.model 2 | 3 | import org.scalatest.FreeSpec 4 | 5 | class TableSpec extends FreeSpec { 6 | 7 | "A table with a single row containing a single cell with the text 'test'" - { 8 | val row: Row = Row.fromString("test") 9 | val table: Table = Table.fromRow(row) 10 | 11 | "should have that cell in it's 1,1 position" in { 12 | assert(table.getCell(1,1).text == "test") 13 | } 14 | 15 | "should have one row" in { 16 | assert(table.numberOfRows == 1) 17 | } 18 | 19 | "should have one column" in { 20 | assert(table.numberOfColumns == 1) 21 | } 22 | 23 | "should have it's first row equal to the row we set" in { 24 | assert(table.getRow(1) == row) 25 | } 26 | } 27 | 28 | "A table with 2 rows, each containing three distinct entries" - { 29 | val rowA: Row = Row.fromStrings(List("1", "2", "3")) 30 | val rowB: Row = Row.fromStrings(List("4", "5", "6")) 31 | val table: Table = Table.fromRows(List(rowA, rowB)) 32 | 33 | "should return '1,2,3\n4,5,6' when converted to String" in { 34 | assert(table.toString == "1,2,3\n4,5,6") 35 | } 36 | 37 | "should have two rows" in { 38 | assert(table.numberOfRows == 2) 39 | } 40 | 41 | "should have three columns" in { 42 | assert(table.numberOfColumns == 3) 43 | } 44 | 45 | "should have it's first row equal to the first row we set" in { 46 | assert(table.getRow(1) == rowA) 47 | } 48 | 49 | "should have it's second row equal to the second row we set" in { 50 | assert(table.getRow(2) == rowB) 51 | } 52 | } 53 | 54 | "A table instantiated from three rows each containing one cell with text 1, 2 and 3 consecutively" - { 55 | val cellContents = List("1", "2", "3") 56 | val rows: List[Row] = cellContents.map(i => Row.fromString(i)) 57 | 58 | val table: Table = Table.fromRows(rows) 59 | 60 | "should have string '1\n2\n3" in { 61 | assert(table.toString == "1\n2\n3") 62 | } 63 | 64 | "should have three rows" in { 65 | assert(table.numberOfRows == 3) 66 | } 67 | 68 | "should have one column" in { 69 | assert(table.numberOfColumns == 1) 70 | } 71 | 72 | "should have it's first row equal to the first row we set" in { 73 | assert(table.getRow(1) == rows.head) 74 | } 75 | 76 | "should have it's second row equal to the second row we set" in { 77 | assert(table.getRow(2) == rows(1)) 78 | } 79 | 80 | "should have it's third row equal to the third row we set" in { 81 | assert(table.getRow(3) == rows(2)) 82 | } 83 | } 84 | 85 | "A table instantiated from a single row containing a cell with the text 'test'" - { 86 | val table: Table = Table.fromRow(Row.fromString("test")) 87 | 88 | "should have string 'test'" in { 89 | assert(table.toString == "test") 90 | } 91 | } 92 | 93 | "Two tables with the same rows" - { 94 | val cellA: Cell = new Cell("test") 95 | val cellB: Cell = new Cell("test") 96 | 97 | val rowA: Row = Row.fromCells(List(cellA, cellB)) 98 | val rowB: Row = Row.fromCells(List(cellA, cellB)) 99 | 100 | val tableA: Table = Table.fromRows(List(rowA, rowB)) 101 | val tableB: Table = Table.fromRows(List(rowA, rowB)) 102 | 103 | "should be equal" in { 104 | assert(tableA == tableB) 105 | } 106 | 107 | "should not have the same reference" in { 108 | assert(tableA ne tableB) 109 | } 110 | } 111 | 112 | "Two tables with different rows" - { 113 | val tableA: Table = Table.fromRows(List(Row.fromString("a"), Row.fromString("b"))) 114 | val tableB: Table = Table.fromRows(List(Row.fromString("c"), Row.fromString("d"))) 115 | 116 | "should not be equal" in { 117 | assert(tableA != tableB) 118 | } 119 | } 120 | 121 | "Two tables with the first table having more rows than the second table" - { 122 | val tableA: Table = Table.fromRows(List(Row.fromString("a"), Row.fromString("b"))) 123 | val tableB: Table = Table.fromRow(Row.fromString("c")) 124 | 125 | "should not be equal" in { 126 | assert(tableA != tableB) 127 | } 128 | } 129 | 130 | "Two rows with the first row having less cells than the second row" - { 131 | val tableA: Table = Table.fromRow(Row.fromString("a")) 132 | val tableB: Table = Table.fromRows(List(Row.fromString("b"), Row.fromString("c"))) 133 | 134 | "should not be equal" in { 135 | assert(tableA != tableB) 136 | } 137 | } 138 | 139 | "A list of two tables that are merged" - { 140 | val tableA: Table = Table.fromRows(List(Row.fromString("a"), Row.fromString("b"))) 141 | val tableB: Table = Table.fromRows(List(Row.fromString("c"), Row.fromString("d"))) 142 | 143 | val mergedTable: Table = Table.merge(List(tableA, tableB)) 144 | 145 | "should have first row equal to the first row in tableA" in { 146 | assert(mergedTable.getRow(1) == tableA.getRow(1)) 147 | } 148 | 149 | "should have second row equal to the second row in tableA" in { 150 | assert(mergedTable.getRow(2) == tableA.getRow(2)) 151 | } 152 | 153 | "should have third row equal to the first row in tableB" in { 154 | assert(mergedTable.getRow(3) == tableB.getRow(1)) 155 | } 156 | 157 | "should have fourth row equal to the second row in tableB" in { 158 | assert(mergedTable.getRow(4) == tableB.getRow(2)) 159 | } 160 | 161 | "should have four rows" in { 162 | assert(mergedTable.numberOfRows == 4) 163 | } 164 | } 165 | 166 | "One table that is merged into another" - { 167 | val tableA: Table = Table.fromRows(List(Row.fromString("a"), Row.fromString("b"))) 168 | val tableB: Table = Table.fromRows(List(Row.fromString("c"), Row.fromString("d"))) 169 | 170 | val mergedTable: Table = tableA.mergedWith(tableB) 171 | 172 | "should have first row equal to the first row in tableA" in { 173 | assert(mergedTable.getRow(1) == tableA.getRow(1)) 174 | } 175 | 176 | "should have second row equal to the second row in tableA" in { 177 | assert(mergedTable.getRow(2) == tableA.getRow(2)) 178 | } 179 | 180 | "should have third row equal to the first row in tableB" in { 181 | assert(mergedTable.getRow(3) == tableB.getRow(1)) 182 | } 183 | 184 | "should have fourth row equal to the second row in tableB" in { 185 | assert(mergedTable.getRow(4) == tableB.getRow(2)) 186 | } 187 | 188 | "should have four rows" in { 189 | assert(mergedTable.numberOfRows == 4) 190 | } 191 | } 192 | 193 | "Two tables that are merged" - { 194 | val tableA: Table = Table.fromRows(List(Row.fromString("a"), Row.fromString("b"))) 195 | val tableB: Table = Table.fromRows(List(Row.fromString("c"), Row.fromString("d"))) 196 | 197 | val mergedTable: Table = Table.merge(tableA, tableB) 198 | 199 | "should have first row equal to the first row in tableA" in { 200 | assert(mergedTable.getRow(1) == tableA.getRow(1)) 201 | } 202 | 203 | "should have second row equal to the second row in tableA" in { 204 | assert(mergedTable.getRow(2) == tableA.getRow(2)) 205 | } 206 | 207 | "should have third row equal to the first row in tableB" in { 208 | assert(mergedTable.getRow(3) == tableB.getRow(1)) 209 | } 210 | 211 | "should have fourth row equal to the second row in tableB" in { 212 | assert(mergedTable.getRow(4) == tableB.getRow(2)) 213 | } 214 | 215 | "should have four rows" in { 216 | assert(mergedTable.numberOfRows == 4) 217 | } 218 | } 219 | } 220 | --------------------------------------------------------------------------------