├── project
    ├── build.properties
    └── plugins.sbt
├── .travis.yml
├── src
    ├── test
    │   ├── resources
    │   │   ├── simpleCSVs
    │   │   │   ├── simple3x3Digits.csv
    │   │   │   ├── SimpleTest1Table.csv
    │   │   │   ├── SimpleTest2Tables1TitlePage1.csv
    │   │   │   └── SimpleTest2Tables1TitlePage2.csv
    │   │   ├── textFiles
    │   │   │   └── quickBrown.txt
    │   │   ├── expenseReports
    │   │   │   ├── P34_LUNDY_Kate.pdf
    │   │   │   ├── P34_ABBOTT_Tony.pdf
    │   │   │   ├── P34_LEIGH_Andrew.pdf
    │   │   │   ├── P34_SESELJA_Zed.pdf
    │   │   │   ├── P34_BRODTMANN_Gai.pdf
    │   │   │   ├── P34_HUMPHRIES_Gary.pdf
    │   │   │   └── LICENSE
    │   │   └── simplePDFs
    │   │   │   ├── SimpleTest1Table.pdf
    │   │   │   ├── SimpleTest2Tables1Title.pdf
    │   │   │   └── TwoPagedBlankDocument.pdf
    │   └── scala
    │   │   └── tools
    │   │       └── ambitious
    │   │           └── pdfextractiontoolkit
    │   │               ├── AmbitiousToolsSpec.scala
    │   │               ├── library
    │   │                   ├── util
    │   │                   │   ├── TabulaConverterSpec.scala
    │   │                   │   └── CSVUtilSpec.scala
    │   │                   ├── model
    │   │                   │   ├── geometry
    │   │                   │   │   ├── PointSpec.scala
    │   │                   │   │   ├── PositivePointSpec.scala
    │   │                   │   │   ├── SizeSpec.scala
    │   │                   │   │   └── RectangleSpec.scala
    │   │                   │   ├── CellSpec.scala
    │   │                   │   ├── PageSpec.scala
    │   │                   │   ├── DocumentSpec.scala
    │   │                   │   ├── RowSpec.scala
    │   │                   │   └── TableSpec.scala
    │   │                   ├── SimpleExtractSpec.scala
    │   │                   ├── extraction
    │   │                   │   ├── ExtractionUtilsSpec.scala
    │   │                   │   ├── extractionconstraints
    │   │                   │   │   ├── FirstOccurrenceOfStringExtractionConstraintSpec.scala
    │   │                   │   │   └── PageNumberExtractionConstraintSpec.scala
    │   │                   │   ├── tablemergers
    │   │                   │   │   └── SimpleTableMergerSpec.scala
    │   │                   │   └── ExtractorSpec.scala
    │   │                   └── expensereports
    │   │                   │   └── SummaryOfParliamentaryExpenditureByPeriodExtractionSpec.scala
    │   │               ├── Resources.scala
    │   │               ├── webapp
    │   │                   ├── services
    │   │                   │   └── documentstorage
    │   │                   │   │   ├── DocumentIdentifierSpec.scala
    │   │                   │   │   ├── DocumentDescriptionSpec.scala
    │   │                   │   │   ├── DocumentLibraryImplSpec.scala
    │   │                   │   │   └── DocumentFileStoreImplSpec.scala
    │   │                   └── data
    │   │                   │   ├── DAOTestUtils.scala
    │   │                   │   ├── RootDAOImplSpec.scala
    │   │                   │   └── DocumentInformationDaoSpec.scala
    │   │               └── utils
    │   │                   └── AmbitiousIoUtilsSpec.scala
    └── main
    │   ├── scala
    │       └── tools
    │       │   └── ambitious
    │       │       └── pdfextractiontoolkit
    │       │           ├── library
    │       │               ├── model
    │       │               │   ├── geometry
    │       │               │   │   ├── Point.scala
    │       │               │   │   ├── PositivePoint.scala
    │       │               │   │   ├── Size.scala
    │       │               │   │   └── Rectangle.scala
    │       │               │   ├── Cell.scala
    │       │               │   ├── Page.scala
    │       │               │   ├── Row.scala
    │       │               │   ├── Table.scala
    │       │               │   └── Document.scala
    │       │               ├── extraction
    │       │               │   ├── StateBundle.scala
    │       │               │   ├── tableextractors
    │       │               │   │   ├── TableExtractor.scala
    │       │               │   │   └── RegionBasedTableExtractor.scala
    │       │               │   ├── tablemergers
    │       │               │   │   ├── TableMerger.scala
    │       │               │   │   └── SimpleTableMerger.scala
    │       │               │   ├── extractionconstraints
    │       │               │   │   ├── ExtractionConstraint.scala
    │       │               │   │   ├── SimpleExtractionConstraint.scala
    │       │               │   │   ├── MergingSimpleExtractionConstraint.scala
    │       │               │   │   ├── PageNumberExtractionConstraint.scala
    │       │               │   │   └── FirstOccurrenceOfStringExtractionConstraint.scala
    │       │               │   ├── ExtractionResult.scala
    │       │               │   ├── ExtractionUtils.scala
    │       │               │   ├── Extractor.scala
    │       │               │   └── DocumentWalker.scala
    │       │               └── util
    │       │               │   ├── CSVUtil.scala
    │       │               │   └── TabulaConverter.scala
    │       │           ├── webapp
    │       │               ├── data
    │       │               │   ├── RootDAO.scala
    │       │               │   ├── DocumentInformationDao.scala
    │       │               │   ├── RootDAOImpl.scala
    │       │               │   ├── model
    │       │               │   │   └── Documents.scala
    │       │               │   └── DocumentInformationDaoImpl.scala
    │       │               └── services
    │       │               │   └── documentstorage
    │       │               │       ├── DocumentFileStore.scala
    │       │               │       ├── DocumentLibrary.scala
    │       │               │       ├── DocumentIdentifier.scala
    │       │               │       ├── DocumentFileStoreImpl.scala
    │       │               │       ├── DocumentDescription.scala
    │       │               │       └── DocumentLibraryImpl.scala
    │       │           └── utils
    │       │               └── AmbitiousIoUtils.scala
    │   └── resources
    │       ├── log4j.properties
    │       └── application.conf
├── TODO.md
├── .gitignore
└── README.md


/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version = 0.13.8


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: scala
2 | scala:
3 |    - 2.11.7


--------------------------------------------------------------------------------
/src/test/resources/simpleCSVs/simple3x3Digits.csv:
--------------------------------------------------------------------------------
1 | 1,2,3
2 | 4,5,6
3 | 7,8,9


--------------------------------------------------------------------------------
/src/test/resources/textFiles/quickBrown.txt:
--------------------------------------------------------------------------------
1 | The quick brown fox jumped over the lazy dogs


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | logLevel := Level.Warn
2 | addSbtPlugin("io.spray" % "sbt-revolver" % "0.7.2")


--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
1 | # Things To Do
2 | 1. Close PDF documents correctly
3 | 2. Improve the tools.ambitious.pdfextractiontoolkit.extraction.Extractor.extractTables method
4 | 


--------------------------------------------------------------------------------
/src/test/resources/expenseReports/P34_LUNDY_Kate.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gavia/PDFExtractionToolkit/master/src/test/resources/expenseReports/P34_LUNDY_Kate.pdf


--------------------------------------------------------------------------------
/src/test/resources/simplePDFs/SimpleTest1Table.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gavia/PDFExtractionToolkit/master/src/test/resources/simplePDFs/SimpleTest1Table.pdf


--------------------------------------------------------------------------------
/src/test/resources/expenseReports/P34_ABBOTT_Tony.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gavia/PDFExtractionToolkit/master/src/test/resources/expenseReports/P34_ABBOTT_Tony.pdf


--------------------------------------------------------------------------------
/src/test/resources/expenseReports/P34_LEIGH_Andrew.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gavia/PDFExtractionToolkit/master/src/test/resources/expenseReports/P34_LEIGH_Andrew.pdf


--------------------------------------------------------------------------------
/src/test/resources/expenseReports/P34_SESELJA_Zed.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gavia/PDFExtractionToolkit/master/src/test/resources/expenseReports/P34_SESELJA_Zed.pdf


--------------------------------------------------------------------------------
/src/test/resources/expenseReports/P34_BRODTMANN_Gai.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gavia/PDFExtractionToolkit/master/src/test/resources/expenseReports/P34_BRODTMANN_Gai.pdf


--------------------------------------------------------------------------------
/src/test/resources/expenseReports/P34_HUMPHRIES_Gary.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gavia/PDFExtractionToolkit/master/src/test/resources/expenseReports/P34_HUMPHRIES_Gary.pdf


--------------------------------------------------------------------------------
/src/test/resources/simplePDFs/SimpleTest2Tables1Title.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gavia/PDFExtractionToolkit/master/src/test/resources/simplePDFs/SimpleTest2Tables1Title.pdf


--------------------------------------------------------------------------------
/src/test/resources/simplePDFs/TwoPagedBlankDocument.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gavia/PDFExtractionToolkit/master/src/test/resources/simplePDFs/TwoPagedBlankDocument.pdf


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /target/
 2 | /project/target/
 3 | *.class
 4 | *.iml
 5 | /lib_managed/
 6 | /src_managed/
 7 | *.ipr
 8 | *.iws
 9 | .idea
10 | out
11 | .DS_Store
12 | /working/
13 | /test_working/


--------------------------------------------------------------------------------
/src/test/resources/simpleCSVs/SimpleTest1Table.csv:
--------------------------------------------------------------------------------
 1 | 10,9,8,2
 2 | 2,4,8,5
 3 | 6,5,6,9
 4 | 3,4,4,5
 5 | 10,3,5,7
 6 | 4,1,4,5
 7 | 10,9,2,8
 8 | 1,1,8,4
 9 | 7,4,9,1
10 | 8,3,4,4
11 | 9,7,5,8
12 | 8,4,2,9
13 | 3,8,7,9
14 | 5,6,3,5
15 | 3,6,5,6
16 | 7,4,1,4
17 | 3,9,6,9
18 | 3,5,8,1


--------------------------------------------------------------------------------
/src/test/resources/expenseReports/LICENSE:
--------------------------------------------------------------------------------
1 | The files in this directory are licensed under Creative Commons Attribution 2.5 Australia licence, by the Department of Finance and Deregulation. For more information, go here: http://www.finance.gov.au/publications/parliamentarians-reporting/


--------------------------------------------------------------------------------
/src/main/scala/tools/ambitious/pdfextractiontoolkit/library/model/geometry/Point.scala:
--------------------------------------------------------------------------------
1 | package tools.ambitious.pdfextractiontoolkit.library.model.geometry
2 | 
3 | case class Point protected (x: Double, y: Double)
4 | 
5 | object Point {
6 |   def at(x:Double, y:Double) = new Point(x, y)
7 | }


--------------------------------------------------------------------------------
/src/main/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/StateBundle.scala:
--------------------------------------------------------------------------------
1 | package tools.ambitious.pdfextractiontoolkit.library.extraction
2 | 
3 | class StateBundle {
4 |   var state: Option[Any] = None
5 | }
6 | 
7 | object StateBundle {
8 |   def create: StateBundle = new StateBundle
9 | }


--------------------------------------------------------------------------------
/src/test/scala/tools/ambitious/pdfextractiontoolkit/AmbitiousToolsSpec.scala:
--------------------------------------------------------------------------------
1 | package tools.ambitious.pdfextractiontoolkit
2 | 
3 | import org.scalatest.{FlatSpec, GivenWhenThen, OneInstancePerTest}
4 | 
5 | class AmbitiousToolsSpec extends FlatSpec with OneInstancePerTest with GivenWhenThen {
6 | 
7 | }
8 | 


--------------------------------------------------------------------------------
/src/test/resources/simpleCSVs/SimpleTest2Tables1TitlePage1.csv:
--------------------------------------------------------------------------------
 1 | 1,6,4,1
 2 | 10,2,5,6
 3 | 1,8,2,10
 4 | 4,3,6,3
 5 | 7,3,6,5
 6 | 4,3,4,6
 7 | 2,3,3,6
 8 | 2,10,5,1
 9 | 10,9,6,8
10 | 8,8,9,1
11 | 1,1,1,5
12 | 7,6,7,8
13 | 6,8,2,8
14 | 8,8,8,4
15 | 7,1,4,1
16 | 9,2,2,3
17 | 7,5,5,3
18 | 1,1,10,7
19 | 1,3,8,5
20 | 1,4,5,9


--------------------------------------------------------------------------------
/src/test/resources/simpleCSVs/SimpleTest2Tables1TitlePage2.csv:
--------------------------------------------------------------------------------
 1 | 4,6,7,2
 2 | 3,3,1,8
 3 | 5,9,8,4
 4 | 5,6,10,10
 5 | 10,5,4,4
 6 | 10,5,7,7
 7 | 4,4,4,7
 8 | 4,7,5,3
 9 | 2,8,2,2
10 | 8,7,2,7
11 | 7,2,6,2
12 | 10,1,8,7
13 | 2,7,4,7
14 | 5,2,7,3
15 | 7,2,10,3
16 | 9,5,3,8
17 | 2,1,8,6
18 | 2,3,6,8
19 | 3,1,6,7
20 | 10,2,6,2


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # The PDF Extraction Toolkit
2 | 
3 | [![Build Status](https://travis-ci.org/AmbitiousTools/PDFExtractionToolkit.svg?branch=master)](https://travis-ci.org/AmbitiousTools/PDFExtractionToolkit)
4 | 
5 | A project to provide a set of user-friendly utilities for extracting tabulated data from large numbers of similar PDF 
6 | files.


--------------------------------------------------------------------------------
/src/main/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/tableextractors/TableExtractor.scala:
--------------------------------------------------------------------------------
1 | package tools.ambitious.pdfextractiontoolkit.library.extraction.tableextractors
2 | 
3 | import tools.ambitious.pdfextractiontoolkit.library.model.{Page, Table}
4 | 
5 | trait TableExtractor {
6 |   def getTable(page: Page): Table
7 | }
8 | 


--------------------------------------------------------------------------------
/src/main/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/tablemergers/TableMerger.scala:
--------------------------------------------------------------------------------
1 | package tools.ambitious.pdfextractiontoolkit.library.extraction.tablemergers
2 | 
3 | import tools.ambitious.pdfextractiontoolkit.library.model.Table
4 | 
5 | trait TableMerger {
6 |   def mergeTables(toMerge: List[Table]): Option[Table]
7 | }
8 | 


--------------------------------------------------------------------------------
/src/main/scala/tools/ambitious/pdfextractiontoolkit/library/model/Cell.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.library.model
 2 | 
 3 | class Cell(val text: String = "") {
 4 | 
 5 |   def isEmpty: Boolean = text == ""
 6 | 
 7 |   override def equals(obj: Any): Boolean =
 8 |     obj.isInstanceOf[Cell] && (text == obj.asInstanceOf[Cell].text)
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/test/scala/tools/ambitious/pdfextractiontoolkit/library/util/TabulaConverterSpec.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.library.util
 2 | 
 3 | import org.scalatest.FreeSpec
 4 | 
 5 | /**
 6 |  * This is empty as there is currently no easy way to mock Tabula tables/rows
 7 |  */
 8 | class TabulaConverterSpec extends FreeSpec {
 9 |   "placeholder test" in {
10 |   }
11 | }
12 | 


--------------------------------------------------------------------------------
/src/main/scala/tools/ambitious/pdfextractiontoolkit/webapp/data/RootDAO.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.webapp.data
 2 | 
 3 | import slick.driver.SQLiteDriver.api.Database
 4 | 
 5 | import scala.concurrent.Future
 6 | 
 7 | private[data] trait RootDAO {
 8 |   def database: Database
 9 | 
10 |   def initialiseIfNeeded(): Future[Unit]
11 | }
12 | 
13 | private[data] object RootDAO {
14 |   def forConfigName(configName: String): RootDAO = new RootDAOImpl(configName)
15 | }


--------------------------------------------------------------------------------
/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | log4j.rootLogger=ERROR, A1
 2 | log4j.logger.org.apache.pdfbox=ERROR, A1
 3 | log4j.logger.org.apache.pdfbox.cos.COSDocument=ERROR, A1
 4 | log4j.logger.org.apache.pdfbox.pdmodel.graphics.xobject.PDPixelMap=OFF
 5 | 
 6 | # A1 is set to be a ConsoleAppender.
 7 | log4j.appender.A1=org.apache.log4j.ConsoleAppender
 8 | 
 9 | # A1 uses PatternLayout.
10 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout
11 | log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n


--------------------------------------------------------------------------------
/src/main/scala/tools/ambitious/pdfextractiontoolkit/webapp/services/documentstorage/DocumentFileStore.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.webapp.services.documentstorage
 2 | 
 3 | import java.net.URL
 4 | 
 5 | import scala.concurrent.Future
 6 | 
 7 | private trait DocumentFileStore {
 8 | 
 9 |   def storeFileFor(docID: DocumentIdentifier, documentSource: URL): Future[Unit]
10 | 
11 |   def retrieveFileFor(docID: DocumentIdentifier): Future[URL]
12 | 
13 |   def deleteFileFor(docID: DocumentIdentifier): Future[Unit]
14 | 
15 | }
16 | 


--------------------------------------------------------------------------------
/src/main/scala/tools/ambitious/pdfextractiontoolkit/library/model/geometry/PositivePoint.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.library.model.geometry
 2 | 
 3 | class PositivePoint protected (override val x: Double, override val y: Double) extends Point(x, y) {
 4 |   if (x < 0)
 5 |     throw new IllegalArgumentException("x must not be negative")
 6 |   if (y < 0)
 7 |     throw new IllegalArgumentException("y must not be negative")
 8 | }
 9 | 
10 | object PositivePoint {
11 |   def at(x:Double, y:Double) = new PositivePoint(x, y)
12 | }
13 | 


--------------------------------------------------------------------------------
/src/test/scala/tools/ambitious/pdfextractiontoolkit/library/model/geometry/PointSpec.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.library.model.geometry
 2 | 
 3 | import org.scalatest.FreeSpec
 4 | 
 5 | class PointSpec extends FreeSpec {
 6 | 
 7 |   "A Point instantiated with x equal to 100.0 and y equal to 50.0" - {
 8 |     val point: Point = Point.at(100.0, 50.0)
 9 | 
10 |     "should have x equal to 100.0" in {
11 |       assert(point.x == 100.0)
12 |     }
13 | 
14 |     "should have y equal to 50.0" in {
15 |       assert(point.y == 50.0)
16 |     }
17 |   }
18 |  }
19 | 


--------------------------------------------------------------------------------
/src/main/scala/tools/ambitious/pdfextractiontoolkit/library/model/geometry/Size.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.library.model.geometry
 2 | 
 3 | case class Size protected (width: Double, height: Double) {
 4 |   if (width < 0)
 5 |     throw new IllegalArgumentException("Width must not be negative")
 6 | 
 7 |   if (height < 0)
 8 |     throw new IllegalArgumentException("Height must not be negative")
 9 | 
10 |   val area = math.abs(width * height)
11 | }
12 | 
13 | object Size {
14 |   def fromWidthAndHeight(width:Double, height:Double) = new Size(width, height)
15 | }
16 | 


--------------------------------------------------------------------------------
/src/main/resources/application.conf:
--------------------------------------------------------------------------------
 1 | akka {
 2 |   loglevel = INFO
 3 | }
 4 | 
 5 | spray.can.server {
 6 |   request-timeout = 1s
 7 | }
 8 | 
 9 | prodWorkingDir = "working"
10 | 
11 | testWorkingDir = "test_working"
12 | 
13 | prodDB = {
14 |   path = ${prodWorkingDir}"/PDFExtractionToolkit.db"
15 |   url = "jdbc:sqlite:"${prodDB.path}
16 |   driver = org.sqlite.JDBC
17 |   connectionPool = disabled
18 |   keepAliveConnection = true
19 | }
20 | 
21 | testDB = {
22 |   path = ${testWorkingDir}"/TestPDFExtractionToolkit.db"
23 |   url = "jdbc:sqlite:"${testDB.path}
24 |   driver = org.sqlite.JDBC
25 |   connectionPool = disabled
26 |   keepAliveConnection = true
27 | }


--------------------------------------------------------------------------------
/src/main/scala/tools/ambitious/pdfextractiontoolkit/library/util/CSVUtil.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.library.util
 2 | 
 3 | import java.io.File
 4 | import java.net.URL
 5 | 
 6 | import com.github.tototoshi.csv._
 7 | import tools.ambitious.pdfextractiontoolkit.library.model.{Row, Table}
 8 | 
 9 | object CSVUtil {
10 |   def tableFromFile(file: File): Table = {
11 |     val reader: CSVReader = CSVReader.open(file)
12 |     val lines: List[List[String]] = reader.all()
13 |     Table.fromRows(lines.map(line => Row.fromStrings(line)))
14 |   }
15 | 
16 |   def tableFromURL(url: URL): Table =
17 |     tableFromFile(new File(url.toURI))
18 | }
19 | 


--------------------------------------------------------------------------------
/src/main/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/extractionconstraints/ExtractionConstraint.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.library.extraction.extractionconstraints
 2 | 
 3 | import tools.ambitious.pdfextractiontoolkit.library.extraction.StateBundle
 4 | import tools.ambitious.pdfextractiontoolkit.library.model.{Document, Page, Table}
 5 | 
 6 | trait ExtractionConstraint {
 7 | 
 8 |   def onStart(stateBundle: StateBundle) = {}
 9 |   def onPage(page:Page, document:Document, stateBundle: StateBundle)
10 |   def onEnd(stateBundle: StateBundle) = {}
11 | 
12 |   def tableFromState(stateBundle: StateBundle): Option[Table]
13 | 
14 | }
15 | 


--------------------------------------------------------------------------------
/src/main/scala/tools/ambitious/pdfextractiontoolkit/webapp/services/documentstorage/DocumentLibrary.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.webapp.services.documentstorage
 2 | 
 3 | import java.net.URL
 4 | 
 5 | import scala.concurrent.Future
 6 | 
 7 | /**
 8 |  * A globally available interface for manipulating the applications stored set of Documents.
 9 |  */
10 | trait DocumentLibrary {
11 | 
12 |   def store(description: DocumentDescription, documentSource: URL): Future[DocumentIdentifier]
13 | 
14 |   def retrieve(docID: DocumentIdentifier): Future[URL]
15 | 
16 |   def delete(docID: DocumentIdentifier): Future[Unit]
17 | 
18 |   def list(): Future[Seq[DocumentIdentifier]]
19 | 
20 | }
21 | 


--------------------------------------------------------------------------------
/src/main/scala/tools/ambitious/pdfextractiontoolkit/webapp/data/DocumentInformationDao.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.webapp.data
 2 | 
 3 | import tools.ambitious.pdfextractiontoolkit.webapp.services.documentstorage.DocumentIdentifier
 4 | 
 5 | import scala.concurrent.Future
 6 | 
 7 | private[webapp] trait DocumentInformationDao {
 8 |   def storeDocumentID(docID: DocumentIdentifier): Future[Unit]
 9 | 
10 |   def deleteDocumentID(docID: DocumentIdentifier): Future[Unit]
11 | 
12 |   def retrieveAllIDs(): Future[Seq[DocumentIdentifier]]
13 | }
14 | 
15 | private[webapp] object DocumentInformationDao {
16 |   def forRootDao(rootDao: RootDAO): DocumentInformationDao = new DocumentInformationDaoImpl(rootDao)
17 | }


--------------------------------------------------------------------------------
/src/test/scala/tools/ambitious/pdfextractiontoolkit/Resources.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit
 2 | 
 3 | import java.net.URL
 4 | 
 5 | object Resources {
 6 |   val quickBrownFoxTxt: URL = getClass.getResource("/textFiles/quickBrown.txt")
 7 | 
 8 |   val simpleTest1TableURL = getClass.getResource("/simplePDFs/SimpleTest1Table.pdf")
 9 |   val simpleTest1TableCSVURL = getClass.getResource("/simpleCSVs/SimpleTest1Table.csv")
10 | 
11 |   val simpleTest2Tables2TitleURL = getClass.getResource("/simplePDFs/SimpleTest2Tables1Title.pdf")
12 |   val simpleTest2Tables2TitlePage1CSVURL = getClass.getResource("/simpleCSVs/SimpleTest2Tables1TitlePage1.csv")
13 |   val simpleTest2Tables2TitlePage2CSVURL = getClass.getResource("/simpleCSVs/SimpleTest2Tables1TitlePage2.csv")
14 | }
15 | 


--------------------------------------------------------------------------------
/src/main/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/extractionconstraints/SimpleExtractionConstraint.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.library.extraction.extractionconstraints
 2 | 
 3 | import tools.ambitious.pdfextractiontoolkit.library.extraction.StateBundle
 4 | import tools.ambitious.pdfextractiontoolkit.library.extraction.tableextractors.TableExtractor
 5 | import tools.ambitious.pdfextractiontoolkit.library.model.{Document, Page, Table}
 6 | 
 7 | trait SimpleExtractionConstraint extends ExtractionConstraint {
 8 |   protected val tableExtractor: TableExtractor
 9 | 
10 |   protected var table: Option[Table] = None
11 | 
12 |   def shouldExtractOnPage(page: Page, document: Document, stateBundle: StateBundle): Boolean
13 |   
14 |   protected def performExtraction(page: Page) =
15 |     tableExtractor.getTable(page)
16 | }
17 | 


--------------------------------------------------------------------------------
/src/test/scala/tools/ambitious/pdfextractiontoolkit/webapp/services/documentstorage/DocumentIdentifierSpec.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.webapp.services.documentstorage
 2 | 
 3 | import org.scalatest.FreeSpec
 4 | 
 5 | class DocumentIdentifierSpec extends FreeSpec {
 6 | 
 7 |   s"a ${DocumentIdentifier.getClass.getSimpleName}" - {
 8 | 
 9 |     "created with a hash and a description" - {
10 |       val hash: String = "425A"
11 |       val description: DocumentDescription = DocumentDescription.withTitle("testDoc")
12 |       
13 |       val docID: DocumentIdentifier = DocumentIdentifier.withHashAndDescription(hash, description)
14 |       
15 |       "has that hash and description" in {
16 |         assert(docID.hash == hash)
17 |         assert(docID.description == description)
18 |       }
19 |       
20 |     }
21 | 
22 |   }
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/scala/tools/ambitious/pdfextractiontoolkit/webapp/services/documentstorage/DocumentIdentifier.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.webapp.services.documentstorage
 2 | 
 3 | import tools.ambitious.pdfextractiontoolkit.utils.AmbitiousIoUtils.ByteArrayUtils
 4 | 
 5 | case class DocumentIdentifier protected(hash: String,
 6 |                                         description: DocumentDescription) {
 7 | }
 8 | 
 9 | object DocumentIdentifier {
10 | 
11 |   def computeFor(documentDescription: DocumentDescription, rawBytes: Array[Byte]): DocumentIdentifier = {
12 |     val hash: String = rawBytes.computeHashAsHex
13 | 
14 |     withHashAndDescription(hash, documentDescription)
15 |   }
16 | 
17 |   def withHashAndDescription(hash: String, description: DocumentDescription): DocumentIdentifier =
18 |     new DocumentIdentifier(hash, description)
19 | 
20 | }


--------------------------------------------------------------------------------
/src/test/scala/tools/ambitious/pdfextractiontoolkit/library/model/geometry/PositivePointSpec.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.library.model.geometry
 2 | 
 3 | import org.scalatest.FreeSpec
 4 | 
 5 | class PositivePointSpec extends FreeSpec {
 6 | 
 7 |   "A PositivePoint instantiated with negative x and positive y should throw an IllegalArgumentException" in {
 8 |     val instantiatePositivePoint = intercept[IllegalArgumentException] {
 9 |       PositivePoint.at(-100, 50)
10 |     }
11 |     assert(instantiatePositivePoint.getMessage === "x must not be negative")
12 |   }
13 | 
14 |   "A PositivePoint instantiated with positive x and negative y should throw an IllegalArgumentException" in {
15 |     val instantiatePositivePoint = intercept[IllegalArgumentException] {
16 |       PositivePoint.at(100, -50)
17 |     }
18 |     assert(instantiatePositivePoint.getMessage === "y must not be negative")
19 |   }
20 | 
21 | }
22 | 


--------------------------------------------------------------------------------
/src/main/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/ExtractionResult.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.library.extraction
 2 | 
 3 | import tools.ambitious.pdfextractiontoolkit.library.extraction.extractionconstraints.ExtractionConstraint
 4 | import tools.ambitious.pdfextractiontoolkit.library.model.{Document, Table}
 5 | 
 6 | class ExtractionResult protected (private val resultsMap: Map[Document, Map[ExtractionConstraint, Table]]) {
 7 | 
 8 |   def getResults(document: Document)(extractionConstraint: ExtractionConstraint): Option[Table] =
 9 |     resultsMap(document).get(extractionConstraint)
10 | 
11 |   def apply(document: Document)(extractionConstraint: ExtractionConstraint): Table =
12 |     resultsMap(document)(extractionConstraint)
13 | 
14 | }
15 | 
16 | object ExtractionResult {
17 | 
18 |   def withResultsMap(resultsMap: Map[Document, Map[ExtractionConstraint, Table]]) = new ExtractionResult(resultsMap)
19 | 
20 | }
21 | 


--------------------------------------------------------------------------------
/src/main/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/tablemergers/SimpleTableMerger.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.library.extraction.tablemergers
 2 | 
 3 | import tools.ambitious.pdfextractiontoolkit.library.model.{Row, Table}
 4 | 
 5 | class SimpleTableMerger(val ignoreHeaders: Boolean) extends TableMerger {
 6 |   override def mergeTables(toMerge: List[Table]): Option[Table] = {
 7 |     if (ignoreHeaders) {
 8 |       val rowsMinusHeaders: List[Row] = toMerge.flatMap(_.rows.drop(1))
 9 |       if (rowsMinusHeaders.isEmpty) {
10 |         Option.empty
11 |       } else {
12 |         Option.apply(Table.fromRows(rowsMinusHeaders))
13 |       }
14 |     } else {
15 |       toMerge.reduceOption(Table.merge)
16 |     }
17 |   }
18 | }
19 | 
20 | object SimpleTableMerger {
21 |   def create: SimpleTableMerger = new SimpleTableMerger(false)
22 |   def createIgnoringHeaderRows: SimpleTableMerger = new SimpleTableMerger(true)
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/tableextractors/RegionBasedTableExtractor.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.library.extraction.tableextractors
 2 | 
 3 | import technology.tabula
 4 | import tools.ambitious.pdfextractiontoolkit.library.extraction.ExtractionUtils
 5 | import tools.ambitious.pdfextractiontoolkit.library.model.geometry.Rectangle
 6 | import tools.ambitious.pdfextractiontoolkit.library.model.{Page, Table}
 7 | import tools.ambitious.pdfextractiontoolkit.library.util.TabulaConverter
 8 | 
 9 | class RegionBasedTableExtractor protected (val region: Rectangle) extends TableExtractor {
10 |   override def getTable(page: Page): Table = {
11 |     val tabulaTable: tabula.Table = ExtractionUtils.extractTabulaTableFromPage(page, region)
12 |     TabulaConverter.tableFromTabulaTable(tabulaTable)
13 |   }
14 | }
15 | 
16 | object RegionBasedTableExtractor {
17 |   def forRegion(region: Rectangle) = new RegionBasedTableExtractor(region)
18 | }


--------------------------------------------------------------------------------
/src/test/scala/tools/ambitious/pdfextractiontoolkit/library/SimpleExtractSpec.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.library
 2 | 
 3 | import org.apache.pdfbox.pdmodel.PDDocument
 4 | import org.scalatest._
 5 | import technology.tabula.extractors.BasicExtractionAlgorithm
 6 | import technology.tabula.{ObjectExtractor, Page}
 7 | 
 8 | class SimpleExtractSpec extends FlatSpec {
 9 | 
10 |   val SIMPLE_TEST_1_TABLE = getClass.getResource("/simplePDFs/SimpleTest1Table.pdf")
11 | 
12 |   "The SimpleTest1Table.pdf table" should "have the value \"10\" in its first cell" in {
13 |     val objectExtractor = new ObjectExtractor(PDDocument.load(SIMPLE_TEST_1_TABLE))
14 |     val wholePage: Page = objectExtractor.extract(1)
15 |     val tablePageArea = wholePage.getArea(81, 108, 305, 312)
16 | 
17 |     val extractionAlgorithm = new BasicExtractionAlgorithm
18 |     val table = extractionAlgorithm.extract(tablePageArea).get(0)
19 | 
20 |     assert(table.getCell(0, 0).getText.trim == "10")
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------
/src/main/scala/tools/ambitious/pdfextractiontoolkit/utils/AmbitiousIoUtils.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.utils
 2 | 
 3 | import java.io.InputStream
 4 | import java.net.URL
 5 | import java.security.MessageDigest
 6 | 
 7 | import org.apache.commons.io.IOUtils
 8 | 
 9 | object AmbitiousIoUtils {
10 | 
11 |   private val digest: MessageDigest = MessageDigest.getInstance("SHA-256")
12 | 
13 |   implicit class URLUtils(url: URL) {
14 |     def toBytes: Array[Byte] = {
15 |       var inputStream: InputStream = null
16 | 
17 |       try {
18 |         inputStream = url.openStream()
19 | 
20 |         IOUtils.toByteArray(inputStream)
21 |       } finally {
22 |         if (inputStream != null) {
23 |           inputStream.close()
24 |         }
25 |       }
26 |     }
27 |   }
28 | 
29 |   implicit class ByteArrayUtils(bytes: Array[Byte]) {
30 |     def computeHashAsHex: String = {
31 |       digest.digest(bytes)
32 |         .map("%02X" format _)
33 |         .mkString
34 |     }
35 |   }
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/scala/tools/ambitious/pdfextractiontoolkit/webapp/data/RootDAOImpl.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.webapp.data
 2 | 
 3 | import slick.driver.SQLiteDriver.api._
 4 | import slick.jdbc.meta.MTable
 5 | import tools.ambitious.pdfextractiontoolkit.webapp.data.model.Documents
 6 | 
 7 | import scala.concurrent.ExecutionContext.Implicits.global
 8 | import scala.concurrent.Future
 9 | 
10 | private[data] class RootDAOImpl(databaseConfigName: String) extends RootDAO {
11 |   val database = Database.forConfig(databaseConfigName)
12 | 
13 |   private def isInitialised: Future[Boolean] =
14 |     database.run(MTable.getTables).map(tables => tables.nonEmpty)
15 | 
16 |   def initialiseIfNeeded(): Future[Unit] =
17 |     isInitialised.flatMap(alreadyInitialised => if (!alreadyInitialised) initialise() else Future(Unit))
18 | 
19 |   private def initialise(): Future[Unit] =
20 |     database.run(createTablesAction)
21 | 
22 |   private lazy val createTablesAction: DBIO[Unit] =
23 |     TableQuery[Documents].schema.create
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/scala/tools/ambitious/pdfextractiontoolkit/library/util/TabulaConverter.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.library.util
 2 | 
 3 | import java.util
 4 | 
 5 | import technology.tabula
 6 | import technology.tabula.{HasText, RectangularTextContainer}
 7 | import tools.ambitious.pdfextractiontoolkit.library.model.{Row, Table}
 8 | 
 9 | import scala.collection.JavaConverters._
10 | 
11 | object TabulaConverter {
12 |   def tableFromTabulaTable(table: tabula.Table): Table =
13 |     Table.fromRows(rowsFromTabulaRows(table.getRows.asScala.toList))
14 | 
15 |   def rowsFromTabulaRows(rows: List[util.List[RectangularTextContainer[_ <: HasText]]]): List[Row] =
16 |     rows.map(row => rowFromTabulaRow(row))
17 | 
18 |   def rowFromTabulaRow(row: util.List[RectangularTextContainer[_ <: HasText]]): Row =
19 |     Row.fromStrings(tabulaRowAsListOfTrimmedStrings(row))
20 | 
21 |   def tabulaRowAsListOfTrimmedStrings(row: util.List[RectangularTextContainer[_ <: HasText]]): List[String] =
22 |     row.asScala.map(container => container.getText.trim).toList
23 | }
24 | 


--------------------------------------------------------------------------------
/src/test/scala/tools/ambitious/pdfextractiontoolkit/utils/AmbitiousIoUtilsSpec.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.utils
 2 | 
 3 | import org.scalatest.FreeSpec
 4 | import tools.ambitious.pdfextractiontoolkit.Resources
 5 | import tools.ambitious.pdfextractiontoolkit.utils.AmbitiousIoUtils.{ByteArrayUtils, URLUtils}
 6 | 
 7 | class AmbitiousIoUtilsSpec extends FreeSpec {
 8 |   "the toBytes method will return the bytes of the quick brown fox text file" in {
 9 |     val actualBytes: Array[Byte] = Resources.quickBrownFoxTxt.toBytes
10 |     val expectedBytes: Array[Byte] = "The quick brown fox jumped over the lazy dogs".getBytes("UTF-8")
11 | 
12 |     assert(expectedBytes sameElements actualBytes)
13 |   }
14 | 
15 |   "the compute hex method should return the correct hash of an array of bytes" in {
16 |     val input = Array(42, 16, 43).map(_.toByte)
17 | 
18 |     val actualHash = input.computeHashAsHex
19 |     val expectedHash = "220ED007F88E894C0AA52A193C826EED7B37AD84A2C4EA69FD538991550F8C46"
20 | 
21 |     assert(expectedHash == actualHash)
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/ExtractionUtils.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.library.extraction
 2 | 
 3 | import java.util.NoSuchElementException
 4 | 
 5 | import technology.tabula
 6 | import technology.tabula.ObjectExtractor
 7 | import technology.tabula.extractors.BasicExtractionAlgorithm
 8 | import tools.ambitious.pdfextractiontoolkit.library.model.Page
 9 | import tools.ambitious.pdfextractiontoolkit.library.model.geometry.Rectangle
10 | 
11 | object ExtractionUtils {
12 | 
13 |   def extractTabulaTableFromPage(page: Page, portal: Rectangle) :tabula.Table = {
14 |     val objectExtractor = new ObjectExtractor(page.asPDDocument)
15 |     val wholePage: tabula.Page = objectExtractor.extract(1)
16 | 
17 |     try {
18 |       val tablePageArea = wholePage.getArea(
19 |         portal.top.toFloat,
20 |         portal.left.toFloat,
21 |         portal.bottom.toFloat,
22 |         portal.right.toFloat)
23 | 
24 |       (new BasicExtractionAlgorithm).extract(tablePageArea).get(0)
25 |     } catch {
26 |       case e: NoSuchElementException => new tabula.Table
27 |     }
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/scala/tools/ambitious/pdfextractiontoolkit/library/model/geometry/Rectangle.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.library.model.geometry
 2 | 
 3 | case class Rectangle protected (topLeft: PositivePoint, bottomRight: PositivePoint) {
 4 | 
 5 |   lazy val size:Size = Size.fromWidthAndHeight(math.abs(bottomRight.x - topLeft.x), math.abs(bottomRight.y - topLeft.y))
 6 | 
 7 |   lazy val left :Double = math.min(topLeft.x, bottomRight.x)
 8 |   lazy val right :Double = math.max(topLeft.x, bottomRight.x)
 9 |   lazy val top :Double = math.min(topLeft.y, bottomRight.y)
10 |   lazy val bottom :Double = math.max(topLeft.y, bottomRight.y)
11 | }
12 | 
13 | object Rectangle {
14 |   def fromCornerCoords(x1:Double, y1:Double, x2:Double, y2:Double): Rectangle = {
15 |     new Rectangle(PositivePoint.at(x1, y1), PositivePoint.at(x2, y2))
16 |   }
17 | 
18 |   def fromCorners(topLeft: PositivePoint, bottomRight: PositivePoint): Rectangle = {
19 |     new Rectangle(topLeft, bottomRight)
20 |   }
21 | 
22 |   def fromCornerAndSize(corner: PositivePoint, size: Size): Rectangle = {
23 |     new Rectangle(corner, PositivePoint.at(corner.x + size.width, corner.y + size.height))
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/src/main/scala/tools/ambitious/pdfextractiontoolkit/library/model/Page.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.library.model
 2 | 
 3 | import org.apache.pdfbox.pdmodel.{PDDocument, PDPage}
 4 | import tools.ambitious.pdfextractiontoolkit.library.model.geometry.Size
 5 | 
 6 | class Page protected (private val document: PDDocument) {
 7 | 
 8 |   val asPDPage: PDPage = this.asPDDocument.getDocumentCatalog.getAllPages.get(0).asInstanceOf[PDPage]
 9 | 
10 |   private val mediaBox = asPDPage.getMediaBox
11 |   val size: Size = Size.fromWidthAndHeight(mediaBox.getWidth, mediaBox.getHeight)
12 | 
13 |   def asPDDocument: PDDocument = document
14 | }
15 | 
16 | object Page {
17 |   def fromPDDocument(document: PDDocument): Page = {
18 |     if (numberOfPagesInPDDocument(document) != 1)
19 |       throw new IllegalArgumentException("Page constructor fromPDDocument must supply a PDDocument with one page only.")
20 | 
21 |     new Page(document)
22 |   }
23 | 
24 |   def listFromSinglePagePDDocuments(documents: List[PDDocument]): List[Page] =
25 |     documents.map(document => Page.fromPDDocument(document))
26 | 
27 |   private def numberOfPagesInPDDocument(document: PDDocument): Int =
28 |     document.getDocumentCatalog.getAllPages.size
29 | }


--------------------------------------------------------------------------------
/src/main/scala/tools/ambitious/pdfextractiontoolkit/webapp/services/documentstorage/DocumentFileStoreImpl.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.webapp.services.documentstorage
 2 | 
 3 | import java.net.URL
 4 | import java.nio.file.{Files, Path}
 5 | 
 6 | import org.apache.commons.io.FileUtils
 7 | 
 8 | import scala.concurrent.ExecutionContext.Implicits.global
 9 | import scala.concurrent.Future
10 | 
11 | private class DocumentFileStoreImpl(val workingDirectory: Path) extends DocumentFileStore {
12 | 
13 |   private def computeExpectedPath(docID: DocumentIdentifier): Path = {
14 |     val fileName: String = docID.hash
15 |     val outputFile: Path = workingDirectory.resolve(fileName)
16 |     outputFile
17 |   }
18 | 
19 |   override def storeFileFor(docID: DocumentIdentifier, source: URL): Future[Unit] = {
20 |     Future {
21 |       val outputFile: Path = computeExpectedPath(docID)
22 | 
23 |       FileUtils.copyURLToFile(source, outputFile.toFile)
24 |     }
25 |   }
26 | 
27 |   override def deleteFileFor(docID: DocumentIdentifier): Future[Unit] = {
28 |     Future {
29 |       Files.deleteIfExists(computeExpectedPath(docID))
30 |     }
31 |   }
32 | 
33 |   override def retrieveFileFor(docID: DocumentIdentifier): Future[URL] = ???
34 | }


--------------------------------------------------------------------------------
/src/main/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/extractionconstraints/MergingSimpleExtractionConstraint.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.library.extraction.extractionconstraints
 2 | 
 3 | import tools.ambitious.pdfextractiontoolkit.library.extraction.StateBundle
 4 | import tools.ambitious.pdfextractiontoolkit.library.extraction.tablemergers.TableMerger
 5 | import tools.ambitious.pdfextractiontoolkit.library.model.{Document, Page, Table}
 6 | 
 7 | trait MergingSimpleExtractionConstraint extends SimpleExtractionConstraint {
 8 | 
 9 |   val tableMerger: TableMerger
10 | 
11 |   override def onStart(stateBundle: StateBundle): Unit = stateBundle.state = Option.apply(List())
12 | 
13 |   override def onPage(page: Page, document: Document, stateBundle: StateBundle): Unit = {
14 |     if (shouldExtractOnPage(page, document, stateBundle)) {
15 | 
16 |       val newList: List[Table] = stateBundle.state.asInstanceOf[Option[List[Table]]].get :+ performExtraction(page)
17 | 
18 |       stateBundle.state = Option.apply(newList)
19 |     }
20 |   }
21 | 
22 |   override def tableFromState(stateBundle: StateBundle): Option[Table] =
23 |     tableMerger.mergeTables(stateBundle.state.asInstanceOf[Option[List[Table]]].get)
24 | 
25 | }
26 | 


--------------------------------------------------------------------------------
/src/main/scala/tools/ambitious/pdfextractiontoolkit/webapp/services/documentstorage/DocumentDescription.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.webapp.services.documentstorage
 2 | 
 3 | import spray.http.MediaType
 4 | 
 5 | /**
 6 |  * A description of a document in the [[DocumentLibrary]], containing information that would be useful to a user.
 7 |  */
 8 | case class DocumentDescription protected (title: String,
 9 |                                           description: Option[String],
10 |                                           mediaType: MediaType = MediaType.custom("application/octet-stream")) {
11 | }
12 | 
13 | object DocumentDescription {
14 |   def withTitleAndDescriptionAndMediaType(title: String, description: String, mediaType: MediaType): DocumentDescription =
15 |     new DocumentDescription(title, Some(description), mediaType)
16 | 
17 |   def withTitleAndMediaType(title: String, mediaType: MediaType): DocumentDescription =
18 |     new DocumentDescription(title, None, mediaType)
19 | 
20 |   def withTitleAndDescription(title: String, description: String): DocumentDescription =
21 |     new DocumentDescription(title, Some(description))
22 | 
23 |   def withTitle(title: String): DocumentDescription =
24 |     new DocumentDescription(title, None)
25 | }
26 | 


--------------------------------------------------------------------------------
/src/main/scala/tools/ambitious/pdfextractiontoolkit/webapp/data/model/Documents.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.webapp.data.model
 2 | 
 3 | import slick.driver.SQLiteDriver.api._
 4 | import spray.http.MediaType
 5 | import tools.ambitious.pdfextractiontoolkit.webapp.services.documentstorage.{DocumentDescription, DocumentIdentifier}
 6 | 
 7 | case class Document(documentID: Option[Long], hash: String, title: String, description: String, mediaType: String) {
 8 | 
 9 |   lazy val asDocumentIdentifier: DocumentIdentifier = {
10 |     val parsedMediaType: MediaType = MediaType.custom(mediaType)
11 |     val documentDescription = DocumentDescription.withTitleAndDescriptionAndMediaType(title, description, parsedMediaType)
12 | 
13 |     DocumentIdentifier.withHashAndDescription(hash, documentDescription)
14 |   }
15 | }
16 | 
17 | class Documents(tag: Tag) extends Table[Document](tag, "Documents") {
18 |   def documentID = column[Long]("documentID", O.PrimaryKey, O.AutoInc)
19 |   def hash = column[String]("hash")
20 |   def title = column[String]("title")
21 |   def description = column[String]("description")
22 |   def mediaType = column[String]("mediaType")
23 | 
24 |   def * = (documentID.?, hash, title, description, mediaType) <> (Document.tupled, Document.unapply)
25 | }
26 | 


--------------------------------------------------------------------------------
/src/test/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/ExtractionUtilsSpec.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.library.extraction
 2 | 
 3 | import org.scalatest.FreeSpec
 4 | import technology.tabula
 5 | import tools.ambitious.pdfextractiontoolkit.Resources
 6 | import tools.ambitious.pdfextractiontoolkit.library.model.geometry.Rectangle
 7 | import tools.ambitious.pdfextractiontoolkit.library.model.{Document, Page, Table}
 8 | import tools.ambitious.pdfextractiontoolkit.library.util.{CSVUtil, TabulaConverter}
 9 | 
10 | class ExtractionUtilsSpec extends FreeSpec {
11 |   "The method extractTabulaTableFromPage() should be able to extract the same values as those in the corresponding" +
12 |     "csv file for simpleTest1Table.pdf file" in {
13 |     val window: Rectangle = Rectangle.fromCornerCoords(108, 81, 312, 305)
14 |     val page: Page = Document.fromPDFPath(Resources.simpleTest1TableURL).getPage(1)
15 | 
16 |     val extractedTabulaTable: tabula.Table = ExtractionUtils.extractTabulaTableFromPage(page, window)
17 |     val extractedTable: Table = TabulaConverter.tableFromTabulaTable(extractedTabulaTable)
18 |     val tableFromCSV: Table = CSVUtil.tableFromURL(Resources.simpleTest1TableCSVURL)
19 | 
20 |     assert(extractedTable == tableFromCSV)
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------
/src/test/scala/tools/ambitious/pdfextractiontoolkit/library/model/geometry/SizeSpec.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.library.model.geometry
 2 | 
 3 | import org.scalatest.FreeSpec
 4 | 
 5 | class SizeSpec extends FreeSpec {
 6 | 
 7 |   "A Size instantiated with width equal to 100.0 and height equal to 50.0" - {
 8 |     val size: Size = Size.fromWidthAndHeight(100.0, 50.0)
 9 | 
10 |     "should have width equal to 100.0" in {
11 |       assert(size.width == 100.0)
12 |     }
13 | 
14 |     "should have height equal to 50.0" in {
15 |       assert(size.height == 50.0)
16 |     }
17 | 
18 |     "should have area equal to 5000.0" in {
19 |       assert(size.area == 5000.0)
20 |     }
21 |   }
22 | 
23 |   "A Size instantiated with negative width and positive height should throw an IllegalArgumentException" in {
24 |     val instantiateSize = intercept[IllegalArgumentException] {
25 |       Size.fromWidthAndHeight(-100, 50)
26 |     }
27 |     assert(instantiateSize.getMessage === "Width must not be negative")
28 |   }
29 | 
30 |   "A Size instantiated with negative height and positive width should throw an IllegalArgumentException" in {
31 |     val instantiateSize = intercept[IllegalArgumentException] {
32 |       Size.fromWidthAndHeight(100, -50)
33 |     }
34 |     assert(instantiateSize.getMessage === "Height must not be negative")
35 |   }
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/src/test/scala/tools/ambitious/pdfextractiontoolkit/library/model/CellSpec.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.library.model
 2 | 
 3 | import org.scalatest.FreeSpec
 4 | 
 5 | class CellSpec extends FreeSpec {
 6 |   "A cell that is instantiated without any arguments" - {
 7 |     val cell: Cell = new Cell
 8 | 
 9 |     "should have text set to an empty string" in {
10 |       assert(cell.text == "")
11 |     }
12 | 
13 |     "should return true when asked if empty" in {
14 |       assert(cell.isEmpty)
15 |     }
16 |   }
17 | 
18 |   "A cell that is instantiated with the string 'blah'" - {
19 |     val cell: Cell = new Cell("blah")
20 | 
21 |     "should have text set to 'blah'" in {
22 |       assert(cell.text == "blah")
23 |     }
24 | 
25 |     "should return false when asked if empty" in {
26 |       assert(!cell.isEmpty)
27 |     }
28 |   }
29 | 
30 |   "Two cells with the same text" - {
31 |     val cellA: Cell = new Cell("test")
32 |     val cellB: Cell = new Cell("test")
33 | 
34 |     "should be equal" in {
35 |       assert(cellA == cellB)
36 |     }
37 | 
38 |     "should not have the same reference" in {
39 |       assert(cellA ne cellB)
40 |     }
41 |   }
42 | 
43 |   "Two cells with different text" - {
44 |     val cellA: Cell = new Cell("test")
45 |     val cellB: Cell = new Cell("test2")
46 | 
47 |     "should not be equal" in {
48 |       assert(cellA != cellB)
49 |     }
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/src/test/scala/tools/ambitious/pdfextractiontoolkit/library/util/CSVUtilSpec.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.library.util
 2 | 
 3 | import java.io.File
 4 | 
 5 | import org.scalatest.FreeSpec
 6 | import tools.ambitious.pdfextractiontoolkit.library.model.{Row, Table}
 7 | 
 8 | class CSVUtilSpec extends FreeSpec {
 9 |   val simple3x3DigitsURL = getClass.getResource("/simpleCSVs/simple3x3Digits.csv")
10 | 
11 |   "A Table from CSV File" - {
12 |     val table: Table = CSVUtil.tableFromFile(new File(simple3x3DigitsURL.toURI))
13 | 
14 |     "should be equal to table: 1,2,3\n4,5,6\n7,8,9" in {
15 |       val rowA: Row = Row.fromStrings(List("1", "2", "3"))
16 |       val rowB: Row = Row.fromStrings(List("4", "5", "6"))
17 |       val rowC: Row = Row.fromStrings(List("7", "8", "9"))
18 | 
19 |       val tableB: Table = Table.fromRows(List(rowA, rowB, rowC))
20 | 
21 |       assert(table == tableB)
22 |     }
23 |   }
24 | 
25 |   "A Table from CSV URL" - {
26 |     val table: Table = CSVUtil.tableFromURL(simple3x3DigitsURL)
27 | 
28 |     "should be equal to table: 1,2,3\n4,5,6\n7,8,9" in {
29 |       val rowA: Row = Row.fromStrings(List("1", "2", "3"))
30 |       val rowB: Row = Row.fromStrings(List("4", "5", "6"))
31 |       val rowC: Row = Row.fromStrings(List("7", "8", "9"))
32 | 
33 |       val tableB: Table = Table.fromRows(List(rowA, rowB, rowC))
34 | 
35 |       assert(table == tableB)
36 |     }
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/src/test/scala/tools/ambitious/pdfextractiontoolkit/webapp/data/DAOTestUtils.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.webapp.data
 2 | 
 3 | import java.nio.file.{Files, Path, Paths}
 4 | 
 5 | import com.typesafe.config.{Config, ConfigFactory}
 6 | import org.apache.commons.io.FileUtils
 7 | 
 8 | import scala.concurrent.Await
 9 | import scala.concurrent.duration._
10 | 
11 | object DAOTestUtils {
12 |   private val applicationConfig: Config = ConfigFactory.load()
13 | 
14 |   private val workingDirPathFromConfig: String = applicationConfig.getString("testWorkingDir")
15 | 
16 |   private val workingDir: Path = Paths.get(workingDirPathFromConfig)
17 | 
18 |   def createAndGetWorkingDirectory(): Path = {
19 |     workingDir.toFile.mkdirs()
20 |     workingDir
21 |   }
22 | 
23 |   def cleanWorkingDirectory(): Unit = {
24 |     FileUtils.deleteQuietly(workingDir.toFile)
25 |   }
26 | 
27 |   private val testDBPathFromConfig: String = applicationConfig.getString("testDB.path")
28 | 
29 |   val testDBFile: Path = Paths.get(testDBPathFromConfig)
30 | 
31 |   val testDBConfigName: String = "testDB"
32 | 
33 |   def constructCleanRootDAO: RootDAO = {
34 |     Files.deleteIfExists(testDBFile)
35 |     DAOTestUtils.testDBFile.getParent.toFile.mkdirs()
36 | 
37 |     val rootDAO: RootDAO = RootDAO.forConfigName(testDBConfigName)
38 | 
39 |     Await.result(rootDAO.initialiseIfNeeded(), 30.seconds)
40 | 
41 |     rootDAO
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/src/main/scala/tools/ambitious/pdfextractiontoolkit/webapp/services/documentstorage/DocumentLibraryImpl.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.webapp.services.documentstorage
 2 | 
 3 | import java.net.URL
 4 | 
 5 | import tools.ambitious.pdfextractiontoolkit.utils.AmbitiousIoUtils._
 6 | import tools.ambitious.pdfextractiontoolkit.webapp.data.DocumentInformationDao
 7 | 
 8 | import scala.concurrent.ExecutionContext.Implicits.global
 9 | import scala.concurrent.Future
10 | 
11 | private class DocumentLibraryImpl (val fileStore: DocumentFileStore,
12 |                                    val dao: DocumentInformationDao) extends DocumentLibrary {
13 | 
14 |   override def store(description: DocumentDescription, documentSource: URL): Future[DocumentIdentifier] = {
15 |     val bytes: Array[Byte] = documentSource.toBytes
16 | 
17 |     val docID: DocumentIdentifier = DocumentIdentifier.computeFor(description, bytes)
18 | 
19 |     fileStore.storeFileFor(docID, documentSource)
20 | 
21 |     dao.storeDocumentID(docID)
22 |       .flatMap(_ => Future(docID))
23 |   }
24 | 
25 |   override def retrieve(docID: DocumentIdentifier): Future[URL] = fileStore.retrieveFileFor(docID)
26 | 
27 |   override def delete(docID: DocumentIdentifier): Future[Unit] = {
28 |     fileStore.deleteFileFor(docID)
29 |     dao.deleteDocumentID(docID)
30 |   }
31 | 
32 |   override def list(): Future[Seq[DocumentIdentifier]] = dao.retrieveAllIDs()
33 | 
34 | }
35 | 
36 | object DocumentLibraryImpl {
37 | }


--------------------------------------------------------------------------------
/src/main/scala/tools/ambitious/pdfextractiontoolkit/library/model/Row.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.library.model
 2 | 
 3 | class Row(private val cells: List[Cell] = Nil) {
 4 |   def getCell(number: Int) = {
 5 |     if (number <= numberOfCells && number > 0)
 6 |       cells(number-1)
 7 |     else
 8 |       throw new IllegalArgumentException(s"Invalid cell number $number")
 9 |   }
10 | 
11 |   def numberOfCells: Int = cells.length
12 | 
13 |   def isEmpty: Boolean = numberOfCells == 0
14 | 
15 |   override def toString: String =
16 |     cells.map(cell => cell.text).mkString(",")
17 | 
18 |   override def equals(obj: Any): Boolean = {
19 |     obj match {
20 |       case row: Row =>
21 |         var cellComparison: Boolean = this.numberOfCells == row.numberOfCells
22 |         for (i <- 1 to cells.length) {
23 |           try {
24 |             cellComparison = cellComparison && (this.getCell(i) == row.getCell(i))
25 |           } catch {
26 |             case _: Throwable => cellComparison = false
27 |           }
28 |         }
29 |         cellComparison
30 |       case _ => false
31 |     }
32 |   }
33 | }
34 | 
35 | object Row {
36 |   def fromCells(cells: List[Cell]): Row =
37 |     new Row(cells)
38 | 
39 |   def fromCell(cell: Cell): Row =
40 |     fromCells(List(cell))
41 | 
42 |   def fromStrings(strings: List[String]): Row =
43 |     fromCells(strings.map(string => new Cell(string)))
44 | 
45 |   def fromString(string: String): Row =
46 |     fromStrings(List(string))
47 | }


--------------------------------------------------------------------------------
/src/main/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/extractionconstraints/PageNumberExtractionConstraint.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.library.extraction.extractionconstraints
 2 | 
 3 | import tools.ambitious.pdfextractiontoolkit.library.extraction.StateBundle
 4 | import tools.ambitious.pdfextractiontoolkit.library.extraction.tableextractors.TableExtractor
 5 | import tools.ambitious.pdfextractiontoolkit.library.extraction.tablemergers.{SimpleTableMerger, TableMerger}
 6 | import tools.ambitious.pdfextractiontoolkit.library.model.{Document, Page}
 7 | 
 8 | import scala.collection.immutable.Range.Inclusive
 9 | 
10 | case class PageNumberExtractionConstraint protected (pages: Set[Int], tableExtractor: TableExtractor) extends MergingSimpleExtractionConstraint {
11 |   if (pages.exists(_ <= 0)) {
12 |     throw new IllegalArgumentException("Page numbers can only be positive numbers.")
13 |   }
14 | 
15 |   override def shouldExtractOnPage(page: Page, document: Document, stateBundle: StateBundle): Boolean =
16 |     document.pageNumberOf(page).exists(pages.contains)
17 | 
18 |   override val tableMerger: TableMerger = SimpleTableMerger.create
19 | }
20 | object PageNumberExtractionConstraint {
21 |   def withPageNumberAndTableExtractor(pageNumber: Int, tableExtractor: TableExtractor) =
22 |     new PageNumberExtractionConstraint(Set(pageNumber), tableExtractor)
23 | 
24 |   def withPageRangeAndTableExtractor(range: Inclusive, tableExtractor: TableExtractor) =
25 |     new PageNumberExtractionConstraint(Set(range: _*), tableExtractor)
26 | }
27 | 
28 | 


--------------------------------------------------------------------------------
/src/main/scala/tools/ambitious/pdfextractiontoolkit/webapp/data/DocumentInformationDaoImpl.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.webapp.data
 2 | 
 3 | import slick.driver.SQLiteDriver.api._
 4 | import slick.lifted.{Query, TableQuery}
 5 | import tools.ambitious.pdfextractiontoolkit.webapp.data.model.{Document, Documents}
 6 | import tools.ambitious.pdfextractiontoolkit.webapp.services.documentstorage.DocumentIdentifier
 7 | 
 8 | import scala.concurrent.ExecutionContext.Implicits.global
 9 | import scala.concurrent.Future
10 | 
11 | private class DocumentInformationDaoImpl(val rootDAO: RootDAO) extends DocumentInformationDao {
12 |   override def storeDocumentID(docID: DocumentIdentifier): Future[Unit] = {
13 |     val newRow = Document(None,
14 |       docID.hash,
15 |       docID.description.title,
16 |       docID.description.description.getOrElse(""),
17 |       docID.description.mediaType.toString())
18 | 
19 |     val tableQuery: TableQuery[Documents] = new TableQuery(new Documents(_))
20 |     val insertAction = tableQuery += newRow
21 | 
22 |     rootDAO.database.run(insertAction).map(_ => Unit)
23 |   }
24 | 
25 |   override def deleteDocumentID(docID: DocumentIdentifier): Future[Unit] = ???
26 | 
27 |   override def retrieveAllIDs(): Future[Seq[DocumentIdentifier]] = {
28 |     val tableQuery: TableQuery[Documents] = new TableQuery(new Documents(_))
29 | 
30 |     val query: Query[Documents, Document, Seq] = tableQuery.map(row => row)
31 | 
32 |     val resultsFuture: Future[Seq[Document]] = rootDAO.database.run(query.result)
33 | 
34 |     resultsFuture.map(documents => documents.map(_.asDocumentIdentifier))
35 |   }
36 | }


--------------------------------------------------------------------------------
/src/main/scala/tools/ambitious/pdfextractiontoolkit/library/model/Table.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.library.model
 2 | 
 3 | class Table(val rows: List[Row] = Nil) {
 4 |   def getCell(row: Int, column: Int): Cell = getRow(row).getCell(column)
 5 | 
 6 |   def getRow(number: Int): Row = {
 7 |     if (number <= numberOfRows && number > 0)
 8 |       rows(number-1)
 9 |     else
10 |       throw new IllegalArgumentException("Invalid row number.")
11 |   }
12 | 
13 |   lazy val numberOfRows: Int = rows.length
14 | 
15 |   lazy val numberOfColumns: Int = rows.map(_.numberOfCells).max
16 | 
17 |   override def toString: String =
18 |     rows.map(row => row.toString).mkString("\n")
19 | 
20 |   override def equals(obj: Any): Boolean = {
21 |     obj match {
22 |       case table: Table =>
23 |         var rowComparison: Boolean = this.numberOfRows == table.numberOfRows
24 |         for (i <- 1 to this.numberOfRows) {
25 |           try {
26 |             rowComparison = rowComparison && (this.getRow(i) == table.getRow(i))
27 |           } catch {
28 |             case _: Throwable => rowComparison = false
29 |           }
30 |         }
31 |         rowComparison
32 |       case _ => false
33 |     }
34 |   }
35 | 
36 |   def mergedWith(table: Table): Table =
37 |     Table.merge(this, table)
38 | }
39 | 
40 | object Table {
41 |   def fromRows(rows: List[Row]): Table =
42 |     new Table(rows)
43 | 
44 |   def fromRow(row: Row): Table =
45 |     fromRows(List(row))
46 | 
47 |   def merge(tables: List[Table]): Table =
48 |     fromRows(tables.flatMap(table => table.rows))
49 | 
50 |   def merge(tableA: Table, tableB: Table): Table =
51 |     merge(List(tableA, tableB))
52 | }


--------------------------------------------------------------------------------
/src/main/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/extractionconstraints/FirstOccurrenceOfStringExtractionConstraint.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.library.extraction.extractionconstraints
 2 | 
 3 | import technology.tabula
 4 | import tools.ambitious.pdfextractiontoolkit.library.extraction.tableextractors.TableExtractor
 5 | import tools.ambitious.pdfextractiontoolkit.library.extraction.{ExtractionUtils, StateBundle}
 6 | import tools.ambitious.pdfextractiontoolkit.library.model.geometry.Rectangle
 7 | import tools.ambitious.pdfextractiontoolkit.library.model.{Document, Page, Table}
 8 | 
 9 | case class FirstOccurrenceOfStringExtractionConstraint protected (text: String, textRegion: Rectangle, tableExtractor: TableExtractor)
10 |   extends SimpleExtractionConstraint {
11 | 
12 |   override def onPage(page: Page, document: Document, stateBundle: StateBundle): Unit = {
13 |     if (shouldExtractOnPage(page, document, stateBundle)) {
14 |       stateBundle.state = Option.apply(page)
15 |     }
16 |   }
17 | 
18 |   override def tableFromState(stateBundle: StateBundle): Option[Table] = {
19 |     if (stateBundle.state.isDefined) {
20 |       val extractedTable: Table = performExtraction(stateBundle.state.asInstanceOf[Option[Page]].get)
21 | 
22 |       Option.apply(extractedTable)
23 |     } else {
24 |       Option.empty
25 |     }
26 |   }
27 | 
28 |   def shouldExtractOnPage(page: Page, document: Document, stateBundle: StateBundle): Boolean = {
29 |     val table: tabula.Table = ExtractionUtils.extractTabulaTableFromPage(page, textRegion)
30 |     val foundText = table.getCell(0, 0).getText
31 | 
32 |     text == foundText && stateBundle.state.isEmpty
33 |   }
34 | }
35 | 
36 | object FirstOccurrenceOfStringExtractionConstraint {
37 |   def withTextAndTableExtractor(text: String, textRegion: Rectangle, tableExtractor: TableExtractor) =
38 |     new FirstOccurrenceOfStringExtractionConstraint(text, textRegion, tableExtractor)
39 | }
40 | 
41 | 


--------------------------------------------------------------------------------
/src/test/scala/tools/ambitious/pdfextractiontoolkit/webapp/services/documentstorage/DocumentDescriptionSpec.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.webapp.services.documentstorage
 2 | 
 3 | import org.scalatest.FreeSpec
 4 | import spray.http.MediaType
 5 | 
 6 | class DocumentDescriptionSpec extends FreeSpec {
 7 | 
 8 |   s"a ${DocumentDescription.getClass.getSimpleName}" - {
 9 | 
10 |     "can be created with a title 'testDoc'" in {
11 |       val docDescription: DocumentDescription = DocumentDescription.withTitle("testDoc")
12 | 
13 |       assert(docDescription.title == "testDoc")
14 |     }
15 | 
16 |     "can be created with a title 'testDoc' and a description 'a test document'" in {
17 |       val docDescription: DocumentDescription =
18 |         DocumentDescription.withTitleAndDescription("testDoc", "a test document")
19 | 
20 |       assert(docDescription.title == "testDoc")
21 |       assert(docDescription.description.contains("a test document"))
22 |     }
23 | 
24 |     "can be created with a title 'testDoc' and media type 'application/pdf'" in {
25 |       val mediaType: MediaType = MediaType.custom("application/pdf")
26 | 
27 |       val docDescription: DocumentDescription =
28 |         DocumentDescription.withTitleAndMediaType("testDoc", mediaType)
29 | 
30 |       assert(docDescription.title == "testDoc")
31 |       assert(docDescription.mediaType == mediaType)
32 |     }
33 | 
34 |     "can be created with a title 'testDoc' and a description 'a test document' and media type 'application/pdf'" in {
35 |       val mediaType: MediaType = MediaType.custom("application/pdf")
36 | 
37 |       val docDescription: DocumentDescription =
38 |         DocumentDescription.withTitleAndDescriptionAndMediaType("testDoc", "a test document", mediaType)
39 | 
40 |       assert(docDescription.title == "testDoc")
41 |       assert(docDescription.description.contains("a test document"))
42 |       assert(docDescription.mediaType == mediaType)
43 |     }
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/Extractor.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.library.extraction
 2 | 
 3 | import tools.ambitious.pdfextractiontoolkit.library.extraction.extractionconstraints.ExtractionConstraint
 4 | import tools.ambitious.pdfextractiontoolkit.library.model._
 5 | 
 6 | import scala.concurrent.ExecutionContext.Implicits.global
 7 | import scala.concurrent.{Future, Promise}
 8 | 
 9 | class Extractor protected(private val documents: List[Document], private val extractors: List[ExtractionConstraint]) {
10 | 
11 |   private def extractTablesFromDocument(document: Document): Future[Map[ExtractionConstraint, Table]] = {
12 |     val walker: DocumentWalker = DocumentWalker.toWalkWithExtractionConstraint(document, extractors)
13 | 
14 |     val promise: Promise[Map[ExtractionConstraint, Table]] = Promise()
15 |     walker.getTables.onSuccess {
16 |       case tables => promise.success(tables)
17 |     }
18 |     promise.future
19 |   }
20 | 
21 |   def extractTables: Future[ExtractionResult] = {
22 |     val documentMap: Map[Document, Future[Map[ExtractionConstraint, Table]]] =
23 |       documents.map(document => document -> extractTablesFromDocument(document)).toMap
24 | 
25 |     Future.sequence(documentMap.map(entry => entry._2.map(i => (entry._1, i))))
26 |       .map(_.toMap)
27 |       .map(ExtractionResult.withResultsMap)
28 |   }
29 | }
30 | 
31 | object Extractor {
32 |   def fromDocumentsAndConstraints(documents: List[Document], extractors: List[ExtractionConstraint]): Extractor =
33 |     new Extractor(documents, extractors)
34 | 
35 |   def fromDocumentsAndConstraints(documents: List[Document], extractors: ExtractionConstraint*): Extractor =
36 |     fromDocumentsAndConstraints(documents, extractors.toList)
37 | 
38 |   def fromDocumentAndConstraints(document: Document, extractors: List[ExtractionConstraint]): Extractor =
39 |     fromDocumentsAndConstraints(List(document), extractors)
40 | 
41 |   def fromDocumentAndConstraints(document: Document, extractors: ExtractionConstraint*): Extractor =
42 |     fromDocumentAndConstraints(document, extractors.toList)
43 | }


--------------------------------------------------------------------------------
/src/test/scala/tools/ambitious/pdfextractiontoolkit/webapp/data/RootDAOImplSpec.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.webapp.data
 2 | 
 3 | import java.net.URL
 4 | import java.nio.file.Files
 5 | 
 6 | import org.scalatest.FreeSpec
 7 | import slick.jdbc.meta.MTable
 8 | import tools.ambitious.pdfextractiontoolkit.utils.AmbitiousIoUtils.{ByteArrayUtils, URLUtils}
 9 | 
10 | import scala.concurrent.duration._
11 | import scala.concurrent.{Await, Future}
12 | 
13 | class RootDAOImplSpec extends FreeSpec {
14 | 
15 |   "the root DAO" - {
16 |     val dao: RootDAO = RootDAO.forConfigName(DAOTestUtils.testDBConfigName)
17 |     DAOTestUtils.testDBFile.getParent.toFile.mkdirs()
18 | 
19 |     "if the database doesn't exist" - {
20 | 
21 |       Files.deleteIfExists(DAOTestUtils.testDBFile)
22 | 
23 |       "a call to initialiseIfNeeded" - {
24 | 
25 |         Await.result(dao.initialiseIfNeeded(), 30.seconds)
26 | 
27 |         "will create a database" in {
28 |           assert(Files.exists(DAOTestUtils.testDBFile))
29 |         }
30 | 
31 |         "will create the correct tables" in {
32 |           val listTables: Future[Vector[MTable]] = dao.database.run(MTable.getTables)
33 | 
34 |           val tablesMetaData: Vector[MTable] = Await.result(listTables, 30.seconds)
35 | 
36 |           val actualTableNames = tablesMetaData
37 |             .map(_.name.name)
38 |             .toSet
39 |             .filterNot(_ == "sqlite_sequence") // Metadata table that's always created. We don't give a shit.
40 | 
41 |           val expectedTableNames = Set("Documents")
42 | 
43 |           assert(expectedTableNames == actualTableNames)
44 |         }
45 |       }
46 |     }
47 | 
48 |     "if the database does exist" - {
49 | 
50 |       dao.initialiseIfNeeded()
51 | 
52 |       "does not change the database" in {
53 |         val testDBUrl: URL = DAOTestUtils.testDBFile.toUri.toURL
54 | 
55 |         val originalHash = testDBUrl
56 |           .toBytes
57 |           .computeHashAsHex
58 | 
59 |         dao.initialiseIfNeeded()
60 | 
61 |         val newHash = testDBUrl
62 |           .toBytes
63 |           .computeHashAsHex
64 | 
65 |         assert(originalHash == newHash)
66 |       }
67 |     }
68 |   }
69 | }
70 | 


--------------------------------------------------------------------------------
/src/test/scala/tools/ambitious/pdfextractiontoolkit/library/model/PageSpec.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.library.model
 2 | 
 3 | import org.apache.pdfbox.pdmodel.{PDDocument, PDPage}
 4 | import org.scalatest.FreeSpec
 5 | 
 6 | class PageSpec extends FreeSpec {
 7 |   val samplePDFPath = getClass.getResource("/simplePDFs/SimpleTest1Table.pdf")
 8 |   val twoPagedBlankDocumentPath = getClass.getResource("/simplePDFs/TwoPagedBlankDocument.pdf")
 9 | 
10 |   "In the context of a PDDocument with two pages (TwoPagedBlankDocument.pdf)" - {
11 |     val document: PDDocument = PDDocument.load(twoPagedBlankDocumentPath)
12 | 
13 |     "An IllegalArgumentException should be thrown when trying to call the constructor Page.fromPDDocument" in {
14 |       val interceptException = intercept[IllegalArgumentException] {
15 |         val page: Page = Page.fromPDDocument(document)
16 |       }
17 | 
18 |       assert(interceptException.getMessage === "Page constructor fromPDDocument must supply a PDDocument with one page only.")
19 |     }
20 |   }
21 | 
22 |   "A Page should instantiate from a PDDocument with one page" in {
23 |     val document: PDDocument = PDDocument.load(samplePDFPath)
24 |     val page: Page = Page.fromPDDocument(document)
25 |   }
26 | 
27 |   "A Page" - {
28 |     val document: PDDocument = PDDocument.load(samplePDFPath)
29 |     val page: Page = Page.fromPDDocument(document)
30 | 
31 |     "should have width equal to the document's PDPage width" in {
32 |       val pDPage: PDPage = document.getDocumentCatalog.getAllPages.get(0).asInstanceOf[PDPage]
33 |       assert(page.size.width == pDPage.getMediaBox.getWidth)
34 |     }
35 | 
36 |     "should have height equal to the document's PDPage height" in {
37 |       val pDPage: PDPage = document.getDocumentCatalog.getAllPages.get(0).asInstanceOf[PDPage]
38 |       assert(page.size.height == pDPage.getMediaBox.getHeight)
39 |     }
40 |   }
41 | 
42 |   "A Page instantiated from a PDDocument with one page" - {
43 |     val document: PDDocument = PDDocument.load(samplePDFPath)
44 |     val page: Page = Page.fromPDDocument(document)
45 | 
46 |     "should return the same PDDocument" in {
47 |       assert(page.asPDDocument == document)
48 |     }
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/src/test/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/extractionconstraints/FirstOccurrenceOfStringExtractionConstraintSpec.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.library.extraction.extractionconstraints
 2 | 
 3 | import org.scalatest.FreeSpec
 4 | import tools.ambitious.pdfextractiontoolkit.Resources
 5 | import tools.ambitious.pdfextractiontoolkit.library.extraction._
 6 | import tools.ambitious.pdfextractiontoolkit.library.extraction.tableextractors.RegionBasedTableExtractor
 7 | import tools.ambitious.pdfextractiontoolkit.library.model.geometry.{PositivePoint, Rectangle, Size}
 8 | import tools.ambitious.pdfextractiontoolkit.library.model.{Document, Table}
 9 | import tools.ambitious.pdfextractiontoolkit.library.util.CSVUtil
10 | 
11 | import scala.concurrent.Await
12 | import scala.concurrent.duration._
13 | 
14 | class FirstOccurrenceOfStringExtractionConstraintSpec extends FreeSpec {
15 |   val simpleTest2Tables2TitleURL = getClass.getResource("/simplePDFs/SimpleTest2Tables1Title.pdf")
16 | 
17 |   s"A ${FirstOccurrenceOfStringExtractionConstraint.getClass.getSimpleName} with string 'An example Title' and " +
18 |     s"appropriate Window for SimpleTest2Tables1Title.pdf" - {
19 |     val region = Rectangle.fromCornerAndSize(PositivePoint.at(168.48, 273.95), Size.fromWidthAndHeight(213.54, 303.5))
20 | 
21 |     val tableExtractor = RegionBasedTableExtractor.forRegion(region)
22 | 
23 |     val textRegion = Rectangle.fromCornerAndSize(PositivePoint.at(185.38, 165.62), Size.fromWidthAndHeight(112.64, 16.16))
24 |     val extractionConstraint = FirstOccurrenceOfStringExtractionConstraint.withTextAndTableExtractor("An example Title", textRegion, tableExtractor)
25 | 
26 |     "when put through a walker with test document SimpleTest2Tables1Title.pdf" - {
27 |       val document: Document = Document.fromPDFPath(simpleTest2Tables2TitleURL)
28 |       val walker: DocumentWalker = DocumentWalker.toWalkWithExtractionConstraint(document, extractionConstraint)
29 |       val tables: Map[ExtractionConstraint, Table] = Await.result(walker.getTables, 60.seconds)
30 | 
31 |       "should return the table at page 2" in {
32 |         val table: Option[Table] = tables.get(extractionConstraint)
33 |         val tableFromCSV: Table = CSVUtil.tableFromURL(Resources.simpleTest2Tables2TitlePage2CSVURL)
34 | 
35 |         assert(table.get == tableFromCSV)
36 |       }
37 |     }
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/test/scala/tools/ambitious/pdfextractiontoolkit/library/model/geometry/RectangleSpec.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.library.model.geometry
 2 | 
 3 | import org.scalatest.FreeSpec
 4 | 
 5 | class RectangleSpec extends FreeSpec {
 6 | 
 7 |   "A Rectangle" - {
 8 |     "defined by the points [1, 2] and [5, 4]" - {
 9 | 
10 |       val theRectangle = Rectangle.fromCornerCoords(1, 2, 5, 4)
11 | 
12 |       standardRectangleTest(theRectangle,
13 |         expectedArea = 8,
14 |         expectedWidth = 4,
15 |         expectedHeight = 2,
16 |         expectedTop = 2,
17 |         expectedBottom = 4,
18 |         expectedLeft = 1,
19 |         expectedRight = 5)
20 |     }
21 | 
22 |     "defined by the points [4, 3] and [1, 2]" - {
23 | 
24 |       val theRectangle = Rectangle.fromCornerCoords(4, 3, 1, 2)
25 | 
26 |       standardRectangleTest(theRectangle,
27 |         expectedArea = 3,
28 |         expectedWidth = 3,
29 |         expectedHeight = 1,
30 |         expectedTop = 2,
31 |         expectedBottom = 3,
32 |         expectedLeft = 1,
33 |         expectedRight = 4)
34 | 
35 |     }
36 |   }
37 | 
38 |   def standardRectangleTest(rectangle:Rectangle,
39 |                             expectedArea:Double,
40 |                             expectedWidth:Double,
41 |                             expectedHeight:Double,
42 |                             expectedLeft:Double,
43 |                             expectedRight:Double,
44 |                             expectedTop:Double,
45 |                             expectedBottom:Double) = {
46 |     s"should have an area of $expectedArea" in {
47 |       assert(rectangle.size.area == expectedArea)
48 |     }
49 | 
50 |     s"should have a width of $expectedWidth" in {
51 |       assert(rectangle.size.width == expectedWidth)
52 |     }
53 | 
54 |     s"should have a height of $expectedHeight" in {
55 |       assert(rectangle.size.height == expectedHeight)
56 |     }
57 | 
58 |     s"should have a left bound of $expectedLeft" in {
59 |       assert(rectangle.left == expectedLeft)
60 |     }
61 | 
62 |     s"should have a right bound of $expectedRight" in {
63 |       assert(rectangle.right == expectedRight)
64 |     }
65 | 
66 |     s"should have a top bound of $expectedTop" in {
67 |       assert(rectangle.top == expectedTop)
68 |     }
69 | 
70 |     s"should have a bottom bound of $expectedBottom" in {
71 |       assert(rectangle.bottom == expectedBottom)
72 |     }
73 |   }
74 | }
75 | 


--------------------------------------------------------------------------------
/src/test/scala/tools/ambitious/pdfextractiontoolkit/webapp/services/documentstorage/DocumentLibraryImplSpec.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.webapp.services.documentstorage
 2 | 
 3 | import java.net.URL
 4 | 
 5 | import org.scalamock.scalatest.MockFactory
 6 | import org.scalatest.FreeSpec
 7 | import tools.ambitious.pdfextractiontoolkit.Resources
 8 | import tools.ambitious.pdfextractiontoolkit.webapp.data.DocumentInformationDao
 9 | 
10 | import scala.concurrent.ExecutionContext.Implicits.global
11 | import scala.concurrent.duration._
12 | import scala.concurrent.{Await, Future}
13 | 
14 | class DocumentLibraryImplSpec extends FreeSpec with MockFactory {
15 | 
16 |   s"a ${DocumentLibraryImpl.getClass.getSimpleName}" - {
17 |     val mockFileStore: DocumentFileStore = mock[DocumentFileStore]
18 |     val mockDao: DocumentInformationDao = mock[DocumentInformationDao]
19 | 
20 |     val documentLibrary: DocumentLibraryImpl = new DocumentLibraryImpl(mockFileStore, mockDao)
21 | 
22 |     val description: DocumentDescription = DocumentDescription.withTitle("testTitle")
23 |     val source: URL = Resources.quickBrownFoxTxt
24 | 
25 |     val expectedHash: String = "58B433FA7E8B0F94B2FF02178E7768F5A329EF346D908C7B917824E5A4CA9575"
26 | 
27 |     val expectedID: DocumentIdentifier = DocumentIdentifier.withHashAndDescription(expectedHash, description)
28 | 
29 |     "will store a document when store is called" in {
30 | 
31 |       (mockFileStore.storeFileFor _).expects(expectedID, source)
32 |       (mockDao.storeDocumentID _).expects(expectedID).returns(Future(Unit))
33 | 
34 |       Await.result(documentLibrary.store(description, source), 30.seconds)
35 |     }
36 | 
37 |     "will retrieve a document when retrieve is called" in {
38 |       (mockFileStore.retrieveFileFor _).expects(expectedID).returns(Future(source))
39 | 
40 |       val retrievedDocument = Await.result(documentLibrary.retrieve(expectedID), 30.seconds)
41 |       assert(retrievedDocument == source)
42 |     }
43 | 
44 |     "will delete a document when delete is called" in {
45 |       (mockFileStore.deleteFileFor _).expects(expectedID)
46 |       (mockDao.deleteDocumentID _).expects(expectedID).returns(Future(Unit))
47 | 
48 |       Await.result(documentLibrary.delete(expectedID), 30.seconds)
49 |     }
50 | 
51 |     "will list all documents when list is called" in {
52 |       (mockDao.retrieveAllIDs _).expects().returns(Future(Seq()))
53 |       Await.result(documentLibrary.list(), 30.seconds)
54 |     }
55 |   }
56 | 
57 | }
58 | 


--------------------------------------------------------------------------------
/src/main/scala/tools/ambitious/pdfextractiontoolkit/library/model/Document.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.library.model
 2 | 
 3 | import java.net.URL
 4 | import java.util
 5 | 
 6 | import org.apache.pdfbox.pdmodel.font._
 7 | import org.apache.pdfbox.pdmodel.{PDDocument, PDPage}
 8 | import org.apache.pdfbox.util.Splitter
 9 | 
10 | import scala.collection.JavaConverters._
11 | 
12 | class Document private (private val pDDocument: PDDocument, val pages: List[Page]) {
13 |   def numberOfPages: Int = pages.length
14 | 
15 |   def getPage(number: Int): Page = {
16 |     if (number <= numberOfPages && number > 0)
17 |       pages(number-1)
18 |     else
19 |       throw new IllegalArgumentException("Invalid page number.")
20 |   }
21 | 
22 |   def close() = pDDocument.close()
23 | 
24 |   def pageNumberOf(page: Page): Option[Int] =
25 |     Option.apply(pages.indexOf(page))
26 |       .filter(_ >= 0)
27 |       .map(_ + 1)
28 | }
29 | 
30 | object Document {
31 |   def fromPDFPath(path: URL): Document = {
32 |     val pDDocument: PDDocument = PDDocument.load(path)
33 | 
34 |     val splitPDDocuments: List[PDDocument] = Document.splitPDDocumentIntoPDDocumentForEachPage(pDDocument)
35 |     val pages = Page.listFromSinglePagePDDocuments(splitPDDocuments)
36 | 
37 |     new Document(pDDocument, pages)
38 |   }
39 | 
40 |   def splitPDDocumentIntoPDDocumentForEachPage(document: PDDocument): List[PDDocument] = {
41 |     val splitter: Splitter = new Splitter
42 |     val splitPages: List[PDDocument] = splitter.split(document).asScala.toList
43 | 
44 |     // There is a bug in PDFBox Splitter that doesn't set fonts from the old
45 |     // document to each new document. This manifests itself as a NullPointerException
46 |     // when Tabula tries to extract a page. The following fixes that bug.
47 |     setFontsFromMasterPDDocumentToSplitPDDocuments(document, splitPages)
48 | 
49 |     splitPages
50 |   }
51 | 
52 |   private def setFontsFromMasterPDDocumentToSplitPDDocuments(masterPDDocument: PDDocument, splitPDDocuments: List[PDDocument]) = {
53 |     for (i <- 0 until masterPDDocument.getNumberOfPages) {
54 |       val masterPage: PDPage = masterPDDocument.getDocumentCatalog.getAllPages.get(i).asInstanceOf[PDPage]
55 |       val splitPage: PDPage = splitPDDocuments(i).getDocumentCatalog.getAllPages.get(0).asInstanceOf[PDPage]
56 | 
57 |       val fontsFromMasterPage: util.Map[String, PDFont] = masterPage.getResources.getFonts
58 | 
59 |       splitPage.getResources.setFonts(fontsFromMasterPage)
60 |     }
61 |   }
62 | }
63 | 


--------------------------------------------------------------------------------
/src/test/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/tablemergers/SimpleTableMergerSpec.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.library.extraction.tablemergers
 2 | 
 3 | import org.scalatest.FreeSpec
 4 | import tools.ambitious.pdfextractiontoolkit.Resources
 5 | import tools.ambitious.pdfextractiontoolkit.library.model.Table
 6 | import tools.ambitious.pdfextractiontoolkit.library.util.CSVUtil
 7 | 
 8 | class SimpleTableMergerSpec extends FreeSpec {
 9 | 
10 |   private val table1: Table = CSVUtil.tableFromURL(Resources.simpleTest2Tables2TitlePage1CSVURL)
11 |   private val table2: Table = CSVUtil.tableFromURL(Resources.simpleTest2Tables2TitlePage2CSVURL)
12 |   private val tables: List[Table] = List(table1, table2)
13 | 
14 |   s"A ${SimpleTableMerger.getClass.getSimpleName}" - {
15 |     "that does not ignore header rows" - {
16 |       val simpleTableMerger: SimpleTableMerger = SimpleTableMerger.create
17 | 
18 |       "when it merges the two tables" - {
19 |         val merged: Table = simpleTableMerger.mergeTables(tables).get
20 | 
21 |         shouldReturnTableWith(table = merged, row = 1, column = 3, expected = "4")
22 |         shouldReturnTableWith(table = merged, row = 6, column = 1, expected = "4")
23 |         shouldReturnTableWith(table = merged, row = 7, column = 4, expected = "6")
24 |         shouldReturnTableWith(table = merged, row = 9, column = 2, expected = "9")
25 |         shouldReturnTableWith(table = merged, row = 20, column = 3, expected = "5")
26 |       }
27 |     }
28 | 
29 |     "created to ignore header rows" - {
30 |       val simpleTableMerger: SimpleTableMerger = SimpleTableMerger.createIgnoringHeaderRows
31 | 
32 |       "when it merges the two tables" - {
33 |         val merged: Table = simpleTableMerger.mergeTables(tables).get
34 | 
35 |         shouldReturnTableWith(table = merged, row = 1, column = 3, expected = "5")
36 |         shouldReturnTableWith(table = merged, row = 6, column = 1, expected = "2")
37 |         shouldReturnTableWith(table = merged, row = 7, column = 4, expected = "1")
38 |         shouldReturnTableWith(table = merged, row = 9, column = 2, expected = "8")
39 |         shouldReturnTableWith(table = merged, row = 20, column = 3, expected = "1")
40 |       }
41 |     }
42 |   }
43 | 
44 |   private def shouldReturnTableWith(table: Table, row: Int, column: Int, expected: String):Unit = {
45 |     s"should return a table with the value $expected at row $row and column $column" in {
46 |       assert(table.getCell(row, column).text == expected)
47 |     }
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/src/test/scala/tools/ambitious/pdfextractiontoolkit/library/model/DocumentSpec.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.library.model
 2 | 
 3 | import org.apache.pdfbox.pdmodel.PDDocument
 4 | import org.scalatest.FreeSpec
 5 | 
 6 | class DocumentSpec extends FreeSpec {
 7 |   val samplePDFPath = getClass.getResource("/simplePDFs/SimpleTest1Table.pdf")
 8 |   val twoPagedDocumentPath = getClass.getResource("/simplePDFs/TwoPagedBlankDocument.pdf")
 9 | 
10 |   "A Document should instantiate from PDF" in {
11 |     val document = Document.fromPDFPath(samplePDFPath)
12 |   }
13 | 
14 |   "A Document instantiated with SimpleTest1Table.pdf" - {
15 |     val document = Document.fromPDFPath(samplePDFPath)
16 | 
17 |     "should have one page" in {
18 |       assert(document.numberOfPages == 1)
19 |     }
20 |   }
21 | 
22 |   "A Document instantiated with TwoPagedBlankDocument.pdf" - {
23 |     val document = Document.fromPDFPath(twoPagedDocumentPath)
24 | 
25 |     "should have two pages" in {
26 |       assert(document.numberOfPages == 2)
27 |     }
28 | 
29 |     "should be able to get the first page" in {
30 |       val page: Page = document.getPage(1)
31 |     }
32 | 
33 |     "should be able to get the second page" in {
34 |       val page: Page = document.getPage(2)
35 |     }
36 | 
37 |     "should throw an IllegalArgumentException when trying to get the third page" in {
38 |       val interceptException = intercept[IllegalArgumentException] {
39 |         val page: Page = document.getPage(3)
40 |       }
41 | 
42 |       assert(interceptException.getMessage === "Invalid page number.")
43 |     }
44 | 
45 |     "should say that its first page is page 1" in {
46 |       val page1: Page = document.getPage(1)
47 |       val pageNumOfPage1: Option[Int] = document.pageNumberOf(page1)
48 |       assert(pageNumOfPage1 == Option.apply(1))
49 |     }
50 |     
51 |     "should say that its second page is page 2" in {
52 |       val page2: Page = document.getPage(2)
53 |       val pageNumOfPage2: Option[Int] = document.pageNumberOf(page2)
54 |       assert(pageNumOfPage2 == Option.apply(2))
55 |     }
56 |   }
57 | 
58 |   "A List of PDDocuments split from TwoPagedBlankDocument.pdf" - {
59 |     val document: PDDocument = PDDocument.load(twoPagedDocumentPath)
60 |     val pages: List[PDDocument] = Document.splitPDDocumentIntoPDDocumentForEachPage(document)
61 | 
62 |     "should have only one page in each PDDocument" in {
63 |       for (pDDocument: PDDocument <- pages)
64 |         assert(pDDocument.getDocumentCatalog.getAllPages.size == 1)
65 |     }
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/src/main/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/DocumentWalker.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.library.extraction
 2 | 
 3 | import tools.ambitious.pdfextractiontoolkit.library.extraction.extractionconstraints.ExtractionConstraint
 4 | import tools.ambitious.pdfextractiontoolkit.library.model.{Document, Page, Table}
 5 | 
 6 | import scala.concurrent.ExecutionContext.Implicits.global
 7 | import scala.concurrent.{Future, Promise}
 8 | 
 9 | class DocumentWalker protected (val document:Document, val extractionConstraints: Set[ExtractionConstraint]) {
10 |   private val stateBundles: Map[ExtractionConstraint, StateBundle] = extractionConstraints.map(_ -> StateBundle.create).toMap
11 |   private val promise: Promise[Map[ExtractionConstraint, Table]] = Promise()
12 | 
13 |   private def run() =
14 |     Future {
15 |       traverseDocument()
16 |       promise.success(getCompletedTables)
17 |     }
18 | 
19 |   private def traverseDocument() = {
20 |       callOnStartOnExtractionConstraints()
21 | 
22 |       document.pages.foreach(page => callOnPageOnExtractionConstraints(page))
23 | 
24 |       callOnEndOnExtractionConstraints()
25 |   }
26 | 
27 |   private def getCompletedTables: Map[ExtractionConstraint, Table] =
28 |     extractionConstraints
29 |       .map(extractionConstraint => extractionConstraint -> extractionConstraint.tableFromState(stateBundles(extractionConstraint)))
30 |       .filter((tuple: (ExtractionConstraint, Option[Table])) => tuple._2.isDefined)
31 |       .map((tuple: (ExtractionConstraint, Option[Table])) => tuple._1 -> tuple._2.get)
32 |       .toMap
33 | 
34 |   def getTables: Future[Map[ExtractionConstraint, Table]] =
35 |     promise.future
36 | 
37 |   private def callOnStartOnExtractionConstraints() =
38 |     extractionConstraints.foreach(extractionConstraint => extractionConstraint.onStart(stateBundles(extractionConstraint)))
39 | 
40 |   private def callOnPageOnExtractionConstraints(page: Page) =
41 |     extractionConstraints.foreach(extractionConstraint => extractionConstraint.onPage(page, document, stateBundles(extractionConstraint)))
42 | 
43 |   private def callOnEndOnExtractionConstraints() =
44 |     extractionConstraints.foreach(extractionConstraint => extractionConstraint.onEnd(stateBundles(extractionConstraint)))
45 | }
46 | 
47 | object DocumentWalker {
48 |   def toWalkWithExtractionConstraint(document: Document, extractionConstraints: Set[ExtractionConstraint]): DocumentWalker = {
49 |     val walker = new DocumentWalker(document, extractionConstraints)
50 |     walker.run()
51 |     walker
52 |   }
53 | 
54 |   def toWalkWithExtractionConstraint(document: Document, extractionConstraints: Seq[ExtractionConstraint]): DocumentWalker =
55 |     toWalkWithExtractionConstraint(document, extractionConstraints.toSet)
56 | 
57 |   def toWalkWithExtractionConstraint(document: Document, extractionConstraint: ExtractionConstraint): DocumentWalker =
58 |     toWalkWithExtractionConstraint(document, Seq(extractionConstraint))
59 | }


--------------------------------------------------------------------------------
/src/test/scala/tools/ambitious/pdfextractiontoolkit/webapp/services/documentstorage/DocumentFileStoreImplSpec.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.webapp.services.documentstorage
 2 | 
 3 | import java.io.File
 4 | import java.net.URL
 5 | import java.nio.file.{Files, Path}
 6 | 
 7 | import org.scalatest._
 8 | import spray.http.MediaType
 9 | import _root_.tools.ambitious.pdfextractiontoolkit.Resources
10 | import _root_.tools.ambitious.pdfextractiontoolkit.utils.AmbitiousIoUtils._
11 | import _root_.tools.ambitious.pdfextractiontoolkit.webapp.data.DAOTestUtils
12 | 
13 | import scala.concurrent.Await
14 | import scala.concurrent.duration._
15 | import scala.io.Source
16 | 
17 | class DocumentFileStoreImplSpec extends FlatSpec with GivenWhenThen with BeforeAndAfter with OneInstancePerTest {
18 |   private val workingDirectory: Path = DAOTestUtils.createAndGetWorkingDirectory()
19 |   private val documentFileStore: DocumentFileStore = new DocumentFileStoreImpl(workingDirectory)
20 | 
21 |   private val mediaType: MediaType = MediaType.custom("text/plain")
22 |   private val documentDescription: DocumentDescription = DocumentDescription.withTitleAndMediaType("Test", mediaType)
23 |   private val source: URL = Resources.quickBrownFoxTxt
24 | 
25 |   private val documentID: DocumentIdentifier = DocumentIdentifier.computeFor(documentDescription, source.toBytes)
26 | 
27 |   private val expectedSourceContents: String = "The quick brown fox jumped over the lazy dogs"
28 |   private val expectedOutputFile: File = workingDirectory.resolve(documentID.hash.toString).toFile
29 | 
30 |   def storeTestFileInFileStore(): Unit = {
31 |     Await.result(documentFileStore.storeFileFor(documentID, source), 30.seconds)
32 |   }
33 | 
34 |   after {
35 |     DAOTestUtils.cleanWorkingDirectory()
36 |   }
37 | 
38 |   "A file store" should "store files with the expected name" in {
39 |     When("a file is stored in the file store")
40 |     storeTestFileInFileStore()
41 | 
42 |     Then("the file should exist on disk")
43 |     assert(expectedOutputFile.isFile)
44 |   }
45 | 
46 |   it should "store files with the expected content" in {
47 |     When("a file is stored in the file store")
48 |     storeTestFileInFileStore()
49 | 
50 |     Then("the file stored on disk should contain the contents of the source file")
51 |     assert(Source.fromFile(expectedOutputFile).mkString == expectedSourceContents)
52 |   }
53 | 
54 |   it should "delete stored files" in {
55 |     Given("a file is in the file store")
56 |     storeTestFileInFileStore()
57 | 
58 |     When("the file is deleted")
59 |     Await.result(documentFileStore.deleteFileFor(documentID), 30.seconds)
60 | 
61 |     Then("the file should not exist on disk")
62 |     assert(!expectedOutputFile.exists())
63 |   }
64 | 
65 |   it should "return quietly if asked to delete a missing file" in {
66 |     Given("no file has been stored in the file store")
67 | 
68 |     When("the file is deleted")
69 |     Await.result(documentFileStore.deleteFileFor(documentID), 30.seconds)
70 | 
71 |     Then("the file store should ignore the missing file")
72 |   }
73 | }
74 | 


--------------------------------------------------------------------------------
/src/test/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/ExtractorSpec.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.library.extraction
 2 | 
 3 | import org.scalatest.FreeSpec
 4 | import tools.ambitious.pdfextractiontoolkit.Resources
 5 | import tools.ambitious.pdfextractiontoolkit.library.extraction.extractionconstraints.{FirstOccurrenceOfStringExtractionConstraint, PageNumberExtractionConstraint}
 6 | import tools.ambitious.pdfextractiontoolkit.library.extraction.tableextractors.RegionBasedTableExtractor
 7 | import tools.ambitious.pdfextractiontoolkit.library.model.geometry.{PositivePoint, Rectangle, Size}
 8 | import tools.ambitious.pdfextractiontoolkit.library.model.{Document, Table}
 9 | import tools.ambitious.pdfextractiontoolkit.library.util.CSVUtil
10 | 
11 | import scala.concurrent.Await
12 | import scala.concurrent.duration._
13 | 
14 | class ExtractorSpec extends FreeSpec {
15 |   s"An ${Extractor.getClass.getSimpleName} with a document and a ${PageNumberExtractionConstraint.getClass.getSimpleName}" - {
16 |     val document: Document = Document.fromPDFPath(Resources.simpleTest1TableURL)
17 | 
18 |     val region: Rectangle = Rectangle.fromCornerCoords(108, 81, 312, 305)
19 |     val tableExtractor = RegionBasedTableExtractor.forRegion(region)
20 |     val extractionConstraint = PageNumberExtractionConstraint.withPageNumberAndTableExtractor(1, tableExtractor)
21 | 
22 |     val extractor: Extractor = Extractor.fromDocumentAndConstraints(document, extractionConstraint)
23 | 
24 |     "should be able to extract the table and have it match the values from it's corresponding CSV file" in {
25 |       val extractionResult: ExtractionResult = Await.result(extractor.extractTables, 60.seconds)
26 | 
27 |       document.close()
28 | 
29 |       val table: Table = extractionResult(document)(extractionConstraint)
30 |       val tableFromCSV: Table = CSVUtil.tableFromURL(Resources.simpleTest1TableCSVURL)
31 | 
32 |       assert(table == tableFromCSV)
33 |     }
34 |   }
35 | 
36 |   s"An ${Extractor.getClass.getSimpleName} with one document and a ${FirstOccurrenceOfStringExtractionConstraint.getClass.getSimpleName}" - {
37 |     val document: Document = Document.fromPDFPath(Resources.simpleTest2Tables2TitleURL)
38 | 
39 |     val region: Rectangle = Rectangle.fromCornerAndSize(PositivePoint.at(168.48, 273.95), Size.fromWidthAndHeight(213.54, 303.5))
40 |     val tableExtractor = RegionBasedTableExtractor.forRegion(region)
41 | 
42 |     val textRegion: Rectangle = Rectangle.fromCornerAndSize(PositivePoint.at(185.38, 165.62), Size.fromWidthAndHeight(112.64, 16.16))
43 |     val extractionConstraint = FirstOccurrenceOfStringExtractionConstraint.withTextAndTableExtractor("An example Title", textRegion, tableExtractor)
44 | 
45 |     val extractor: Extractor = Extractor.fromDocumentAndConstraints(document, extractionConstraint)
46 | 
47 |     "should be able to extract the table and have it match the values from it's corresponding CSV file" in {
48 |       val extractionResult: ExtractionResult = Await.result(extractor.extractTables, 60.seconds)
49 | 
50 |       document.close()
51 | 
52 |       val table: Table = extractionResult(document)(extractionConstraint)
53 |       val tableFromCSV: Table = CSVUtil.tableFromURL(Resources.simpleTest2Tables2TitlePage2CSVURL)
54 | 
55 |       assert(table == tableFromCSV)
56 |     }
57 |   }
58 | }
59 | 


--------------------------------------------------------------------------------
/src/test/scala/tools/ambitious/pdfextractiontoolkit/library/extraction/extractionconstraints/PageNumberExtractionConstraintSpec.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.library.extraction.extractionconstraints
 2 | 
 3 | import org.scalatest.FreeSpec
 4 | import tools.ambitious.pdfextractiontoolkit.Resources
 5 | import tools.ambitious.pdfextractiontoolkit.library.extraction._
 6 | import tools.ambitious.pdfextractiontoolkit.library.extraction.tableextractors.RegionBasedTableExtractor
 7 | import tools.ambitious.pdfextractiontoolkit.library.extraction.tablemergers.SimpleTableMerger
 8 | import tools.ambitious.pdfextractiontoolkit.library.model.geometry.{PositivePoint, Rectangle, Size}
 9 | import tools.ambitious.pdfextractiontoolkit.library.model.{Document, Table}
10 | import tools.ambitious.pdfextractiontoolkit.library.util.CSVUtil
11 | 
12 | import scala.concurrent.Await
13 | import scala.concurrent.duration._
14 | 
15 | class PageNumberExtractionConstraintSpec extends FreeSpec {
16 |   val region: Rectangle = Rectangle.fromCornerAndSize(PositivePoint.at(168.48, 240), Size.fromWidthAndHeight(213.54, 340))
17 |   val tableExtractor = RegionBasedTableExtractor.forRegion(region)
18 | 
19 |   s"A ${PageNumberExtractionConstraint.getClass.getSimpleName}" - {
20 |     "for page 2" - {
21 |       val extractionConstraint: PageNumberExtractionConstraint = PageNumberExtractionConstraint.withPageNumberAndTableExtractor(2, tableExtractor)
22 | 
23 |       "when put through a walker with test document 2" - {
24 |         val document: Document = Document.fromPDFPath(Resources.simpleTest2Tables2TitleURL)
25 |         val walker: DocumentWalker = DocumentWalker.toWalkWithExtractionConstraint(document, extractionConstraint)
26 |         val tables: Map[ExtractionConstraint, Table] = Await.result(walker.getTables, 60.seconds)
27 | 
28 |         "should return the table at page 2" in {
29 |           val table: Option[Table] = tables.get(extractionConstraint)
30 |           val tableFromCSV: Table = CSVUtil.tableFromURL(Resources.simpleTest2Tables2TitlePage2CSVURL)
31 | 
32 |           assert(table.get == tableFromCSV)
33 |         }
34 |       }
35 |     }
36 | 
37 |     "for a page range from 1 to 2" - {
38 |       val extractionConstraint: PageNumberExtractionConstraint =
39 |         PageNumberExtractionConstraint.withPageRangeAndTableExtractor(Range.inclusive(1, 2), tableExtractor)
40 | 
41 |       "when put through a walker with test document 2" - {
42 |         val document: Document = Document.fromPDFPath(Resources.simpleTest2Tables2TitleURL)
43 |         val walker: DocumentWalker = DocumentWalker.toWalkWithExtractionConstraint(document, extractionConstraint)
44 |         val tables: Map[ExtractionConstraint, Table] = Await.result(walker.getTables, 60.seconds)
45 | 
46 |         "should return the two tables merged" in {
47 |           val table: Option[Table] = tables.get(extractionConstraint)
48 | 
49 |           val tableMerger: SimpleTableMerger = SimpleTableMerger.create
50 | 
51 |           val table1: Table = CSVUtil.tableFromURL(Resources.simpleTest2Tables2TitlePage1CSVURL)
52 |           val table2: Table = CSVUtil.tableFromURL(Resources.simpleTest2Tables2TitlePage2CSVURL)
53 |           val tablesToMerge: List[Table] = List(table1, table2)
54 | 
55 |           val tableFromCSV: Table = tableMerger.mergeTables(tablesToMerge).get
56 | 
57 |           assert(table.get == tableFromCSV)
58 |         }
59 |       }
60 |     }
61 | 
62 |     s"instantiated for a page number less than 1 should throw an IllegalArgumentException" - {
63 |       val instantiatePageNumberTableExtractor = intercept[IllegalArgumentException] {
64 | 
65 |         val dummyExtractor = RegionBasedTableExtractor.forRegion(Rectangle.fromCornerCoords(0, 0, 0, 0))
66 | 
67 |         PageNumberExtractionConstraint.withPageNumberAndTableExtractor(0, dummyExtractor)
68 |       }
69 |       assert(instantiatePageNumberTableExtractor.getMessage === "Page numbers can only be positive numbers.")
70 |     }
71 |   }
72 | }
73 | 


--------------------------------------------------------------------------------
/src/test/scala/tools/ambitious/pdfextractiontoolkit/library/expensereports/SummaryOfParliamentaryExpenditureByPeriodExtractionSpec.scala:
--------------------------------------------------------------------------------
 1 | package tools.ambitious.pdfextractiontoolkit.library.expensereports
 2 | 
 3 | import org.scalatest.{BeforeAndAfterEach, FlatSpec}
 4 | import tools.ambitious.pdfextractiontoolkit.library.extraction.extractionconstraints.FirstOccurrenceOfStringExtractionConstraint
 5 | import tools.ambitious.pdfextractiontoolkit.library.extraction.tableextractors.RegionBasedTableExtractor
 6 | import tools.ambitious.pdfextractiontoolkit.library.extraction.{ExtractionResult, Extractor}
 7 | import tools.ambitious.pdfextractiontoolkit.library.model.geometry.{PositivePoint, Rectangle, Size}
 8 | import tools.ambitious.pdfextractiontoolkit.library.model.{Document, Table}
 9 | 
10 | import scala.concurrent.Await
11 | import scala.concurrent.duration._
12 | 
13 | class SummaryOfParliamentaryExpenditureByPeriodExtractionSpec extends FlatSpec with BeforeAndAfterEach {
14 | 
15 |   val abbottTonyReport = getClass.getResource("/expenseReports/P34_ABBOTT_Tony.pdf")
16 |   val leighAndrewReport = getClass.getResource("/expenseReports/P34_LEIGH_Andrew.pdf")
17 | 
18 |   val textToFind = "Summary of Parliamentary Expenditure by Period"
19 | 
20 |   val tableRegion = Rectangle.fromCornerAndSize(PositivePoint.at(34, 138), Size.fromWidthAndHeight(540, 608))
21 |   val tableExtractor = RegionBasedTableExtractor.forRegion(tableRegion)
22 | 
23 |   val textRegion = Rectangle.fromCornerAndSize(PositivePoint.at(165, 90), Size.fromWidthAndHeight(280, 25))
24 | 
25 |   val extractionConstraint = FirstOccurrenceOfStringExtractionConstraint.withTextAndTableExtractor(textToFind, textRegion, tableExtractor)
26 | 
27 |   val abbottTonyDocument = Document.fromPDFPath(abbottTonyReport)
28 |   val leighAndrewDocument = Document.fromPDFPath(leighAndrewReport)
29 | 
30 |   val extractor = Extractor.fromDocumentsAndConstraints(List(abbottTonyDocument, leighAndrewDocument), extractionConstraint)
31 |   val extractionResult: ExtractionResult = Await.result(extractor.extractTables, 60.seconds)
32 | 
33 |   "the Extractor" should "extract a single table from the Tony Abbott report" in {
34 |     assert(extractionResult.getResults(abbottTonyDocument)(extractionConstraint).isDefined)
35 |   }
36 | 
37 |   it should "extract a single table from the Andrew Leigh report" in {
38 |     assert(extractionResult.getResults(leighAndrewDocument)(extractionConstraint).isDefined)
39 |   }
40 | 
41 |   val expectedRowsTonyAbbott = 24
42 |   it should s"extract a table from the Tony Abbott Report with $expectedRowsTonyAbbott rows" in {
43 |     val table: Table = extractionResult(abbottTonyDocument)(extractionConstraint)
44 | 
45 |     assert(table.numberOfRows == expectedRowsTonyAbbott)
46 |   }
47 | 
48 |   val expectedRowsAndrewLeigh = 26
49 |   it should s"extract a table from the Andrew Leigh Report with $expectedRowsAndrewLeigh rows" in {
50 |     val table: Table = extractionResult(leighAndrewDocument)(extractionConstraint)
51 | 
52 |     assert(table.numberOfRows == expectedRowsAndrewLeigh)
53 |   }
54 | 
55 |   it should "extract the expected values from the Tony Abbott report" in {
56 |     val table: Table = extractionResult(abbottTonyDocument)(extractionConstraint)
57 | 
58 |     assertValueAtCell(expectedText = "Expenses From", table = table, rowNumber = 1, columnNumber = 2)
59 |     assertValueAtCell(expectedText = "$628,736.33", table = table, rowNumber = 23, columnNumber = 2)
60 |   }
61 | 
62 |   it should "extract the expected values from the Andrew Leigh report" in {
63 |     val table: Table = extractionResult(leighAndrewDocument)(extractionConstraint)
64 | 
65 |     assertValueAtCell(expectedText = "Expenses From", table = table, rowNumber = 1, columnNumber = 2)
66 |     assertValueAtCell(expectedText = "$109,760.32", table = table, rowNumber = 26, columnNumber = 2)
67 |   }
68 | 
69 |   private def assertValueAtCell(expectedText: String, table: Table, rowNumber: Int, columnNumber: Int) = {
70 |     assert(table.getCell(rowNumber, columnNumber).text == expectedText)
71 |   }
72 | 
73 |   override def afterEach(): Unit = {
74 |     abbottTonyDocument.close()
75 |     leighAndrewDocument.close()
76 |   }
77 | }
78 | 


--------------------------------------------------------------------------------
/src/test/scala/tools/ambitious/pdfextractiontoolkit/library/model/RowSpec.scala:
--------------------------------------------------------------------------------
  1 | package tools.ambitious.pdfextractiontoolkit.library.model
  2 | 
  3 | import org.scalatest.FreeSpec
  4 | 
  5 | class RowSpec extends FreeSpec {
  6 | 
  7 |   "A row that is instantiated without any arguments" - {
  8 |     val row: Row = new Row
  9 | 
 10 |     "should be empty" in {
 11 |       assert(row.isEmpty)
 12 |     }
 13 | 
 14 |     "should have no cells" in {
 15 |       assert(row.numberOfCells == 0)
 16 |     }
 17 |   }
 18 | 
 19 |   "A row that is instantiated with a cell containing the text 'test'" - {
 20 |     val row: Row = new Row(List(new Cell("test")))
 21 | 
 22 |     "should not be empty" in {
 23 |       assert(!row.isEmpty)
 24 |     }
 25 | 
 26 |     "should have the first cell contain the text 'test" in {
 27 |       assert(row.getCell(1).text == "test")
 28 |     }
 29 | 
 30 |     "should have one cell" in {
 31 |       assert(row.numberOfCells == 1)
 32 |     }
 33 |   }
 34 | 
 35 |   "A row containing the entries 1, 2 and 3" - {
 36 |     val row: Row = Row.fromStrings(List("1", "2", "3"))
 37 | 
 38 |     "should return '1,2,3' when converted to String" in {
 39 |       assert(row.toString == "1,2,3")
 40 |     }
 41 | 
 42 |     "should have three cells" in {
 43 |       assert(row.numberOfCells == 3)
 44 |     }
 45 |   }
 46 | 
 47 |   "A row instantiated from three Cells containing 1, 2 and 3 consecutively" - {
 48 |     val row: Row = Row.fromCells(List(new Cell("1"), new Cell("2"), new Cell("3")))
 49 | 
 50 |     "should have first cell with text 1" in {
 51 |       assert(row.getCell(1).text == "1")
 52 |     }
 53 | 
 54 |     "should have second cell with text 2" in {
 55 |       assert(row.getCell(2).text == "2")
 56 |     }
 57 | 
 58 |     "should have third cell with text 3" in {
 59 |       assert(row.getCell(3).text == "3")
 60 |     }
 61 |   }
 62 | 
 63 |   "A row instantiated from three Strings containing 1, 2 and 3 consecutively" - {
 64 |     val cellContents: List[String] = List("1", "2", "3")
 65 |     val row: Row = Row.fromStrings(cellContents)
 66 | 
 67 |     "should have first cell with text 1" in {
 68 |       assert(row.getCell(1).text == "1")
 69 |     }
 70 | 
 71 |     "should have second cell with text 2" in {
 72 |       assert(row.getCell(2).text == "2")
 73 |     }
 74 | 
 75 |     "should have third cell with text 3" in {
 76 |       assert(row.getCell(3).text == "3")
 77 |     }
 78 |   }
 79 | 
 80 |   "A row instantiated from a single cell" - {
 81 |     val row: Row = Row.fromCell(new Cell("1"))
 82 | 
 83 |     "should have first cell with text 1" in {
 84 |       assert(row.getCell(1).text == "1")
 85 |     }
 86 |   }
 87 | 
 88 |   "A row instantiated from a single string" - {
 89 |     val row: Row = Row.fromString("1")
 90 | 
 91 |     "should have first cell with text 1" in {
 92 |       assert(row.getCell(1).text == "1")
 93 |     }
 94 |   }
 95 | 
 96 |   "Two rows with the same cells" - {
 97 |     val cellA: Cell = new Cell("test")
 98 |     val cellB: Cell = new Cell("test")
 99 | 
100 |     val rowA: Row = Row.fromCells(List(cellA, cellB))
101 |     val rowB: Row = Row.fromCells(List(cellA, cellB))
102 | 
103 |     "should be equal" in {
104 |       assert(rowA == rowB)
105 |     }
106 | 
107 |     "should not have the same reference" in {
108 |       assert(rowA ne rowB)
109 |     }
110 |   }
111 | 
112 |   "Two rows with different cells" - {
113 |     val rowA: Row = Row.fromStrings(List("a", "b"))
114 |     val rowB: Row = Row.fromStrings(List("c", "d"))
115 | 
116 |     "should not be equal" in {
117 |       assert(rowA != rowB)
118 |     }
119 |   }
120 | 
121 |   "Two rows with the first row having more cells than the second row" - {
122 |     val rowA: Row = Row.fromStrings(List("a", "b"))
123 |     val rowB: Row = Row.fromStrings(List("a"))
124 | 
125 |     "should not be equal" in {
126 |       assert(rowA != rowB)
127 |     }
128 |   }
129 | 
130 |   "Two rows with the first row having less cells than the second row" - {
131 |     val rowA: Row = Row.fromStrings(List("a"))
132 |     val rowB: Row = Row.fromStrings(List("a", "b"))
133 | 
134 |     "should not be equal" in {
135 |       assert(rowA != rowB)
136 |     }
137 |   }
138 | 
139 |   "A row with three cells" - {
140 |     val row: Row = Row.fromStrings(List("a", "b", "c"))
141 | 
142 |     "should have three cells" in {
143 |       assert(row.numberOfCells == 3)
144 |     }
145 |   }
146 | }
147 | 


--------------------------------------------------------------------------------
/src/test/scala/tools/ambitious/pdfextractiontoolkit/webapp/data/DocumentInformationDaoSpec.scala:
--------------------------------------------------------------------------------
  1 | package tools.ambitious.pdfextractiontoolkit.webapp.data
  2 | 
  3 | import slick.driver.SQLiteDriver.api._
  4 | import slick.lifted.{Query, TableQuery}
  5 | import spray.http.{MediaType, MediaTypes}
  6 | import tools.ambitious.pdfextractiontoolkit.AmbitiousToolsSpec
  7 | import tools.ambitious.pdfextractiontoolkit.webapp.data.model.{Document, Documents}
  8 | import tools.ambitious.pdfextractiontoolkit.webapp.services.documentstorage.{DocumentDescription, DocumentIdentifier}
  9 | 
 10 | import scala.concurrent.Await
 11 | import scala.concurrent.duration._
 12 | 
 13 | class DocumentInformationDaoSpec extends AmbitiousToolsSpec {
 14 | 
 15 |   val rootDao: RootDAO = DAOTestUtils.constructCleanRootDAO
 16 |   val dao = DocumentInformationDao.forRootDao(rootDao)
 17 | 
 18 |   val documentID: DocumentIdentifier = {
 19 |     val testDocumentHash: String = "425A"
 20 |     val testDocumentTitle: String = "testDoc"
 21 |     val testDocumentDescription: String = "A test document"
 22 |     val testDocumentMediaType: MediaType = MediaTypes.`application/pdf`
 23 | 
 24 |     val description: DocumentDescription = DocumentDescription
 25 |       .withTitleAndDescriptionAndMediaType(testDocumentTitle, testDocumentDescription, testDocumentMediaType)
 26 | 
 27 |     DocumentIdentifier.withHashAndDescription(testDocumentHash, description)
 28 |   }
 29 | 
 30 |   "the document information dao" should "write one database row per stored document ID" in {
 31 |     storeDocumentUsingDao(documentID)
 32 | 
 33 |     val allDocuments = retrieveAllDocsForIDFromDatabase(documentID)
 34 | 
 35 |     assert(allDocuments.length == 1)
 36 |   }
 37 | 
 38 |   it should "store a stored document'sthe document's hash" in {
 39 |     storeDocumentUsingDao(documentID)
 40 | 
 41 |     val allDocuments = retrieveAllDocsForIDFromDatabase(documentID)
 42 | 
 43 |     assert(allDocuments.head.hash == documentID.hash)
 44 |   }
 45 | 
 46 |   it should "store the document's title" in {
 47 |     storeDocumentUsingDao(documentID)
 48 | 
 49 |     val allDocuments = retrieveAllDocsForIDFromDatabase(documentID)
 50 | 
 51 |     assert(allDocuments.head.title == documentID.description.title)
 52 |   }
 53 | 
 54 |   it should "store the document's description" in {
 55 |     storeDocumentUsingDao(documentID)
 56 | 
 57 |     val allDocuments = retrieveAllDocsForIDFromDatabase(documentID)
 58 | 
 59 |     assert(documentID.description.description.contains(allDocuments.head.description))
 60 |   }
 61 | 
 62 |   it should "store the document's media type" in {
 63 |     storeDocumentUsingDao(documentID)
 64 | 
 65 |     val allDocuments = retrieveAllDocsForIDFromDatabase(documentID)
 66 | 
 67 |     assert(allDocuments.head.mediaType == documentID.description.mediaType.toString())
 68 |   }
 69 | 
 70 |   it should "retrieve no identifiers from an empty identifier table" in {
 71 |     val allDocumentIDs: Seq[DocumentIdentifier] = retrieveAllDocumentIDsFromDao()
 72 | 
 73 |     assert(allDocumentIDs.isEmpty)
 74 |   }
 75 | 
 76 |   it should "retrieve one identifier from an identifer table with one record" in {
 77 |     Given("one document has been stored in the database")
 78 |     storeDocumentUsingDao(documentID)
 79 | 
 80 |     When("the document IDs are retrieved from the dao")
 81 |     val allDocIDs: Seq[DocumentIdentifier] = retrieveAllDocumentIDsFromDao()
 82 | 
 83 |     Then("there should be one document ID")
 84 |     assert(allDocIDs.size == 1)
 85 |   }
 86 | 
 87 |   it should "retrieve the correct document ID" in {
 88 |     Given("one document has been stored in the database")
 89 |     storeDocumentUsingDao(documentID)
 90 | 
 91 |     When("the document IDs are retrieved from the dao")
 92 |     val allDocIDs: Seq[DocumentIdentifier] = retrieveAllDocumentIDsFromDao()
 93 | 
 94 |     Then("the retrieved document ID should be the same as the stored one")
 95 |     val retrievedDocID = allDocIDs.head
 96 | 
 97 |     assert(retrievedDocID == documentID)
 98 |   }
 99 | 
100 |   def storeDocumentUsingDao(documentID: DocumentIdentifier): Unit = {
101 |     Await.result(dao.storeDocumentID(documentID), 30.seconds)
102 |   }
103 | 
104 |   def retrieveAllDocsForIDFromDatabase(documentID: DocumentIdentifier): Seq[Document] = {
105 |     val tableQuery: TableQuery[Documents] = new TableQuery(new Documents(_))
106 | 
107 |     val query: Query[Documents, Document, Seq] = tableQuery.map(row => row)
108 |     Await.result(rootDao.database.run(query.result), 30.seconds)
109 |   }
110 | 
111 |   def retrieveAllDocumentIDsFromDao(): Seq[DocumentIdentifier] = {
112 |     val allDocumentIDs: Seq[DocumentIdentifier] = Await.result(dao.retrieveAllIDs(), 30.seconds)
113 |     allDocumentIDs
114 |   }
115 | }
116 | 


--------------------------------------------------------------------------------
/src/test/scala/tools/ambitious/pdfextractiontoolkit/library/model/TableSpec.scala:
--------------------------------------------------------------------------------
  1 | package tools.ambitious.pdfextractiontoolkit.library.model
  2 | 
  3 | import org.scalatest.FreeSpec
  4 | 
  5 | class TableSpec extends FreeSpec {
  6 | 
  7 |   "A table with a single row containing a single cell with the text 'test'" - {
  8 |     val row: Row = Row.fromString("test")
  9 |     val table: Table = Table.fromRow(row)
 10 | 
 11 |     "should have that cell in it's 1,1 position" in {
 12 |       assert(table.getCell(1,1).text == "test")
 13 |     }
 14 | 
 15 |     "should have one row" in {
 16 |       assert(table.numberOfRows == 1)
 17 |     }
 18 | 
 19 |     "should have one column" in {
 20 |       assert(table.numberOfColumns == 1)
 21 |     }
 22 | 
 23 |     "should have it's first row equal to the row we set" in {
 24 |       assert(table.getRow(1) == row)
 25 |     }
 26 |   }
 27 | 
 28 |   "A table with 2 rows, each containing three distinct entries" - {
 29 |     val rowA: Row = Row.fromStrings(List("1", "2", "3"))
 30 |     val rowB: Row = Row.fromStrings(List("4", "5", "6"))
 31 |     val table: Table = Table.fromRows(List(rowA, rowB))
 32 | 
 33 |     "should return '1,2,3\n4,5,6' when converted to String" in {
 34 |       assert(table.toString == "1,2,3\n4,5,6")
 35 |     }
 36 | 
 37 |     "should have two rows" in {
 38 |       assert(table.numberOfRows == 2)
 39 |     }
 40 | 
 41 |     "should have three columns" in {
 42 |       assert(table.numberOfColumns == 3)
 43 |     }
 44 | 
 45 |     "should have it's first row equal to the first row we set" in {
 46 |       assert(table.getRow(1) == rowA)
 47 |     }
 48 | 
 49 |     "should have it's second row equal to the second row we set" in {
 50 |       assert(table.getRow(2) == rowB)
 51 |     }
 52 |   }
 53 | 
 54 |   "A table instantiated from three rows each containing one cell with text 1, 2 and 3 consecutively" - {
 55 |     val cellContents = List("1", "2", "3")
 56 |     val rows: List[Row] = cellContents.map(i => Row.fromString(i))
 57 | 
 58 |     val table: Table = Table.fromRows(rows)
 59 | 
 60 |     "should have string '1\n2\n3" in {
 61 |       assert(table.toString == "1\n2\n3")
 62 |     }
 63 | 
 64 |     "should have three rows" in {
 65 |       assert(table.numberOfRows == 3)
 66 |     }
 67 | 
 68 |     "should have one column" in {
 69 |       assert(table.numberOfColumns == 1)
 70 |     }
 71 | 
 72 |     "should have it's first row equal to the first row we set" in {
 73 |       assert(table.getRow(1) == rows.head)
 74 |     }
 75 | 
 76 |     "should have it's second row equal to the second row we set" in {
 77 |       assert(table.getRow(2) == rows(1))
 78 |     }
 79 | 
 80 |     "should have it's third row equal to the third row we set" in {
 81 |       assert(table.getRow(3) == rows(2))
 82 |     }
 83 |   }
 84 | 
 85 |   "A table instantiated from a single row containing a cell with the text 'test'" - {
 86 |     val table: Table = Table.fromRow(Row.fromString("test"))
 87 | 
 88 |     "should have string 'test'" in {
 89 |       assert(table.toString == "test")
 90 |     }
 91 |   }
 92 | 
 93 |   "Two tables with the same rows" - {
 94 |     val cellA: Cell = new Cell("test")
 95 |     val cellB: Cell = new Cell("test")
 96 | 
 97 |     val rowA: Row = Row.fromCells(List(cellA, cellB))
 98 |     val rowB: Row = Row.fromCells(List(cellA, cellB))
 99 | 
100 |     val tableA: Table = Table.fromRows(List(rowA, rowB))
101 |     val tableB: Table = Table.fromRows(List(rowA, rowB))
102 | 
103 |     "should be equal" in {
104 |       assert(tableA == tableB)
105 |     }
106 | 
107 |     "should not have the same reference" in {
108 |       assert(tableA ne tableB)
109 |     }
110 |   }
111 | 
112 |   "Two tables with different rows" - {
113 |     val tableA: Table = Table.fromRows(List(Row.fromString("a"), Row.fromString("b")))
114 |     val tableB: Table = Table.fromRows(List(Row.fromString("c"), Row.fromString("d")))
115 | 
116 |     "should not be equal" in {
117 |       assert(tableA != tableB)
118 |     }
119 |   }
120 | 
121 |   "Two tables with the first table having more rows than the second table" - {
122 |     val tableA: Table = Table.fromRows(List(Row.fromString("a"), Row.fromString("b")))
123 |     val tableB: Table = Table.fromRow(Row.fromString("c"))
124 | 
125 |     "should not be equal" in {
126 |       assert(tableA != tableB)
127 |     }
128 |   }
129 | 
130 |   "Two rows with the first row having less cells than the second row" - {
131 |     val tableA: Table = Table.fromRow(Row.fromString("a"))
132 |     val tableB: Table = Table.fromRows(List(Row.fromString("b"), Row.fromString("c")))
133 | 
134 |     "should not be equal" in {
135 |       assert(tableA != tableB)
136 |     }
137 |   }
138 | 
139 |   "A list of two tables that are merged" - {
140 |     val tableA: Table = Table.fromRows(List(Row.fromString("a"), Row.fromString("b")))
141 |     val tableB: Table = Table.fromRows(List(Row.fromString("c"), Row.fromString("d")))
142 | 
143 |     val mergedTable: Table = Table.merge(List(tableA, tableB))
144 | 
145 |     "should have first row equal to the first row in tableA" in {
146 |       assert(mergedTable.getRow(1) == tableA.getRow(1))
147 |     }
148 | 
149 |     "should have second row equal to the second row in tableA" in {
150 |       assert(mergedTable.getRow(2) == tableA.getRow(2))
151 |     }
152 | 
153 |     "should have third row equal to the first row in tableB" in {
154 |       assert(mergedTable.getRow(3) == tableB.getRow(1))
155 |     }
156 | 
157 |     "should have fourth row equal to the second row in tableB" in {
158 |       assert(mergedTable.getRow(4) == tableB.getRow(2))
159 |     }
160 | 
161 |     "should have four rows" in {
162 |       assert(mergedTable.numberOfRows == 4)
163 |     }
164 |   }
165 | 
166 |   "One table that is merged into another" - {
167 |     val tableA: Table = Table.fromRows(List(Row.fromString("a"), Row.fromString("b")))
168 |     val tableB: Table = Table.fromRows(List(Row.fromString("c"), Row.fromString("d")))
169 | 
170 |     val mergedTable: Table = tableA.mergedWith(tableB)
171 | 
172 |     "should have first row equal to the first row in tableA" in {
173 |       assert(mergedTable.getRow(1) == tableA.getRow(1))
174 |     }
175 | 
176 |     "should have second row equal to the second row in tableA" in {
177 |       assert(mergedTable.getRow(2) == tableA.getRow(2))
178 |     }
179 | 
180 |     "should have third row equal to the first row in tableB" in {
181 |       assert(mergedTable.getRow(3) == tableB.getRow(1))
182 |     }
183 | 
184 |     "should have fourth row equal to the second row in tableB" in {
185 |       assert(mergedTable.getRow(4) == tableB.getRow(2))
186 |     }
187 | 
188 |     "should have four rows" in {
189 |       assert(mergedTable.numberOfRows == 4)
190 |     }
191 |   }
192 | 
193 |   "Two tables that are merged" - {
194 |     val tableA: Table = Table.fromRows(List(Row.fromString("a"), Row.fromString("b")))
195 |     val tableB: Table = Table.fromRows(List(Row.fromString("c"), Row.fromString("d")))
196 | 
197 |     val mergedTable: Table = Table.merge(tableA, tableB)
198 | 
199 |     "should have first row equal to the first row in tableA" in {
200 |       assert(mergedTable.getRow(1) == tableA.getRow(1))
201 |     }
202 | 
203 |     "should have second row equal to the second row in tableA" in {
204 |       assert(mergedTable.getRow(2) == tableA.getRow(2))
205 |     }
206 | 
207 |     "should have third row equal to the first row in tableB" in {
208 |       assert(mergedTable.getRow(3) == tableB.getRow(1))
209 |     }
210 | 
211 |     "should have fourth row equal to the second row in tableB" in {
212 |       assert(mergedTable.getRow(4) == tableB.getRow(2))
213 |     }
214 | 
215 |     "should have four rows" in {
216 |       assert(mergedTable.numberOfRows == 4)
217 |     }
218 |   }
219 | }
220 | 


--------------------------------------------------------------------------------