├── PULL_REQUEST_TEMPLATE.md
├── src
    ├── .gitignore
    ├── test-runner.cpp
    ├── DCTDecode.h
    ├── external
    │   └── Profiler.h
    ├── letter_grouper.h
    ├── deflate.h
    ├── line_grouper.h
    ├── graphicsstate.h
    ├── box.cpp
    ├── object_class.h
    ├── word_grouper.h
    ├── streams.cpp
    ├── streams.h
    ├── tokenizer.h
    ├── charstring.cpp
    ├── whitespace.h
    ├── line_grouper.cpp
    ├── font.h
    ├── charstring.h
    ├── matrix.h
    ├── document.h
    ├── page.h
    ├── font.cpp
    ├── textbox.cpp
    ├── crypto.h
    ├── text_element.h
    ├── graphicobject.h
    ├── word_grouper.cpp
    ├── text_element.cpp
    ├── xref.h
    ├── textbox.h
    ├── encoding.h
    ├── glyphwidths.h
    └── object_class.cpp
├── LICENSE
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test-cpp.R
    │   └── test-pdrf.R
├── inst
    └── extdata
    │   ├── gg.pdf
    │   ├── tex.pdf
    │   ├── adobe.pdf
    │   ├── leeds.pdf
    │   ├── luck.pdf
    │   ├── rcpp.pdf
    │   ├── sams.pdf
    │   ├── barcodes.pdf
    │   ├── pdfinfo.pdf
    │   ├── chestpain.pdf
    │   └── testreader.pdf
├── .gitignore
├── TODO.md
├── NEWS.md
├── .Rbuildignore
├── R
    ├── catch-routine-registration.R
    ├── PDFR-package.R
    ├── data.R
    ├── utils.R
    └── RcppExports.R
├── man
    ├── pdfdoc.Rd
    ├── pdfboxes.Rd
    ├── run_testthat_tests.Rd
    ├── get_xref.Rd
    ├── getpagestring.Rd
    ├── draw_glyph.Rd
    ├── pdfgraphics.Rd
    ├── get_object.Rd
    ├── pdfgrobs.Rd
    ├── getglyphmap.Rd
    ├── pdfpage.Rd
    ├── pdfr_paths.Rd
    ├── pdfplot.Rd
    └── PDFR-package.Rd
├── PDFR.Rproj
├── .github
    └── ISSUE_TEMPLATE
    │   └── feature_request.md
├── NAMESPACE
├── DESCRIPTION
├── LICENSE.md
├── CONTRIBUTING.md
├── CODE_OF_CONDUCT.md
├── codemeta.json
├── README.Rmd
└── README.md


/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | JUST ASK
2 | 


--------------------------------------------------------------------------------
/src/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | *.so
3 | *.dll
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2023
2 | COPYRIGHT HOLDER: Allan Cameron
3 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(PDFR)
3 | 
4 | test_check("PDFR")
5 | 


--------------------------------------------------------------------------------
/inst/extdata/gg.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AllanCameron/PDFR/HEAD/inst/extdata/gg.pdf


--------------------------------------------------------------------------------
/inst/extdata/tex.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AllanCameron/PDFR/HEAD/inst/extdata/tex.pdf


--------------------------------------------------------------------------------
/inst/extdata/adobe.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AllanCameron/PDFR/HEAD/inst/extdata/adobe.pdf


--------------------------------------------------------------------------------
/inst/extdata/leeds.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AllanCameron/PDFR/HEAD/inst/extdata/leeds.pdf


--------------------------------------------------------------------------------
/inst/extdata/luck.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AllanCameron/PDFR/HEAD/inst/extdata/luck.pdf


--------------------------------------------------------------------------------
/inst/extdata/rcpp.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AllanCameron/PDFR/HEAD/inst/extdata/rcpp.pdf


--------------------------------------------------------------------------------
/inst/extdata/sams.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AllanCameron/PDFR/HEAD/inst/extdata/sams.pdf


--------------------------------------------------------------------------------
/inst/extdata/barcodes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AllanCameron/PDFR/HEAD/inst/extdata/barcodes.pdf


--------------------------------------------------------------------------------
/inst/extdata/pdfinfo.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AllanCameron/PDFR/HEAD/inst/extdata/pdfinfo.pdf


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | src/*.o
6 | src/*.so
7 | src/*.dll
8 | 


--------------------------------------------------------------------------------
/inst/extdata/chestpain.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AllanCameron/PDFR/HEAD/inst/extdata/chestpain.pdf


--------------------------------------------------------------------------------
/inst/extdata/testreader.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AllanCameron/PDFR/HEAD/inst/extdata/testreader.pdf


--------------------------------------------------------------------------------
/tests/testthat/test-cpp.R:
--------------------------------------------------------------------------------
1 | context("C++")
2 | test_that("Catch unit tests pass", {
3 |     expect_cpp_tests_pass("PDFR")
4 | })
5 | 


--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
1 | # TODO
2 | 
3 | - Group glyphs together into paragraphs, tables etc
4 | - Complete documentation
5 | - Submit to CRAN
6 | 
7 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
1 | # PDFR 0.1.0
2 | 
3 | * Added a `NEWS.md` file to track changes to the package.
4 | * Refactor and update documentation allow PDFR to pass `devtools::check()` with no errors, warnings, or notes (@elipousson, #4).
5 | 


--------------------------------------------------------------------------------
/src/test-runner.cpp:
--------------------------------------------------------------------------------
1 | /*
2 |  * Please do not edit this file -- it ensures that your package will export a
3 |  * 'run_testthat_tests()' C routine that can be used to run the Catch unit tests
4 |  * available in your package.
5 |  */
6 | #define TESTTHAT_TEST_RUNNER
7 | #include <testthat.h>
8 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^.*\.Rproj$
 2 | ^\.Rproj\.user$
 3 | README.md
 4 | TODO.md
 5 | CONTRIBUTING.md
 6 | CODE_OF_CONDUCT.md
 7 | PULL_REQUEST_TEMPLATE.md
 8 | inflate_method.md
 9 | working.R
10 | .gitignore
11 | ^\.github
12 | headerMap.txt
13 | ^data-raw$
14 | ^LICENSE\.md$
15 | ^codemeta\.json$
16 | ^README\.Rmd$
17 | 


--------------------------------------------------------------------------------
/R/catch-routine-registration.R:
--------------------------------------------------------------------------------
1 | # This dummy function definition is included with the package to ensure that
2 | # 'tools::package_native_routine_registration_skeleton()' generates the required
3 | # registration info for the 'run_testthat_tests' symbol.
4 | (function() {
5 |   .Call("run_testthat_tests", PACKAGE = "PDFR")
6 | })
7 | 


--------------------------------------------------------------------------------
/R/PDFR-package.R:
--------------------------------------------------------------------------------
 1 | #' @useDynLib PDFR, .registration = TRUE
 2 | #' @keywords internal
 3 | "_PACKAGE"
 4 | 
 5 | ## usethis namespace: start
 6 | #' @importFrom Rcpp evalCpp
 7 | #' @importFrom cli cli_abort
 8 | #' @importFrom rlang is_character is_true is_false is_raw check_installed
 9 | #'   has_length abort caller_env
10 | ## usethis namespace: end
11 | NULL
12 | 


--------------------------------------------------------------------------------
/man/pdfdoc.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pdrf.R
 3 | \name{pdfdoc}
 4 | \alias{pdfdoc}
 5 | \title{pdfdoc}
 6 | \usage{
 7 | pdfdoc(pdf)
 8 | }
 9 | \arguments{
10 | \item{pdf}{a valid pdf file location}
11 | }
12 | \value{
13 | a data frame of all text elements in a document
14 | }
15 | \description{
16 | Returns contents of all pdf pages
17 | }
18 | \examples{
19 | pdfdoc(pdfr_paths$leeds)
20 | }
21 | 


--------------------------------------------------------------------------------
/PDFR.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageRoxygenize: rd,collate,namespace
22 | 


--------------------------------------------------------------------------------
/man/pdfboxes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pdrf.R
 3 | \name{pdfboxes}
 4 | \alias{pdfboxes}
 5 | \title{pdfboxes}
 6 | \usage{
 7 | pdfboxes(pdf, pagenum)
 8 | }
 9 | \arguments{
10 | \item{pdf}{a valid pdf file location}
11 | 
12 | \item{pagenum}{the page number to be plotted}
13 | }
14 | \value{
15 | a ggplot
16 | }
17 | \description{
18 | Plots the bounding boxes of text elements from a page as a ggplot.
19 | }
20 | \examples{
21 | pdfboxes(pdfr_paths$leeds, 1)
22 | }
23 | 


--------------------------------------------------------------------------------
/man/run_testthat_tests.Rd:
--------------------------------------------------------------------------------
 1 | \docType{data}
 2 | \name{run_testthat_tests}
 3 | \alias{run_testthat_tests}
 4 | \title{A tool used for symbol registration}
 5 | \format{A list of 4 fields
 6 | \describe{
 7 |   \item{name}{run_testthat_tests}
 8 |   \item{address}{a pointer to this symbol}
 9 |   \item{dll}{the compiled file where the symbol is contained}
10 |   \item{numParameters}{no parameters}
11 | }}
12 | \usage{
13 | run_testthat_tests
14 | }
15 | \description{
16 | A registered native symbol used in testing
17 | }
18 | \keyword{tests}
19 | 


--------------------------------------------------------------------------------
/man/get_xref.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pdrf.R
 3 | \name{get_xref}
 4 | \alias{get_xref}
 5 | \title{Get a pdf's xref table as an R dataframe}
 6 | \usage{
 7 | get_xref(pdf)
 8 | }
 9 | \arguments{
10 | \item{pdf}{a valid pdf file location or raw data vector}
11 | }
12 | \value{
13 | a data frame showing the bytewise positions of each object in the pdf
14 | }
15 | \description{
16 | Get a pdf's xref table as an R dataframe
17 | }
18 | \examples{
19 | get_xref(pdfr_paths$leeds)
20 | }
21 | 


--------------------------------------------------------------------------------
/man/getpagestring.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pdrf.R
 3 | \name{getpagestring}
 4 | \alias{getpagestring}
 5 | \title{pagestring}
 6 | \usage{
 7 | getpagestring(pdf, page)
 8 | }
 9 | \arguments{
10 | \item{pdf}{a valid pdf file location}
11 | 
12 | \item{page}{the page number to be extracted}
13 | }
14 | \value{
15 | a single string containing the page description program
16 | }
17 | \description{
18 | Returns contents of a pdf page description program
19 | }
20 | \examples{
21 | getpagestring(pdfr_paths$leeds, 1)
22 | }
23 | 


--------------------------------------------------------------------------------
/man/draw_glyph.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pdrf.R
 3 | \name{draw_glyph}
 4 | \alias{draw_glyph}
 5 | \title{draw_glyph}
 6 | \usage{
 7 | draw_glyph(fontfile, glyph)
 8 | }
 9 | \arguments{
10 | \item{fontfile}{a raw vector representing a font file}
11 | 
12 | \item{glyph}{the character to be drawn. Can be text or an integer}
13 | }
14 | \value{
15 | no return
16 | }
17 | \description{
18 | Draws glyphs from a truetype font as grid grobs
19 | }
20 | \examples{
21 | \dontrun{
22 | if(interactive()){
23 |  # ttf <- "raw vector with font file"
24 |  draw_glyph(ttf, "a")
25 |  }
26 | }
27 | }
28 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | 
 5 | ---
 6 | 
 7 | **Is your feature request related to a problem? Please describe.**
 8 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
 9 | 
10 | **Describe the solution you'd like**
11 | A clear and concise description of what you want to happen.
12 | 
13 | **Describe alternatives you've considered**
14 | A clear and concise description of any alternative solutions or features you've considered.
15 | 
16 | **Additional context**
17 | Add any other context or screenshots about the feature request here.
18 | 


--------------------------------------------------------------------------------
/man/pdfgraphics.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pdrf.R
 3 | \name{pdfgraphics}
 4 | \alias{pdfgraphics}
 5 | \title{pdfgraphics}
 6 | \usage{
 7 | pdfgraphics(file, pagenum, scale = 1)
 8 | }
 9 | \arguments{
10 | \item{file}{a valid pdf file location}
11 | 
12 | \item{pagenum}{the page number to be plotted}
13 | 
14 | \item{scale}{Scale used for linewidth and text size. Passed to
15 | `ggplot2::geom_text()` size parameter as scale * size/3}
16 | }
17 | \value{
18 | a ggplot
19 | }
20 | \description{
21 | Plots the graphical elements of a pdf page as a ggplot
22 | }
23 | \examples{
24 | pdfgraphics(pdfr_paths$leeds, 1)
25 | 
26 | }
27 | 


--------------------------------------------------------------------------------
/man/get_object.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pdrf.R
 3 | \name{get_object}
 4 | \alias{get_object}
 5 | \title{Get the contents of a pdf object}
 6 | \usage{
 7 | get_object(pdf, number)
 8 | }
 9 | \arguments{
10 | \item{pdf}{a valid pdf file location}
11 | 
12 | \item{number}{the object number}
13 | }
14 | \value{
15 | a named vector of the dictionary and stream of the pdf object
16 | }
17 | \description{
18 | Returns a list consisting of a named vector representing key:value pairs
19 | in a specified object. It also contains any stream data associated with
20 | the object.
21 | }
22 | \examples{
23 | get_object(pdfr_paths$leeds, 1)
24 | }
25 | 


--------------------------------------------------------------------------------
/man/pdfgrobs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pdrf.R
 3 | \name{pdfgrobs}
 4 | \alias{pdfgrobs}
 5 | \title{pdfgrobs}
 6 | \usage{
 7 | pdfgrobs(file_name, pagenum, scale = dev.size()[2]/10, enc = "UTF-8")
 8 | }
 9 | \arguments{
10 | \item{file_name}{a valid pdf file location}
11 | 
12 | \item{pagenum}{the page number to be plotted}
13 | 
14 | \item{scale}{Document scale. Defaults to `dev.size()[2]/10`}
15 | 
16 | \item{enc}{Document encoding. Defaults to "UTF-8"}
17 | }
18 | \value{
19 | invisibly returns grobs as well as drawing them
20 | }
21 | \description{
22 | Plots the graphical elements of a pdf page as grobs
23 | }
24 | \examples{
25 | pdfgrobs(pdfr_paths$leeds, 1)
26 | }
27 | 


--------------------------------------------------------------------------------
/man/getglyphmap.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pdrf.R
 3 | \name{getglyphmap}
 4 | \alias{getglyphmap}
 5 | \title{Return map of glyphs from a page}
 6 | \usage{
 7 | getglyphmap(pdf, page = 1)
 8 | }
 9 | \arguments{
10 | \item{pdf}{a valid pdf file location}
11 | 
12 | \item{page}{the page number from which to extract glyphs}
13 | }
14 | \value{
15 | a dataframe of all entries of font encoding tables with width mapping
16 | }
17 | \description{
18 | Used mainly for debugging, this function returns an R dataframe, one row for
19 | each byte that may be used as a glyph. It shows the unicode number of
20 | each interpreted glyph, as well as its width in text space.
21 | }
22 | \examples{
23 | getglyphmap(pdfr_paths$leeds, 1)
24 | }
25 | 


--------------------------------------------------------------------------------
/man/pdfpage.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pdrf.R
 3 | \name{pdfpage}
 4 | \alias{pdfpage}
 5 | \title{pdfpage}
 6 | \usage{
 7 | pdfpage(pdf, page = 1, atomic = FALSE, table_only = TRUE)
 8 | }
 9 | \arguments{
10 | \item{pdf}{a valid pdf file location}
11 | 
12 | \item{page}{the page number to be extracted}
13 | 
14 | \item{atomic}{a boolean - should each letter treated individually?}
15 | 
16 | \item{table_only}{a boolean - return data frame alone, as opposed to list}
17 | }
18 | \value{
19 | a list containing data frames
20 | }
21 | \description{
22 | Returns contents of a pdf page
23 | }
24 | \examples{
25 | 
26 | head(pdfpage(pdfr_paths$leeds, page = 1))
27 | 
28 | head(pdfpage(pdfr_paths$chestpain, page = c(1:2)))
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/man/pdfr_paths.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{pdfr_paths}
 5 | \alias{pdfr_paths}
 6 | \title{Paths to test pdfs}
 7 | \format{
 8 | A list of 9 pdf files
 9 | \describe{
10 |   \item{barcodes}{a pdf constructed in Rstudio}
11 |   \item{chestpain}{a flow-chart for chest pain management}
12 |   \item{pdfinfo}{information about the pdf format}
13 |   \item{adobe}{an official adobe document}
14 |   \item{leeds}{a table-rich local government document}
15 |   \item{sams}{a document based on svg}
16 |   \item{testreader}{a simple pdf test}
17 |   \item{tex}{a simple tex test}
18 |   \item{rcpp}{a CRAN package vignette}
19 | }
20 | }
21 | \usage{
22 | pdfr_paths
23 | }
24 | \description{
25 | A list of paths to locally stored test pdfs
26 | }
27 | \keyword{datasets}
28 | 


--------------------------------------------------------------------------------
/man/pdfplot.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pdrf.R
 3 | \name{pdfplot}
 4 | \alias{pdfplot}
 5 | \title{pdfplot}
 6 | \usage{
 7 | pdfplot(pdf, page = 1, atomic = FALSE, boxes = FALSE, textsize = 1)
 8 | }
 9 | \arguments{
10 | \item{pdf}{a valid pdf file location}
11 | 
12 | \item{page}{the page number to be plotted}
13 | 
14 | \item{atomic}{a boolean - should each letter treated individually?}
15 | 
16 | \item{boxes}{Show the calculated text bounding boxes}
17 | 
18 | \item{textsize}{the scale of the text to be shown}
19 | }
20 | \value{
21 | a ggplot
22 | }
23 | \description{
24 | Plots the text elements from a page as a ggplot.
25 | The aim is not a complete pdf rendering but to help identify elements of
26 | interest in the data frame of text elements to convert to data points.
27 | }
28 | \examples{
29 | pdfplot(pdfr_paths$leeds, 1)
30 | }
31 | 


--------------------------------------------------------------------------------
/man/PDFR-package.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/PDFR-package.R
 3 | \docType{package}
 4 | \name{PDFR-package}
 5 | \alias{PDFR}
 6 | \alias{PDFR-package}
 7 | \title{PDFR: Extract Text From PDFs In An R Friendly Way}
 8 | \description{
 9 | Extracts text from PDF into an R dataframe giving the content, size, position and font of any text elements. This information can then be manipulated in R.
10 | }
11 | \seealso{
12 | Useful links:
13 | \itemize{
14 |   \item \url{https://github.com/AllanCameron/PDFR}
15 |   \item Report bugs at \url{https://github.com/AllanCameron/PDFR/issues}
16 | }
17 | 
18 | }
19 | \author{
20 | \strong{Maintainer}: Allan Cameron \email{Allan.Cameron@nhs.net} [copyright holder]
21 | 
22 | Other contributors:
23 | \itemize{
24 |   \item Eli Pousson \email{eli.pousson@gmail.com} (\href{https://orcid.org/0000-0001-8280-1706}{ORCID}) [contributor]
25 | }
26 | 
27 | }
28 | \keyword{internal}
29 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(draw_glyph)
 4 | export(get_object)
 5 | export(get_xref)
 6 | export(getglyphmap)
 7 | export(getpagestring)
 8 | export(pdfboxes)
 9 | export(pdfdoc)
10 | export(pdfgraphics)
11 | export(pdfgrobs)
12 | export(pdfpage)
13 | export(pdfplot)
14 | export(pdfr_paths)
15 | importFrom(Rcpp,evalCpp)
16 | importFrom(cli,cli_abort)
17 | importFrom(grDevices,dev.size)
18 | importFrom(grDevices,rgb)
19 | importFrom(grid,gpar)
20 | importFrom(grid,grid.draw)
21 | importFrom(grid,grid.newpage)
22 | importFrom(grid,grid.path)
23 | importFrom(grid,grid.rect)
24 | importFrom(grid,pushViewport)
25 | importFrom(grid,viewport)
26 | importFrom(rlang,abort)
27 | importFrom(rlang,caller_env)
28 | importFrom(rlang,check_installed)
29 | importFrom(rlang,has_length)
30 | importFrom(rlang,is_character)
31 | importFrom(rlang,is_false)
32 | importFrom(rlang,is_raw)
33 | importFrom(rlang,is_true)
34 | useDynLib(PDFR, .registration = TRUE)
35 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Type: Package
 2 | Package: PDFR
 3 | Title: Extract Text From PDFs In An R Friendly Way
 4 | Version: 0.1.0
 5 | Authors@R: c(
 6 |     person("Allan", "Cameron", , "Allan.Cameron@nhs.net", role = c("aut", "cre", "cph")),
 7 |     person("Eli", "Pousson", , "eli.pousson@gmail.com", role = "ctb",
 8 |            comment = c(ORCID = "0000-0001-8280-1706"))
 9 |   )
10 | Maintainer: Allan Cameron <Allan.Cameron@nhs.scot>
11 | Description: Extracts text from PDF into an R dataframe giving the
12 |     content, size, position and font of any text elements. This
13 |     information can then be manipulated in R.
14 | License: MIT + file LICENSE
15 | URL: https://github.com/AllanCameron/PDFR
16 | BugReports: https://github.com/AllanCameron/PDFR/issues
17 | Depends: 
18 |     R (>= 2.10)
19 | Imports: 
20 |     cli,
21 |     grDevices,
22 |     grid,
23 |     Rcpp,
24 |     rlang
25 | Suggests: 
26 |     ggplot2,
27 |     testthat
28 | LinkingTo: 
29 |     Rcpp,
30 |     testthat
31 | Encoding: UTF-8
32 | LazyData: true
33 | RoxygenNote: 7.2.3
34 | StagedInstall: no
35 | SystemRequirements: C++11
36 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | Copyright (c) 2023 Allan Cameron
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/DCTDecode.h:
--------------------------------------------------------------------------------
 1 | //---------------------------------------------------------------------------//
 2 | //                                                                           //
 3 | //  PDFR DCTDecode header file                                               //
 4 | //                                                                           //
 5 | //  Copyright (C) 2018 - 2021 by Allan Cameron                               //
 6 | //                                                                           //
 7 | //  Licensed under the MIT license - see https://mit-license.org             //
 8 | //  or the LICENSE file in the project root directory                        //
 9 | //                                                                           //
10 | //---------------------------------------------------------------------------//
11 | 
12 | #ifndef PDFR_DCT
13 | 
14 | //---------------------------------------------------------------------------//
15 | 
16 | #define PDFR_DCT
17 | 
18 | #include "streams.h"
19 | 
20 | class DCTDecode : public Stream
21 | {
22 | public:
23 |   DCTDecode(const std::string* input) : Stream(*input) {};
24 |   DCTDecode(const CharString& input) : Stream(input) {};
25 | };
26 | 
27 | //---------------------------------------------------------------------------//
28 | 
29 | #endif
30 | 


--------------------------------------------------------------------------------
/tests/testthat/test-pdrf.R:
--------------------------------------------------------------------------------
 1 | context("test-pdrf")
 2 | pdfpage(pdfr_paths[[1]], 1, FALSE, FALSE)$Elements -> barcodes
 3 | pdfpage(pdfr_paths[[2]], 1, FALSE, FALSE)$Elements -> chestpain
 4 | pdfpage(pdfr_paths[[3]], 1, FALSE, FALSE)$Elements -> pdfinfo
 5 | pdfpage(pdfr_paths[[4]], 1, FALSE, FALSE)$Elements -> adobe
 6 | pdfpage(pdfr_paths[[5]], 1, FALSE, FALSE)$Elements -> leeds
 7 | pdfpage(pdfr_paths[[6]], 1, FALSE, FALSE)$Elements -> sams
 8 | pdfpage(pdfr_paths[[7]], 1, FALSE, FALSE)$Elements -> testreader
 9 | pdfpage(pdfr_paths[[8]], 3, FALSE, FALSE)$Elements -> tex
10 | pdfpage(pdfr_paths[[9]], 1, FALSE, FALSE)$Elements -> rcpp
11 | 
12 | test_that("Encoding works",
13 | {
14 |   expect_match(chestpain$text[1], "ACUTE CARDIAC CHEST PAIN GUIDELINES")
15 | })
16 | 
17 | test_that("Ligatures are properly encoded",
18 | {
19 |   expect_match(paste(tex$text, collapse = " "), "fi")
20 | })
21 | 
22 | test_that("Widths are non-zero",
23 | {
24 |   expect_gt(min(testreader$right - testreader$left), 95)
25 | })
26 | 
27 | test_that("Whole document can be parsed",
28 | {
29 |   expect_silent(pdfdoc(pdfr_paths[[2]]))
30 | })
31 | 
32 | test_that("Multiple pages can be parsed",
33 | {
34 |   expect_silent(pdfpage(pdfr_paths[[2]], c(1:2)))
35 | })
36 | 
37 | test_that("Errors as expected",
38 | {
39 |   expect_error(pdfpage(2, c(1:2)))
40 | })
41 | 


--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
 1 | ##---------------------------------------------------------------------------##
 2 | #' Paths to test pdfs
 3 | #'
 4 | #' A list of paths to locally stored test pdfs
 5 | #'
 6 | #' @format A list of 9 pdf files
 7 | #' \describe{
 8 | #'   \item{barcodes}{a pdf constructed in Rstudio}
 9 | #'   \item{chestpain}{a flow-chart for chest pain management}
10 | #'   \item{pdfinfo}{information about the pdf format}
11 | #'   \item{adobe}{an official adobe document}
12 | #'   \item{leeds}{a table-rich local government document}
13 | #'   \item{sams}{a document based on svg}
14 | #'   \item{testreader}{a simple pdf test}
15 | #'   \item{tex}{a simple tex test}
16 | #'   \item{rcpp}{a CRAN package vignette}
17 | #' }
18 | #' @export
19 | ##---------------------------------------------------------------------------##
20 | 
21 | pdfr_paths <- list(
22 |   barcodes   =  system.file("extdata", "barcodes.pdf",   package = "PDFR"),
23 |   chestpain  =  system.file("extdata", "chestpain.pdf",  package = "PDFR"),
24 |   pdfinfo    =  system.file("extdata", "pdfinfo.pdf",    package = "PDFR"),
25 |   adobe      =  system.file("extdata", "adobe.pdf",      package = "PDFR"),
26 |   leeds      =  system.file("extdata", "leeds.pdf",      package = "PDFR"),
27 |   sams       =  system.file("extdata", "sams.pdf",       package = "PDFR"),
28 |   testreader =  system.file("extdata", "testreader.pdf", package = "PDFR"),
29 |   tex        =  system.file("extdata", "tex.pdf",        package = "PDFR"),
30 |   rcpp       =  system.file("extdata", "rcpp.pdf",       package = "PDFR")
31 | )
32 | 


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
 1 | utils::globalVariables(
 2 |   c(
 3 |     "X", "Y", "bottom", "box", "fill", "left", "midx", "midy", "poly", "right",
 4 |     "size", "stroke", "text", "top", "xmax", "xmin", "ymax", "ymin"
 5 |   )
 6 | )
 7 | 
 8 | #' Error if pdf is not a valid input
 9 | #'
10 | #' @param pdf Object to check
11 | #' @keywords internal
12 | #' @noRd
13 | check_pdf <- function(pdf, call = caller_env()) {
14 |   if (any(
15 |     c(
16 |       !is_raw(pdf) && is_false(is_character(pdf)),
17 |       is_character(pdf) && is_min_length(pdf),
18 |       is_character(pdf) && !is_pdf_fileext(pdf[1])
19 |     )
20 |   )) {
21 |     cli_abort(
22 |       "{.arg pdf} must be a single path to a valid pdf file or a raw vector
23 |       string, not {.obj_type_friendly {pdf}}.",
24 |       call = call
25 |     )
26 |   }
27 | }
28 | 
29 | #' Does x end with a PDF file extension?
30 | #'
31 | #' @param x Object to check for PDF file extension.
32 | #' @inheritParams base::grepl
33 | #' @keywords internal
34 | #' @noRd
35 | is_pdf_fileext <- function(x, ignore.case = TRUE) {
36 |   grepl("[.]pdf$", x, ignore.case = ignore.case)
37 | }
38 | 
39 | #' Does x contain a file separator character?
40 | #'
41 | #' @param x Object to check for a file separator character
42 | #' @param fsep File separator character. Defaults to `.Platform$file.sep`
43 | #' @keywords internal
44 | #' @noRd
45 | is_fsep_path <- function(x, fsep = .Platform$file.sep) {
46 |   grepl(fsep, x)
47 | }
48 | 
49 | #' Does x end with a PDF file extension?
50 | #'
51 | #' @param x Object to check for minimum length.
52 | #' @param n Minimum length to return `TRUE`.
53 | #' @keywords internal
54 | #' @noRd
55 | is_min_length <- function(x, n = 2) {
56 |   length(x) >= n
57 | }
58 | 


--------------------------------------------------------------------------------
/src/external/Profiler.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <stdio.h>
 3 | #include <functional>
 4 | #include <chrono>
 5 | #include <thread>
 6 | #include <mutex>
 7 | #include <set>
 8 | 
 9 | #ifdef PROFILER_PDFR
10 | #define PROFC_NODE(name)                              \
11 |   static ProfileNode __node##__LINE__(name);          \
12 |   TheNodeList::Instance().AddNode(&__node##__LINE__); \
13 |   ScopedTimer __timer##__LINE__(std::bind(            \
14 |       &ProfileNode::Accumulate, &__node##__LINE__, std::placeholders::_1));
15 | 
16 | class ProfileNode {
17 |  public:
18 |   explicit ProfileNode(const std::string& name) : name_(name), count_(0) {
19 |   }
20 |   void Accumulate(std::chrono::microseconds us) {
21 |     the_lock_.lock();
22 |     count_++;
23 |     elapsed_us_ += us;
24 |     the_lock_.unlock();
25 |   }
26 |   void Print() {
27 |     printf(
28 |         "%-25s %10d %10dms %10dus\n", name_.c_str(), count_,
29 |         static_cast<int>(std::chrono::duration_cast<std::chrono::milliseconds>(
30 |                              elapsed_us_).count()),
31 |         static_cast<int>(elapsed_us_.count() / count_));
32 |   }
33 | 
34 |  private:
35 |   std::string name_;
36 |   int count_;
37 |   std::chrono::microseconds elapsed_us_;
38 |   std::mutex the_lock_;
39 | };
40 | 
41 | class ScopedTimer {
42 |  public:
43 |   explicit ScopedTimer(std::function<void(std::chrono::microseconds)> callback)
44 |       : callback_(callback) {
45 |     start_ = std::chrono::system_clock::now();
46 |   }
47 |   ~ScopedTimer() {
48 |     auto end = std::chrono::system_clock::now();
49 |     auto elapsed = end - start_;
50 |     callback_(std::chrono::duration_cast<std::chrono::microseconds>(elapsed));
51 |   }
52 |   ScopedTimer(const ScopedTimer&) = delete;
53 |   ScopedTimer& operator=(const ScopedTimer&) = delete;
54 | 
55 |  private:
56 |   std::function<void(std::chrono::microseconds)> callback_;
57 |   std::chrono::time_point<std::chrono::system_clock> start_;
58 | };
59 | 
60 | class TheNodeList {
61 |  public:
62 |   void AddNode(ProfileNode* node) {
63 |     nodes_.insert(node);
64 |   }
65 |   ~TheNodeList() {
66 |     Print();
67 |   }
68 |   void printNodeList() {Print();}
69 |   static TheNodeList& Instance() {
70 |     static TheNodeList nodes;
71 |     return nodes;
72 |   }
73 |   void endprofiler(){this->Print();}
74 |   void Print() {
75 |     printf("--------------------------------------------------------------\n");
76 |     printf("name                           count      elapsed      us/call\n");
77 |     for (auto node : nodes_) node->Print();
78 |   }
79 | 
80 |  private:
81 |   std::set<ProfileNode*> nodes_;
82 | };
83 | #endif
84 | 
85 | #ifndef PROFILER_PDFR
86 | #define PROFC_NODE(name)  ((void)0);
87 | #endif
88 | 


--------------------------------------------------------------------------------
/R/RcppExports.R:
--------------------------------------------------------------------------------
  1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand
  2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
  3 | 
  4 | .get_xref <- function(file_name) {
  5 |     .Call(`_PDFR_GetXrefFromString`, file_name)
  6 | }
  7 | 
  8 | .get_xrefraw <- function(raw_file) {
  9 |     .Call(`_PDFR_GetXrefFromRaw`, raw_file)
 10 | }
 11 | 
 12 | .get_obj <- function(file_name, object_number) {
 13 |     .Call(`_PDFR_GetObjectFromString`, file_name, object_number)
 14 | }
 15 | 
 16 | .get_objraw <- function(raw_file, object_number) {
 17 |     .Call(`_PDFR_GetObjectFromRaw`, raw_file, object_number)
 18 | }
 19 | 
 20 | .pdfpage <- function(file_name, page_number, each_glyph) {
 21 |     .Call(`_PDFR_GetPdfPageFromString`, file_name, page_number, each_glyph)
 22 | }
 23 | 
 24 | .pdfpageraw <- function(raw_file, page_number, atoms) {
 25 |     .Call(`_PDFR_GetPdfPageFromRaw`, raw_file, page_number, atoms)
 26 | }
 27 | 
 28 | .getglyphmap <- function(file_name, page_number) {
 29 |     .Call(`_PDFR_GetGlyphMap`, file_name, page_number)
 30 | }
 31 | 
 32 | .pagestring <- function(file_name, page_number) {
 33 |     .Call(`_PDFR_GetPageStringFromString`, file_name, page_number)
 34 | }
 35 | 
 36 | .pagestringraw <- function(raw_file, page_number) {
 37 |     .Call(`_PDFR_GetPageStringFromRaw`, raw_file, page_number)
 38 | }
 39 | 
 40 | .pdfdoc <- function(file_name) {
 41 |     .Call(`_PDFR_GetPdfDocumentFromString`, file_name)
 42 | }
 43 | 
 44 | .pdfdocraw <- function(file_name) {
 45 |     .Call(`_PDFR_GetPdfDocumentFromRaw`, file_name)
 46 | }
 47 | 
 48 | .pdfboxesString <- function(file_name, page_number) {
 49 |     .Call(`_PDFR_GetPdfBoxesFromString`, file_name, page_number)
 50 | }
 51 | 
 52 | .pdfboxesRaw <- function(file_name, page_number) {
 53 |     .Call(`_PDFR_GetPdfBoxesFromRaw`, file_name, page_number)
 54 | }
 55 | 
 56 | .GetPaths <- function(file_name, page_number) {
 57 |     .Call(`_PDFR_GetPaths`, file_name, page_number)
 58 | }
 59 | 
 60 | .GetGrobs <- function(file_name, page_number) {
 61 |     .Call(`_PDFR_GetGrobs`, file_name, page_number)
 62 | }
 63 | 
 64 | ReadFontTable <- function(raw) {
 65 |     .Call(`_PDFR_ReadFontTable`, raw)
 66 | }
 67 | 
 68 | GetFontFileHeader <- function(raw) {
 69 |     .Call(`_PDFR_GetFontFileHeader`, raw)
 70 | }
 71 | 
 72 | GetFontFileCMap <- function(raw) {
 73 |     .Call(`_PDFR_GetFontFileCMap`, raw)
 74 | }
 75 | 
 76 | GetFontFileMaxp <- function(raw) {
 77 |     .Call(`_PDFR_GetFontFileMaxp`, raw)
 78 | }
 79 | 
 80 | GetFontFileLoca <- function(raw) {
 81 |     .Call(`_PDFR_GetFontFileLoca`, raw)
 82 | }
 83 | 
 84 | GetFontFileGlyph <- function(raw, glyph) {
 85 |     .Call(`_PDFR_GetFontFileGlyph`, raw, glyph)
 86 | }
 87 | 
 88 | GetFontFilePostTable <- function(raw) {
 89 |     .Call(`_PDFR_GetFontFilePostTable`, raw)
 90 | }
 91 | 
 92 | GetFontFileNameTable <- function(raw) {
 93 |     .Call(`_PDFR_GetFontFileNameTable`, raw)
 94 | }
 95 | 
 96 | GetFontFileOS2Table <- function(raw) {
 97 |     .Call(`_PDFR_GetFontFileOS2Table`, raw)
 98 | }
 99 | 
100 | .stopCpp <- function() {
101 |     invisible(.Call(`_PDFR_stopCpp`))
102 | }
103 | 
104 | 


--------------------------------------------------------------------------------
/src/letter_grouper.h:
--------------------------------------------------------------------------------
 1 | //----------------------------------------------------------------------------//
 2 | //                                                                            //
 3 | //  PDFR LetterGrouper header file                                            //
 4 | //                                                                            //
 5 | //  Copyright (C) 2018 - 2019 by Allan Cameron                                //
 6 | //                                                                            //
 7 | //  Licensed under the MIT license - see https://mit-license.org              //
 8 | //  or the LICENSE file in the project root directory                         //
 9 | //                                                                            //
10 | //----------------------------------------------------------------------------//
11 | 
12 | #ifndef PDFR_LGROUPER
13 | 
14 | //----------------------------------------------------------------------------//
15 | 
16 | #define PDFR_LGROUPER
17 | 
18 | /* The LetterGrouper class co-ordinates the grouping together of words. In
19 |  * terms of program structure, this comes directly after the parser step that
20 |  * reads the page description program. The goal of this class is to clump
21 |  * adjoining glyphs to form strings. Mostly, these will form words, but if
22 |  * actual spaces are included as glyphs then grouped strings of words will be
23 |  * included in the output.
24 |  *
25 |  * This is the first step of a "meet-in-the-middle" document reconstruction,
26 |  * which will use these strings as the atoms from which to form structures such
27 |  * as paragraphs, headers and tables.
28 |  */
29 | 
30 | #include "textbox.h"
31 | 
32 | //----------------------------------------------------------------------------//
33 | // The LetterGrouper class contains a constructor, an output map of results,
34 | // and a method for passing out the minimum text bounding box found in page
35 | // construction. Its private methods are used only in construction of the
36 | // output. The main private member is a map of vectors of TextElements, each
37 | // vector representing all glyphs in one of 256 equally sized cells on the page.
38 | // Each glyph is addressable by two numbers - the grid number and the position
39 | // of the glyph in the cell's vector.
40 | 
41 | class LetterGrouper
42 | {
43 |  public:
44 |   using TextPointer = std::shared_ptr<TextElement>;
45 |   // constructor.
46 |   LetterGrouper(std::unique_ptr<TextBox>);
47 | 
48 |   // Passes text elements to WordGrouper for further construction if needed
49 |   std::unique_ptr<TextBox> Output();
50 |   TextTable Out(); // output table to interface if ungrouped words needed
51 | 
52 |  private:
53 |   // A copy of the parser output used to create grid
54 |   std::unique_ptr<TextBox> text_box_;
55 | 
56 |   // Main data member - a 16 x 16 grid of cells, each with a TextPointer vector
57 |   std::unordered_map<uint8_t, std::vector<TextPointer>> grid_;
58 | 
59 |   // private methods
60 |   void MakeGrid_();                       // Assigns glyphs to a 16 x 16 grid
61 |   void CompareCells_();                   // Co-ordinates matching between cells
62 |   void MatchRight_(TextPointer, uint8_t); // Compares all glyphs in cell
63 |   void Merge_();                          // Joins matching glyphs together
64 | };
65 | 
66 | //----------------------------------------------------------------------------//
67 | 
68 | #endif
69 | 


--------------------------------------------------------------------------------
/src/deflate.h:
--------------------------------------------------------------------------------
 1 | //---------------------------------------------------------------------------//
 2 | //                                                                           //
 3 | //  PDFR Deflate header file                                                 //
 4 | //                                                                           //
 5 | //  Copyright (C) 2018 - 2019 by Allan Cameron                               //
 6 | //                                                                           //
 7 | //  Licensed under the MIT license - see https://mit-license.org             //
 8 | //  or the LICENSE file in the project root directory                        //
 9 | //                                                                           //
10 | //---------------------------------------------------------------------------//
11 | 
12 | #ifndef PDFR_DEFLATE
13 | 
14 | //---------------------------------------------------------------------------//
15 | 
16 | #define PDFR_DEFLATE
17 | 
18 | #include<map>
19 | #include "streams.h"
20 | 
21 | std::string FlateDecode(std::string* message);
22 | std::string FlateDecode(const CharString& message);
23 | 
24 | //---------------------------------------------------------------------------//
25 | // This class reinvents the wheel in an attempt to free the library from
26 | // dependencies. It is a full implementation of Deflate decompression. It uses
27 | // std::map for storing and looking up Huffman trees and inherits from Stream
28 | // to give it an easy interface to the underlying stream. Only the constructor
29 | // is public.
30 | 
31 | class Deflate : public Stream
32 | {
33 |  public:
34 |   // String and byte-vector constructors. The latter converts to a string.
35 |   Deflate(const std::string*);
36 |   Deflate(const CharString&);
37 | 
38 |  private:
39 |   bool is_last_block_;    // Flag so decompressor knows when to stop
40 | 
41 |   // The fixed literal and distance maps are used if compression used a
42 |   // fixed dictionary. Usually this only happens with short messages.
43 |   static const std::unordered_map<uint32_t, uint32_t> fixed_literal_map_;
44 |   static const std::unordered_map<uint32_t, uint32_t> fixed_distance_map_;
45 | 
46 |   // If we come across a length code or a distance code, we need to know
47 |   // how many extra bytes to read. This is looked up in these tables.
48 |   static const std::vector<uint32_t> length_table_;
49 |   static const std::vector<uint32_t> distance_table_;
50 | 
51 |   // Whether its fixed or dynamic compression, we want to end up with a literal
52 |   // and distance map that we can look up.
53 |   std::unordered_map<uint32_t, uint32_t> literal_map_;
54 |   std::unordered_map<uint32_t, uint32_t> distance_map_;
55 | 
56 |   void CheckHeader_();             // Read first two bytes to ensure valid
57 |   void ReadBlock_();               // Co-ordinates reading of a single block
58 |   void BuildDynamicCodeTable_();   // Builds lookup tables for each block
59 |   void ReadCodes_();               // Actual reading of compressed data
60 |   void HandlePointer_(uint32_t);   // Deals with length & distance pointers
61 | 
62 |   // Finds the next code in the input stream using given lookup table
63 |   uint32_t ReadCode_(std::unordered_map<uint32_t, uint32_t>&);
64 | 
65 |   // Creates a Huffman tree from a vector of bit lengths.
66 |   std::unordered_map<uint32_t, uint32_t>
67 |     Huffmanize_(const std::vector<uint32_t>&);
68 | };
69 | 
70 | 
71 | #endif
72 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | I am an amateur, with little time to spend on this project and no experience whatsoever in C++. If you are interested in contributing, I would be delighted to hear from you: please [get in touch](Allan.Cameron@nhs.net).
 4 | 
 5 | To keep the C++ codebase consistent, I will declare some coding conventions here. This is mostly for my own benefit, but I would ask any contributors to keep to these conventions too where possible.
 6 | 
 7 | # Conventions
 8 | 
 9 | ## Naming conventions
10 | - All variable names are written in `snake_case` with no capitals.
11 | - Aim for descriptive names over saving horizontal space e.g `temporary_byte_vector` is better than `tmpvec`
12 | - Prefer named iterators in a loop rather than `i`; e.g. `for (size_t entry = 0; entry < table.size(); ++entry)` unless there is no meaningful name to apply.
13 | - All private data members are suffixed with a single underscore: `private_member_`
14 | - All function / method names are written in `CamelCase`.
15 | - Class, struct, enum and type names are written in `CamelCase`.
16 | - Suffix private methods with an underscore - `MyPrivateMethod_();`
17 | - Use descriptive variable names in class method declarations, as these will help document the class user interface. Private methods don't always need a variable name, or can be short descriptive names if preferred.
18 | 
19 | The following code block demonstrates most of these naming conventions:
20 | 
21 | ```cpp
22 | //---------------------------------------------------------------------------//
23 | // Method to make things OK
24 | 
25 | std::string MakeEverythingOK(std::string input_string)
26 | {
27 |   std::string ok_suffix = " is OK";
28 |   input_string.append(ok_suffix);
29 |   return input_string;
30 | }
31 | 
32 | //---------------------------------------------------------------------------//
33 | // Make the data member OK
34 | 
35 | void MyClass::MakeAnOKMember_(const std::string& input_string)
36 | {
37 |   ok_data_member_ = MakeEverythingOK(input_string);
38 | }
39 | 
40 | //---------------------------------------------------------------------------//
41 | ```
42 | 
43 | 
44 | ## Comments
45 | - Every file begins with the MIT license header
46 | - Most comments should have the single-line `//` format rather than `/* Multi-line */` type. An exception can be made for large introductory comments explaining the rationale for a class at the top of a header file, just below the license.
47 | - Prefer verbose comments, even though the naming rules should make the code largely self-commenting. It takes less time to understand what's going on if things are well commented. Of course, we want to avoid being silly with the likes of
48 | ```cpp
49 | return result; // Returns the result
50 | ```
51 | but the general rule is, if it is quicker to understand it with a comment, the comment goes in.
52 | 
53 | ## Layout
54 | - Indentation is in [Allman style](https://en.wikipedia.org/wiki/Indentation_style#Allman_style). Yes it wastes vertical space, but I just find it more readable.
55 | - Indentation is with two spaces. No tabs allowed.
56 | - The maximum line width is 80 characters. No exceptions.
57 | - All function definitions are seperated by an 80-character comment line as shown in the snippet above, with a brief description commented below, a line break, then the function, followed by a line break then the next comment line.
58 | - Class definitions are declared public members first, then private members.
59 | - The keywords `public:` and `private:` in a class definition get a single space indentation.
60 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |  advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |  address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |  professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at Allan.Cameron@nhs.net. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 | 
73 | [homepage]: https://www.contributor-covenant.org
74 | 
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 | 


--------------------------------------------------------------------------------
/src/line_grouper.h:
--------------------------------------------------------------------------------
 1 | //---------------------------------------------------------------------------//
 2 | //                                                                           //
 3 | //  PDFR LineGrouper header file                                             //
 4 | //                                                                           //
 5 | //  Copyright (C) 2018 - 2019 by Allan Cameron                               //
 6 | //                                                                           //
 7 | //  Licensed under the MIT license - see https://mit-license.org             //
 8 | //  or the LICENSE file in the project root directory                        //
 9 | //                                                                           //
10 | //---------------------------------------------------------------------------//
11 | 
12 | #ifndef PDFR_LINEGOUPER
13 | 
14 | //---------------------------------------------------------------------------//
15 | 
16 | #define PDFR_LINEGOUPER
17 | 
18 | #include "textbox.h"
19 | 
20 | 
21 | //---------------------------------------------------------------------------//
22 | /* The LineGrouper class takes the output of the whitespace class, which is
23 |  * a vector of TextBoxes - that is, a box containing a vector of text elements.
24 |  * What we want is to change this so that we have a 1:1 correspondence between
25 |  * boxes and text elements, but for the text elements to be joined-up, logical
26 |  * components of the document such as paragraphs, headers, table entries and so
27 |  * on.
28 |  *
29 |  * This requires a few different processes. First, we need to arrange all the
30 |  * text elements in the boxes into the correct "reading order". Since we have
31 |  * already split elements by whitespace, this should be a simple matter of
32 |  * sorting top to bottom and left to right.
33 |  *
34 |  * Secondly, we need to determine whether there are logical breaks between
35 |  * the lines of text, or whether they are supposed to join together. We do this
36 |  * by taking clues such as the size of line spacing and the alignment of text to
37 |  * spot paragraph breaks.
38 |  *
39 |  * Thirdly, we need to work out how lines are meant to be joined together.
40 |  * Usually, they should be joined with a space. However, if a line is to be
41 |  * joined to the one below but already ends in a space or ends in a hyphen,
42 |  * it should be joined without a space.
43 |  *
44 |  * The LineGrouper class modifies the std::vector<TextBox> class, so we only
45 |  * need to pass a pointer to this
46 |  *
47 |  */
48 | 
49 | class LineGrouper
50 | {
51 |  public:
52 |   using TextPointer = std::shared_ptr<TextElement>;
53 | 
54 |   // Constructor takes the output of WordGrouper - a vector of TextBoxes
55 |   LineGrouper(PageBox page_box_from_whitespace);
56 | 
57 |   // The output is also a vector of TextBoxes
58 |   inline TextBox Output() { return text_boxes_.CastToTextBox();}
59 | 
60 |  private:
61 |   void FindBreaks_(TextBox&);   // Identifies paragraph breaks
62 |   void LineEndings_(TextBox&);  // Adjusts line endings to facilitate pasting
63 |   void PasteLines_(TextBox&);   // Pastes TextElements in the TextBoxes together
64 | 
65 |   // Defines the reading order for elements in a text box. If an element is
66 |   // higher than another, it comes before it. If it is at the same height but
67 |   // to the left of the other element, it comes before it. In all other cases,
68 |   // it comes afterwards.
69 |   struct ReadingOrder_
70 |   {
71 |     bool operator() (const TextPointer& row1, const TextPointer& row2) const
72 |     {
73 |       if (row1->GetBottom()  > row2->GetBottom() ) return true;
74 |       if (row1->GetBottom() == row2->GetBottom() &&
75 |           row1->GetLeft()    < row2->GetLeft()   ) return true;
76 |       return false;
77 |     }
78 |   };
79 | 
80 |   // private data member
81 |   PageBox text_boxes_;
82 | };
83 | 
84 | 
85 | //---------------------------------------------------------------------------//
86 | 
87 | #endif
88 | 


--------------------------------------------------------------------------------
/src/graphicsstate.h:
--------------------------------------------------------------------------------
 1 | //---------------------------------------------------------------------------//
 2 | //                                                                           //
 3 | //  PDFR GraphicsState header file                                           //
 4 | //                                                                           //
 5 | //  Copyright (C) 2018 - 2021 by Allan Cameron                               //
 6 | //                                                                           //
 7 | //  Licensed under the MIT license - see https://mit-license.org             //
 8 | //  or the LICENSE file in the project root directory                        //
 9 | //                                                                           //
10 | //---------------------------------------------------------------------------//
11 | 
12 | #ifndef PDFR_GS
13 | 
14 | //---------------------------------------------------------------------------//
15 | 
16 | #define PDFR_GS
17 | 
18 | #include "matrix.h"
19 | #include "page.h"
20 | #include "graphicobject.h"
21 | 
22 | /*---------------------------------------------------------------------------*/
23 | 
24 | class TextState
25 | {
26 |   public:
27 |     float                 tc,     // Character spacing
28 |                           tw,     // Word spacing
29 |                           th,     // Horizontal scaling
30 |                           tl,     // Text leading
31 |                           tfs,    // Font size
32 |                           trise;  // Text rise
33 |     std::string           tf;     // Font name
34 |     int                   tmode;  // Text printing mode
35 |     std::shared_ptr<Font> current_font;
36 | 
37 |     TextState() : tc(0), tw(0), th(100), tl(0),
38 |                   tfs(0), trise(0), tf(""), tmode(0) {}
39 | };
40 | 
41 | //---------------------------------------------------------------------------//
42 | 
43 | class GraphicsState
44 | {
45 |   public:
46 |     Matrix                   CTM;
47 |     Path                     clipping_path;
48 |     std::vector<std::string> colour_space_stroke,
49 |                              colour_space_fill;
50 |     std::vector<float>       colour,
51 |                              fill;
52 |     TextState                text_state;
53 |     Matrix                   tm_state,
54 |                              td_state;
55 |     float                    line_width;
56 |     int                      line_cap,
57 |                              line_join;
58 |     float                    miter_limit;
59 |     std::string              rendering_intent;
60 |     bool                     stroke_adjustment;
61 |     std::vector<int>         dash_array;
62 |     std::vector<std::string> blending_mode;
63 |     std::string              soft_mask;
64 |     float                    alpha_constant;
65 |     bool                     alpha_source;
66 | 
67 |     GraphicsState(std::shared_ptr<Page> p) :
68 |                       CTM(Matrix()), clipping_path(Path()),
69 |                       colour_space_stroke({"/DeviceGray"}),
70 |                       colour_space_fill({"/DeviceGray"}),
71 |                       colour({0, 0, 0}), fill({0, 0, 0}),
72 |                       text_state(TextState()), tm_state(Matrix()),
73 |                       td_state(Matrix()), line_width(1),
74 |                       line_cap(0), line_join(0), miter_limit(10.0),
75 |                       rendering_intent("/RelativeColorimetric"),
76 |                       stroke_adjustment(false),
77 |                       dash_array({0}),
78 |                       blending_mode({"Normal"}), soft_mask("None"),
79 |                       alpha_constant(1.0), alpha_source(false)
80 |     {
81 |       std::shared_ptr<Box> b = p->GetMinbox();
82 |       clipping_path.SetX({b->GetLeft(),   b->GetLeft(), b->GetRight(),
83 |                           b->GetRight(),  b->GetLeft()});
84 |       clipping_path.SetY({b->GetBottom(), b->GetTop(), b->GetTop(),
85 |                           b->GetBottom(), b->GetBottom()});
86 |     }
87 | 
88 | };
89 | 
90 | //---------------------------------------------------------------------------//
91 | 
92 | #endif
93 | 
94 | 
95 | 


--------------------------------------------------------------------------------
/src/box.cpp:
--------------------------------------------------------------------------------
 1 | //---------------------------------------------------------------------------//
 2 | //                                                                           //
 3 | //  PDFR box implementation file                                             //
 4 | //                                                                           //
 5 | //  Copyright (C) 2018 - 2019 by Allan Cameron                               //
 6 | //                                                                           //
 7 | //  Licensed under the MIT license - see https://mit-license.org             //
 8 | //  or the LICENSE file in the project root directory                        //
 9 | //                                                                           //
10 | //---------------------------------------------------------------------------//
11 | 
12 | 
13 | #include "box.h"
14 | using namespace std;
15 | 
16 | //---------------------------------------------------------------------------//
17 | // Every vertex of the final polygon surrounding each text element contains
18 | // information about its position. However, to "connect" the vertices so as to
19 | // arrange them in clockwise order, it also needs to know which direction
20 | // the incoming and outgoing edges are "pointing". We do this by working out
21 | // for each vertex whether there is whitespace immediately to the NorthWest,
22 | // NorthEast, SouthEast and SouthWest of the vertex. These are recorded as the
23 | // four lowest order bits in a single "flags" byte; thus a vertex that had
24 | // whitespace to the NorthWest and SouthWest (as it would if it lay along the
25 | // middle of the left edge of a text polygon), would have its flags set to
26 | // 1001 in binary (or 0x9 in hexadecimal, or a byte value of 0x09 provided we
27 | // mask the flag byte with & 0x0f). Since we know that a masked flag value of
28 | // 0x09 must represent a point lying on a left edge, its "incoming" edge must
29 | // be travelling North (since we are concerned with clockwise ordering), and
30 | // its outgoing edge must also be pointing North. To make all this clearer we
31 | // want each Vertex to specify its incoming and outgoing directions. We can
32 | // therefore just look up the four lowest order bytes in each Vertex's flags
33 | // using this unordered map to get the implied direction based on the
34 | // surrounding whitespace.
35 | 
36 | unordered_map<uint8_t, pair<Direction, Direction>> Vertex::arrows_ =
37 | {
38 |   {0x00, {None, None}},   {0x01, {North, West}}, {0x02, {West, South}},
39 |   {0x03, {West, West}},   {0x04, {South, East}}, {0x05, {None, None}},
40 |   {0x06, {South, South}}, {0x07, {South, West}}, {0x08, {East, North}},
41 |   {0x09, {North, North}}, {0x0A, {None, None}},  {0x0B, {West, North}},
42 |   {0x0C, {East, East}},   {0x0D, {North, East}}, {0x0E, {East, South}},
43 |   {0x0F, {None, None}}
44 | };
45 | 
46 | //---------------------------------------------------------------------------//
47 | // Create a vertex from a given corner of the box
48 | // (0 = top-left, 1 = top-right, 2 = bottom-left, 3 = bottom-right)
49 | // Note, the given vertex is automatically flagged as being impinged at the
50 | // correct compass direction
51 | 
52 | shared_ptr<Vertex> Box::GetVertex(int corner)
53 | {
54 |   switch (corner)
55 |   {
56 |     case 0 : return std::make_shared<Vertex>(left_,  top_,    0x02);
57 |     case 1 : return std::make_shared<Vertex>(right_, top_,    0x01);
58 |     case 2 : return std::make_shared<Vertex>(left_,  bottom_, 0x04);
59 |     case 3 : return std::make_shared<Vertex>(right_, bottom_, 0x08);
60 |     default: return std::make_shared<Vertex>(0, 0, 0);
61 |   }
62 |   return std::make_shared<Vertex>  (0, 0, 0);
63 | }
64 | 
65 | //---------------------------------------------------------------------------//
66 | // Marks a box's impingement on a given vertex. This records whether moving
67 | // an arbitrarily small distance in a given direction from the vertex will
68 | // place one inside the current box.
69 | 
70 | void Box::RecordImpingementOn(Vertex& corner)
71 | {
72 |   if (IsNorthWestOf(corner)) corner.SetFlags(0x08);
73 |   if (IsNorthEastOf(corner)) corner.SetFlags(0x04);
74 |   if (IsSouthEastOf(corner)) corner.SetFlags(0x02);
75 |   if (IsSouthWestOf(corner)) corner.SetFlags(0x01);
76 | }
77 | 


--------------------------------------------------------------------------------
/src/object_class.h:
--------------------------------------------------------------------------------
 1 | //---------------------------------------------------------------------------//
 2 | //                                                                           //
 3 | //  PDFR Object header file                                                  //
 4 | //                                                                           //
 5 | //  Copyright (C) 2018 - 2019 by Allan Cameron                               //
 6 | //                                                                           //
 7 | //  Licensed under the MIT license - see https://mit-license.org             //
 8 | //  or the LICENSE file in the project root directory                        //
 9 | //                                                                           //
10 | //---------------------------------------------------------------------------//
11 | 
12 | #ifndef PDFR_OBJECT
13 | 
14 | //---------------------------------------------------------------------------//
15 | 
16 | #define PDFR_OBJECT
17 | 
18 | /* This is the fourth header file in a daisy-chain of main headers which builds
19 |  * up an interface for parsing pdf files. It comes directly after xref.h and is
20 |  * the last step before the main document class is declared.
21 |  *
22 |  * The object class comprises the data and functions needed to represent a pdf
23 |  * object. Each Object object is made of two main items of data: a
24 |  * dictionary (which can be empty), and a pair of size_t indicating the offset
25 |  * of the stream's start and stop. The reason we don't just build the stream
26 |  * is that decryption and deflation of large streams is computationally
27 |  * expensive, and we should only do it on request. As an object may be requested
28 |  * more than once however, if we have gone to the trouble of calculating the
29 |  * stream, it is stored as a private data member.
30 |  *
31 |  * Of course, for objects to have this memory of their state, they need to
32 |  * stay in scope from creation until the program exits. This is done by keeping
33 |  * a vector of retrieved objects in the document class, which persists through
34 |  * the lifetime of the program.
35 |  *
36 |  * The job of finding the object, parsing its dictionary and decoding its stream
37 |  * is abstracted away using this class, so that pdf objects can be directly
38 |  * interrogated for key:value pairs and their streams can be parsed as plain
39 |  * text where appropriate. This means that logical structures such as pages,
40 |  * fonts and form objects can be built by interfacing directly with pdf objects
41 |  * rather than indirectly through byte offsets and binary streams
42 |  */
43 | 
44 | #include "xref.h"
45 | 
46 | //---------------------------------------------------------------------------//
47 | 
48 | class Object
49 | {
50 |  public:
51 |   // Get pdf object from a given object number
52 |   Object(std::shared_ptr<const XRef> xref_ptr, int object_number);
53 | 
54 |   // Get stream object from inside the holding object, given object number
55 |   Object(std::shared_ptr<Object> holding_object_ptr, int object_number);
56 | 
57 |   // Default constructor
58 |   Object() = delete;
59 | 
60 |   // Returns an Object's stream as a string
61 |   std::string& GetStream();
62 | 
63 |   // Returns an Object's Dictionary
64 |   Dictionary& GetDictionary();
65 | 
66 |   friend std::ostream& operator<<(std::ostream& os, const Object& obj);
67 | 
68 |  private:
69 |   std::shared_ptr<const XRef> xref_;      // Pointer to creating xref
70 |   int object_number_;                     // The object knows its own number
71 |   Dictionary header_;                     // The object's dictionary
72 |   std::string stream_;                    // The object's stream or contents
73 |   CharString raw_stream_;                 // Start position and length of stream
74 | 
75 |   // A lookup of start / stop positions of the objects within an object stream
76 |   std::shared_ptr<std::unordered_map<int, std::pair<int, int>>> stream_index_;
77 | 
78 |   // private methods
79 |   void IndexObjectStream_();
80 |   void ReadStream_();
81 | };
82 | 
83 | //---------------------------------------------------------------------------//
84 | 
85 | inline std::ostream& operator<<(std::ostream& os, Object& obj)
86 | {
87 |   os << obj.GetDictionary() << "\n\nStream:\n" << obj.GetStream();
88 |   return os;
89 | }
90 | 
91 | #endif
92 | 


--------------------------------------------------------------------------------
/src/word_grouper.h:
--------------------------------------------------------------------------------
 1 | //---------------------------------------------------------------------------//
 2 | //                                                                           //
 3 | //  PDFR WordGrouper header file                                            //
 4 | //                                                                           //
 5 | //  Copyright (C) 2018 - 2019 by Allan Cameron                               //
 6 | //                                                                           //
 7 | //  Licensed under the MIT license - see https://mit-license.org             //
 8 | //  or the LICENSE file in the project root directory                        //
 9 | //                                                                           //
10 | //---------------------------------------------------------------------------//
11 | 
12 | #ifndef PDFR_WGROUPER
13 | 
14 | //---------------------------------------------------------------------------//
15 | 
16 | #define PDFR_WGROUPER
17 | 
18 | /* The word grouper takes all of the words stuck together by the letter grouper
19 |  * and attempts to join them into lines of text. It does this primarily by
20 |  * identifying whether two adjacent words are close enough to be joined by a
21 |  * single space character.
22 |  *
23 |  * There are a few caveats to this. Often text will be in columns, and we don't
24 |  * want words at the right edge of one column to join to words in the adjacent
25 |  * column if they are close together. The word grouper attempts to prevent this
26 |  * by identifying words on the page whose left edges are aligned. If several
27 |  * words have matching left edges, then they probably form a left-aligned
28 |  * column. Any word with its left edge on a left-aligned column should not be
29 |  * allowed to be joined to a word to its right.
30 |  *
31 |  * This isn't perfect, since we may get false positives, when words
32 |  * coincidentally line up within a body of text. The higher we stipulate the
33 |  * number of words that must be aligned to count as a column, the less likely
34 |  * this is to happen, but we will then run the risk of false negatives, where
35 |  * adjacent columns get stuck together. Therefore, the more left edges we find
36 |  * and the higher the likliehood of a column being present, the smaller the gap
37 |  * that is allowed to be bridged.
38 |  *
39 |  * We carry out a similar process for right-aligned and centre-aligned text.
40 |  * Right-aligned text is intolerant of anything to the left joining and centre-
41 |  * aligned text is intolerant of left or right joins.
42 |  */
43 | 
44 | #include "textbox.h"
45 | 
46 | //---------------------------------------------------------------------------//
47 | // The word grouper class takes a pointer to a letter grouper object in its
48 | // constructor. It makes a table of the x values of the left, right and centre
49 | // points of each word, and uses these to infer which word pairs are elligible
50 | // for sticking together.
51 | 
52 | class WordGrouper
53 | {
54 |  public:
55 |   // Constructor - takes the main textbox as output from LetterGrouper
56 |   WordGrouper(std::unique_ptr<TextBox> output_from_lettergrouper);
57 | 
58 |   // Output individual text elements for next phase of layout analysis
59 |   std::unique_ptr<TextBox> Output() { return std::move(text_box_); }
60 | 
61 |   // Output text elements with sizes, fonts, positions to API
62 |   TextTable Out() const;
63 | 
64 |  private:
65 |   // Make a table of values in a vector of floats rounded to one decimal place
66 |   void Tabulate_(const std::vector<float>&, std::unordered_map<int, size_t>&);
67 | 
68 |   // Use tabulate function to find likely left, right or mid-aligned columns
69 |   void FindEdges_();
70 | 
71 |   // Tell the text elements whether they form an edge or not
72 |   void AssignEdges_();
73 | 
74 |   // Join elligible adjacent glyphs together and merge their properties
75 |   void FindRightMatch_();
76 | 
77 | // private data members
78 |   std::unordered_map<int, size_t> left_edges_,        // The tables of edges
79 |                                   right_edges_,
80 |                                   mids_;
81 |   std::unique_ptr<TextBox>        text_box_;           // The main data member
82 | };
83 | 
84 | //---------------------------------------------------------------------------//
85 | 
86 | #endif
87 | 


--------------------------------------------------------------------------------
/src/streams.cpp:
--------------------------------------------------------------------------------
  1 | //---------------------------------------------------------------------------//
  2 | //                                                                           //
  3 | //  PDFR Streams implementation file                                         //
  4 | //                                                                           //
  5 | //  Copyright (C) 2018 - 2019 by Allan Cameron                               //
  6 | //                                                                           //
  7 | //  Licensed under the MIT license - see https://mit-license.org             //
  8 | //  or the LICENSE file in the project root directory                        //
  9 | //                                                                           //
 10 | //---------------------------------------------------------------------------//
 11 | 
 12 | /* Streams are normally compressed in PDFs, and the majority appear to be
 13 |  * compressed in DEFLATE format. I have used inheritance here with the Stream
 14 |  * class playing the role of base class and the various types of compression
 15 |  * having their own dervied classes, so that the interface remains standardized
 16 |  * and new classes for each type of compression could be added as needed.
 17 |  *
 18 |  * The Stream Class itself is effectively an abstract class. Its constructor
 19 |  * is protected so it can only be called by the derived class constructors.
 20 |  */
 21 | 
 22 | #include<string>
 23 | #include<vector>
 24 | #include<iostream>
 25 | #include<stdexcept>
 26 | #include<map>
 27 | #include<algorithm>
 28 | #include "streams.h"
 29 | 
 30 | using namespace std;
 31 | 
 32 | 
 33 | Stream::Stream(const string* input) : input_(*input),
 34 |                                         output_(std::string()),
 35 |                                         input_position_(input_.begin()),
 36 |                                         output_position_(output_.begin()),
 37 |                                         unconsumed_bits_(0),
 38 |                                         unconsumed_bit_value_(0) {}
 39 | 
 40 | Stream::Stream(const CharString& input) : input_(input),
 41 |                                         output_(std::string()),
 42 |                                         input_position_(input_.begin()),
 43 |                                         output_position_(output_.begin()),
 44 |                                         unconsumed_bits_(0),
 45 |                                         unconsumed_bit_value_(0) {}
 46 | 
 47 | /*---------------------------------------------------------------------------*/
 48 | 
 49 | uint32_t Stream::GetByte()
 50 | {
 51 |   if (input_position_ == input_.end()) return 256;
 52 |   return (uint8_t) *input_position_++;
 53 | }
 54 | 
 55 | 
 56 | /*---------------------------------------------------------------------------*/
 57 | 
 58 | uint32_t Stream::PeekByte()
 59 | {
 60 |   uint32_t result = GetByte();
 61 |   --input_position_;
 62 |   return result;
 63 | }
 64 | 
 65 | /*---------------------------------------------------------------------------*/
 66 | 
 67 | void Stream::Reset()
 68 | {
 69 |   input_position_ = input_.begin();
 70 |   output_.clear();
 71 |   output_position_ = output_.begin();
 72 |   unconsumed_bit_value_ = 0;
 73 |   unconsumed_bits_ = 0;
 74 | }
 75 | 
 76 | /*---------------------------------------------------------------------------*/
 77 | 
 78 | uint32_t Stream::GetBits(uint32_t n_bits)
 79 | {
 80 |   uint32_t value_read = unconsumed_bit_value_;
 81 |   uint8_t bits_read = unconsumed_bits_;
 82 | 
 83 |   while (bits_read < n_bits)
 84 |   {
 85 |     uint32_t new_byte = GetByte();
 86 |     if (new_byte == 256) throw runtime_error("Unexpected end of stream");
 87 |     value_read |= new_byte << bits_read;
 88 |     bits_read += 8;
 89 |   }
 90 | 
 91 |   uint32_t result = value_read & ((1 << n_bits) - 1);
 92 |   unconsumed_bit_value_ = value_read >> n_bits;
 93 |   bits_read -= n_bits;
 94 |   unconsumed_bits_ = bits_read;
 95 |   return result;
 96 | }
 97 | 
 98 | /*---------------------------------------------------------------------------*/
 99 | 
100 | uint32_t Stream::BitFlip(uint32_t value, uint32_t n_bits)
101 | {
102 |   uint32_t result = 0;
103 |   for(uint32_t i = 1; i <= n_bits; ++i)
104 |   {
105 |     result = (result << 1) | (value & 1);
106 |     value  >>= 1;
107 |   }
108 |   return result;
109 | }
110 | 


--------------------------------------------------------------------------------
/codemeta.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
  3 |   "@type": "SoftwareSourceCode",
  4 |   "identifier": "PDFR",
  5 |   "description": "Extracts text from PDF into an R dataframe giving the content, size, position and font of any text elements. This information can then be manipulated in R.",
  6 |   "name": "PDFR: Extract Text From PDFs In An R Friendly Way",
  7 |   "codeRepository": "https://github.com/AllanCameron/PDFR",
  8 |   "issueTracker": "https://github.com/AllanCameron/PDFR/issues",
  9 |   "license": "https://spdx.org/licenses/MIT",
 10 |   "version": "0.1.0",
 11 |   "programmingLanguage": {
 12 |     "@type": "ComputerLanguage",
 13 |     "name": "R",
 14 |     "url": "https://r-project.org"
 15 |   },
 16 |   "runtimePlatform": "R version 4.2.0 Patched (2022-05-23 r82396)",
 17 |   "author": [
 18 |     {
 19 |       "@type": "Person",
 20 |       "givenName": "Allan",
 21 |       "familyName": "Cameron",
 22 |       "email": "Allan.Cameron@nhs.net"
 23 |     }
 24 |   ],
 25 |   "contributor": [
 26 |     {
 27 |       "@type": "Person",
 28 |       "givenName": "Eli",
 29 |       "familyName": "Pousson",
 30 |       "email": "eli.pousson@gmail.com",
 31 |       "@id": "https://orcid.org/0000-0001-8280-1706"
 32 |     }
 33 |   ],
 34 |   "copyrightHolder": [
 35 |     {
 36 |       "@type": "Person",
 37 |       "givenName": "Allan",
 38 |       "familyName": "Cameron",
 39 |       "email": "Allan.Cameron@nhs.net"
 40 |     }
 41 |   ],
 42 |   "maintainer": [
 43 |     {
 44 |       "@type": "Person",
 45 |       "givenName": "Allan",
 46 |       "familyName": "Cameron",
 47 |       "email": "Allan.Cameron@nhs.net"
 48 |     }
 49 |   ],
 50 |   "softwareSuggestions": [
 51 |     {
 52 |       "@type": "SoftwareApplication",
 53 |       "identifier": "ggplot2",
 54 |       "name": "ggplot2",
 55 |       "provider": {
 56 |         "@id": "https://cran.r-project.org",
 57 |         "@type": "Organization",
 58 |         "name": "Comprehensive R Archive Network (CRAN)",
 59 |         "url": "https://cran.r-project.org"
 60 |       },
 61 |       "sameAs": "https://CRAN.R-project.org/package=ggplot2"
 62 |     },
 63 |     {
 64 |       "@type": "SoftwareApplication",
 65 |       "identifier": "testthat",
 66 |       "name": "testthat",
 67 |       "provider": {
 68 |         "@id": "https://cran.r-project.org",
 69 |         "@type": "Organization",
 70 |         "name": "Comprehensive R Archive Network (CRAN)",
 71 |         "url": "https://cran.r-project.org"
 72 |       },
 73 |       "sameAs": "https://CRAN.R-project.org/package=testthat"
 74 |     }
 75 |   ],
 76 |   "softwareRequirements": {
 77 |     "1": {
 78 |       "@type": "SoftwareApplication",
 79 |       "identifier": "R",
 80 |       "name": "R",
 81 |       "version": ">= 2.10"
 82 |     },
 83 |     "2": {
 84 |       "@type": "SoftwareApplication",
 85 |       "identifier": "cli",
 86 |       "name": "cli",
 87 |       "provider": {
 88 |         "@id": "https://cran.r-project.org",
 89 |         "@type": "Organization",
 90 |         "name": "Comprehensive R Archive Network (CRAN)",
 91 |         "url": "https://cran.r-project.org"
 92 |       },
 93 |       "sameAs": "https://CRAN.R-project.org/package=cli"
 94 |     },
 95 |     "3": {
 96 |       "@type": "SoftwareApplication",
 97 |       "identifier": "grDevices",
 98 |       "name": "grDevices"
 99 |     },
100 |     "4": {
101 |       "@type": "SoftwareApplication",
102 |       "identifier": "grid",
103 |       "name": "grid"
104 |     },
105 |     "5": {
106 |       "@type": "SoftwareApplication",
107 |       "identifier": "Rcpp",
108 |       "name": "Rcpp",
109 |       "provider": {
110 |         "@id": "https://cran.r-project.org",
111 |         "@type": "Organization",
112 |         "name": "Comprehensive R Archive Network (CRAN)",
113 |         "url": "https://cran.r-project.org"
114 |       },
115 |       "sameAs": "https://CRAN.R-project.org/package=Rcpp"
116 |     },
117 |     "6": {
118 |       "@type": "SoftwareApplication",
119 |       "identifier": "rlang",
120 |       "name": "rlang",
121 |       "provider": {
122 |         "@id": "https://cran.r-project.org",
123 |         "@type": "Organization",
124 |         "name": "Comprehensive R Archive Network (CRAN)",
125 |         "url": "https://cran.r-project.org"
126 |       },
127 |       "sameAs": "https://CRAN.R-project.org/package=rlang"
128 |     },
129 |     "SystemRequirements": "C++11"
130 |   },
131 |   "fileSize": "46223.28KB",
132 |   "readme": "https://github.com/AllanCameron/PDFR/blob/master/README.md",
133 |   "developmentStatus": "https://lifecycle.r-lib.org/articles/stages.html#experimental",
134 |   "keywords": ["pdf-format", "pdf", "extract-text", "data-scientists"]
135 | }
136 | 


--------------------------------------------------------------------------------
/src/streams.h:
--------------------------------------------------------------------------------
  1 | //---------------------------------------------------------------------------//
  2 | //                                                                           //
  3 | //  PDFR Streams header file                                                 //
  4 | //                                                                           //
  5 | //  Copyright (C) 2018 - 2019 by Allan Cameron                               //
  6 | //                                                                           //
  7 | //  Licensed under the MIT license - see https://mit-license.org             //
  8 | //  or the LICENSE file in the project root directory                        //
  9 | //                                                                           //
 10 | //---------------------------------------------------------------------------//
 11 | 
 12 | #ifndef PDFR_STREAMS
 13 | 
 14 | //---------------------------------------------------------------------------//
 15 | 
 16 | #define PDFR_STREAMS
 17 | 
 18 | /* Streams in pdf files are usually made up of a sequence of non-ascii bytes
 19 |  * intended to represent raw data. When they occur they are always part of a pdf
 20 |  * object, which will always start with a <<dictionary>>. At the end of the
 21 |  * dictionary, after the closing brackets, comes the keyword 'stream', usually
 22 |  * (?always) followed by two whitespace bytes: \r and \n. The data then begins.
 23 |  * The end of the stream is declared by the sequence (\r\nendstream).
 24 |  *
 25 |  * The data can represent many different things including pictures, fonts,
 26 |  * annotations and postscript-type page descriptions. For the purposes of text
 27 |  * extraction, it is mainly the latter we are interested in.
 28 |  *
 29 |  * The raw data in the stream is almost always compressed, so needs to be
 30 |  * decompressed before being processed. That is the purpose of the stream class.
 31 |  *
 32 |  * At present, only the flatedecode decompression algorithm is implemented.
 33 |  * I have yet to find a pdf file that uses anything else for page description
 34 |  * to allow testing.
 35 |  *
 36 |  * The possible stream types are:
 37 |  *
 38 |  *  Ascii85Stream,
 39 |  *  AsciiHexStream,
 40 |  *  DecodeStream,
 41 |  *  FlateStream,
 42 |  *  NullStream,
 43 |  *  PredictorStream,
 44 |  *  RunLengthStream,
 45 |  *  StreamsSequenceStream,
 46 |  *  StringStream,
 47 |  *  LZWStream
 48 |  *
 49 |  * This header is required by the xref class, as it needs to be able to deflate
 50 |  * xrefstreams.
 51 |  */
 52 | 
 53 | #include "utilities.h"
 54 | 
 55 | //---------------------------------------------------------------------------//
 56 | // The Stream class is the base class for the different streams used in pdfs.
 57 | // It provides a unified interface, with an input string, an output string,
 58 | // and an iterator for each. It allows for consumption of individual bytes or
 59 | // even for bits within bytes, while keeping track of its reading position and
 60 | // signalling when the end of a stream has been reached without throwing.
 61 | 
 62 | class Stream
 63 | {
 64 |  // The constructors are protected to make this an abstract class.
 65 |  protected:
 66 |   Stream(const std::string*);
 67 |   Stream(const CharString&);
 68 | 
 69 |  public:
 70 |   std::string Output(){return output_;}        // Getter for output
 71 |   uint32_t GetByte();                          // Consumes next byte
 72 |   uint32_t PeekByte();                         // Looks but doesn't consume
 73 |   void Reset();                                // Returns stream to start
 74 |   uint32_t GetBits(uint32_t n);                // Get next n bits
 75 |   uint32_t BitFlip(uint32_t value, uint32_t);  // Reverses bit order
 76 | 
 77 |   // Appends byte to output and advances iterator
 78 |   void WriteOutput(uint8_t byte)
 79 |   {
 80 |     output_.append(1, (char) byte);
 81 |     output_position_ = output_.end();
 82 |   }
 83 | 
 84 |   // Writes a repeat sequence from earlier in the ouput to the end of the
 85 |   // output. Used in Deflate and LZW.
 86 |   void AppendPrevious(uint32_t distance, uint32_t len)
 87 |   {
 88 |     for (unsigned i = 0; i < len; ++i)
 89 |       WriteOutput(*(output_position_ - distance));
 90 |   }
 91 | 
 92 |   void SetExpansionRatio(uint8_t r) {output_.reserve(input_.size() * r);}
 93 |   void ShrinkToFit() { output_.shrink_to_fit();}
 94 |   char GetOutput(){return *output_position_++;}
 95 | 
 96 |   uint64_t GetEightBytes();
 97 | 
 98 |  private:
 99 |   CharString input_;                            // The input string
100 |   std::string output_;                          // The output string
101 |   const char* input_position_;                  // Input iterator
102 |   std::string::const_iterator output_position_; // Output iterator
103 |   uint8_t unconsumed_bits_;                     // Bit iterator
104 |   uint32_t unconsumed_bit_value_;               // Keeps track of unused bits
105 | };
106 | 
107 | #endif
108 | 
109 | 


--------------------------------------------------------------------------------
/src/tokenizer.h:
--------------------------------------------------------------------------------
  1 | //---------------------------------------------------------------------------//
  2 | //                                                                           //
  3 | //  PDFR tokenizer header file                                               //
  4 | //                                                                           //
  5 | //  Copyright (C) 2018 - 2019 by Allan Cameron                               //
  6 | //                                                                           //
  7 | //  Licensed under the MIT license - see https://mit-license.org             //
  8 | //  or the LICENSE file in the project root directory                        //
  9 | //                                                                           //
 10 | //---------------------------------------------------------------------------//
 11 | 
 12 | #ifndef PDFR_TOKEN
 13 | 
 14 | //---------------------------------------------------------------------------//
 15 | 
 16 | #define PDFR_TOKEN
 17 | 
 18 | /* The tokenizer class represents the last of our dealings with the actual
 19 |  * pdf file. After this stage, we have a complete description of the text on
 20 |  * the page including the size and position of every correctly-encoded glyph.
 21 |  * The subsequent steps will use only this data to try to reconstruct useful
 22 |  * semantic information from the text position in an attempt to provide useable
 23 |  * data, and to output the result to a variety of formats.
 24 |  *
 25 |  * The tokenizer class is used to read page description programs from the
 26 |  * page contents objects (and form xobjects). Rather than using regex to do
 27 |  * this (which is extremely slow), we use a custom-built lexer. This takes the
 28 |  * page program as a text string and goes through each character, identifying
 29 |  * tokens as it goes and storing them in a buffer until it can be decided what
 30 |  * type of token it has read. It switches state according to a finite set of
 31 |  * rules so that it knows when to pass the buffer to the parser for
 32 |  * parsing.
 33 |  *
 34 |  * Its interface is very simple - create the object by feeding it a string and
 35 |  * a pointer to the graphics state. It will tokenize the string and send it
 36 |  * to the parser for parsing
 37 |  *
 38 |  * It has a number of private members because it is a fairly complex lexer and
 39 |  * is easier to maintain as a collection of functions that pass private members
 40 |  * around, rather than one huge hairball function.
 41 |  */
 42 | 
 43 | #include "parser.h"
 44 | 
 45 | //---------------------------------------------------------------------------//
 46 | // The Tokenizer class. It has a simple interface of one constructor and one
 47 | // getter for the result. The private members allow for passing of state
 48 | // between member functions during the instruction set creation.
 49 | 
 50 | class Tokenizer
 51 | {
 52 |  public:
 53 |   // Constructor. Takes a string pointer to the page description program
 54 |   // and a fresh Parser object
 55 |   Tokenizer(const std::string& input_string, Parser* parser);
 56 | 
 57 |  private:
 58 |   // Enumerates the types of characters that can alter state differently
 59 | 
 60 |   Reader it_;
 61 |   Token::TokenState state_;               // Current Tokenizer state
 62 |   Parser* interpreter_;                   // The Parser instructions are sent to
 63 |   static std::string in_loop_;            // Prevents an infinite loop
 64 | 
 65 |   // const member functions
 66 |   char GetChar()         const {return it_.GetChar();}
 67 |   CharType GetCharType() const {return it_.GetCharType();}
 68 |   bool empty()           const {return it_.empty();}
 69 | 
 70 |   // private methods
 71 |   void NewSymbolState_();    //--------//---------------------------------------
 72 |   void ResourceState_();               //
 73 |   void IdentifierState_();             //
 74 |   void NumberState_();                 // These private member functions handle
 75 |   void StringState_();                 // the various states of the lexer,
 76 |   void ArrayState_();                  // responding variously to each character
 77 |   void EscapeState_();                 // they come across to build the result
 78 |   void HexStringState_();              //
 79 |   void DictionaryState_();             //
 80 |   void WaitState_();         //--------//---------------------------------------
 81 | 
 82 |   // Frequently used helper functions to update buffer and state
 83 |   void PushBuffer_(const Token::TokenState, const Token::TokenState);
 84 |   void HandleXObject_();
 85 | 
 86 |   // Some simple inlined helpers
 87 |   void NewToken_(const Token::TokenState T) {it_.Clear(); state_ = T;}
 88 | 
 89 |   void Skip_() { ++it_; it_.Clear(); }
 90 | 
 91 |   void HandleLAB_()
 92 |   {
 93 |     Skip_();
 94 |     if (GetChar() == '<') state_ = Token::DICT;
 95 |     else state_ = Token::HEXSTRING;
 96 |   }
 97 | 
 98 |   void HandleLCB_()
 99 |   {
100 |     Skip_();
101 |     if (GetChar() == '\\') EscapeState_();
102 |     else state_ = Token::STRING;
103 |   }
104 | };
105 | 
106 | //---------------------------------------------------------------------------//
107 | 
108 | #endif
109 | 


--------------------------------------------------------------------------------
/src/charstring.cpp:
--------------------------------------------------------------------------------
  1 | //---------------------------------------------------------------------------//
  2 | //                                                                           //
  3 | //  PDFR CharString implementation file                                      //
  4 | //                                                                           //
  5 | //  Copyright (C) 2018 - 2019 by Allan Cameron                               //
  6 | //                                                                           //
  7 | //  Licensed under the MIT license - see https://mit-license.org             //
  8 | //  or the LICENSE file in the project root directory                        //
  9 | //                                                                           //
 10 | //---------------------------------------------------------------------------//
 11 | 
 12 | #include "charstring.h"
 13 | #include<stdexcept>
 14 | 
 15 | /*--------------------------------------------------------------------------*/
 16 | // Returns a pointer to the beginning of the first instance of a target
 17 | // character literal in a CharString
 18 | 
 19 | const char* CharString::find(const char* target) const
 20 | {
 21 |   int first_char = -1;
 22 |   size_t target_index = 0;
 23 | 
 24 |   for (auto it = this->begin(); it != this->end(); ++it)
 25 |   {
 26 |     if (*it == *(target + target_index))
 27 |     {
 28 |       if (first_char == -1) first_char = it - this->begin();
 29 |       ++target_index;
 30 |     }
 31 |     else
 32 |     {
 33 |       if (*(target) == *it)
 34 |       {
 35 |         first_char = it - this->begin();
 36 |         target_index = 1;
 37 |       }
 38 |       else
 39 |       {
 40 |         first_char = -1;
 41 |         target_index = 0;
 42 |       }
 43 |     }
 44 |     if (*(target + target_index) == '\0') return this->begin() + first_char;
 45 |   }
 46 |   return this->end();
 47 | }
 48 | 
 49 | const char* CharString::find(const CharString& target) const
 50 | {
 51 |   int first_char = -1;
 52 |   size_t target_index = 0;
 53 | 
 54 |   for (auto it = this->begin(); it != this->end(); ++it)
 55 |   {
 56 |     if (*it == target[target_index])
 57 |     {
 58 |       if (first_char == -1) first_char = it - this->begin();
 59 |       ++target_index;
 60 |     }
 61 |     else
 62 |     {
 63 |       if (*(target.begin()) == *it)
 64 |       {
 65 |         first_char = it - this->begin();
 66 |         target_index = 1;
 67 |       }
 68 |       else
 69 |       {
 70 |         first_char = -1;
 71 |         target_index = 0;
 72 |       }
 73 |     }
 74 |     if (target[target_index] == '\0') return this->begin() + first_char;
 75 |   }
 76 |   return this->end();
 77 | }
 78 | 
 79 | /*--------------------------------------------------------------------------*/
 80 | 
 81 | std::ostream& operator<<(std::ostream& os, const CharString& cs)
 82 | {
 83 |   for(auto it = cs.begin(); it != cs.end(); ++it) os << *it;
 84 |   return os;
 85 | }
 86 | 
 87 | /*--------------------------------------------------------------------------*/
 88 | // A CharString matches a C-string only if all characters in the two strings
 89 | // match, not including the C-string's terminal nul character.
 90 | 
 91 | bool CharString::operator==(const char* cstring) const
 92 | {
 93 |     if (length_ == 0) return false;
 94 |     for (size_t i = 0; i < length_; ++i)
 95 |     {
 96 |       if (*(begin_ + i) != *(cstring + i)) return false;
 97 |       if (*(cstring + i) == '\0') return false;
 98 |       if (length_ - i == 1 && *(cstring + i + 1) != '\0') return false;
 99 |     }
100 |     return true;
101 | }
102 | 
103 | /*--------------------------------------------------------------------------*/
104 | 
105 | bool CharString::operator==(const CharString& other) const
106 | {
107 |   if (length_ != other.length_) return false;
108 |   if (begin_ == other.begin_) return true;
109 |   for (size_t i = 0; i < length_; ++i)
110 |   {
111 |     if (*(begin_ + i) != other[i]) return false;
112 |   }
113 |   return true;
114 | }
115 | 
116 | /*--------------------------------------------------------------------------*/
117 | 
118 | bool CharString::operator==(const std::string& stdstring) const
119 | {
120 |   if (length_ != stdstring.size()) return false;
121 |   for (size_t i = 0; i < length_; ++i)
122 |   {
123 |     if (*(begin_ + i) != stdstring[i]) return false;
124 |   }
125 |   return true;
126 | }
127 | 
128 | /*--------------------------------------------------------------------------*/
129 | 
130 | CharString CharString::substr(size_t start, size_t length) const
131 | {
132 |   if (start >= this->size())
133 |   {
134 |     throw std::runtime_error("Invalid substring range in CharString::substr");
135 |   }
136 | 
137 |   if (start + length > this->size())
138 |   {
139 |     length = this->size() - start;
140 |   }
141 | 
142 |   return CharString(this->begin(), start, start + length);
143 | }
144 | 
145 | CharString CharString::CarveOut(const char* left, const char* right) const
146 | {
147 |   size_t leftsize = 0;
148 |   while(*(leftsize + left)) ++leftsize;
149 |   const char* newstart = find(left);
150 |   if (newstart == end()) return *this; else newstart += leftsize;
151 |   CharString leftchunk(newstart, end() - newstart);
152 |   const char* newend = leftchunk.find(right);
153 |   return CharString(newstart, newend - newstart);
154 | }
155 | 


--------------------------------------------------------------------------------
/src/whitespace.h:
--------------------------------------------------------------------------------
 1 | //---------------------------------------------------------------------------//
 2 | //                                                                           //
 3 | //  PDFR whitespace header file                                              //
 4 | //                                                                           //
 5 | //  Copyright (C) 2018 - 2019 by Allan Cameron                               //
 6 | //                                                                           //
 7 | //  Licensed under the MIT license - see https://mit-license.org             //
 8 | //  or the LICENSE file in the project root directory                        //
 9 | //                                                                           //
10 | //---------------------------------------------------------------------------//
11 | 
12 | #ifndef PDFR_WSPACE
13 | 
14 | //---------------------------------------------------------------------------//
15 | 
16 | #define PDFR_WSPACE
17 | 
18 | /* This class's job is to take the output of the parser and to carry out the
19 |  * first stage of page segmentation. It does this by dividing the page into
20 |  * a large number of tall vertical strips. Any strips that encounter an
21 |  * obstruction (i.e. one or more glyphs) as they go from the top to the bottom
22 |  * of the page are divided so they do not overlap the glyphs. Thus, if there
23 |  * are n rows of text that the strip would otherwise cross, the strip is
24 |  * divided into n + 1 segments.
25 |  *
26 |  * Once the strips are all calculated, they will cover all the significant empty
27 |  * spaces (henceforth whitespace) in a document, leaving islands of text content
28 |  * uncovered. These islands are physically, and usually logically related.
29 |  *
30 |  * However, there is some work to be done to identify the islands in question.
31 |  * Firstly, we need to ensure that contiguous whitespace is joined together as
32 |  * far as possible. This is done by looking to the right of each strip segment.
33 |  * If the strip immediately to the right has the same top and bottom value, then
34 |  * it is joined to the strip to the left by reducing its left value to the
35 |  * same as the test strip, then flagging the test strip for deletion.
36 |  *
37 |  * The procedure may also leave small holes in the text islands due to
38 |  * whitespace between words and lines. We remove these based on size criteria.
39 |  *
40 |  * Once we have are final set of whitespace boxes, we look at each vertex in
41 |  * each whitespace box to determine which quadrants contain whitespace. None
42 |  * should contain zero or four quadrants, and they should all lie on either a
43 |  * page margin or the margin of a text island.
44 |  *
45 |  * The point of doing this is that we can use this information to draw a line
46 |  * clockwise around the edge of each island by identifying the configuration
47 |  * of whitespace around each vertex. Once we have drawn the polygons defining
48 |  * each island, we can then assign glyphs to be inside one of these polygons.
49 |  * This gives us a group of page segments along with the glyphs they contain.
50 |  * We can then use this information to group letters and words together,
51 |  * establish a reading order and attempt classification of text elements based
52 |  * on size, shape, position and order on the page.
53 |  */
54 | 
55 | #include "textbox.h"
56 | 
57 | //---------------------------------------------------------------------------//
58 | // The whitespace class takes a word grouper as an argument in its constructor
59 | // and from that uses a sequence of helper functions to construct its final
60 | // output, which is a vector of WS_box containing the text boxes for a page.
61 | 
62 | class Whitespace
63 | {
64 |  public:
65 |   using TextPointer = std::shared_ptr<TextElement>;
66 |   // constructor
67 |   Whitespace(std::unique_ptr<TextBox> ouput_from_wordgrouper);
68 | 
69 |   //  Output the text element groups directly
70 |   PageBox Output();
71 | 
72 |   // Output the final text box co-ordinates
73 |   std::vector<Box> WSBoxOut() const;
74 | 
75 |  private:
76 |   //The main output is a collection of pairs of text boxes with their elements
77 |   std::unique_ptr<TextBox> text_box_; // A copy of word grouper's output
78 |   std::unordered_map<size_t, std::vector<std::shared_ptr<Vertex>>> polygonmap_;
79 |   std::vector<Box> boxes_;  // Used in construction AND output
80 |   std::vector<std::shared_ptr<Vertex>> vertices_; // Used to make polygons
81 |   static const size_t DIVISIONS_ = 200; // number of strips used for whitespace
82 | 
83 |   void PageDimensions_();    // Gets page margins
84 |   void CleanAndSortBoxes_(); // Helper to remove Boxes flagged for deletion
85 |   void MakeStrips_();        // Cover the whitespace with tall thin strips
86 |   void MergeStrips_();       // merge adjacent strips into boxes
87 |   void RemoveSmall_();       // remove insufficiently tall boxes
88 |   void MakeVertices_();      // use Boxes to find vertices of polygons
89 |   void TidyVertices_();      // identify and remove the unneeded vertices
90 |   void TracePolygons_();     // trace around polygons by following vertices
91 |   void MakePolygonMap_();    // map polygons to size_t keys
92 |   void PolygonMax_();        // find bounding boxes of polygons
93 |   void RemoveEngulfed_();    // remove boxes within other boxes
94 | };
95 | 
96 | //---------------------------------------------------------------------------//
97 | 
98 | #endif
99 | 


--------------------------------------------------------------------------------
/src/line_grouper.cpp:
--------------------------------------------------------------------------------
  1 | //----------------------------------------------------------------------------//
  2 | //                                                                            //
  3 | //  PDFR LineGrouper implementation file                                      //
  4 | //                                                                            //
  5 | //  Copyright (C) 2018 - 2019 by Allan Cameron                                //
  6 | //                                                                            //
  7 | //  Licensed under the MIT license - see https://mit-license.org              //
  8 | //  or the LICENSE file in the project root directory                         //
  9 | //                                                                            //
 10 | //----------------------------------------------------------------------------//
 11 | 
 12 | #include<algorithm>
 13 | #include<unordered_map>
 14 | #include<memory>
 15 | #include<stdexcept>
 16 | #include "line_grouper.h"
 17 | 
 18 | //----------------------------------------------------------------------------//
 19 | 
 20 | using namespace std;
 21 | 
 22 | //----------------------------------------------------------------------------//
 23 | // The LineGrouper constructor takes the WordGrouper output and goes through all
 24 | // of its text boxes. If the elements within each box can be grouped together
 25 | // into a single logical component, then they are glued together into a logical
 26 | // unit. Otherwise, the box is split vertically.
 27 | 
 28 | LineGrouper::LineGrouper(PageBox text_boxes)
 29 |   : text_boxes_(text_boxes)
 30 | {
 31 |   size_t i = 0;
 32 | 
 33 |   // If there are no textboxes, there is nothing to do.
 34 |   if ( !text_boxes_.empty())
 35 |   {
 36 |     while (i < text_boxes_.size())
 37 |     {
 38 |       // There is no point processing a textbox with 0 or 1 elements.
 39 |       if (text_boxes_[i].size() < 2){++i; continue;}
 40 | 
 41 |       // Ensures the text elements are in the correct reading order in the box.
 42 |       sort(text_boxes_[i].begin(), text_boxes_[i].end(), ReadingOrder_());
 43 | 
 44 |       // Finds logical breaks within the text box and splits if needed.
 45 |       FindBreaks_(text_boxes_[i]);
 46 | 
 47 |       // After splitting, there may only be 1 element left in the box.
 48 |       if (text_boxes_[i].size() < 2){++i; continue;}
 49 | 
 50 |       // Makes sure the lines have correct final character before being pasted
 51 |       LineEndings_(text_boxes_[i]);
 52 | 
 53 |       // Pastes the text elements together
 54 |       PasteLines_(text_boxes_[i++]);
 55 |     }
 56 |   }
 57 | };
 58 | 
 59 | //----------------------------------------------------------------------------//
 60 | // Since the TextElements are now sorted by reading order, we can compare
 61 | // consecutive elements in a textbox to work out whether they belong to the
 62 | // same logical group. If they don't, then we call SplitBox_ to seperate them.
 63 | //
 64 | // This method identifies whether a new line is indented compared the previous
 65 | // line.
 66 | 
 67 | void LineGrouper::FindBreaks_(TextBox& text_box)
 68 | {
 69 |   // For each TextElement in the TextBox
 70 |   for (size_t i = 1; i < text_box.size(); ++i)
 71 |   {
 72 |     if (text_box[i]->GetBottom() < text_box[i - 1]->GetBottom() && // Below
 73 |         text_box[i]->GetLeft() - text_box[i - 1]->GetLeft() > 0.1) // To left
 74 |     {
 75 |       auto slice_at = text_box[i - 1]->GetBottom();
 76 |       auto&& new_box = text_box.SplitIntoTopAndBottom(slice_at);
 77 |       if (!new_box.empty()) text_boxes_.push_back(new_box);
 78 |       break;
 79 |     }
 80 |   }
 81 | }
 82 | 
 83 | //----------------------------------------------------------------------------//
 84 | // To join lines together properly, we normally want to add a space to seperate
 85 | // the word ending the line above and the first word of the line below. However,
 86 | // we don't want to add an extra whitespace if the line already ends in one.
 87 | // Furthermore, we don't want to add a space between the two fragments of a
 88 | // hyphenated word, and instead we should just remove the hyphen.
 89 | //
 90 | // This method handles these various possibilities
 91 | 
 92 | void LineGrouper::LineEndings_(TextBox& text_box)
 93 | {
 94 |   // For each element in the TextBox
 95 |   for (size_t i = 0; i < text_box.size() - 1; ++i)
 96 |   {
 97 |     auto& element = text_box[i];
 98 |     switch (element->GetGlyph().back())
 99 |     {
100 |       case 0x0020:                          break;
101 |       case 0x00A0:                          break;
102 |       case 0x002d: element->PopLastGlyph(); break;
103 |       case 0x2010: element->PopLastGlyph(); break;
104 |       case 0x2011: element->PopLastGlyph(); break;
105 |       case 0x2012: element->PopLastGlyph(); break;
106 |       case 0x2013: element->PopLastGlyph(); break;
107 |       case 0x2014: element->PopLastGlyph(); break;
108 |       case 0x2015: element->PopLastGlyph(); break;
109 |       default:     element->AddSpace();
110 |     }
111 |   }
112 | }
113 | 
114 | //----------------------------------------------------------------------------//
115 | // Combines the text elements into a single element with the textbox
116 | 
117 | void LineGrouper::PasteLines_(TextBox& text_box)
118 | {
119 |   for (auto& element : text_box)
120 |   {
121 |     if (&element != &(text_box[0]))
122 |     {
123 |       text_box[0]->ConcatenateUnicode(element->GetGlyph());
124 |     }
125 |   }
126 |   text_box.resize(1);
127 | }
128 | 
129 | 
130 | 


--------------------------------------------------------------------------------
/src/font.h:
--------------------------------------------------------------------------------
  1 | //---------------------------------------------------------------------------//
  2 | //                                                                           //
  3 | //  PDFR Font header file                                                    //
  4 | //                                                                           //
  5 | //  Copyright (C) 2018 - 2019 by Allan Cameron                               //
  6 | //                                                                           //
  7 | //  Licensed under the MIT license - see https://mit-license.org             //
  8 | //  or the LICENSE file in the project root directory                        //
  9 | //                                                                           //
 10 | //---------------------------------------------------------------------------//
 11 | 
 12 | #ifndef PDFR_FONT
 13 | 
 14 | //---------------------------------------------------------------------------//
 15 | 
 16 | #define PDFR_FONT
 17 | 
 18 | /* This is the seventh step of a daisy-chain of headers comprising the PDFR
 19 |  * program as described in headerMap.txt. It #includes the two files that
 20 |  * comprise the 6th step - encoding.h and glyphwidths.h.
 21 |  *
 22 |  * Most of the hard work in creating fonts has been done in the previous step -
 23 |  * working out which glyphs are intended by an input pdf string, and what size
 24 |  * those glyphs should be when printed.
 25 |  *
 26 |  * The job of the Font class is therefore to co-ordinate the process of font
 27 |  * creation by using these other two classes, then combining their results into
 28 |  * a structure that I have called a glyphmap. This is a map which maps any
 29 |  * raw character from the pdf input to a pair indicating the intended Unicode
 30 |  * output glyph and glyph width for that character code.
 31 |  *
 32 |  * Its public interface includes constructors which require a document pointer,
 33 |  * the font dictionary and the ID of the font, used as shorthand in the
 34 |  * page dictionary.
 35 |  *
 36 |  * The remainder of the public members are: a getter for the actual font name;
 37 |  * an enumerator of all the RawChars mapped in the glyphmap, and a function to
 38 |  * safely interrogate the glyphmap, returning a vector of paired Unicode code
 39 |  * points and integer widths for each glyph, given an input vector of RawChars.
 40 |  *
 41 |  */
 42 | 
 43 | #include<utility>
 44 | #include<string>
 45 | #include<vector>
 46 | #include<unordered_map>
 47 | #include<memory>
 48 | #include "truetype.h"
 49 | 
 50 | class Dictionary;
 51 | class Document;
 52 | using Unicode = uint16_t;
 53 | using RawChar = uint16_t;
 54 | 
 55 | 
 56 | //---------------------------------------------------------------------------//
 57 | // The GlyphMap is the main data member of the Font class. Although it is
 58 | // constructed from standard library components, it needs a shorthand name
 59 | 
 60 | typedef std::pair<Unicode, float> GlyphData;
 61 | typedef std::unordered_map<RawChar, GlyphData> GlyphMap;
 62 | 
 63 | //---------------------------------------------------------------------------//
 64 | // Each Font object is created and stored as an object in a pdf page, as this
 65 | // is how the pdf is logically organised. However, its public methods are
 66 | // called by other classes, which use Font objects to interpret pdf strings.
 67 | 
 68 | class Font
 69 | {
 70 |  public:
 71 |   // Constructor
 72 |   Font(std::shared_ptr<Document> document_ptr,
 73 |        Dictionary& font_dictionary_ptr,
 74 |        const std::string& id);
 75 | 
 76 |   // public methods
 77 |   std::string GetFontName();            // Gets the actual PostScript Font name
 78 |   std::vector<RawChar> GetGlyphKeys();  // Returns vector of all mapped RawChars
 79 | 
 80 |   // The most important public method is MapRawChar, which takes a vector of
 81 |   // uint16_t representing raw character codes, and returns a vector of pairs
 82 |   // containing the Unicode interpretation and its associated width
 83 |   std::vector<GlyphData> MapRawChar(const std::vector<RawChar>& raw_chars);
 84 | 
 85 | private:
 86 |   // private data members
 87 |   std::shared_ptr<Document> document_;  // - Pointer to the containing document
 88 |   Dictionary& font_dictionary_;         // - The main font dictionary
 89 |   std::string font_id_,                 // - The name the font as used in PDF
 90 |               font_name_,               // - The actual name of the font
 91 |               fontfile_;                // - The bytes making up the font
 92 |   GlyphMap glyph_map_;                  // - Main data member, mapping RawChar
 93 |                                         //   to a {Unicode, width} pair.
 94 |   std::shared_ptr<TTFont> font_data_;
 95 | 
 96 |   std::vector<Path> GetGlyphPath(RawChar ch, float x_scale, float y_scale,
 97 |                                  float x_offset, float y_offset) {
 98 |     return font_data_->ReadGlyf(ch).AsPath(x_scale, y_scale,
 99 |                                            x_offset, y_offset);
100 |   }
101 | 
102 |   std::vector<Path> GetGlyphPath(RawChar ch, float x_scale, float y_scale) {
103 |     return font_data_->ReadGlyf(ch).AsPath(x_scale, y_scale);
104 |   }
105 | 
106 |   std::vector<Path> GetGlyphPath(RawChar ch, float scale) {
107 |     return font_data_->ReadGlyf(ch).AsPath(scale, scale);
108 |   }
109 | 
110 |   // private methods
111 |   void ReadFontName_();                  // Finds the postscript font name
112 |   void MakeGlyphTable_();                // Co-ordinates font construction
113 |   void GetFontFile_();                   // Gets TTF data
114 | };
115 | 
116 | //---------------------------------------------------------------------------//
117 | 
118 | #endif
119 | 


--------------------------------------------------------------------------------
/src/charstring.h:
--------------------------------------------------------------------------------
  1 | //---------------------------------------------------------------------------//
  2 | //                                                                           //
  3 | //  PDFR CharString header file                                              //
  4 | //                                                                           //
  5 | //  Copyright (C) 2018 - 2019 by Allan Cameron                               //
  6 | //                                                                           //
  7 | //  Licensed under the MIT license - see https://mit-license.org             //
  8 | //  or the LICENSE file in the project root directory                        //
  9 | //                                                                           //
 10 | //---------------------------------------------------------------------------//
 11 | 
 12 | 
 13 | #ifndef PDFR_CHARSTRING
 14 | 
 15 | //---------------------------------------------------------------------------//
 16 | 
 17 | #define PDFR_CHARSTRING
 18 | 
 19 | #include<string>
 20 | #include<iostream>
 21 | 
 22 | // The CharString class offers cheap, read-only access to std::strings and
 23 | // C-style strings without having to copy data at any point. It is effectively
 24 | // a glorified struct{const char* string, size_t length;} with member functions
 25 | // such as operator[](), size(), begin(), end(), find() and substr() that
 26 | // function as one might expect. It can be created from a std::string, a
 27 | // const char* or a string literal, and can be compared directly for equality
 28 | // against each of these using operator==(). It has its own output stream
 29 | // method so it can be written directly to the console.
 30 | //
 31 | // Although it doesn't own any other resources, it will only remain valid as
 32 | // long as the string to which it points exists, and this can make it
 33 | // problematic unless care is taken to ensure its lifetime falls strictly
 34 | // within the lifetime of the pointed-to string.
 35 | //
 36 | // It is used in PDFR because the entire pdf file is read into the free store as
 37 | // an std::string and sits there for the duration of the parsing process. It
 38 | // is therefore a safe and efficient tool for this job.
 39 | //
 40 | // This class is a wheel that has been reinvented many times, not least by the
 41 | // C++17 addition of string_view. My guess is that string_view is much more
 42 | // efficient, safe and portable than this class, but isn't available in C++11.
 43 | // I have tried to give the member functions the same names as those in
 44 | // string_view so that the code base can be easily upgraded in the future.
 45 | //
 46 | // Most of the methods are inlined, and only those that are a bit more complex
 47 | // such as the find() and substr() methods are defined seperately in the
 48 | // implementation file
 49 | 
 50 | class CharString
 51 | {
 52 | public:
 53 |   // There are several ways to construct a Charstring:
 54 | 
 55 |   // Give it a pointer, a starting offset and an endpoint
 56 |   CharString(const char* ptr, size_t start, size_t end) :
 57 |   begin_(ptr + start), length_(end - start) {}
 58 | 
 59 |   // Or just a pointer and a length
 60 |   CharString(const char* ptr, size_t length) : begin_(ptr), length_(length) {}
 61 | 
 62 |   // Or just a pointer to a zero-terminated string
 63 |   CharString(const char* ptr) :
 64 |   begin_(ptr), length_(0) { while (*(begin_ + length_)) ++length_; }
 65 | 
 66 |   // Or an std::string
 67 |   CharString(const std::string& s) : begin_(s.c_str()), length_(s.size()) {}
 68 | 
 69 |   // Or an std::string with a starting offset
 70 |   CharString(const std::string& str, size_t start) :
 71 |   begin_(str.c_str() + start), length_(str.size() - start) {}
 72 | 
 73 |   // Or another CharString
 74 |   CharString(const CharString&) = default;
 75 |   CharString& operator=(const CharString& chunk) = default;
 76 |   CharString& operator=(CharString&& chunk) noexcept = default;
 77 | 
 78 |   // Empty constructor
 79 |   CharString() : begin_(nullptr), length_(0) {}
 80 | 
 81 |   // The comparators are seperately defined
 82 |   bool operator==(const CharString& other)   const;
 83 |   bool operator==(const std::string& string) const;
 84 |   bool operator==(const char* cstring)       const;
 85 | 
 86 |   // Find and substr also require seperate definition
 87 |   const char* find(const char* target)                     const;
 88 |   const char* find(const CharString& target)               const;
 89 |   CharString substr(size_t start, size_t length)           const;
 90 |   CharString CarveOut(const char* left, const char* right) const;
 91 | 
 92 |   // The basic reading operations are all inlined
 93 |   char operator[](int index)               const {return *(begin_ + index);}
 94 |   std::string AsString()                   const {return {begin_, length_};}
 95 |   const char* begin()                      const {return begin_;}
 96 |   const char* end()                        const {return begin_ + length_;}
 97 |   char back()                              const {return *(end() - 1);}
 98 |   bool empty()                             const {return length_ == 0;}
 99 |   size_t size()                            const {return length_;}
100 |   const char* find(const std::string& str) const {return find(str.c_str());}
101 |   bool contains(const char* target)        const {return find(target) != end();}
102 |   bool contains(std::string target)        const {return find(target) != end();}
103 | 
104 | private:
105 |   const char* begin_;
106 |   size_t length_;
107 | };
108 | 
109 | // Declaration for output stream interface doesn't need to be a member
110 | std::ostream& operator<<(std::ostream& os, const CharString& charstring);
111 | 
112 | // Create a string literal CharString directly
113 | inline CharString operator "" _cs(const char* cstr) { return CharString(cstr);}
114 | 
115 | #endif
116 | 


--------------------------------------------------------------------------------
/src/matrix.h:
--------------------------------------------------------------------------------
  1 | //---------------------------------------------------------------------------//
  2 | //                                                                           //
  3 | //  PDFR Matrix header file                                                  //
  4 | //                                                                           //
  5 | //  Copyright (C) 2018 - 2021 by Allan Cameron                               //
  6 | //                                                                           //
  7 | //  Licensed under the MIT license - see https://mit-license.org             //
  8 | //  or the LICENSE file in the project root directory                        //
  9 | //                                                                           //
 10 | //---------------------------------------------------------------------------//
 11 | 
 12 | #ifndef PDFR_MATRIX
 13 | 
 14 | //---------------------------------------------------------------------------//
 15 | 
 16 | #define PDFR_MATRIX
 17 | 
 18 | #include<vector>
 19 | #include<array>
 20 | #include<string>
 21 | #include<memory>
 22 | 
 23 | //---------------------------------------------------------------------------//
 24 | // To define the position of elements on a page, the pdf page description
 25 | // program uses 3 * 3 matrices. These allow for arbitrary scaling, rotation,
 26 | // translation and skewing. Since the last column of a transformation
 27 | // matrix is always {0, 0,  1}, the matrices in pdf are defined by just six
 28 | // numbers in the page description program.
 29 | //
 30 | // For example, the entry "11 12 13 14 15 16 Tm" represents the following
 31 | // 3x3 transformation matrix:
 32 | //
 33 | //                      |   11    12    0  |
 34 | //                      |                  |
 35 | //                      |   13    14    0  |
 36 | //                      |                  |
 37 | //                      |   15    16    1  |
 38 | //
 39 | // The matrices all use floating point numbers and are all 3 x 3. Although we
 40 | // could just model them with a length 9 array of floats, it makes things a bit
 41 | // easier to just define a 3 x 3 float matrix here. That way, we can easily
 42 | // add or multiply two matrices using '+' and '-' instead of calling named
 43 | // functions. This is despite the fact that the underlying data member is
 44 | // a std::array<float, 9> anyway.
 45 | 
 46 | class Matrix
 47 | {
 48 | public:
 49 |   // The default constructor returns a 3 x 3 identity matrix
 50 |   Matrix(): data_(std::array<float, 9> {1.0, 0, 0, 0, 1.0, 0, 0, 0, 1.0}) {}
 51 | 
 52 |   // We can create a Matrix directly from a length-9 array of floats
 53 |   Matrix(std::array<float, 9> float_array): data_(float_array){}
 54 | 
 55 |   // This constructor takes a vector of 6 strings representing floats and
 56 |   // turns them into a 3 x 3 matrix as specified by the pdf page descriptor
 57 |   Matrix(const std::vector<std::string>& string_vector)
 58 |   {
 59 |     if (string_vector.size() < 6)
 60 |     {
 61 |       throw std::runtime_error("Can't create Matrix with fewer than 6 floats");
 62 |     }
 63 | 
 64 |     data_ = {stof(string_vector[0]), stof(string_vector[1]), 0,
 65 |              stof(string_vector[2]), stof(string_vector[3]), 0,
 66 |              stof(string_vector[4]), stof(string_vector[5]), 1};
 67 |   }
 68 | 
 69 |   // Assignment constructor
 70 |   Matrix& operator=(const Matrix& other)
 71 |   {
 72 |     this->data_ = other.data_;
 73 |     return *this;
 74 |   }
 75 | 
 76 |   // Operator overload of '*': returns dot product of two matrices
 77 |   Matrix operator*(const Matrix& other)
 78 |   {
 79 |     std::array<float, 9> new_data {};
 80 | 
 81 |     // Use indices to fill by loop
 82 |     for (size_t i = 0; i < 9; ++i)
 83 |     {
 84 |       new_data[i] = (data_[i % 3 + 0] * other.data_[3 * (i / 3) + 0] +
 85 |         data_[i % 3 + 3] * other.data_[3 * (i / 3) + 1] +
 86 |         data_[i % 3 + 6] * other.data_[3 * (i / 3) + 2] );
 87 |     }
 88 | 
 89 |     return Matrix(new_data);
 90 |   }
 91 | 
 92 |   // Transforms this matrix into the dot product of *this and t_other
 93 |   void operator*=(const Matrix& other)
 94 |   {
 95 |     std::array<float, 9> new_data {};
 96 | 
 97 |     // Use indices to fill by loop
 98 |     for (size_t i = 0; i < 9; ++i)
 99 |     {
100 |       new_data[i] = (data_[i % 3 + 0] * other.data_[3 * (i / 3) + 0] +
101 |         data_[i % 3 + 3] * other.data_[3 * (i / 3) + 1] +
102 |         data_[i % 3 + 6] * other.data_[3 * (i / 3) + 2] );
103 |     }
104 |     // Swap rather than copy the array used as the data member
105 |     std::swap(this->data_, new_data);
106 |   }
107 | 
108 |   // Overloaded + operator returns the element-by-element addition of Matrices
109 |   Matrix operator+(const Matrix& other)
110 |   {
111 |     std::array<float, 9> new_data {};
112 |     for (size_t element = 0; element < 9; ++element)
113 |     {
114 |       new_data[element] = this->data_[element] + other.data_[element];
115 |     }
116 |     return Matrix(new_data);
117 |   }
118 | 
119 |   // Transforms *this into *this + t_other using element-by-element addition
120 |   void operator+=(const Matrix& other)
121 |   {
122 |     for (size_t element = 0; element < 9; ++element)
123 |     {
124 |       this->data_[element] += other.data_[element];
125 |     }
126 |   }
127 | 
128 |   // Gets a reference to an element of the data member
129 |   float& operator[](size_t index)
130 |   {
131 |     return data_[index];
132 |   }
133 | 
134 |   std::array<float, 2> transformXY(float x, float y)
135 |   {
136 |     std::array<float, 2> result = {data_[0] * x + data_[3] * y + data_[6],
137 |                                    data_[1] * x + data_[4] * y + data_[7]};
138 |     return result;
139 |   }
140 | 
141 | private:
142 |   std::array<float, 9> data_;   // The actual data member
143 | };
144 | 
145 | //---------------------------------------------------------------------------//
146 | 
147 | #endif
148 | 


--------------------------------------------------------------------------------
/src/document.h:
--------------------------------------------------------------------------------
  1 | //---------------------------------------------------------------------------//
  2 | //                                                                           //
  3 | //  PDFR Document header file                                                //
  4 | //                                                                           //
  5 | //  Copyright (C) 2018 - 2019 by Allan Cameron                               //
  6 | //                                                                           //
  7 | //  Licensed under the MIT license - see https://mit-license.org             //
  8 | //  or the LICENSE file in the project root directory                        //
  9 | //                                                                           //
 10 | //---------------------------------------------------------------------------//
 11 | 
 12 | #ifndef PDFR_DOCUMENT
 13 | 
 14 | //---------------------------------------------------------------------------//
 15 | 
 16 | #define PDFR_DOCUMENT
 17 | 
 18 | /* This is the fifth header file in a daisy-chain of headers that build up the
 19 |  * tools required to parse pdfs. It follows just after the definition of the
 20 |  * object_class.
 21 |  *
 22 |  * The tools already in place have abstracted away decryption, decompression,
 23 |  * bytewise navigation of the file and parsing of dictionaries. The job of the
 24 |  * Document class is therefore to act as an interface to use the pdf objects
 25 |  * from which we build up logical structures such as fonts, xobjects and pages.
 26 |  *
 27 |  * The previous classes have been encapsulated as far as possible to be able to
 28 |  * work in isolation with minimal knowledge of each other. The Document class
 29 |  * in contrast acts as a creator, container and user of these objects.
 30 |  *
 31 |  * Each Document will have one and only one xref class. Instead of a pointer to
 32 |  * the xref as in other classes, the xref is actually a data member of the
 33 |  * Document class. PDF objects are created and stored in a map for easy access.
 34 |  * The file string is stored here and any other class that needs to read the
 35 |  * file accesses a pointer to the filestring held in the Document class.
 36 |  *
 37 |  * The Document class is therefore self-contained, in that after the initial
 38 |  * step of reading in the file, it has everything in needs to build up its
 39 |  * own components and interface. The logical PDF structures we go on to build
 40 |  * only need to know about the Document class, and can use it as the interface
 41 |  * they need. They "see" the pdf as a random access collection of numbered
 42 |  * objects with key:value dictionaries and uncompressed streams without being
 43 |  * concerned about how that is implemented.
 44 |  *
 45 |  * The Document also needs to have an outline of its own logical structure,
 46 |  * in terms of the pages it contains and where they are located. Part of the
 47 |  * task of Document creation is therefore to count and locate the objects
 48 |  * that act as page descriptors. It does this by finding the catalog
 49 |  * dictionary and then following pointers to dictionaries that contain
 50 |  * individual page headers. There is then a "getter" function for other classes
 51 |  * to access the dictionary pertaining to a particular page
 52 |  */
 53 | 
 54 | #include<string>
 55 | #include<vector>
 56 | #include<unordered_map>
 57 | #include<memory>
 58 | 
 59 | class Dictionary;
 60 | class XRef;
 61 | class Object;
 62 | 
 63 | //---------------------------------------------------------------------------//
 64 | // The public interface of the Document class comprises constructors and two
 65 | // member functions - one to return any object from the pdf and one to retrieve
 66 | // a specific page header.
 67 | 
 68 | class Document
 69 | {
 70 |  public:
 71 |   // Constructor to create Document from file path
 72 |   Document(const std::string& file_path)
 73 |    : file_string_(GetFile(file_path))
 74 |    { BuildDocument_(); }
 75 | 
 76 |   // Constructor to create Document from raw data
 77 |   Document(const std::vector<uint8_t>& byte_vector)
 78 |    : file_string_(std::string(byte_vector.begin(), byte_vector.end()))
 79 |    { BuildDocument_(); }
 80 | 
 81 | 
 82 |   // Gets a pointer to the Object specified by object_number. If the object has
 83 |   // previously been accessed, it will retrieve a pointer from the Object cache.
 84 |   // If it has not been accessed before, it will first create it. If the object
 85 |   // is inside an object stream, it will automatically add the holding object to
 86 |   // the cache as well.
 87 |   std::shared_ptr<Object> GetObject(int object_number);
 88 | 
 89 |   // Returns the main header dictionary for page specified by page_number
 90 |   Dictionary GetPageHeader(size_t page_number);
 91 | 
 92 |   // Accesses the private member containing object numbers of all page headers.
 93 |   std::vector<int> GetPageObjectNumbers() const {return page_object_numbers_;};
 94 | 
 95 |  private:
 96 |   const std::string file_string_;         // Full contents of file
 97 |   std::shared_ptr<const XRef> xref_;      // Pointer to creating XRef object
 98 |   std::vector<int> page_object_numbers_;  // The object numbers of page headers
 99 | 
100 |   // This map holds Object pointers. Since some objects may be read
101 |   // multiple times, it is best to store them when they are first created,
102 |   // then return the stored object on request rather than creating a new
103 |   // instance of the object every time it is requested.
104 |   std::unordered_map <int, std::shared_ptr<Object>> object_cache_;
105 | 
106 |   void BuildDocument_();      // The constructors use this as a common pathway
107 | 
108 |   // This function effectively builds the pages tree.
109 |   std::vector<int> ExpandKids_(const std::vector<int>& object_numbers);
110 | };
111 | 
112 | //---------------------------------------------------------------------------//
113 | 
114 | 
115 | 
116 | #endif
117 | 


--------------------------------------------------------------------------------
/src/page.h:
--------------------------------------------------------------------------------
  1 | //---------------------------------------------------------------------------//
  2 | //                                                                           //
  3 | //  PDFR Page header file                                                    //
  4 | //                                                                           //
  5 | //  Copyright (C) 2018 - 2019 by Allan Cameron                               //
  6 | //                                                                           //
  7 | //  Licensed under the MIT license - see https://mit-license.org             //
  8 | //  or the LICENSE file in the project root directory                        //
  9 | //                                                                           //
 10 | //---------------------------------------------------------------------------//
 11 | 
 12 | #ifndef PDFR_PAGE
 13 | 
 14 | //---------------------------------------------------------------------------//
 15 | 
 16 | #define PDFR_PAGE
 17 | 
 18 | /* This is the eighth in a sequence of daisy-chained headers that build up the
 19 |  * tools needed to read and parse the text content of pdfs. It comes after
 20 |  * font.h in the sequence and is the last step of constructing the logical
 21 |  * structure of pdf files.
 22 |  *
 23 |  * Each Page object represents a page in the pdf document, taking as its
 24 |  * construction parameters just a document pointer and a page number.
 25 |  *
 26 |  * The Page object acts as a container and organiser of the data required to
 27 |  * build a representation of the page. This includes the page dimensions,
 28 |  * the Font objects used on the page, any xobjects, and the contents of the
 29 |  * page (as a page description program).
 30 |  *
 31 |  * The document and pagenumber are used to find the appropriate page header
 32 |  * dictionary. This gives the page dimensions, contents and resources (such
 33 |  * as fonts and xobjects). These items are pulled in from the relevant
 34 |  * pdf objects and processed to get the data members.
 35 |  *
 36 |  * The public interface is more substantial with the Page class than with other
 37 |  * classes. The reason for this is that some of the data held by the Page class
 38 |  * may be useful to the end user rather than just being abstractions accessed
 39 |  * by other classes. Some of the downstream classes will also needs members of
 40 |  * the interface however - the parser class needs to access the fonts,
 41 |  * page contents and Xobjects for example.
 42 |  */
 43 | 
 44 | #include "font.h"
 45 | #include "dictionary.h"
 46 | #include "object_class.h"
 47 | #include<list>
 48 | 
 49 | class Box;
 50 | 
 51 | //---------------------------------------------------------------------------//
 52 | 
 53 | class Page
 54 | {
 55 |  public:
 56 |   // Constructor
 57 |   Page(std::shared_ptr<Document> document_pointer, int page_number);
 58 | 
 59 |   // Move constructor
 60 |   Page(Page&& other_page) noexcept {*this = std::move(other_page);}
 61 | 
 62 |   // lvalue assignment operator
 63 |   Page& operator=(const Page& other_page)
 64 |   {
 65 |     *this = other_page;
 66 |     return *this;
 67 |   }
 68 | 
 69 |   // rvalue assignment operator
 70 |   Page& operator=(Page&& other_page) noexcept
 71 |   {
 72 |     *this = std::move(other_page);
 73 |     return *this;
 74 |   }
 75 | 
 76 |   // Returns PostScript font names
 77 |   std::vector<std::string> GetFontNames();
 78 | 
 79 |   // Returns page description program
 80 |   const std::string& GetPageContents();
 81 | 
 82 |   // Returns a pointer to the contents of an XObject used by the page
 83 |   std::shared_ptr<std::string> GetXObject(const std::string& x_object_name);
 84 | 
 85 |   // Returns a pointer to the Font object from a given font name
 86 |   std::shared_ptr<Font> GetFont(const std::string& font_name);
 87 | 
 88 |   // Returns a Box object describing the page's bounding box.
 89 |   std::shared_ptr<Box> GetMinbox() const { return minbox_;}
 90 | 
 91 |   // Since the font map is a static object, it should be cleared at the end
 92 |   // of processing any particular document. Important!
 93 |   void ClearFontMap() { fontmap_.clear(); };
 94 | 
 95 |   // Allows a dictionary to be returned either directly or via reference
 96 |   Dictionary FollowToDictionary(Dictionary&,  const std::string&);
 97 | 
 98 |   std::list<std::pair<std::string, int>> SubXobjects(int xobj_num);
 99 | 
100 |  private:
101 |   std::shared_ptr<Document>   document_;        // Pointer to main document
102 |   int                         page_number_;     // [Zero-indexed] page number
103 |   Dictionary                  header_,          // The page's header dictionary
104 |                               resources_,       // Resource sub-dictionary
105 |                               fonts_;           // Font sub-dictionary
106 |   std::shared_ptr<Box>        minbox_;          // Page bounding Box
107 |   std::string                 content_string_;  // The page PostScript program
108 |   float                       rotate_;          // Page rotation in degrees
109 | 
110 |   // A map of Xobject strings, which are fragments of page description programs
111 |   std::unordered_map<std::string, std::string> xobjects_;
112 | 
113 |   // The actual storage container for fonts, mapped to their pdf names
114 |   static std::unordered_map<std::string, std::shared_ptr<Font>> fontmap_;
115 | 
116 |   // private methods
117 |   void ReadXObjects_();     // Write form XObjects to the xobject map
118 |   void ReadBoxes_();        // Store bounding boxes and calculate the smallest
119 |   void ReadHeader_();       // Find the correct header dictionary in document
120 |   void ReadResources_();    // Obtain the resource dictionary
121 |   void ReadFonts_();        // Get font dictionary and build fontmap
122 |   void ReadContents_();     // find content objects to Write contentstring
123 | 
124 |   // Gets the leaf nodes of a content tree
125 |   std::vector<int> ExpandContents_(std::vector<int>);
126 | };
127 | 
128 | //---------------------------------------------------------------------------//
129 | 
130 | #endif
131 | 


--------------------------------------------------------------------------------
/src/font.cpp:
--------------------------------------------------------------------------------
  1 | //---------------------------------------------------------------------------//
  2 | //                                                                           //
  3 | //  PDFR Font implementation file                                            //
  4 | //                                                                           //
  5 | //  Copyright (C) 2018 - 2019 by Allan Cameron                               //
  6 | //                                                                           //
  7 | //  Licensed under the MIT license - see https://mit-license.org             //
  8 | //  or the LICENSE file in the project root directory                        //
  9 | //                                                                           //
 10 | //---------------------------------------------------------------------------//
 11 | 
 12 | #include "utilities.h"
 13 | #include "dictionary.h"
 14 | #include "glyphwidths.h"
 15 | #include "encoding.h"
 16 | #include "font.h"
 17 | #include "object_class.h"
 18 | #include "document.h"
 19 | 
 20 | //---------------------------------------------------------------------------//
 21 | 
 22 | using namespace std;
 23 | 
 24 | /*---------------------------------------------------------------------------*/
 25 | // The Font constructor simply initializes the private data members, calls
 26 | // getFontName() to get the postscript font title, and then makeGlyphTable()
 27 | // to create the main data member
 28 | 
 29 | Font::Font(shared_ptr<Document> document_ptr,
 30 |            Dictionary& font_dictionary,
 31 |            const string& font_id)
 32 |   : document_(document_ptr),
 33 |     font_dictionary_(font_dictionary),
 34 |     font_id_(font_id)
 35 | {
 36 |   ReadFontName_();
 37 |   MakeGlyphTable_();
 38 |   GetFontFile_();
 39 |   if(fontfile_.size() > 0) font_data_ = std::make_shared<TTFont>(fontfile_);
 40 | }
 41 | 
 42 | /*---------------------------------------------------------------------------*/
 43 | // Obtains the font's relevant TrueType file
 44 | 
 45 | void Font::GetFontFile_()
 46 | {
 47 |   if(font_dictionary_.ContainsReferences("/FontDescriptor"))
 48 |   {
 49 |     int descriptor_ref = font_dictionary_.GetReference("/FontDescriptor");
 50 |     std::shared_ptr<Object> descriptor = document_->GetObject(descriptor_ref);
 51 |     if(descriptor->GetDictionary().ContainsReferences("/FontFile2"))
 52 |     {
 53 |       int fontfile_ref = descriptor->GetDictionary().GetReference("/FontFile2");
 54 |       std::shared_ptr<Object> font_obj = document_->GetObject(fontfile_ref);
 55 |       fontfile_ = font_obj->GetStream();
 56 |     }
 57 |   }
 58 | }
 59 | 
 60 | /*---------------------------------------------------------------------------*/
 61 | // Obtains the font's PostScript name from the font dictionary
 62 | 
 63 | void Font::ReadFontName_()
 64 | {
 65 |   // Reads /BaseFont entry
 66 |   string base_font(font_dictionary_.GetString("/BaseFont"));
 67 | 
 68 |   if (base_font.size() > 7 && base_font[7] == '+')
 69 |   {
 70 |     font_name_ = base_font.substr(8, base_font.size() - 8);
 71 |   }
 72 |   else
 73 |   {
 74 |     font_name_ = base_font.substr(1, base_font.size() - 1);
 75 |   }
 76 | }
 77 | 
 78 | /*---------------------------------------------------------------------------*/
 79 | // Most of the work asked of an object of the Font class will be to provide
 80 | // interpretations of raw character codes, in terms of the actual glyphs and
 81 | // their sizes intended by the document. This public method allows a vector
 82 | // of raw characters to be interpreted. It returns a vector of the same length
 83 | // as the input vector, containing a pair of {Unicode glyph, width} at each
 84 | // position
 85 | 
 86 | vector<pair<Unicode, float>> Font::MapRawChar(const vector<RawChar>& raw_vector)
 87 | {
 88 |   vector<pair<Unicode, float>> result;
 89 |   result.reserve(raw_vector.size());
 90 | 
 91 |   for (const auto& raw_char : raw_vector)
 92 |   {
 93 |     auto finder = glyph_map_.find(raw_char);
 94 |     if (finder != glyph_map_.end())
 95 |     {
 96 |       result.push_back(finder->second);
 97 |     }
 98 |   }
 99 | 
100 |   return result;
101 | }
102 | 
103 | /*---------------------------------------------------------------------------*/
104 | // The Font class subcontracts most of the work of its own construction out to
105 | // the encoding and glyphwidth classes. This private method co-ordinates the
106 | // building of the glyphmap using these two component classes
107 | 
108 | void Font::MakeGlyphTable_()
109 | {
110 |   // Create Encoding object
111 |   Encoding encodings(font_dictionary_, document_);
112 | 
113 |   // Create glyphwidth object
114 |   GlyphWidths widths(font_dictionary_, document_);
115 | 
116 |   // get all the mapped RawChars from the Encoding object
117 |   auto encoding_map = encodings.GetEncodingKeys();
118 | 
119 |   // We need to know whether the width code points refer to the width of raw
120 |   // character codes or to the final Unicode translations
121 | 
122 |   // If the widths refer to RawChar code points, map every RawChar to a width
123 |   if (widths.WidthsAreForRaw())
124 |   {
125 |     for (auto& key_value_pair : *encoding_map)
126 |     {
127 |       auto& key = key_value_pair.first;
128 |       glyph_map_[key] = make_pair(encodings.Interpret(key),
129 |                                   widths.GetWidth(key));
130 |     }
131 |   }
132 |   // Otherwise widths refer to Unicode glyphs, so map each to a width
133 |   else
134 |   {
135 |     for (auto& key_value_pair : *encoding_map)
136 |     {
137 |       auto& key = key_value_pair.first;
138 |       glyph_map_[key] = make_pair(encodings.Interpret(key),
139 |                                   widths.GetWidth(encodings.Interpret(key)));
140 |     }
141 |   }
142 | }
143 | 
144 | /*---------------------------------------------------------------------------*/
145 | // Public getter for FontName
146 | 
147 | std::string Font::GetFontName()
148 | {
149 |   return font_name_;
150 | }
151 | 
152 | /*---------------------------------------------------------------------------*/
153 | // Public getter for the keys of the glyphmap, needed to output the map from
154 | // the program if required
155 | 
156 | std::vector<RawChar> Font::GetGlyphKeys()
157 | {
158 |   return GetKeys(glyph_map_);
159 | }
160 | 


--------------------------------------------------------------------------------
/src/textbox.cpp:
--------------------------------------------------------------------------------
  1 | //---------------------------------------------------------------------------//
  2 | //                                                                           //
  3 | //  PDFR TextElement implementation file                                     //
  4 | //                                                                           //
  5 | //  Copyright (C) 2018 - 2019 by Allan Cameron                               //
  6 | //                                                                           //
  7 | //  Licensed under the MIT license - see https://mit-license.org             //
  8 | //  or the LICENSE file in the project root directory                        //
  9 | //                                                                           //
 10 | //---------------------------------------------------------------------------//
 11 | 
 12 | #include "utilities.h"
 13 | #include "box.h"
 14 | #include "font.h"
 15 | #include "textbox.h"
 16 | 
 17 | using namespace std;
 18 | 
 19 | //---------------------------------------------------------------------------//
 20 | // Converts TextBox to TextTable
 21 | TextTable::TextTable(const TextBox& text_box):
 22 | Box((Box) text_box)
 23 | {
 24 |   for (auto ptr = text_box.cbegin(); ptr != text_box.cend(); ++ptr)
 25 |   {
 26 |     auto& element = *ptr;
 27 |     if (!element->IsConsumed())
 28 |     {
 29 |       this->text_.push_back(element->Utf());
 30 |       this->lefts_.push_back(element->GetLeft());
 31 |       this->bottoms_.push_back(element->GetBottom());
 32 |       this->rights_.push_back(element->GetRight());
 33 |       this->fonts_.push_back(element->GetFontName());
 34 |       this->tops_.push_back(element->GetTop());
 35 |       this->sizes_.push_back(element->GetSize());
 36 |     }
 37 |   }
 38 | }
 39 | 
 40 | //---------------------------------------------------------------------------//
 41 | 
 42 | void TextBox::RemoveDuplicates()
 43 | {
 44 |   for (auto this_row = data_.begin(); this_row != data_.end(); ++this_row)
 45 |   {
 46 |     if ((*this_row)->IsConsumed()) continue;
 47 |     for (auto other_row = this_row; other_row != data_.end(); ++other_row)
 48 |     {
 49 |       if (other_row == this_row) continue;
 50 | 
 51 |       if (**other_row == **this_row)
 52 |       {
 53 |         (*other_row)->Consume();
 54 |       }
 55 |     }
 56 |   }
 57 | }
 58 | 
 59 | //---------------------------------------------------------------------------//
 60 | // Join another text table to this one
 61 | 
 62 | void TextTable::Join(TextTable& other)
 63 | {
 64 |   this->Merge(other);
 65 |   Concatenate(this->text_,    other.text_);
 66 |   Concatenate(this->lefts_,   other.lefts_);
 67 |   Concatenate(this->bottoms_, other.bottoms_);
 68 |   Concatenate(this->rights_,  other.rights_);
 69 |   Concatenate(this->fonts_,   other.fonts_);
 70 |   Concatenate(this->tops_,    other.tops_);
 71 | }
 72 | 
 73 | 
 74 | //----------------------------------------------------------------------------//
 75 | // Divides a TextBox into two by a horizontal line given as a y value
 76 | 
 77 | TextBox TextBox::SplitIntoTopAndBottom(float top_edge)
 78 | {
 79 |   if (this->empty()) return TextBox(); // Don't split the box if it's empty
 80 | 
 81 |   // Lambda to find elements whose bottom edge is below the cutoff
 82 |   auto FindLower = [&](TextPointer text_ptr) -> bool
 83 |                    { return text_ptr->GetTop() < top_edge; };
 84 | 
 85 |   // Gets an iterator to the first element below the cutoff
 86 |   auto split_at = find_if(this->begin(), this->end(), FindLower);
 87 | 
 88 |   // We won't split the box if all or none of the elements would be moved
 89 |   // to a new box
 90 |   if (split_at == this->begin() || split_at == this->end()) return TextBox();
 91 | 
 92 |   // Create a new textbox using a vector of all elements below the cutoff
 93 |   // and a down-cast copy of the text box
 94 |   std::vector<TextPointer> lower_contents {split_at, this->end()};
 95 |   auto lower = TextBox(std::move(lower_contents), (Box) *this);
 96 | 
 97 |   // Now we can erase the lower elements we have just copied from the upper box
 98 |   this->erase(split_at, this->end());
 99 | 
100 |   // We also need to readjust the margins of our bounding boxes based on their
101 |   // new contents
102 |   this->SetBottom(this->back()->GetBottom());
103 |   lower.SetTop(lower.front()->GetTop());
104 | 
105 |   // The upper box has been changed in place,
106 |   return lower;
107 | }
108 | 
109 | //----------------------------------------------------------------------------//
110 | // Divides a TextBox into two by a vertical line given as an x value
111 | 
112 | TextBox TextBox::SplitIntoLeftAndRight(float left_edge)
113 | {
114 |   if (this->empty()) return TextBox(); // Don't split the box if it's empty
115 | 
116 |     // This lambda defines a TextPointer sort from left to right
117 |   auto LeftSort = [ ](const TextPointer& a, const TextPointer& b) -> bool
118 |                   { return a->GetLeft() < b->GetLeft(); };
119 | 
120 |   std::stable_sort(this->begin(), this->end(), LeftSort);
121 | 
122 |   // Lambda to find elements whose left edge is below the cutoff
123 |   auto FindLeftMost = [&](TextPointer text_ptr) -> bool
124 |                    { return text_ptr->GetLeft() < left_edge; };
125 | 
126 |   // Gets an iterator to the first element right of the cutoff
127 |   auto split_at = find_if(this->begin(), this->end(), FindLeftMost);
128 | 
129 |   // We won't split the box if all or none of the elements would be moved
130 |   // to a new box
131 |   if (split_at == this->begin() || split_at == this->end()) return TextBox();
132 | 
133 |   // Create a new textbox using a vector of all elements below the cutoff
134 |   // and a down-cast copy of the text box
135 |   std::vector<TextPointer> rightmost_contents {split_at, this->end()};
136 |   auto rightmost = TextBox(std::move(rightmost_contents), (Box) *this);
137 | 
138 |   // Now we can erase the lower elements we have just copied from the upper box
139 |   this->erase(split_at, this->end());
140 | 
141 |   // We also need to readjust the margins of our bounding boxes based on their
142 |   // new contents
143 |   this->SetRight(this->back()->GetRight());
144 |   rightmost.SetLeft(rightmost.front()->GetTop());
145 | 
146 |   // The upper box has been changed in place,
147 |   return rightmost;
148 | }
149 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | output: github_document
 3 | ---
 4 | 
 5 | <!-- README.md is generated from README.Rmd. Please edit that file -->
 6 | 
 7 | ```{r, include = FALSE}
 8 | knitr::opts_chunk$set(
 9 |   collapse = TRUE,
10 |   comment = "#>",
11 |   fig.path = "man/figures/README-",
12 |   out.width = "100%"
13 | )
14 | ```
15 | 
16 | # PDFR
17 | 
18 | <!-- badges: start -->
19 | [![Lifecycle: experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html#experimental)
20 |  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
21 | <!-- badges: end -->
22 | 
23 | The goal of PDFR is to aid data scientists who need the ability to extract data from files in pdf format. PDFR is a new C++ based R library to extract usable text from portable document format (pdf) files.
24 | 
25 | The majority of the code base is written in C++ with a view to being ported to other languages, but at present it is constructed to be built as an R package.
26 | 
27 | ## Installation
28 | 
29 | You can install the development version of PDFR from [GitHub](https://github.com/) with:
30 | 
31 | ``` r
32 | # install.packages("pak")
33 | pak::pkg_install("AllanCameron/PDFR")
34 | ```
35 | 
36 | ## Usage
37 | 
38 | The main function used to extract all data from a pdf page to an R data frame is `pdfpage()`. This accepts either the path to a pdf or a raw data vector representing a pdf. For example, this is how you extract all text from page 1 of the barcodes PDF from `pdfr_paths`:
39 | 
40 | ```{r}
41 | library(PDFR)
42 | 
43 | barcodes <- system.file("extdata", "barcodes.pdf", package = "PDFR")
44 | pdfpage(barcodes, 1)
45 | ```
46 | 
47 | ## Background
48 | 
49 | The current version is at an early stage of development. It will work with most pdfs, but there are some unsupported features which may lead to some pdfs producing runtime errors.
50 | 
51 | Documents encrypted using the standard method and which can be opened without a password are supported. Password-based encryption is currently unsupported.
52 | 
53 | If there are any suggestions for development please submit a feature request, or let me know about pdfs that break the package.
54 | 
55 | ## Motivation
56 | 
57 | Extracting useful data from pdf is difficult for two reasons. Firstly, the pdf format primarily consists of binary data, which is laid out in such a way as to provide quick random access to pdf *objects* as required by a pdf reader. The text elements as seen on the page are usually encoded in a binary stream within the document. Even when the binary stream is decoded, the text items exist as individual elements within a page description program, which has to be parsed before the text can be extracted. It is therefore not a trivial matter to extract the "raw text" from a pdf file into a format in which it can be read by R, though there exist some excellent tools that can do this quickly. In particular, [pdftools](https://ropensci.org/blog/2016/03/01/pdftools-and-jeroen/) provides an R interface to some of Poppler's pdf tools, and can quickly and reliably extract text wholesale from pdf. 
58 | 
59 | The second problem is that, unlike some other common file types used to exchange information on the internet (e.g. html, xml, csv, JSON), the raw text extracted from a pdf does not have a fixed structure to provide semantic information about the data to allow it to be processed easily by a data scientist. 
60 | 
61 | The mismatch between the fact that humans can read data from pdfs so easily yet the format is so difficult to convert into machine-readable data is explained by the fact that humans use the structure of the page layout to provide the semantic context to the data. When the structure is lost (as it often is with copy and pasting from PDF), it becomes very difficult for a human reader to interpret. The computer does not know how to interpret the characters' positions, so it cannot classify the characters by semantics as a human reader (usually) can.
62 | 
63 | The idea behind PDFR is to try to extract raw text then use the positioning and formatting data from the extracted text to reconstruct some of the semantic content that would otherwise be lost. For example, identifying and grouping letters into words, words into paragraphs or into tables. 
64 | 
65 | Ultimately, to extract useful data, the user will need the option to control how and to what extent text elements are grouped. For example, they may need the fine control of having every letter's position on the page (e.g. to accurately reconstruct a part of the document on a plot), or may wish to extract a corpus of plain text from a book as a set of paragraphs or even whole pages.
66 |  
67 | PDFR is written in C++ 11 and has no external dependencies, but makes extensive use of the C++ standard libraries. Rather than being based on an existing library such as [xpdf](https://www.xpdfreader.com/) or [Poppler](https://poppler.freedesktop.org/), it was written from scratch with the specific goal of making text extraction easier for R users. Most of the design is new, an attempt to implement the text extraction elements of the pdf standard [ISO 32000](https://www.iso.org/standard/51502.html), though it borrows some concepts from existing open-source libraries such as Poppler and [pdfjs](https://mozilla.github.io/pdf.js/).
68 | 
69 | Clearly, the package would not exist without the excellent [Rcpp](http://www.rcpp.org/) package. Much of the pdf parsing would take too long to do in R, but having the facility to write C++ extensions makes pdf parsing feasible, and even pretty quick in some cases.
70 | 
71 | ## Related projects
72 | 
73 | - [pdftools](https://github.com/ropensci/pdftools): Text Extraction, Rendering and Converting of PDF Documents.
74 | - [qpdf](https://github.com/ropensci/qpdf): Content-preserving transformations transformations of PDF files such as split, combine, and compress. This package interfaces directly to the ‘qpdf’ C++ API and does not require any command line utilities.
75 | - [tabulizer](https://github.com/ropensci/tabulizer): Bindings for Tabula PDF Table Extractor Library
76 | - [PDE](https://github.com/erikstricker/PDE): The PDE (Pdf Data Extractor) allows the extraction of information and tables optionally based on search words from PDF (Portable Document Format) files and enables the visualization of the results, both by providing a convenient user-interface.
77 | - [xmpdf](https://github.com/trevorld/r-xmpdf): Edit XMP metadata and PDF bookmarks/documentation info.
78 | 


--------------------------------------------------------------------------------
/src/crypto.h:
--------------------------------------------------------------------------------
  1 | //---------------------------------------------------------------------------//
  2 | //                                                                           //
  3 | //  PDFR crypto header file                                                  //
  4 | //                                                                           //
  5 | //  Copyright (C) 2018 by Allan Cameron                                      //
  6 | //                                                                           //
  7 | //  Permission is hereby granted, free of charge, to any person obtaining    //
  8 | //  a copy of this software and associated documentation files               //
  9 | //  (the "Software"), to deal in the Software without restriction, including //
 10 | //  without limitation the rights to use, copy, modify, merge, publish,      //
 11 | //  distribute, sublicense, and/or sell copies of the Software, and to       //
 12 | //  permit persons to whom the Software is furnished to do so, subject to    //
 13 | //  the following conditions:                                                //
 14 | //                                                                           //
 15 | //  The above copyright notice and this permission notice shall be included  //
 16 | //  in all copies or substantial portions of the Software.                   //
 17 | //                                                                           //
 18 | //  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS  //
 19 | //  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF               //
 20 | //  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.   //
 21 | //  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY     //
 22 | //  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,     //
 23 | //  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE        //
 24 | //  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                   //
 25 | //                                                                           //
 26 | //---------------------------------------------------------------------------//
 27 | 
 28 | #ifndef PDFR_CRYPTO
 29 | 
 30 | //---------------------------------------------------------------------------//
 31 | 
 32 | #define PDFR_CRYPTO
 33 | 
 34 | /* This header file includes the declaration of a class which containes the
 35 |  * algorithms needed to decrypt protected pdfs.
 36 |  *
 37 |  * This only applies to situations in which a password is not required to
 38 |  * open the file. It allows reading of pdfs in which the ability to copy and
 39 |  * paste, save or modify the file have been disabled by the owner but it can
 40 |  * still be opened and read by anyone without a user password.
 41 |  *
 42 |  * Most pdfs will open without the need for decryption, but some (such as the
 43 |  * ISO 32000 pdf reference document itself) are useless without the ability to
 44 |  * decrypt.
 45 |  *
 46 |  * Decryption is quite well encapsulated here. The implementation of decryption
 47 |  * is left to private member functions. The decryption itself is called only
 48 |  * when an object stream is extracted at the point of pdf object creation and
 49 |  * is accessed via a wrapper function in the xref class. The public interface
 50 |  * is a single function to decrypt a stream given the raw stream, the object
 51 |  * number and the generation number of the pdf object in which the stream
 52 |  * resides.
 53 |  */
 54 | 
 55 | #include<string>
 56 | #include<vector>
 57 | #include<deque>         // Needed for md5mix function
 58 | #include<memory>
 59 | #include "charstring.h"
 60 | class Dictionary;
 61 | 
 62 | 
 63 | //---------------------------------------------------------------------------//
 64 | // The md5 algorithm makes use of 4-byte numbers (unsigned long or uint32_t).
 65 | // To shorten the name and make it explicit what we are talking about I have
 66 | // typedef'd uint32_t as FourBytes
 67 | 
 68 | typedef uint32_t FourBytes;
 69 | 
 70 | //---------------------------------------------------------------------------//
 71 | // Class definition for crypto
 72 | 
 73 | class Crypto
 74 | {
 75 |  public:
 76 |   // Constructors
 77 |   Crypto(const Dictionary& encryption_dictionary,
 78 |          const Dictionary& trailer_dictionary);
 79 | 
 80 |   // This is the main decryption function which is also the public interface for
 81 |   // the class. It takes the raw stream, the object and generation numbers then
 82 |   // returns the decrypted stream.
 83 |   std::string DecryptStream(const std::string& stream_to_be_decoded,
 84 |                             int object_number,
 85 |                             int object_generation_number) const;
 86 | 
 87 |   std::string DecryptStream(const CharString&, int, int) const;
 88 | 
 89 | 
 90 | private:
 91 |   // private data members
 92 |   const Dictionary& encryption_dictionary_;
 93 |   const Dictionary& trailer_;
 94 |   int   revision_;
 95 |   std::vector<uint8_t> filekey_;
 96 |   static const std::vector<uint8_t> default_user_password_;
 97 |   static const std::vector<FourBytes> md5_table;
 98 |   static const std::vector<std::vector<FourBytes>> mixarray;
 99 | 
100 |   // Chops FourBytes into 4 bytes
101 |   std::vector<uint8_t> ChopLong_(FourBytes) const;
102 | 
103 |   // Return permission flags for file
104 |   std::vector<uint8_t> ReadPermissions_(const std::string&);
105 | 
106 |   // Helper function for md5
107 |   void Md5Mix_(int, std::deque<FourBytes>&, std::vector<FourBytes>&) const;
108 | 
109 |   // Gives md5 hash of a vector of raw bytes
110 |   std::vector<uint8_t> Md5_(const std::vector<uint8_t>&) const;
111 | 
112 |   // Gives md5 hash of a string (as bytes)
113 |   std::vector<uint8_t> Md5_(const std::string&) const;
114 | 
115 |   // Gives rc4 cipher of message:key pair, given key and message
116 |   void Rc4_(std::vector<uint8_t>&, const std::vector<uint8_t>&) const;
117 | 
118 |   // Gets /O and /U cipher
119 |   std::vector<uint8_t> ReadPassword_(const std::string&);
120 | 
121 |   // Constructs file key
122 |   void ReadFileKey_();
123 | 
124 |   // Checks file key (revision 2)
125 |   void CheckKeyR2_();
126 | 
127 |   // Checks file key (revision 3)
128 |   void CheckKeyR3_();
129 | 
130 |   // Ensure the ID is read correctly whether hex or plain bytes
131 |   std::vector<uint8_t> ParseID_(const std::string&);
132 | 
133 | };
134 | 
135 | //---------------------------------------------------------------------------//
136 | 
137 | #endif
138 | 


--------------------------------------------------------------------------------
/src/text_element.h:
--------------------------------------------------------------------------------
  1 | //---------------------------------------------------------------------------//
  2 | //                                                                           //
  3 | //  PDFR TextElement header file                                             //
  4 | //                                                                           //
  5 | //  Copyright (C) 2018 - 2019 by Allan Cameron                               //
  6 | //                                                                           //
  7 | //  Licensed under the MIT license - see https://mit-license.org             //
  8 | //  or the LICENSE file in the project root directory                        //
  9 | //                                                                           //
 10 | //---------------------------------------------------------------------------//
 11 | 
 12 | #ifndef PDFR_TEXT_ELEMENT
 13 | 
 14 | //---------------------------------------------------------------------------//
 15 | 
 16 | #define PDFR_TEXT_ELEMENT
 17 | 
 18 | #include<string>
 19 | #include "box.h"
 20 | 
 21 | class Font;
 22 | using Unicode = uint16_t;
 23 | 
 24 | //---------------------------------------------------------------------------//
 25 | // The "atom" of our output will be the TextElement. This is a class containing
 26 | // one or more glyphs as a vector of uint16_t (representing Unicode code points)
 27 | // along with its position, size, and the name of the font used to draw it.
 28 | // We will need to shuffle these around quite a lot in processing, so we use
 29 | // shared pointers to each TextElement to represent each text element. The
 30 | // pointers to text_elements are typedef'd as TextPointer for brevity.
 31 | 
 32 | //---------------------------------------------------------------------------//
 33 | // The TextElement is a class which contains information about each text
 34 | // element on a page including the actual unicode glyph(s), the position, the
 35 | // font and size of the character(s). It also contains a pair that acts as an
 36 | // address for the adjacent glyph which will be found during LetterGrouper's
 37 | // construction, and Boolean flags to indicate whether it is "consumed" when
 38 | // the glyphs are stuck together into words, as well as flags to indicate
 39 | // whether the element is at the left, right or centre of a column
 40 | 
 41 | class TextElement : public Box
 42 | {
 43 |   typedef std::shared_ptr<TextElement> TextPointer;
 44 | 
 45 |  public:
 46 |   TextElement(float left, float right, float top, float bottom,
 47 |               float size, std::shared_ptr<Font> font,
 48 |               std::vector<Unicode> glyphs)
 49 |     : Box(left, right, top, bottom), size_(size),
 50 |       font_(font), glyph_(std::move(glyphs)), join_(nullptr) {};
 51 | 
 52 |   // Inevitably, we need to define some "magic number" constants to define
 53 |   // how close together text elements have to be to clump together
 54 | 
 55 |   constexpr static float CLUMP_H = 0.01; // horizontal clumping, high = sticky
 56 |   constexpr static float CLUMP_V = 0.1;  // vertical clumping, high = sticky
 57 |   constexpr static float LINE_CLUMP = 0.7;
 58 |   constexpr static float MAX_WORD_GAP = 0.5;
 59 |   constexpr static float MAX_ALIGN_IGNORE = 0.0;
 60 | 
 61 |   inline void MakeLeftEdge()  { this->SetFlag(0x04); }
 62 |   inline void MakeRightEdge() { this->SetFlag(0x02); }
 63 |   inline void MakeCentred()   { this->SetFlag(0x06); }
 64 |   inline float GetSize() const override {return this->size_;}
 65 |   inline bool IsLeftEdge()  const { return this->HasFlag(0x04); }
 66 |   inline bool IsRightEdge() const { return this->HasFlag(0x02); }
 67 |   inline bool IsCentred()   const { return this->HasFlag(0x06); }
 68 | 
 69 |   inline void SetJoin(TextPointer element) { this->join_ = element;}
 70 |   inline TextPointer GetJoin()             { return this->join_; }
 71 |   inline bool HasJoin() const { if (join_) return true; else return false;}
 72 | 
 73 |   std::string GetFontName() const; // can't inline without including font.h
 74 |   inline std::vector<Unicode> GetGlyph() const { return this->glyph_;}
 75 |   inline void AddSpace() { glyph_.push_back(0x0020);         }
 76 | 
 77 |   inline void PopLastGlyph()
 78 |   {
 79 |     if (glyph_.empty()) throw std::runtime_error("Can't pop empty vector");
 80 |     else glyph_.pop_back();
 81 |   }
 82 | 
 83 |   inline bool operator ==(const TextElement& other) const
 84 |   {
 85 |     if (&other == this) return true;
 86 |     return (other.GetLeft()   == this->GetLeft()    &&
 87 |             other.GetBottom() == this->GetBottom()  &&
 88 |             other.GetTop()    == this->GetTop()     &&
 89 |             other.GetGlyph()  == this->GetGlyph()   );
 90 |   }
 91 | 
 92 |   inline bool IsAdjoiningLetter(const TextElement& other) const
 93 |   {
 94 |     if (&other == this) return false;
 95 |     return
 96 |       other.GetLeft() > GetLeft() &&
 97 |       abs(other.GetBottom() - GetBottom()) < (CLUMP_V * GetSize()) &&
 98 |       (
 99 |         abs(other.GetLeft() - GetRight()) < (CLUMP_H * GetSize()) ||
100 |         (other.GetLeft() < GetRight())
101 |       ) ;
102 |   }
103 | 
104 |   inline bool IsOnSameLineAs(const TextElement& other) const
105 |   {
106 |     if (&other == this) return true;
107 |     return
108 |     (other.GetBottom() - this->GetBottom() < LINE_CLUMP * this->GetSize()) &&
109 |     (this->GetBottom() - other.GetBottom() < LINE_CLUMP * this->GetSize());
110 |   }
111 | 
112 |   inline bool IsWayBeyond(const TextElement& other) const
113 |   {
114 |     if (&other == this) return false;
115 |     return GetLeft() - other.GetRight() > MAX_WORD_GAP * other.GetSize();
116 |   }
117 | 
118 |   inline bool CannotJoinLeftOf(const TextElement& other) const
119 |   {
120 |     if (&other == this) return true;
121 |     return
122 |     ( other.IsLeftEdge()  || other.IsCentred()  ||
123 |       this->IsRightEdge()   || this->IsCentred())   &&
124 |     (other.GetLeft() - this->GetRight()) > (MAX_ALIGN_IGNORE * GetSize());
125 |   }
126 | 
127 |   void MergeLetters(TextElement&);
128 |   bool IsElligibleToJoin(const TextElement&) const;
129 |   void JoinWords(TextElement&);
130 |   void ConcatenateUnicode(const std::vector<Unicode>&);
131 |   std::string Utf();
132 | 
133 | 
134 |  private:
135 |   float size_;                           // The font size
136 |   std::shared_ptr<Font> font_;           // Font used to draw text
137 |   std::vector<Unicode> glyph_;           // The actual Unicode glyphs encoded
138 |   std::shared_ptr<TextElement> join_;    // address of closest adjacent element
139 | };
140 | 
141 | 
142 | #endif
143 | 


--------------------------------------------------------------------------------
/src/graphicobject.h:
--------------------------------------------------------------------------------
  1 | //---------------------------------------------------------------------------//
  2 | //                                                                           //
  3 | //  PDFR GraphicObject header file                                           //
  4 | //                                                                           //
  5 | //  Copyright (C) 2018 - 2021 by Allan Cameron                               //
  6 | //                                                                           //
  7 | //  Licensed under the MIT license - see https://mit-license.org             //
  8 | //  or the LICENSE file in the project root directory                        //
  9 | //                                                                           //
 10 | //---------------------------------------------------------------------------//
 11 | 
 12 | #ifndef PDFR_GO
 13 | 
 14 | //---------------------------------------------------------------------------//
 15 | 
 16 | #define PDFR_GO
 17 | 
 18 | #include "utilities.h"
 19 | #include<string>
 20 | #include<vector>
 21 | #include<memory>
 22 | #include "text_element.h"
 23 | 
 24 | 
 25 | /*---------------------------------------------------------------------------*/
 26 | /* This is a header-only implementation of a GraphicObject class, which is used
 27 |  * to store information about shapes extracted from the page description
 28 |  * program.
 29 |  */
 30 | 
 31 | class GraphicObject
 32 | {
 33 | public:
 34 |   GraphicObject() : linewidth_(1),
 35 |   stroke_colour_({0, 0, 0}), is_stroked_(false),
 36 |   is_filled_(false), fill_colour_({0.5, 0.5, 0.5}) {};
 37 | 
 38 |   // Setters
 39 |   void SetLineWidth(float size) {this->linewidth_ = size;}
 40 |   void SetColour(std::vector<float> colour) {this->stroke_colour_ = colour;}
 41 |   void SetFillColour(std::vector<float> colour) {this->fill_colour_ = colour;}
 42 |   void SetStroke(bool visible) {this->is_stroked_ = visible;}
 43 |   void SetFilled(bool is_filled) {this->is_filled_ = is_filled;}
 44 | 
 45 |   // virtual functions allow type-specific behaviour in derived classes
 46 |   virtual void NewSubpath() {}
 47 |   virtual void SetX(std::vector<float> values) {}
 48 |   virtual void SetY(std::vector<float> values) {}
 49 |   virtual void CloseSubpath() {}
 50 |   virtual void AppendX(std::vector<float> value) {}
 51 |   virtual void AppendY(std::vector<float> value) {}
 52 |   virtual std::vector<float> GetX() {return {0};}
 53 |   virtual std::vector<float> GetY() {return {0};}
 54 |   virtual bool IsClosed() { return false;}
 55 |   virtual float Bottom()  { return 0;}
 56 |   virtual float Top()     { return 0;}
 57 |   virtual float Left()    { return 0;}
 58 |   virtual float Right()   { return 0;}
 59 |   virtual float Width()   { return 0;}
 60 |   virtual float Height()  { return 0;}
 61 |   virtual std::string GetText() {return "";}
 62 |   virtual float GetFontSize() {return 0;}
 63 |   virtual std::vector<int> GetSubpaths() {return {0};}
 64 | 
 65 |   // Getters
 66 | 
 67 |   virtual float GetLineWidth() {return this->linewidth_;}
 68 |   virtual std::vector<float> GetColour() {return this->stroke_colour_;}
 69 |   bool IsStroked() {return this->is_stroked_;}
 70 |   bool IsFilled() {return this->is_filled_;}
 71 |   std::vector<float> GetFillColour() {return this->fill_colour_;}
 72 | 
 73 | 
 74 |  private:
 75 |   float linewidth_;
 76 |   std::vector<float> stroke_colour_;
 77 |   bool is_stroked_;
 78 |   bool is_filled_;
 79 |   std::vector<float> fill_colour_;
 80 | 
 81 | };
 82 | 
 83 | /*---------------------------------------------------------------------------*/
 84 | 
 85 | class Path : public GraphicObject {
 86 |  public:
 87 |   Path(): path_x_({}), path_y_({}), current_subpath_(0), is_closed_({false}) {}
 88 | 
 89 |   void SetX(std::vector<float> values) {this->path_x_ = values;}
 90 |   void SetY(std::vector<float> values) {this->path_y_ = values;}
 91 | 
 92 |   void NewSubpath() {++current_subpath_;}
 93 |   void CloseSubpath() {
 94 |     is_closed_.back() = true;
 95 |     int pos = std::find(subpaths_.begin(), subpaths_.end(), current_subpath_) -
 96 |               subpaths_.begin();
 97 |     path_x_.push_back(path_x_[pos]);
 98 |     path_y_.push_back(path_y_[pos]);
 99 |     subpaths_.push_back(subpaths_.back());
100 |   }
101 | 
102 |   void SetSubpaths(std::vector<int> value) { subpaths_ = value;}
103 | 
104 |   void AppendX(std::vector<float> value) {
105 |     Concatenate(this->path_x_, {value});
106 |     while(subpaths_.size() < path_x_.size()){
107 |       subpaths_.push_back(current_subpath_);
108 |     }
109 |   }
110 | 
111 |   void AppendY(std::vector<float> value) {
112 |     Concatenate(this->path_y_, {value});
113 |     while(subpaths_.size() < path_x_.size()){
114 |       subpaths_.push_back(current_subpath_);
115 |     }
116 |   }
117 | 
118 |   std::vector<float> GetX() {return this->path_x_;}
119 |   std::vector<float> GetY() {return this->path_y_;}
120 |   bool IsClosed() { return this->is_closed_.back();}
121 | 
122 |   float Bottom()  { return *std::min_element(this->path_y_.begin(),
123 |                                              this->path_y_.end());}
124 |   float Top()     { return *std::max_element(this->path_y_.begin(),
125 |                                              this->path_y_.end());}
126 |   float Left()    { return *std::min_element(this->path_x_.begin(),
127 |                                              this->path_x_.end());}
128 |   float Right()   { return *std::max_element(this->path_x_.begin(),
129 |                                              this->path_x_.end());}
130 |   float Width()   { return this->Right() - this->Left();}
131 |   float Height()  { return this->Top() - this->Bottom();}
132 |   std::vector<int> GetSubpaths() {return subpaths_;}
133 | 
134 |  private:
135 |   std::vector<float> path_x_;
136 |   std::vector<float> path_y_;
137 |   int current_subpath_;
138 |   std::vector<int> subpaths_;
139 |   std::vector<bool> is_closed_;
140 |   };
141 | 
142 | /*---------------------------------------------------------------------------*/
143 | 
144 | class Text : public GraphicObject {
145 | 
146 |  public:
147 |   Text(std::shared_ptr<TextElement> text) : contents_(text) {}
148 |   std::string GetText() {return contents_->Utf();}
149 |   std::vector<float> GetColour() {return this->GetFillColour();}
150 |   std::vector<float> GetX() {return {contents_->GetLeft()};}
151 |   std::vector<float> GetY() {return {contents_->GetBottom()};}
152 |   float GetFontSize() {return contents_->GetSize();}
153 | 
154 | 
155 |  private:
156 |   std::shared_ptr<TextElement> contents_;
157 | };
158 | 
159 | #endif
160 | 


--------------------------------------------------------------------------------
/src/word_grouper.cpp:
--------------------------------------------------------------------------------
  1 | //---------------------------------------------------------------------------//
  2 | //                                                                           //
  3 | //  PDFR WordGrouper implementation file                                    //
  4 | //                                                                           //
  5 | //  Copyright (C) 2018 - 2019 by Allan Cameron                               //
  6 | //                                                                           //
  7 | //  Licensed under the MIT license - see https://mit-license.org             //
  8 | //  or the LICENSE file in the project root directory                        //
  9 | //                                                                           //
 10 | //---------------------------------------------------------------------------//
 11 | 
 12 | #include "word_grouper.h"
 13 | 
 14 | using namespace std;
 15 | 
 16 | //---------------------------------------------------------------------------//
 17 | // This "magic number" is an integer that specifies how many glyphs need to
 18 | // line up to infer an aligned column on the page.
 19 | 
 20 | constexpr int EDGECOUNT = 4;
 21 | 
 22 | //---------------------------------------------------------------------------//
 23 | // Constructor for WordGrouper class. This takes the output from LetterGrouper
 24 | // and finds its column edges, then joins elligible words together as long as
 25 | // they do not belong to different columns.
 26 | 
 27 | WordGrouper::WordGrouper(std::unique_ptr<TextBox> text_box)
 28 |   : text_box_(move(text_box))
 29 | {
 30 |   FindEdges_();
 31 |   AssignEdges_();
 32 |   FindRightMatch_();
 33 | };
 34 | 
 35 | //---------------------------------------------------------------------------//
 36 | // Makes a table of supplied vector of floats. Multiplies them by 10 and
 37 | // casts to int as a way of rounding to 1 decimal place. It then removes any
 38 | // keys whose counts are less than EDGECOUNT, so the remaining keys are the
 39 | // positions we wish to identify as possible edges. Since the maps we want to
 40 | // return are data members of the class, we need to pass the map we wish to
 41 | // create by reference.
 42 | 
 43 | void WordGrouper::Tabulate_(const vector<float>& supplied_vector,
 44 |                             unordered_map<int, size_t>& table   )
 45 | {
 46 |   // Take each member of the supplied vector
 47 |   for (const auto& element : supplied_vector)
 48 |   {
 49 |     // Multiply it by 10 and use it as a key in the map with value 1
 50 |     auto inserter = table.insert(pair<int, size_t>((int) 10 * element, 1));
 51 | 
 52 |     // If the key already exists in the map, increment the value by 1
 53 |     if (!inserter.second) inserter.first->second++;
 54 |   }
 55 | 
 56 |   // Now take each key in the resulting map
 57 |   for (auto key_value_pair = table.begin(); key_value_pair != table.end(); )
 58 |   {
 59 |     // if value is below the number needed to declare a column, delete it
 60 |     if (key_value_pair->second < EDGECOUNT)
 61 |     {
 62 |       table.erase(key_value_pair++);
 63 |     }
 64 |     else ++key_value_pair;
 65 |   }
 66 | }
 67 | 
 68 | //---------------------------------------------------------------------------//
 69 | // This uses the Tabulate function to find left, right and centre-aligned text
 70 | // elements on the page.
 71 | 
 72 | void WordGrouper::FindEdges_()
 73 | {
 74 |   // Create vectors of left and right edges of text elements
 75 |   vector<float> left, right, midvec;
 76 |   left.reserve(text_box_->size());
 77 |   right.reserve(text_box_->size());
 78 |   midvec.reserve(text_box_->size());
 79 | 
 80 |   for (auto& element : *text_box_)
 81 |   {
 82 |     left.push_back(element->GetLeft());
 83 |     right.push_back(element->GetRight());
 84 |     midvec.push_back((left.back() + right.back()) / 2);
 85 |   }
 86 | 
 87 | 
 88 |   // Use Tabulate to find left and right edges as well as midpoints
 89 |   Tabulate_(left,   left_edges_);
 90 |   Tabulate_(right,  right_edges_);
 91 |   Tabulate_(midvec, mids_);
 92 | }
 93 | 
 94 | //---------------------------------------------------------------------------//
 95 | // Now we need to "tell" each element whether it is a left, right or centre
 96 | // aligned element so it "knows" which side(s), if any, are eligible to join
 97 | // other elements
 98 | 
 99 | void WordGrouper::AssignEdges_()
100 | {
101 |   for (auto& element : *text_box_)
102 |   {
103 |     int left_int = element->GetLeft() * 10;
104 |     int right_int = element->GetRight() * 10;
105 |     int mid_int = (element->GetRight() + element->GetLeft()) * 5;
106 | 
107 |     // Non-unique left edge - assume column edge
108 |     if (left_edges_.find(left_int) != left_edges_.end())
109 |     {
110 |       element->MakeLeftEdge();
111 |     }
112 | 
113 |     // Non-unique right edge - assume column edge
114 |     if (right_edges_.find(right_int) != right_edges_.end())
115 |     {
116 |       element->MakeRightEdge();
117 |     }
118 | 
119 |     // Non-unique centre value - assume centred column
120 |     if (mids_.find(mid_int) != mids_.end())
121 |     {
122 |       element->MakeCentred();
123 |     }
124 |   }
125 | }
126 | 
127 | //---------------------------------------------------------------------------//
128 | // It's a bit naughty for a function to do two things instead of one, but these
129 | // two things are easier / quicker done in a single loop. Go through each text
130 | // item and check whether it is elligible for joining to another element. If it
131 | // is, find the most appropriate match to its right that is elligible and stick
132 | // the two together.
133 | 
134 | void WordGrouper::FindRightMatch_()
135 | {
136 |   // Handle empty data
137 |   if (text_box_->empty()) throw runtime_error("empty data");
138 | 
139 |   for (auto element = text_box_->begin(); element != text_box_->end(); ++element)
140 |   {
141 |     // Check the row is elligible for matching
142 |     if ((*element)->IsConsumed()) continue;
143 | 
144 |     // If elligible, check every other word for the best match
145 |     for (auto other = element; other != text_box_->end(); ++other)
146 |     {
147 |       // Don't match against itself
148 |       if (element == other) continue;
149 | 
150 |       // These TextElement functions are quite complex in themselves
151 |       if ((*element)->IsElligibleToJoin(**other))
152 |       {
153 |         (*element)->JoinWords(**other);
154 |         --element;  // Keep matching same element until no other matches found
155 |         break;
156 |       }
157 |     }
158 |   }
159 | }
160 | 
161 | //---------------------------------------------------------------------------//
162 | 
163 | TextTable WordGrouper::Out() const { return TextTable(*text_box_);}
164 | 


--------------------------------------------------------------------------------
/src/text_element.cpp:
--------------------------------------------------------------------------------
  1 | //---------------------------------------------------------------------------//
  2 | //                                                                           //
  3 | //  PDFR TextElement implementation file                                     //
  4 | //                                                                           //
  5 | //  Copyright (C) 2018 - 2019 by Allan Cameron                               //
  6 | //                                                                           //
  7 | //  Licensed under the MIT license - see https://mit-license.org             //
  8 | //  or the LICENSE file in the project root directory                        //
  9 | //                                                                           //
 10 | //---------------------------------------------------------------------------//
 11 | 
 12 | #include "utilities.h"
 13 | #include "box.h"
 14 | #include "font.h"
 15 | #include "text_element.h"
 16 | 
 17 | using namespace std;
 18 | 
 19 | //---------------------------------------------------------------------------//
 20 | 
 21 | void TextElement::MergeLetters(TextElement& matcher)
 22 | {
 23 |    // paste the left glyph to the right glyph
 24 |   this->ConcatenateUnicode(matcher.glyph_);
 25 | 
 26 |   // make the right glyph now contain both glyphs
 27 |   swap(matcher.glyph_, this->glyph_);
 28 | 
 29 |   // make the right glyph now start where the left glyph started
 30 |   matcher.SetLeft(this->GetLeft());
 31 | 
 32 |   // Ensure bottom is the lowest value of the two glyphs
 33 |   if (this->GetBottom() < matcher.GetBottom())
 34 |     matcher.SetBottom(this->GetBottom());
 35 | 
 36 |   // The checked glyph is now consumed - move to the next
 37 |   this->Consume();
 38 | }
 39 | 
 40 | //---------------------------------------------------------------------------//
 41 | 
 42 | bool TextElement::IsElligibleToJoin(const TextElement& other) const
 43 | {
 44 |   return  !other.IsConsumed()                      &&
 45 |            other.IsBeyond(*this)                   &&
 46 |            other.IsOnSameLineAs(*this)             &&
 47 |           !other.IsWayBeyond(*this)                &&
 48 |           !this->CannotJoinLeftOf(other)            ;
 49 | }
 50 | 
 51 | //---------------------------------------------------------------------------//
 52 | 
 53 | void TextElement::JoinWords(TextElement& other)
 54 | {
 55 |     // This element is elligible for joining - start by adding a space to it
 56 |     this->glyph_.push_back(0x0020);
 57 | 
 58 |     // If the gap is wide enough, add two spaces
 59 |     if (other.GetLeft() - this->GetRight() > 1 * this->GetSize())
 60 |     {
 61 |       this->glyph_.push_back(0x0020);
 62 |     }
 63 | 
 64 |     // Stick contents together
 65 |     Concatenate(this->glyph_, other.GetGlyph());
 66 | 
 67 |     // The rightmost glyph's right edge properties are also copied over
 68 |     this->SetRight(other.GetRight());
 69 |     if (other.IsRightEdge()) this->MakeRightEdge();
 70 | 
 71 |     // The word will take up the size of its largest glyph
 72 |     this->SetTop(max(this->GetSize(), other.GetSize()) + this->GetBottom());
 73 | 
 74 |     // The element on the right is now consumed
 75 |     other.Consume();
 76 | }
 77 | 
 78 | //---------------------------------------------------------------------------//
 79 | 
 80 | void TextElement::ConcatenateUnicode(const std::vector<Unicode>& other)
 81 | {
 82 |   Concatenate(glyph_, other);
 83 | }
 84 | 
 85 | /*--------------------------------------------------------------------------*/
 86 | // converts (16-bit) Unicode code points to multibyte utf-8 encoding.
 87 | 
 88 | string TextElement::Utf()
 89 | {
 90 |   std::string result_string {}; // empty string for results
 91 |   for (auto& point : this->glyph_) // for each uint16_t in the input vector...
 92 |   {
 93 |     // values less than 128 are just single-byte ASCII
 94 |     if (point < 0x0080)
 95 |     {
 96 |       result_string.push_back(point & 0x007f);
 97 |       continue;
 98 |     }
 99 | 
100 |     // values of 128 - 2047 are two bytes. The first byte starts 110xxxxx
101 |     // and the second starts 10xxxxxx. The remaining 11 x's are filled with the
102 |     // 11 bits representing a number between 128 and 2047. e.g. Unicode point
103 |     // U+061f (decimal 1567) is 11000011111 in 11 bits of binary, which we split
104 |     // into length-5 and length-6 pieces 11000 and 011111. These are appended on
105 |     // to 110 and 10 respectively to give the 16-bit number 110 11000 10 011111,
106 |     // which as two bytes is 11011000 10011111 or d8 9f. Thus the UTF-8
107 |     // encoding for character U+061f is the two-byte sequence d8 9f.
108 |     if (point > 0x007f && point < 0x0800)
109 |     {
110 |       // construct byte with bits 110 and first 5 bits of unicode point number
111 |       result_string.push_back((0x00c0 | ((point >> 6) & 0x001f)));
112 | 
113 |       // construct byte with bits 10 and final 6 bits of unicode point number
114 |       result_string.push_back(0x0080 | (point & 0x003f));
115 |       continue;
116 |     }
117 | 
118 |     // Unicode values between 2048 (0x0800) and the maximum uint16_t value
119 |     // (65535 or 0xffff) are given by 16 bits split over three bytes in the
120 |     // following format: 1110xxxx 10xxxxxx 10xxxxxx. Each x here takes one of
121 |     // the 16 bits representing 2048 - 65535.
122 |     if (point > 0x07ff)
123 |     {
124 |       // First we specifically change ligatures to appropriate Ascii values
125 |       if (point == 0xFB00) {result_string += "ff"; continue;}
126 |       if (point == 0xFB01) {result_string += "fi"; continue;}
127 |       if (point == 0xFB02) {result_string += "fl"; continue;}
128 |       if (point == 0xFB03) {result_string += "ffi"; continue;}
129 |       if (point == 0xFB04) {result_string += "ffl"; continue;}
130 | 
131 |       // construct byte with 1110 and first 4 bits of unicode point number
132 |       result_string.push_back(0x00e0 | ((point >> 12) & 0x000f));
133 | 
134 |       // construct byte with 10 and bits 5-10 of unicode point number
135 |       result_string.push_back(0x0080 | ((point >> 6) & 0x003f));
136 | 
137 |       // construct byte with bits 10 and final 6 bits of unicode point number
138 |       result_string.push_back(0x0080 | ((point) & 0x003f));
139 |     }
140 |     // Although higher Unicode points are defined and can be encoded in utf8,
141 |     // the hex-strings in pdf seem to be two bytes wide at most. These are
142 |     // therefore not supported at present.
143 |   }
144 |   return result_string;
145 | }
146 | 
147 | /*--------------------------------------------------------------------------*/
148 | // Although this method looks like it should be inlined, doing so would mean
149 | // having to include font.h in the header file
150 | 
151 | string TextElement::GetFontName() const
152 | {
153 |   return this->font_->GetFontName();
154 | }
155 | 


--------------------------------------------------------------------------------
/src/xref.h:
--------------------------------------------------------------------------------
  1 | //---------------------------------------------------------------------------//
  2 | //                                                                           //
  3 | //  PDFR XRef header file                                                    //
  4 | //                                                                           //
  5 | //  Copyright (C) 2018 - 2019 by Allan Cameron                               //
  6 | //                                                                           //
  7 | //  Licensed under the MIT license - see https://mit-license.org             //
  8 | //  or the LICENSE file in the project root directory                        //
  9 | //                                                                           //
 10 | //---------------------------------------------------------------------------//
 11 | 
 12 | #ifndef PDFR_XREF
 13 | 
 14 | //---------------------------------------------------------------------------//
 15 | 
 16 | #define PDFR_XREF
 17 | 
 18 | /* This is the third main header in the daisy-chain of #includes that builds up
 19 |  * the tools needed to read and parse pdf, after utilities.h and dictionary.h.
 20 |  * It also includes a couple of other headers which are needed to decrypt and
 21 |  * decode encrypted and compressed streams (streams.h and crypto.h)
 22 |  *
 23 |  * The cross reference table (XRef) is a data structure (or more accurately a
 24 |  * group of data structures) that forms part of the pdf file format and allows
 25 |  * for the rapid random access of the pdf objects from which a document is
 26 |  * comprised. At its simplest, this is a table containing the object number,
 27 |  * the generation number of the object, and the number of bytes from the start
 28 |  * of the file where that object is located.
 29 |  *
 30 |  * However, it is not always quite that simple. Firstly, documents can and do
 31 |  * have more than one XRef that lists different objects. Secondly, the XRef
 32 |  * can itself be a compressed stream which must be found and translated before
 33 |  * being read. This means the XRef class must have access to decryption and
 34 |  * decoding algorithms.
 35 |  *
 36 |  * Fortunately, the location of the start of an XRef table (as number of bytes
 37 |  * offset from the start of the file) is given right at the end of a file, just
 38 |  * before the %%EOF on the last line. It is thus simple to get to the start of
 39 |  * an XRef from this number. For a normal uncompressed XRef, this takes us to
 40 |  * the top of a table which is just read and parsed. At the end of the table is
 41 |  * a special dictionary which does not belong to any object. This is the
 42 |  * trailer dictionary. If there are other xrefs in the file, this tells us
 43 |  * where the next one is, and we can continue to hop around and read the xrefs
 44 |  * until none are left and we have a complete "roadmap" of where the objects
 45 |  * are in the file.
 46 |  *
 47 |  * If, however, the XRef is located in a stream, things get more complicated.
 48 |  * The stream belongs to an object, and the dictionary at the beginning of that
 49 |  * object doubles as the trailer dictionary. As well as being compressed, the
 50 |  * stream containing the XRef is usually encoded as a string of bytes which
 51 |  * then need to be interpreted using the algorithm normally used for
 52 |  * decompressing PNG files. This makes handling XRef streams complex enough to
 53 |  * warrant their own class. However, since this class only has to perform a part
 54 |  * of XRef implementation, it has no public interface and is therefore not
 55 |  * defined in this header file, but rather within xref.cpp
 56 | */
 57 | #include<string>
 58 | #include<vector>
 59 | #include<memory>
 60 | #include<unordered_map>
 61 | 
 62 | class Dictionary;
 63 | class Crypto;
 64 | class CharString;
 65 | 
 66 | /*---------------------------------------------------------------------------*/
 67 | // The main XRef data member is an unordered map with the key being the object
 68 | // number and the value being a struct of named ints as defined here
 69 | 
 70 | struct XRefRow
 71 | {
 72 |   int startbyte,  // Its byte offset
 73 |       stopbyte,   // The offset of the corresponding endobj marker
 74 |       in_object;  // If this is a stream object, in which other object is it
 75 | };                // located? Has value of 0 if the object is not in a stream
 76 | 
 77 | /*---------------------------------------------------------------------------*/
 78 | // The main XRef class definition. Since this is the main "skeleton" of the pdf
 79 | // which is used by other classes to negotiate & parse the pdf, and because it
 80 | // can be complex to construct, it is a fairly large and complex class.
 81 | //
 82 | // Where possible I have tried to delegate some of its work to other classes
 83 | // or subclasses, but even still it is a little unwieldy.
 84 | 
 85 | class XRef
 86 | {
 87 |  public:
 88 |   XRef(std::shared_ptr<const std::string>);
 89 | 
 90 |   // Empty XRef constructor
 91 |   XRef(){};
 92 | 
 93 |   // public methods
 94 |   Dictionary GetTrailer()                    const; // Gets trailer dictionary
 95 |   size_t GetObjectEndByte(int)               const; // Gets object end position
 96 |   std::vector<int> GetAllObjectNumbers()     const; // Gets all object numbers
 97 |   CharString GetStreamLocation(int) const; // Gets start/stop of stream
 98 |   std::string Decrypt(std::string&, int, int) const; // Decrypts a stream
 99 |   std::string Decrypt(const CharString&, int, int) const;
100 | 
101 |   std::shared_ptr<const std::string> File() const { return file_string_;}
102 | 
103 |   CharString GetCharString() const { return CharString(*file_string_);}
104 | 
105 |   bool IsEncrypted() const { if(encryption_) return true; else return false; }
106 | 
107 |   size_t GetObjectStartByte(int object_number) const
108 |     { return GetRow_(object_number).startbyte; }
109 | 
110 |   size_t GetHoldingNumberOf(int object_number) const
111 |    { return GetRow_(object_number).in_object; }
112 | 
113 |  private:
114 |   std::shared_ptr<const std::string> file_string_;  // Pointer to file string
115 |   std::unordered_map<int, XRefRow> xref_table_;     // Main data member
116 |   Dictionary trailer_dictionary_;  // Main trailer dictionary
117 |   std::shared_ptr<Crypto> encryption_;              // Used for encrypted files
118 | 
119 |   // private methods
120 |   XRef& operator=(const XRef&);
121 |   int GetStreamLength_(const Dictionary&) const;
122 |   void LocateXRefs_();                    // Finds XRef locations
123 |   void ReadXRefStrings_(int);             // Gets strings from XRef locations
124 |   void ReadXRefFromStream_(int);          // Uses xrefstream class to get XRef
125 |   void ReadXRefFromString_(const CharString&); // parses XRef directly
126 |   void CreateCrypto_();                   // Allows decryption of encrypted docs
127 |   const XRefRow& GetRow_(int) const;
128 | };
129 | 
130 | //---------------------------------------------------------------------------//
131 | 
132 | #endif
133 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  3 | 
  4 | # PDFR
  5 | 
  6 | <!-- badges: start -->
  7 | 
  8 | [![Lifecycle:
  9 | experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html#experimental)
 10 | [![License:
 11 | MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
 12 | <!-- badges: end -->
 13 | 
 14 | The goal of PDFR is to aid data scientists who need the ability to
 15 | extract data from files in pdf format. PDFR is a new C++ based R library
 16 | to extract usable text from portable document format (pdf) files.
 17 | 
 18 | The majority of the code base is written in C++ with a view to being
 19 | ported to other languages, but at present it is constructed to be built
 20 | as an R package.
 21 | 
 22 | ## Installation
 23 | 
 24 | You can install the development version of PDFR from
 25 | [GitHub](https://github.com/) with:
 26 | 
 27 | ``` r
 28 | # install.packages("pak")
 29 | pak::pkg_install("AllanCameron/PDFR")
 30 | ```
 31 | 
 32 | ## Usage
 33 | 
 34 | The main function used to extract all data from a pdf page to an R data
 35 | frame is `pdfpage()`. This accepts either the path to a pdf or a raw
 36 | data vector representing a pdf. For example, this is how you extract all
 37 | text from page 1 of the barcodes PDF from `pdfr_paths`:
 38 | 
 39 | ``` r
 40 | library(PDFR)
 41 | 
 42 | barcodes <- system.file("extdata", "barcodes.pdf", package = "PDFR")
 43 | pdfpage(barcodes, 1)
 44 | #>                               text  left right bottom   top    font size
 45 | #> 1                             None  53.5  74.4  774.2 782.2 Courier    8
 46 | #> 2                   Acute medicine 187.4 255.9  774.2 782.2 Courier    8
 47 | #> 3                                / 258.8 264.8  774.2 782.2 Courier    8
 48 | #> 4                             ward 267.8 288.6  774.2 782.2 Courier    8
 49 | #> 5                               11 291.6 303.5  774.2 782.2 Courier    8
 50 | #> 6 jean.cairney@ggc.scot.nhs.uk0141 318.3 470.1  774.2 782.2 Courier    8
 51 | #> 7                              211 473.0 490.9  774.2 782.2 Courier    8
 52 | #> 8                             5719 493.9 514.7  774.2 782.2 Courier    8
 53 | ```
 54 | 
 55 | ## Background
 56 | 
 57 | The current version is at an early stage of development. It will work
 58 | with most pdfs, but there are some unsupported features which may lead
 59 | to some pdfs producing runtime errors.
 60 | 
 61 | Documents encrypted using the standard method and which can be opened
 62 | without a password are supported. Password-based encryption is currently
 63 | unsupported.
 64 | 
 65 | If there are any suggestions for development please submit a feature
 66 | request, or let me know about pdfs that break the package.
 67 | 
 68 | ## Motivation
 69 | 
 70 | Extracting useful data from pdf is difficult for two reasons. Firstly,
 71 | the pdf format primarily consists of binary data, which is laid out in
 72 | such a way as to provide quick random access to pdf *objects* as
 73 | required by a pdf reader. The text elements as seen on the page are
 74 | usually encoded in a binary stream within the document. Even when the
 75 | binary stream is decoded, the text items exist as individual elements
 76 | within a page description program, which has to be parsed before the
 77 | text can be extracted. It is therefore not a trivial matter to extract
 78 | the “raw text” from a pdf file into a format in which it can be read by
 79 | R, though there exist some excellent tools that can do this quickly. In
 80 | particular,
 81 | [pdftools](https://ropensci.org/blog/2016/03/01/pdftools-and-jeroen/)
 82 | provides an R interface to some of Poppler’s pdf tools, and can quickly
 83 | and reliably extract text wholesale from pdf.
 84 | 
 85 | The second problem is that, unlike some other common file types used to
 86 | exchange information on the internet (e.g. html, xml, csv, JSON), the
 87 | raw text extracted from a pdf does not have a fixed structure to provide
 88 | semantic information about the data to allow it to be processed easily
 89 | by a data scientist.
 90 | 
 91 | The mismatch between the fact that humans can read data from pdfs so
 92 | easily yet the format is so difficult to convert into machine-readable
 93 | data is explained by the fact that humans use the structure of the page
 94 | layout to provide the semantic context to the data. When the structure
 95 | is lost (as it often is with copy and pasting from PDF), it becomes very
 96 | difficult for a human reader to interpret. The computer does not know
 97 | how to interpret the characters’ positions, so it cannot classify the
 98 | characters by semantics as a human reader (usually) can.
 99 | 
100 | The idea behind PDFR is to try to extract raw text then use the
101 | positioning and formatting data from the extracted text to reconstruct
102 | some of the semantic content that would otherwise be lost. For example,
103 | identifying and grouping letters into words, words into paragraphs or
104 | into tables.
105 | 
106 | Ultimately, to extract useful data, the user will need the option to
107 | control how and to what extent text elements are grouped. For example,
108 | they may need the fine control of having every letter’s position on the
109 | page (e.g. to accurately reconstruct a part of the document on a plot),
110 | or may wish to extract a corpus of plain text from a book as a set of
111 | paragraphs or even whole pages.
112 | 
113 | PDFR is written in C++ 11 and has no external dependencies, but makes
114 | extensive use of the C++ standard libraries. Rather than being based on
115 | an existing library such as [xpdf](https://www.xpdfreader.com/) or
116 | [Poppler](https://poppler.freedesktop.org/), it was written from scratch
117 | with the specific goal of making text extraction easier for R users.
118 | Most of the design is new, an attempt to implement the text extraction
119 | elements of the pdf standard [ISO
120 | 32000](https://www.iso.org/standard/51502.html), though it borrows some
121 | concepts from existing open-source libraries such as Poppler and
122 | [pdfjs](https://mozilla.github.io/pdf.js/).
123 | 
124 | Clearly, the package would not exist without the excellent
125 | [Rcpp](http://www.rcpp.org/) package. Much of the pdf parsing would take
126 | too long to do in R, but having the facility to write C++ extensions
127 | makes pdf parsing feasible, and even pretty quick in some cases.
128 | 
129 | ## Related projects
130 | 
131 | - [pdftools](https://github.com/ropensci/pdftools): Text Extraction,
132 |   Rendering and Converting of PDF Documents.
133 | - [qpdf](https://github.com/ropensci/qpdf): Content-preserving
134 |   transformations transformations of PDF files such as split, combine,
135 |   and compress. This package interfaces directly to the ‘qpdf’ C++ API
136 |   and does not require any command line utilities.
137 | - [tabulizer](https://github.com/ropensci/tabulizer): Bindings for
138 |   Tabula PDF Table Extractor Library
139 | - [PDE](https://github.com/erikstricker/PDE): The PDE (Pdf Data
140 |   Extractor) allows the extraction of information and tables optionally
141 |   based on search words from PDF (Portable Document Format) files and
142 |   enables the visualization of the results, both by providing a
143 |   convenient user-interface.
144 | - [xmpdf](https://github.com/trevorld/r-xmpdf): Edit XMP metadata and
145 |   PDF bookmarks/documentation info.
146 | 


--------------------------------------------------------------------------------
/src/textbox.h:
--------------------------------------------------------------------------------
  1 | //---------------------------------------------------------------------------//
  2 | //                                                                           //
  3 | //  PDFR TextBox header file                                                 //
  4 | //                                                                           //
  5 | //  Copyright (C) 2018 - 2019 by Allan Cameron                               //
  6 | //                                                                           //
  7 | //  Licensed under the MIT license - see https://mit-license.org             //
  8 | //  or the LICENSE file in the project root directory                        //
  9 | //                                                                           //
 10 | //---------------------------------------------------------------------------//
 11 | 
 12 | #ifndef PDFR_TEXT_BOX
 13 | 
 14 | //---------------------------------------------------------------------------//
 15 | 
 16 | #define PDFR_TEXT_BOX
 17 | 
 18 | #include "text_element.h"
 19 | 
 20 | //---------------------------------------------------------------------------//
 21 | // We need to be able to process groups of text_elements together; for this we
 22 | // could just use a vector of TextPointer. However, we often need to know the
 23 | // bounding box of a group of text_elements. We can therefore define a TextBox
 24 | // as a struct with a Box and a vector of text_elements.
 25 | //
 26 | // This header file contains the definitions of the TextElement, TextPointer and
 27 | // TextBox classes. Most of their methods are straightforward and inlined, but
 28 | // some of the more involved methods are described in text_element.cpp
 29 | 
 30 | //---------------------------------------------------------------------------//
 31 | // The TextBox will be the main data repository for our output. It inherits from
 32 | // Box and contains a vector of text_elements. To make it easy to work with, it
 33 | // contains functions that allow us to use it as if it was just a vector of
 34 | // text_elements. This allows for easy iteration.
 35 | 
 36 | class TextBox : public Box
 37 | {
 38 |   using TextPointer = std::shared_ptr<TextElement>;
 39 |   using TextBoxIterator = std::vector<TextPointer>::iterator;
 40 |   using TextBoxConstIterator = std::vector<TextPointer>::const_iterator;
 41 | 
 42 |  public:
 43 |   // Standard constructor - takes vector of TextElement pointers and the minbox
 44 |   TextBox(std::vector<TextPointer> text, Box box)
 45 |    : Box(box), data_(text) {}
 46 | 
 47 |   // Constructor from text and vector of floats representing a box
 48 |   TextBox(std::vector<TextPointer> text, std::vector<float> float_vector)
 49 |    : Box(float_vector), data_(text) {}
 50 | 
 51 |   // Constructor from individual elements
 52 |   TextBox(std::vector<TextPointer> text, float left, float right,
 53 |           float top,  float bottom)
 54 |    : Box(left, right, top, bottom), data_(text) {}
 55 | 
 56 |   // Assignment constructor
 57 |   TextBox(Box box):  Box(box) {}
 58 | 
 59 |   // Default constructor
 60 |   TextBox() = default;
 61 | 
 62 |   // Copy contructor
 63 |   TextBox(const TextBox& textbox) = default;
 64 | 
 65 |   // Lvalue assignment constructor
 66 |   TextBox& operator=(const TextBox& textbox) = default;
 67 | 
 68 |   // Rvalue assignment constructor
 69 |   TextBox& operator=(TextBox&& textbox) noexcept {
 70 |     std::swap(textbox, *this); return *this;}
 71 | 
 72 |   std::shared_ptr<TextElement> CastToElement()
 73 |   {
 74 |     if (data_.size() > 1)
 75 |     {
 76 |       throw std::runtime_error("Can't cast multiple TextBoxes to TextElement");
 77 |     }
 78 |     auto& element = data_[0];
 79 |     element->SetLeft(this->GetLeft());
 80 |     element->SetRight(this->GetRight());
 81 |     element->SetTop(this->GetTop());
 82 |     element->SetBottom(this->GetBottom());
 83 |     return element;
 84 |   }
 85 | 
 86 |   // Functions to copy the methods of vectors to access main data object
 87 |   inline TextBoxIterator begin() {return data_.begin(); }
 88 |   inline TextBoxIterator end()   {return data_.end(); }
 89 |   inline void erase(TextBoxIterator start, TextBoxIterator finish)
 90 |   {
 91 |     data_.erase(start, finish);
 92 |   }
 93 |   inline TextBoxConstIterator cbegin() const {return data_.cbegin(); }
 94 |   inline TextBoxConstIterator cend() const {return data_.cend(); }
 95 |   inline TextPointer& operator[](int index) { return data_[index]; }
 96 |   inline TextPointer front() const {return data_.front(); }
 97 |   inline TextPointer back() const { return data_.back(); }
 98 |   inline size_t size() const { return data_.size(); }
 99 |   inline bool empty() const { return data_.empty(); }
100 |   inline void push_back(TextPointer text_ptr) { data_.push_back(text_ptr);}
101 |   inline void clear() { data_.clear(); }
102 |   inline void resize(int new_size) { data_.resize(new_size); }
103 |   inline void SwapData(std::vector<TextPointer>& other)
104 |   {
105 |     std::swap(data_, other);
106 |   }
107 | 
108 |   inline void emplace_back(TextPointer text_ptr)
109 |   {
110 |     data_.emplace_back(text_ptr);
111 |   }
112 | 
113 |   void RemoveDuplicates();
114 | 
115 |   // Divides a TextBox into two
116 |   TextBox SplitIntoTopAndBottom(float divide_at_this_y_value);
117 |   TextBox SplitIntoLeftAndRight(float divide_at_this_x_value);
118 | 
119 |  private:
120 |   // The data member
121 |   std::vector<TextPointer> data_;
122 | };
123 | 
124 | //---------------------------------------------------------------------------//
125 | // This struct inherits from Box, and is created by feeding it a TextBox. It
126 | // converts the vector of text_elements (which is conceptually a vector of
127 | // data frame rows) into columns of the different data types.
128 | 
129 | class TextTable: public Box
130 | {
131 |  public:
132 |   TextTable(const TextBox&);
133 |   void Join(TextTable&);
134 |   inline std::vector<float>&       GetLefts()      { return this->lefts_;  }
135 |   inline std::vector<float>&       GetRights()     { return this->rights_; }
136 |   inline std::vector<float>&       GetTops()       { return this->tops_;   }
137 |   inline std::vector<float>&       GetBottoms()    { return this->bottoms_;}
138 |   inline std::vector<float>&       GetSizes()      { return this->sizes_;  }
139 |   inline std::vector<std::string>& GetFontNames()  { return this->fonts_;  }
140 |   inline std::vector<std::string>& GetText()       { return this->text_;   }
141 | 
142 |  private:
143 |   std::vector<std::string> text_, fonts_;
144 |   std::vector<float> lefts_, rights_, bottoms_, tops_, sizes_;
145 | };
146 | 
147 | 
148 | //---------------------------------------------------------------------------//
149 | // PageBox class. This is a class containing multiple textboxes as well as a
150 | // 'naked' Box that gives the page dimensions
151 | 
152 | class PageBox : public Box
153 | {
154 |  public:
155 |   PageBox(const Box& box, std::vector<TextBox> text_boxes)
156 |     : Box(box), data_(text_boxes) {}
157 | 
158 |   inline TextBox& operator[](size_t i) { return data_[i];}
159 |   inline std::vector<TextBox>::iterator begin() { return data_.begin();}
160 |   inline std::vector<TextBox>::iterator end() { return data_.end();}
161 |   inline bool empty() const { return data_.empty();}
162 |   inline size_t size() const { return data_.size();}
163 |   inline void push_back(TextBox textbox) { data_.push_back(textbox);}
164 |   TextBox CastToTextBox()
165 |   {
166 |     auto result = TextBox((Box) *this);
167 |     for (auto box : data_)
168 |     {
169 |       if(!box.empty()) result.push_back(box.CastToElement());
170 |     }
171 |     return result;
172 |   }
173 | 
174 | private:
175 |   std::vector<TextBox> data_;
176 | };
177 | 
178 | #endif
179 | 


--------------------------------------------------------------------------------
/src/encoding.h:
--------------------------------------------------------------------------------
  1 | //---------------------------------------------------------------------------//
  2 | //                                                                           //
  3 | //  PDFR Encoding header file                                                //
  4 | //                                                                           //
  5 | //  Copyright (C) 2018 - 2019 by Allan Cameron                               //
  6 | //                                                                           //
  7 | //  Licensed under the MIT license - see https://mit-license.org             //
  8 | //  or the LICENSE file in the project root directory                        //
  9 | //                                                                           //
 10 | //---------------------------------------------------------------------------//
 11 | 
 12 | #ifndef PDFR_ENCODING
 13 | 
 14 | //---------------------------------------------------------------------------//
 15 | 
 16 | #define PDFR_ENCODING
 17 | 
 18 | /* This is the joint 6th in a series of daisy-chained headers that build up the
 19 |  * tools to read and parse pdfs. It is logically paired with glyphwidths.h
 20 |  * in that they both come after document.h and together form the basis for the
 21 |  * next step, which is font creation.
 22 |  *
 23 |  * The reason that font creation comes before page creation is that pages
 24 |  * include a list of their fonts in the page description header, and the
 25 |  * program needs to know what these are.
 26 |  *
 27 |  * There are three main parts of font creation pertinent to the task of text
 28 |  * extraction: identifying the font's name, working out the width of the glyphs,
 29 |  * and working out the correspondence between the characters in a pdf string and
 30 |  * the intended glyphs as Unicode code points. The latter of these tasks is
 31 |  * called encoding, and is fairly complex.
 32 |  *
 33 |  * The complexity arises because there are several different methods for
 34 |  * encoding fonts in pdf. First, a base encoding scheme can be declared, such as
 35 |  * WinAnsiEncoding or MacRomanEncoding. These encodings are stored as static
 36 |  * private data members of the class in the form of an unordered_map, though
 37 |  * they are defined in the chartounicode.cpp file rather than encoding.cpp to
 38 |  * improve code readability.
 39 |  *
 40 |  * Whether a base encoding is specified or not, the actual encoding used can
 41 |  * be modified, for example to include Unicode characters that are not
 42 |  * available in the base encoding's character set (a common example is the
 43 |  * glyph for the ligatures ff, fi or fl). This is done using an explicit
 44 |  * mapping of input characters ("code points") to standard glyph names. That
 45 |  * means the program needs to know all these glyph names and how to convert
 46 |  * them to Unicode. This is a very large mapping, and again is declared here as
 47 |  * a static member but defined in a seperate source file (adobetounicode.h)
 48 |  *
 49 |  * The encoding may instead be specified in a CMap, which is a type of
 50 |  * raw char to Unicode mapping table that usually appears in a (compressed)
 51 |  * pdf object stream.
 52 |  *
 53 |  * The idea behind the encoding class is to use these methods as required to
 54 |  * produce a mapping for each font so that each code point encountered has a
 55 |  * Unicode interpretation. It keeps the implementation private and its interface
 56 |  * is limited to querying its main data member - an unordered map of input
 57 |  * characters (represented as 2-byte unsigned integers or uint16_t) to Unicode
 58 |  * characters (also represented as uint16_t). Since in most cases the input
 59 |  * characters are given as single bytes, these have to be recast as two-byte
 60 |  * uints for consistency to handle the odd cases when two-byte characters are
 61 |  * supplied in the strings (as is the case with "hexstrings" or ascii-encoded
 62 |  * multi-byte character strings).
 63 |  *
 64 |  * To make the code clearer, both RawChar and Unicode are typedef'd as synonyms
 65 |  * of uint16_t so we know at any time whether we are referring to input ("raw")
 66 |  * code points or output (Unicode) characters.
 67 |  */
 68 | 
 69 | #include<string>
 70 | #include<vector>
 71 | #include<unordered_map>
 72 | #include<memory>
 73 | #include<utility>
 74 | 
 75 | class Dictionary;
 76 | class Document;
 77 | using Unicode = uint16_t;
 78 | using RawChar = uint16_t;
 79 | 
 80 | //---------------------------------------------------------------------------//
 81 | // The encoding class comprises constructors which use private subroutines
 82 | // and large static maps to construct the main variable data member. The
 83 | // public interface is a simple RawChar in, Unicode out translator and a
 84 | // function to get all of the encoding (RawChar) keys
 85 | 
 86 | class Encoding
 87 | {
 88 |  public:
 89 |   // Constructor
 90 |   Encoding(Dictionary& font_dictionary,
 91 |            std::shared_ptr<Document> ptr_to_document);
 92 | 
 93 |   // Maps given raw code point to Unicode
 94 |   Unicode Interpret(const RawChar& code_point_to_be_interpreted);
 95 | 
 96 |   // This typedef shortens the name of the RawChar to Unicode lookup maps.
 97 |   typedef std::unordered_map<RawChar, Unicode> UnicodeMap;
 98 | 
 99 |   // Gets all available Raw chars that may be translated to Unicode in the map
100 |   std::shared_ptr<UnicodeMap> GetEncodingKeys();
101 | 
102 |  private:
103 |   // States used by parser to read "differences" entry in encoding dictionary
104 |   enum DifferencesState { NEWSYMB, NUM, NAME, STOP };
105 | 
106 |   // Data lookup tables - defined as static, which means only a single
107 |   // instance of each is created rather than a copy for each object.
108 |   // Note these maps are defined in adobetounicode.h and chartounicode.h
109 |   static const std::unordered_map<std::string, Unicode> adobe_to_unicode_;
110 |   static const UnicodeMap macroman_to_unicode_;
111 |   static const UnicodeMap winansi_to_unicode_;
112 |   static const UnicodeMap pdfdoc_to_unicode_;
113 | 
114 |   UnicodeMap encoding_map_;             // The main data member lookup
115 |   Dictionary& font_dictionary_;         // the main font dictionary
116 |   std::shared_ptr<Document> document_;  // pointer to the containing document
117 |   std::string base_encoding_;           // value of /BaseEncoding entry
118 | 
119 |   // The entries_ vector gives a pair of type : entry for each entity pushed
120 |   // onto the stack by the lexer. We therefore know whether we are dealing with
121 |   // a code point or a name when we parse the stack
122 |   std::vector<std::pair<DifferencesState, std::string>> entries_;
123 | 
124 |   // private member functions
125 | 
126 |   // uses lexer to parse /Differences entry
127 |   void ReadDifferences_(const std::string&);
128 | 
129 |   // finds encoding dictionary, gets /basencoding and /Differences entries
130 |   void ReadEncoding_();           // Tokenizer
131 |   void ReadDifferenceEntries_();  // Parser
132 | 
133 |   // parses CMap encoding ranges
134 |   void ProcessUnicodeRange_(std::vector<std::string>&);
135 | 
136 |   // parses CMap direct char-char conversion table
137 |   void ProcessUnicodeChars_(std::vector<std::string>&);
138 | 
139 |   // finds CMap if any and co-ordinates parsers to create mapping
140 |   void MapUnicode_();
141 | 
142 |   // Handles type 1 fonts
143 |   void HandleTypeOneFont_();
144 |   void ParseTypeOneFont_(std::string);
145 | 
146 |   // Helper function for parser
147 |   void Write_(DifferencesState& state_to_push_to_entries,
148 |               std::string& string_to_push_to_entries);
149 | };
150 | 
151 | //---------------------------------------------------------------------------//
152 | 
153 | #endif
154 | 


--------------------------------------------------------------------------------
/src/glyphwidths.h:
--------------------------------------------------------------------------------
  1 | //---------------------------------------------------------------------------//
  2 | //                                                                           //
  3 | //  PDFR GlyphWidths header file                                             //
  4 | //                                                                           //
  5 | //  Copyright (C) 2018 - 2019 by Allan Cameron                               //
  6 | //                                                                           //
  7 | //  Licensed under the MIT license - see https://mit-license.org             //
  8 | //  or the LICENSE file in the project root directory                        //
  9 | //                                                                           //
 10 | //---------------------------------------------------------------------------//
 11 | 
 12 | #ifndef PDFR_WIDTH
 13 | 
 14 | //---------------------------------------------------------------------------//
 15 | 
 16 | #define PDFR_WIDTH
 17 | 
 18 | /* This is the joint 6th in a series of daisy-chained headers that build up the
 19 |  * tools to read and parse pdfs. It is logically paired with encoding.h
 20 |  * in that they both come after document.h and together form the basis for the
 21 |  * next step, which is font creation.
 22 |  *
 23 |  * Calculating the width of each glyph is necessary for working out the spacing
 24 |  * between letters, words, paragraphs and other text elements. The glyph widths
 25 |  * in pdf are given in units of text space, where 1000 = 1 point = 1/72 inch in
 26 |  * 1-point font size.
 27 |  *
 28 |  * Getting the glyph widths is one of the more complex tasks in extracting text,
 29 |  * since there are various ways for pdf files to describe them. The most
 30 |  * explicit way is by listing the font widths at each code point in an array.
 31 |  * The array is preceeded by the first code point that is being described,
 32 |  * then the array itself comprises numbers for the widths of sequential code
 33 |  * points. Often there are several consecutive arrays like this specifying
 34 |  * groups of sequential code points. Sometimes the entry is just an array of
 35 |  * widths, and the first code point is given seperately in the font
 36 |  * dictionary. Sometimes there is a default width for missing glyphs. Sometimes
 37 |  * the width array is in the font dictionary; sometimes it is in a descendant
 38 |  * font dictionary; other times it is in an encoded stream; still other times
 39 |  * it comprises an entire non-dictionary object on its own.
 40 |  *
 41 |  * In older pdfs, the widths may not be specified at all if the font used is
 42 |  * one of 14 core fonts in the pdf specification. A conforming reader is
 43 |  * supposed to know the glyph widths for these fonts.
 44 |  *
 45 |  * The glyphwidth class attempts to work out the method used to describe
 46 |  * glyph widths and produce a map of the intended glyphs to their intended
 47 |  * widths, without bothering any other classes with its implementation.
 48 |  *
 49 |  * Among the tools it needs to do this, it requires navigating the document,
 50 |  * reading dictionaries and streams, and parsing a width description array.
 51 |  * It therefore needs the document.h header which wraps most of these
 52 |  * capabilities. The class defines its own lexer for interpreting the special
 53 |  * width arrays.
 54 |  *
 55 |  * It also needs a group of static objects listing the widths of each of the
 56 |  * characters used in the 'built-in' fonts used in pdfs. In theory, later
 57 |  * versions of pdf require specification of all glyph widths, but for back-
 58 |  * compatibility, the widths of the 14 core fonts still need to be defined.
 59 |  *
 60 |  * The widths are available as an open online resource from Adobe.
 61 |  *
 62 |  * To preserve encapsulation, this header is included only by the fonts
 63 |  * class. The fonts class merges its width map with the encoding map to
 64 |  * produce the glyphmap, which gives the intended Unicode code point and
 65 |  * width as a paired value for any given input character in a pdf string.
 66 |  */
 67 | 
 68 | //---------------------------------------------------------------------------//
 69 | 
 70 | #include<string>
 71 | #include<vector>
 72 | #include<unordered_map>
 73 | #include<memory>
 74 | 
 75 | class Dictionary;
 76 | class Document;
 77 | using Unicode = uint16_t;
 78 | using RawChar = uint16_t;
 79 | 
 80 | 
 81 | //---------------------------------------------------------------------------//
 82 | // The GlyphWidths class contains private methods to find the description of
 83 | // widths for each character in a font. It only makes sense to the font class,
 84 | // from whence it is created and accessed.
 85 | //
 86 | // The core font widths are declared static private because they are only
 87 | // needed by this class, and we don't want an extra copy of all of them if
 88 | // several fonts are created. This also prevents them polluting the global
 89 | // namespace.
 90 | 
 91 | class GlyphWidths
 92 | {
 93 |  public:
 94 |   // Constructor
 95 |   GlyphWidths(Dictionary& font_dictionary_ptr,
 96 |               std::shared_ptr<Document> document_ptr);
 97 | 
 98 |   // public methods
 99 |   float GetWidth(const RawChar& code_point);   // Get width of character code
100 |   std::vector<RawChar> WidthKeys();            // Returns all map keys
101 | 
102 |   inline bool WidthsAreForRaw() const { return width_is_pre_interpretation_; }
103 | 
104 |  private:
105 |   // This enum is used in the width array lexer
106 |   enum WidthState {NEWSYMB, READFIRSTCHAR, READSECONDCHAR,
107 |                    READWIDTH, INSUBARRAY, END};
108 | 
109 |   // private data
110 |   std::unordered_map<RawChar, float> width_map_;  // The main data member
111 |   Dictionary& font_dictionary_;                 // The font dictionary
112 |   std::shared_ptr<Document> document_;          // Pointer to document
113 |   std::string base_font_;                       // The base font (if any)
114 |   bool width_is_pre_interpretation_;            // Are widths for code points
115 |                                                 // pre- or post- translation?
116 |   // private methods
117 |   void ParseWidthArray_(const std::string&);    // Width lexer
118 |   void ReadCoreFont_();                         // Core font getter
119 |   void ParseDescendants_();                     // Gets descendant dictionary
120 |   void ParseWidths_();                          // Parses the width array
121 |   void ReadWidthTable_();                       // Co-ordinates construction
122 | 
123 | //-- The core fonts as defined in corefonts.cpp ------------------------------//
124 |                                                                               //
125 |   static const std::unordered_map<Unicode, float> courier_widths_;              //
126 |   static const std::unordered_map<Unicode, float> helvetica_widths_;            //
127 |   static const std::unordered_map<Unicode, float> helvetica_bold_widths_;       //
128 |   static const std::unordered_map<Unicode, float> symbol_widths_;               //
129 |   static const std::unordered_map<Unicode, float> times_bold_widths_;           //
130 |   static const std::unordered_map<Unicode, float> times_bold_italic_widths_;    //
131 |   static const std::unordered_map<Unicode, float> times_italic_widths_;         //
132 |   static const std::unordered_map<Unicode, float> times_roman_widths_;          //
133 |   static const std::unordered_map<Unicode, float> dingbats_widths_;             //
134 |                                                                               //
135 | //----------------------------------------------------------------------------//
136 | };
137 | 
138 | //---------------------------------------------------------------------------//
139 | 
140 | #endif
141 | 


--------------------------------------------------------------------------------
/src/object_class.cpp:
--------------------------------------------------------------------------------
  1 | //---------------------------------------------------------------------------//
  2 | //                                                                           //
  3 | //  PDFR Object implementation file                                          //
  4 | //                                                                           //
  5 | //  Copyright (C) 2018 - 2019 by Allan Cameron                               //
  6 | //                                                                           //
  7 | //  Licensed under the MIT license - see https://mit-license.org             //
  8 | //  or the LICENSE file in the project root directory                        //
  9 | //                                                                           //
 10 | //---------------------------------------------------------------------------//
 11 | 
 12 | #include "utilities.h"
 13 | #include "dictionary.h"
 14 | #include "streams.h"
 15 | #include "deflate.h"
 16 | #include "xref.h"
 17 | #include "object_class.h"
 18 | #include<iostream>
 19 | 
 20 | //---------------------------------------------------------------------------//
 21 | 
 22 | using namespace std;
 23 | 
 24 | //---------------------------------------------------------------------------//
 25 | // The main object creator class. It needs a pointer to the xref and a number
 26 | // representing the object's number as set out in the xref table.
 27 | 
 28 | Object::Object(shared_ptr<const XRef> xref, int object_number) :
 29 |   xref_(xref),
 30 |   object_number_(object_number),
 31 |   raw_stream_(),
 32 |   stream_index_(make_shared<unordered_map<int, pair<int, int>>>())
 33 | {
 34 |   // Find start and end of object
 35 |   size_t start = xref_->GetObjectStartByte(object_number_);
 36 |   size_t stop  = xref_->GetObjectEndByte(object_number_);
 37 | 
 38 |   if (xref_->File()->substr(start, 20).find("%") != string::npos)
 39 |   {
 40 |     start = xref_->File()->substr(start, 200).find("\n") + start;
 41 |   }
 42 | 
 43 |   // We check to see if the object has a header dictionary by finding '<<'
 44 |   if (xref_->File()->substr(start, 20).find("<<") == string::npos)
 45 |   {
 46 |     // No dictionary found - make blank dictionary for header
 47 |     header_ = Dictionary();
 48 | 
 49 |     // Finds start and length of contents
 50 |     size_t c_start = xref_->File()->find(" obj", start) + 4;
 51 |     raw_stream_ = {xref_->File()->c_str() + c_start, stop - c_start};
 52 |   }
 53 | 
 54 |   else // Else the object has a header dictionary
 55 |   {
 56 |     header_ = Dictionary(xref_->File(), start);
 57 |     // Find the stream (if any)
 58 |     raw_stream_ = xref_->GetStreamLocation(start);
 59 | 
 60 |     // The object may contain an object stream that needs unpacked
 61 |     if (header_["/Type"] == "/ObjStm")
 62 |     {
 63 |       // Get the object stream
 64 |       ReadStream_();
 65 | 
 66 |       // Index the objects in the stream
 67 |       IndexObjectStream_();
 68 |     }
 69 |   }
 70 | }
 71 | 
 72 | //---------------------------------------------------------------------------//
 73 | // Object streams start with a group of integers representing the object
 74 | // numbers and the byte offset of each object relative to the stream. This
 75 | // method reads the objects and their positions in the stream, indexing them
 76 | // for later retrieval.
 77 | 
 78 | void Object::IndexObjectStream_()
 79 | {
 80 |   // Get the first character that is not a digit or space
 81 |   int startbyte = stream_.find_first_not_of("\n\r\t 0123456789");
 82 | 
 83 |   // Now get the substring with the objects proper...
 84 |   string stream_string(stream_.begin() + startbyte, stream_.end());
 85 | 
 86 |   // ...and the substring with the registration numbers...
 87 |   string index_string(stream_.begin(), stream_.begin() + startbyte - 1);
 88 | 
 89 |   // extract these numbers to a vector
 90 |   vector<int> index = ParseInts(index_string);
 91 | 
 92 |   // If this is empty, something has gone wrong.
 93 |   if (index.empty()) throw runtime_error("Couldn't parse object stream");
 94 | 
 95 |   // We now set up a loop that determines which numbers are object numbers and
 96 |   // which are byte offsets
 97 |   for (size_t byte_length, i = 1; i < index.size(); i += 2)
 98 |   {
 99 |     if (i == (index.size() - 1)) byte_length = stream_string.size() - index[i];
100 |     else byte_length = index[i + 2] - index[i];
101 |     auto&& index_pair = make_pair(index[i] + startbyte, byte_length);
102 |     (*stream_index_)[index[i - 1]] = index_pair;
103 |   }
104 | }
105 | 
106 | /*---------------------------------------------------------------------------*/
107 | // The constructor for in-stream objects. This is called automatically by the
108 | // main object constructor if the main object constructor determines that the
109 | // requested object lies inside the stream of another object
110 | 
111 | Object::Object(shared_ptr<Object> holder, int object_number):
112 |   xref_(holder->xref_),
113 |   object_number_(object_number),
114 |   raw_stream_()
115 | {
116 |   auto finder = holder->stream_index_->find(object_number_);
117 |   if (finder == holder->stream_index_->end())
118 |   {
119 |     throw runtime_error("Object not found in stream");
120 |   }
121 | 
122 |   auto index_position = finder->second.first;
123 |   auto index_length   = finder->second.second;
124 |   auto stream_string  = holder->stream_.substr(index_position, index_length);
125 | 
126 |   // Most stream objects consist of just a dictionary
127 |   if (stream_string[0] == '<')
128 |   {
129 |     header_ = Dictionary(make_shared<string>(stream_string));
130 |     stream_ = "";             // stream objects don't have their own stream
131 |   }
132 |   else // The object is not a dictionary - maybe just an array or int etc
133 |   {
134 |     header_ = Dictionary();// empty header
135 |     stream_ = stream_string;  // Call the contents a stream for ease
136 | 
137 |     // Annoyingly, some "objects" in an object stream are just pointers
138 |     // to other objects. This is pointless but does happen and needs to
139 |     // be handled by recursively calling the constructor
140 |     if (stream_.size() < 15 && stream_.find(" R", 0) < 15)
141 |     {
142 |       size_t new_number = ParseReferences(stream_)[0];
143 |       size_t holder = xref_->GetHoldingNumberOf(new_number);
144 |       if (holder == 0) *this = Object(xref_, new_number);
145 |       else *this = Object(make_shared<Object>(xref_, holder), new_number);
146 |       this->object_number_ = object_number;
147 |     }
148 |   }
149 | }
150 | 
151 | /*---------------------------------------------------------------------------*/
152 | // Simple public getter for the header dictionary
153 | 
154 | Dictionary& Object::GetDictionary()
155 | {
156 |   return header_;
157 | }
158 | 
159 | /*---------------------------------------------------------------------------*/
160 | // We have to create the stream on the fly when it is needed rather than
161 | // calculating and storing all the streams upon document creation
162 | 
163 | string& Object::GetStream()
164 | {
165 |   // If the stream has not already been processed, do it now
166 |   if (stream_.empty()) ReadStream_();
167 |   return stream_;
168 | }
169 | 
170 | /*---------------------------------------------------------------------------*/
171 | // We will keep all stream processing in one place for easier debugging and
172 | // future development
173 | 
174 | void Object::ReadStream_()
175 | {
176 | 
177 |   string filters = header_["/Filter"];
178 |   bool is_flatedecode = filters.find("/FlateDecode") != string::npos;
179 | 
180 |   // Decrypt if necessary
181 |   if (xref_->IsEncrypted())
182 |   {
183 |     stream_ = xref_->Decrypt(raw_stream_, object_number_, 0);
184 |     if (is_flatedecode) stream_ = FlateDecode(&stream_);
185 |   }
186 |   else
187 |   {
188 |     if (is_flatedecode) stream_ = FlateDecode(raw_stream_);
189 |   }
190 | }
191 | 
192 | 


--------------------------------------------------------------------------------