├── PULL_REQUEST_TEMPLATE.md ├── src ├── .gitignore ├── test-runner.cpp ├── DCTDecode.h ├── external │ └── Profiler.h ├── letter_grouper.h ├── deflate.h ├── line_grouper.h ├── graphicsstate.h ├── box.cpp ├── object_class.h ├── word_grouper.h ├── streams.cpp ├── streams.h ├── tokenizer.h ├── charstring.cpp ├── whitespace.h ├── line_grouper.cpp ├── font.h ├── charstring.h ├── matrix.h ├── document.h ├── page.h ├── font.cpp ├── textbox.cpp ├── crypto.h ├── text_element.h ├── graphicobject.h ├── word_grouper.cpp ├── text_element.cpp ├── xref.h ├── textbox.h ├── encoding.h ├── glyphwidths.h └── object_class.cpp ├── LICENSE ├── tests ├── testthat.R └── testthat │ ├── test-cpp.R │ └── test-pdrf.R ├── inst └── extdata │ ├── gg.pdf │ ├── tex.pdf │ ├── adobe.pdf │ ├── leeds.pdf │ ├── luck.pdf │ ├── rcpp.pdf │ ├── sams.pdf │ ├── barcodes.pdf │ ├── pdfinfo.pdf │ ├── chestpain.pdf │ └── testreader.pdf ├── .gitignore ├── TODO.md ├── NEWS.md ├── .Rbuildignore ├── R ├── catch-routine-registration.R ├── PDFR-package.R ├── data.R ├── utils.R └── RcppExports.R ├── man ├── pdfdoc.Rd ├── pdfboxes.Rd ├── run_testthat_tests.Rd ├── get_xref.Rd ├── getpagestring.Rd ├── draw_glyph.Rd ├── pdfgraphics.Rd ├── get_object.Rd ├── pdfgrobs.Rd ├── getglyphmap.Rd ├── pdfpage.Rd ├── pdfr_paths.Rd ├── pdfplot.Rd └── PDFR-package.Rd ├── PDFR.Rproj ├── .github └── ISSUE_TEMPLATE │ └── feature_request.md ├── NAMESPACE ├── DESCRIPTION ├── LICENSE.md ├── CONTRIBUTING.md ├── CODE_OF_CONDUCT.md ├── codemeta.json ├── README.Rmd └── README.md /PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | JUST ASK 2 | -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.so 3 | *.dll 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2023 2 | COPYRIGHT HOLDER: Allan Cameron 3 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(PDFR) 3 | 4 | test_check("PDFR") 5 | -------------------------------------------------------------------------------- /inst/extdata/gg.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AllanCameron/PDFR/HEAD/inst/extdata/gg.pdf -------------------------------------------------------------------------------- /inst/extdata/tex.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AllanCameron/PDFR/HEAD/inst/extdata/tex.pdf -------------------------------------------------------------------------------- /inst/extdata/adobe.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AllanCameron/PDFR/HEAD/inst/extdata/adobe.pdf -------------------------------------------------------------------------------- /inst/extdata/leeds.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AllanCameron/PDFR/HEAD/inst/extdata/leeds.pdf -------------------------------------------------------------------------------- /inst/extdata/luck.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AllanCameron/PDFR/HEAD/inst/extdata/luck.pdf -------------------------------------------------------------------------------- /inst/extdata/rcpp.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AllanCameron/PDFR/HEAD/inst/extdata/rcpp.pdf -------------------------------------------------------------------------------- /inst/extdata/sams.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AllanCameron/PDFR/HEAD/inst/extdata/sams.pdf -------------------------------------------------------------------------------- /inst/extdata/barcodes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AllanCameron/PDFR/HEAD/inst/extdata/barcodes.pdf -------------------------------------------------------------------------------- /inst/extdata/pdfinfo.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AllanCameron/PDFR/HEAD/inst/extdata/pdfinfo.pdf -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | src/*.o 6 | src/*.so 7 | src/*.dll 8 | -------------------------------------------------------------------------------- /inst/extdata/chestpain.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AllanCameron/PDFR/HEAD/inst/extdata/chestpain.pdf -------------------------------------------------------------------------------- /inst/extdata/testreader.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AllanCameron/PDFR/HEAD/inst/extdata/testreader.pdf -------------------------------------------------------------------------------- /tests/testthat/test-cpp.R: -------------------------------------------------------------------------------- 1 | context("C++") 2 | test_that("Catch unit tests pass", { 3 | expect_cpp_tests_pass("PDFR") 4 | }) 5 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | # TODO 2 | 3 | - Group glyphs together into paragraphs, tables etc 4 | - Complete documentation 5 | - Submit to CRAN 6 | 7 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # PDFR 0.1.0 2 | 3 | * Added a `NEWS.md` file to track changes to the package. 4 | * Refactor and update documentation allow PDFR to pass `devtools::check()` with no errors, warnings, or notes (@elipousson, #4). 5 | -------------------------------------------------------------------------------- /src/test-runner.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Please do not edit this file -- it ensures that your package will export a 3 | * 'run_testthat_tests()' C routine that can be used to run the Catch unit tests 4 | * available in your package. 5 | */ 6 | #define TESTTHAT_TEST_RUNNER 7 | #include 8 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | README.md 4 | TODO.md 5 | CONTRIBUTING.md 6 | CODE_OF_CONDUCT.md 7 | PULL_REQUEST_TEMPLATE.md 8 | inflate_method.md 9 | working.R 10 | .gitignore 11 | ^\.github 12 | headerMap.txt 13 | ^data-raw$ 14 | ^LICENSE\.md$ 15 | ^codemeta\.json$ 16 | ^README\.Rmd$ 17 | -------------------------------------------------------------------------------- /R/catch-routine-registration.R: -------------------------------------------------------------------------------- 1 | # This dummy function definition is included with the package to ensure that 2 | # 'tools::package_native_routine_registration_skeleton()' generates the required 3 | # registration info for the 'run_testthat_tests' symbol. 4 | (function() { 5 | .Call("run_testthat_tests", PACKAGE = "PDFR") 6 | }) 7 | -------------------------------------------------------------------------------- /R/PDFR-package.R: -------------------------------------------------------------------------------- 1 | #' @useDynLib PDFR, .registration = TRUE 2 | #' @keywords internal 3 | "_PACKAGE" 4 | 5 | ## usethis namespace: start 6 | #' @importFrom Rcpp evalCpp 7 | #' @importFrom cli cli_abort 8 | #' @importFrom rlang is_character is_true is_false is_raw check_installed 9 | #' has_length abort caller_env 10 | ## usethis namespace: end 11 | NULL 12 | -------------------------------------------------------------------------------- /man/pdfdoc.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pdrf.R 3 | \name{pdfdoc} 4 | \alias{pdfdoc} 5 | \title{pdfdoc} 6 | \usage{ 7 | pdfdoc(pdf) 8 | } 9 | \arguments{ 10 | \item{pdf}{a valid pdf file location} 11 | } 12 | \value{ 13 | a data frame of all text elements in a document 14 | } 15 | \description{ 16 | Returns contents of all pdf pages 17 | } 18 | \examples{ 19 | pdfdoc(pdfr_paths$leeds) 20 | } 21 | -------------------------------------------------------------------------------- /PDFR.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageRoxygenize: rd,collate,namespace 22 | -------------------------------------------------------------------------------- /man/pdfboxes.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pdrf.R 3 | \name{pdfboxes} 4 | \alias{pdfboxes} 5 | \title{pdfboxes} 6 | \usage{ 7 | pdfboxes(pdf, pagenum) 8 | } 9 | \arguments{ 10 | \item{pdf}{a valid pdf file location} 11 | 12 | \item{pagenum}{the page number to be plotted} 13 | } 14 | \value{ 15 | a ggplot 16 | } 17 | \description{ 18 | Plots the bounding boxes of text elements from a page as a ggplot. 19 | } 20 | \examples{ 21 | pdfboxes(pdfr_paths$leeds, 1) 22 | } 23 | -------------------------------------------------------------------------------- /man/run_testthat_tests.Rd: -------------------------------------------------------------------------------- 1 | \docType{data} 2 | \name{run_testthat_tests} 3 | \alias{run_testthat_tests} 4 | \title{A tool used for symbol registration} 5 | \format{A list of 4 fields 6 | \describe{ 7 | \item{name}{run_testthat_tests} 8 | \item{address}{a pointer to this symbol} 9 | \item{dll}{the compiled file where the symbol is contained} 10 | \item{numParameters}{no parameters} 11 | }} 12 | \usage{ 13 | run_testthat_tests 14 | } 15 | \description{ 16 | A registered native symbol used in testing 17 | } 18 | \keyword{tests} 19 | -------------------------------------------------------------------------------- /man/get_xref.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pdrf.R 3 | \name{get_xref} 4 | \alias{get_xref} 5 | \title{Get a pdf's xref table as an R dataframe} 6 | \usage{ 7 | get_xref(pdf) 8 | } 9 | \arguments{ 10 | \item{pdf}{a valid pdf file location or raw data vector} 11 | } 12 | \value{ 13 | a data frame showing the bytewise positions of each object in the pdf 14 | } 15 | \description{ 16 | Get a pdf's xref table as an R dataframe 17 | } 18 | \examples{ 19 | get_xref(pdfr_paths$leeds) 20 | } 21 | -------------------------------------------------------------------------------- /man/getpagestring.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pdrf.R 3 | \name{getpagestring} 4 | \alias{getpagestring} 5 | \title{pagestring} 6 | \usage{ 7 | getpagestring(pdf, page) 8 | } 9 | \arguments{ 10 | \item{pdf}{a valid pdf file location} 11 | 12 | \item{page}{the page number to be extracted} 13 | } 14 | \value{ 15 | a single string containing the page description program 16 | } 17 | \description{ 18 | Returns contents of a pdf page description program 19 | } 20 | \examples{ 21 | getpagestring(pdfr_paths$leeds, 1) 22 | } 23 | -------------------------------------------------------------------------------- /man/draw_glyph.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pdrf.R 3 | \name{draw_glyph} 4 | \alias{draw_glyph} 5 | \title{draw_glyph} 6 | \usage{ 7 | draw_glyph(fontfile, glyph) 8 | } 9 | \arguments{ 10 | \item{fontfile}{a raw vector representing a font file} 11 | 12 | \item{glyph}{the character to be drawn. Can be text or an integer} 13 | } 14 | \value{ 15 | no return 16 | } 17 | \description{ 18 | Draws glyphs from a truetype font as grid grobs 19 | } 20 | \examples{ 21 | \dontrun{ 22 | if(interactive()){ 23 | # ttf <- "raw vector with font file" 24 | draw_glyph(ttf, "a") 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | 5 | --- 6 | 7 | **Is your feature request related to a problem? Please describe.** 8 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 9 | 10 | **Describe the solution you'd like** 11 | A clear and concise description of what you want to happen. 12 | 13 | **Describe alternatives you've considered** 14 | A clear and concise description of any alternative solutions or features you've considered. 15 | 16 | **Additional context** 17 | Add any other context or screenshots about the feature request here. 18 | -------------------------------------------------------------------------------- /man/pdfgraphics.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pdrf.R 3 | \name{pdfgraphics} 4 | \alias{pdfgraphics} 5 | \title{pdfgraphics} 6 | \usage{ 7 | pdfgraphics(file, pagenum, scale = 1) 8 | } 9 | \arguments{ 10 | \item{file}{a valid pdf file location} 11 | 12 | \item{pagenum}{the page number to be plotted} 13 | 14 | \item{scale}{Scale used for linewidth and text size. Passed to 15 | `ggplot2::geom_text()` size parameter as scale * size/3} 16 | } 17 | \value{ 18 | a ggplot 19 | } 20 | \description{ 21 | Plots the graphical elements of a pdf page as a ggplot 22 | } 23 | \examples{ 24 | pdfgraphics(pdfr_paths$leeds, 1) 25 | 26 | } 27 | -------------------------------------------------------------------------------- /man/get_object.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pdrf.R 3 | \name{get_object} 4 | \alias{get_object} 5 | \title{Get the contents of a pdf object} 6 | \usage{ 7 | get_object(pdf, number) 8 | } 9 | \arguments{ 10 | \item{pdf}{a valid pdf file location} 11 | 12 | \item{number}{the object number} 13 | } 14 | \value{ 15 | a named vector of the dictionary and stream of the pdf object 16 | } 17 | \description{ 18 | Returns a list consisting of a named vector representing key:value pairs 19 | in a specified object. It also contains any stream data associated with 20 | the object. 21 | } 22 | \examples{ 23 | get_object(pdfr_paths$leeds, 1) 24 | } 25 | -------------------------------------------------------------------------------- /man/pdfgrobs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pdrf.R 3 | \name{pdfgrobs} 4 | \alias{pdfgrobs} 5 | \title{pdfgrobs} 6 | \usage{ 7 | pdfgrobs(file_name, pagenum, scale = dev.size()[2]/10, enc = "UTF-8") 8 | } 9 | \arguments{ 10 | \item{file_name}{a valid pdf file location} 11 | 12 | \item{pagenum}{the page number to be plotted} 13 | 14 | \item{scale}{Document scale. Defaults to `dev.size()[2]/10`} 15 | 16 | \item{enc}{Document encoding. Defaults to "UTF-8"} 17 | } 18 | \value{ 19 | invisibly returns grobs as well as drawing them 20 | } 21 | \description{ 22 | Plots the graphical elements of a pdf page as grobs 23 | } 24 | \examples{ 25 | pdfgrobs(pdfr_paths$leeds, 1) 26 | } 27 | -------------------------------------------------------------------------------- /man/getglyphmap.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pdrf.R 3 | \name{getglyphmap} 4 | \alias{getglyphmap} 5 | \title{Return map of glyphs from a page} 6 | \usage{ 7 | getglyphmap(pdf, page = 1) 8 | } 9 | \arguments{ 10 | \item{pdf}{a valid pdf file location} 11 | 12 | \item{page}{the page number from which to extract glyphs} 13 | } 14 | \value{ 15 | a dataframe of all entries of font encoding tables with width mapping 16 | } 17 | \description{ 18 | Used mainly for debugging, this function returns an R dataframe, one row for 19 | each byte that may be used as a glyph. It shows the unicode number of 20 | each interpreted glyph, as well as its width in text space. 21 | } 22 | \examples{ 23 | getglyphmap(pdfr_paths$leeds, 1) 24 | } 25 | -------------------------------------------------------------------------------- /man/pdfpage.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pdrf.R 3 | \name{pdfpage} 4 | \alias{pdfpage} 5 | \title{pdfpage} 6 | \usage{ 7 | pdfpage(pdf, page = 1, atomic = FALSE, table_only = TRUE) 8 | } 9 | \arguments{ 10 | \item{pdf}{a valid pdf file location} 11 | 12 | \item{page}{the page number to be extracted} 13 | 14 | \item{atomic}{a boolean - should each letter treated individually?} 15 | 16 | \item{table_only}{a boolean - return data frame alone, as opposed to list} 17 | } 18 | \value{ 19 | a list containing data frames 20 | } 21 | \description{ 22 | Returns contents of a pdf page 23 | } 24 | \examples{ 25 | 26 | head(pdfpage(pdfr_paths$leeds, page = 1)) 27 | 28 | head(pdfpage(pdfr_paths$chestpain, page = c(1:2))) 29 | 30 | } 31 | -------------------------------------------------------------------------------- /man/pdfr_paths.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{pdfr_paths} 5 | \alias{pdfr_paths} 6 | \title{Paths to test pdfs} 7 | \format{ 8 | A list of 9 pdf files 9 | \describe{ 10 | \item{barcodes}{a pdf constructed in Rstudio} 11 | \item{chestpain}{a flow-chart for chest pain management} 12 | \item{pdfinfo}{information about the pdf format} 13 | \item{adobe}{an official adobe document} 14 | \item{leeds}{a table-rich local government document} 15 | \item{sams}{a document based on svg} 16 | \item{testreader}{a simple pdf test} 17 | \item{tex}{a simple tex test} 18 | \item{rcpp}{a CRAN package vignette} 19 | } 20 | } 21 | \usage{ 22 | pdfr_paths 23 | } 24 | \description{ 25 | A list of paths to locally stored test pdfs 26 | } 27 | \keyword{datasets} 28 | -------------------------------------------------------------------------------- /man/pdfplot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pdrf.R 3 | \name{pdfplot} 4 | \alias{pdfplot} 5 | \title{pdfplot} 6 | \usage{ 7 | pdfplot(pdf, page = 1, atomic = FALSE, boxes = FALSE, textsize = 1) 8 | } 9 | \arguments{ 10 | \item{pdf}{a valid pdf file location} 11 | 12 | \item{page}{the page number to be plotted} 13 | 14 | \item{atomic}{a boolean - should each letter treated individually?} 15 | 16 | \item{boxes}{Show the calculated text bounding boxes} 17 | 18 | \item{textsize}{the scale of the text to be shown} 19 | } 20 | \value{ 21 | a ggplot 22 | } 23 | \description{ 24 | Plots the text elements from a page as a ggplot. 25 | The aim is not a complete pdf rendering but to help identify elements of 26 | interest in the data frame of text elements to convert to data points. 27 | } 28 | \examples{ 29 | pdfplot(pdfr_paths$leeds, 1) 30 | } 31 | -------------------------------------------------------------------------------- /man/PDFR-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/PDFR-package.R 3 | \docType{package} 4 | \name{PDFR-package} 5 | \alias{PDFR} 6 | \alias{PDFR-package} 7 | \title{PDFR: Extract Text From PDFs In An R Friendly Way} 8 | \description{ 9 | Extracts text from PDF into an R dataframe giving the content, size, position and font of any text elements. This information can then be manipulated in R. 10 | } 11 | \seealso{ 12 | Useful links: 13 | \itemize{ 14 | \item \url{https://github.com/AllanCameron/PDFR} 15 | \item Report bugs at \url{https://github.com/AllanCameron/PDFR/issues} 16 | } 17 | 18 | } 19 | \author{ 20 | \strong{Maintainer}: Allan Cameron \email{Allan.Cameron@nhs.net} [copyright holder] 21 | 22 | Other contributors: 23 | \itemize{ 24 | \item Eli Pousson \email{eli.pousson@gmail.com} (\href{https://orcid.org/0000-0001-8280-1706}{ORCID}) [contributor] 25 | } 26 | 27 | } 28 | \keyword{internal} 29 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(draw_glyph) 4 | export(get_object) 5 | export(get_xref) 6 | export(getglyphmap) 7 | export(getpagestring) 8 | export(pdfboxes) 9 | export(pdfdoc) 10 | export(pdfgraphics) 11 | export(pdfgrobs) 12 | export(pdfpage) 13 | export(pdfplot) 14 | export(pdfr_paths) 15 | importFrom(Rcpp,evalCpp) 16 | importFrom(cli,cli_abort) 17 | importFrom(grDevices,dev.size) 18 | importFrom(grDevices,rgb) 19 | importFrom(grid,gpar) 20 | importFrom(grid,grid.draw) 21 | importFrom(grid,grid.newpage) 22 | importFrom(grid,grid.path) 23 | importFrom(grid,grid.rect) 24 | importFrom(grid,pushViewport) 25 | importFrom(grid,viewport) 26 | importFrom(rlang,abort) 27 | importFrom(rlang,caller_env) 28 | importFrom(rlang,check_installed) 29 | importFrom(rlang,has_length) 30 | importFrom(rlang,is_character) 31 | importFrom(rlang,is_false) 32 | importFrom(rlang,is_raw) 33 | importFrom(rlang,is_true) 34 | useDynLib(PDFR, .registration = TRUE) 35 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Type: Package 2 | Package: PDFR 3 | Title: Extract Text From PDFs In An R Friendly Way 4 | Version: 0.1.0 5 | Authors@R: c( 6 | person("Allan", "Cameron", , "Allan.Cameron@nhs.net", role = c("aut", "cre", "cph")), 7 | person("Eli", "Pousson", , "eli.pousson@gmail.com", role = "ctb", 8 | comment = c(ORCID = "0000-0001-8280-1706")) 9 | ) 10 | Maintainer: Allan Cameron 11 | Description: Extracts text from PDF into an R dataframe giving the 12 | content, size, position and font of any text elements. This 13 | information can then be manipulated in R. 14 | License: MIT + file LICENSE 15 | URL: https://github.com/AllanCameron/PDFR 16 | BugReports: https://github.com/AllanCameron/PDFR/issues 17 | Depends: 18 | R (>= 2.10) 19 | Imports: 20 | cli, 21 | grDevices, 22 | grid, 23 | Rcpp, 24 | rlang 25 | Suggests: 26 | ggplot2, 27 | testthat 28 | LinkingTo: 29 | Rcpp, 30 | testthat 31 | Encoding: UTF-8 32 | LazyData: true 33 | RoxygenNote: 7.2.3 34 | StagedInstall: no 35 | SystemRequirements: C++11 36 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2023 Allan Cameron 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/DCTDecode.h: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------------// 2 | // // 3 | // PDFR DCTDecode header file // 4 | // // 5 | // Copyright (C) 2018 - 2021 by Allan Cameron // 6 | // // 7 | // Licensed under the MIT license - see https://mit-license.org // 8 | // or the LICENSE file in the project root directory // 9 | // // 10 | //---------------------------------------------------------------------------// 11 | 12 | #ifndef PDFR_DCT 13 | 14 | //---------------------------------------------------------------------------// 15 | 16 | #define PDFR_DCT 17 | 18 | #include "streams.h" 19 | 20 | class DCTDecode : public Stream 21 | { 22 | public: 23 | DCTDecode(const std::string* input) : Stream(*input) {}; 24 | DCTDecode(const CharString& input) : Stream(input) {}; 25 | }; 26 | 27 | //---------------------------------------------------------------------------// 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /tests/testthat/test-pdrf.R: -------------------------------------------------------------------------------- 1 | context("test-pdrf") 2 | pdfpage(pdfr_paths[[1]], 1, FALSE, FALSE)$Elements -> barcodes 3 | pdfpage(pdfr_paths[[2]], 1, FALSE, FALSE)$Elements -> chestpain 4 | pdfpage(pdfr_paths[[3]], 1, FALSE, FALSE)$Elements -> pdfinfo 5 | pdfpage(pdfr_paths[[4]], 1, FALSE, FALSE)$Elements -> adobe 6 | pdfpage(pdfr_paths[[5]], 1, FALSE, FALSE)$Elements -> leeds 7 | pdfpage(pdfr_paths[[6]], 1, FALSE, FALSE)$Elements -> sams 8 | pdfpage(pdfr_paths[[7]], 1, FALSE, FALSE)$Elements -> testreader 9 | pdfpage(pdfr_paths[[8]], 3, FALSE, FALSE)$Elements -> tex 10 | pdfpage(pdfr_paths[[9]], 1, FALSE, FALSE)$Elements -> rcpp 11 | 12 | test_that("Encoding works", 13 | { 14 | expect_match(chestpain$text[1], "ACUTE CARDIAC CHEST PAIN GUIDELINES") 15 | }) 16 | 17 | test_that("Ligatures are properly encoded", 18 | { 19 | expect_match(paste(tex$text, collapse = " "), "fi") 20 | }) 21 | 22 | test_that("Widths are non-zero", 23 | { 24 | expect_gt(min(testreader$right - testreader$left), 95) 25 | }) 26 | 27 | test_that("Whole document can be parsed", 28 | { 29 | expect_silent(pdfdoc(pdfr_paths[[2]])) 30 | }) 31 | 32 | test_that("Multiple pages can be parsed", 33 | { 34 | expect_silent(pdfpage(pdfr_paths[[2]], c(1:2))) 35 | }) 36 | 37 | test_that("Errors as expected", 38 | { 39 | expect_error(pdfpage(2, c(1:2))) 40 | }) 41 | -------------------------------------------------------------------------------- /R/data.R: -------------------------------------------------------------------------------- 1 | ##---------------------------------------------------------------------------## 2 | #' Paths to test pdfs 3 | #' 4 | #' A list of paths to locally stored test pdfs 5 | #' 6 | #' @format A list of 9 pdf files 7 | #' \describe{ 8 | #' \item{barcodes}{a pdf constructed in Rstudio} 9 | #' \item{chestpain}{a flow-chart for chest pain management} 10 | #' \item{pdfinfo}{information about the pdf format} 11 | #' \item{adobe}{an official adobe document} 12 | #' \item{leeds}{a table-rich local government document} 13 | #' \item{sams}{a document based on svg} 14 | #' \item{testreader}{a simple pdf test} 15 | #' \item{tex}{a simple tex test} 16 | #' \item{rcpp}{a CRAN package vignette} 17 | #' } 18 | #' @export 19 | ##---------------------------------------------------------------------------## 20 | 21 | pdfr_paths <- list( 22 | barcodes = system.file("extdata", "barcodes.pdf", package = "PDFR"), 23 | chestpain = system.file("extdata", "chestpain.pdf", package = "PDFR"), 24 | pdfinfo = system.file("extdata", "pdfinfo.pdf", package = "PDFR"), 25 | adobe = system.file("extdata", "adobe.pdf", package = "PDFR"), 26 | leeds = system.file("extdata", "leeds.pdf", package = "PDFR"), 27 | sams = system.file("extdata", "sams.pdf", package = "PDFR"), 28 | testreader = system.file("extdata", "testreader.pdf", package = "PDFR"), 29 | tex = system.file("extdata", "tex.pdf", package = "PDFR"), 30 | rcpp = system.file("extdata", "rcpp.pdf", package = "PDFR") 31 | ) 32 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | utils::globalVariables( 2 | c( 3 | "X", "Y", "bottom", "box", "fill", "left", "midx", "midy", "poly", "right", 4 | "size", "stroke", "text", "top", "xmax", "xmin", "ymax", "ymin" 5 | ) 6 | ) 7 | 8 | #' Error if pdf is not a valid input 9 | #' 10 | #' @param pdf Object to check 11 | #' @keywords internal 12 | #' @noRd 13 | check_pdf <- function(pdf, call = caller_env()) { 14 | if (any( 15 | c( 16 | !is_raw(pdf) && is_false(is_character(pdf)), 17 | is_character(pdf) && is_min_length(pdf), 18 | is_character(pdf) && !is_pdf_fileext(pdf[1]) 19 | ) 20 | )) { 21 | cli_abort( 22 | "{.arg pdf} must be a single path to a valid pdf file or a raw vector 23 | string, not {.obj_type_friendly {pdf}}.", 24 | call = call 25 | ) 26 | } 27 | } 28 | 29 | #' Does x end with a PDF file extension? 30 | #' 31 | #' @param x Object to check for PDF file extension. 32 | #' @inheritParams base::grepl 33 | #' @keywords internal 34 | #' @noRd 35 | is_pdf_fileext <- function(x, ignore.case = TRUE) { 36 | grepl("[.]pdf$", x, ignore.case = ignore.case) 37 | } 38 | 39 | #' Does x contain a file separator character? 40 | #' 41 | #' @param x Object to check for a file separator character 42 | #' @param fsep File separator character. Defaults to `.Platform$file.sep` 43 | #' @keywords internal 44 | #' @noRd 45 | is_fsep_path <- function(x, fsep = .Platform$file.sep) { 46 | grepl(fsep, x) 47 | } 48 | 49 | #' Does x end with a PDF file extension? 50 | #' 51 | #' @param x Object to check for minimum length. 52 | #' @param n Minimum length to return `TRUE`. 53 | #' @keywords internal 54 | #' @noRd 55 | is_min_length <- function(x, n = 2) { 56 | length(x) >= n 57 | } 58 | -------------------------------------------------------------------------------- /src/external/Profiler.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #ifdef PROFILER_PDFR 10 | #define PROFC_NODE(name) \ 11 | static ProfileNode __node##__LINE__(name); \ 12 | TheNodeList::Instance().AddNode(&__node##__LINE__); \ 13 | ScopedTimer __timer##__LINE__(std::bind( \ 14 | &ProfileNode::Accumulate, &__node##__LINE__, std::placeholders::_1)); 15 | 16 | class ProfileNode { 17 | public: 18 | explicit ProfileNode(const std::string& name) : name_(name), count_(0) { 19 | } 20 | void Accumulate(std::chrono::microseconds us) { 21 | the_lock_.lock(); 22 | count_++; 23 | elapsed_us_ += us; 24 | the_lock_.unlock(); 25 | } 26 | void Print() { 27 | printf( 28 | "%-25s %10d %10dms %10dus\n", name_.c_str(), count_, 29 | static_cast(std::chrono::duration_cast( 30 | elapsed_us_).count()), 31 | static_cast(elapsed_us_.count() / count_)); 32 | } 33 | 34 | private: 35 | std::string name_; 36 | int count_; 37 | std::chrono::microseconds elapsed_us_; 38 | std::mutex the_lock_; 39 | }; 40 | 41 | class ScopedTimer { 42 | public: 43 | explicit ScopedTimer(std::function callback) 44 | : callback_(callback) { 45 | start_ = std::chrono::system_clock::now(); 46 | } 47 | ~ScopedTimer() { 48 | auto end = std::chrono::system_clock::now(); 49 | auto elapsed = end - start_; 50 | callback_(std::chrono::duration_cast(elapsed)); 51 | } 52 | ScopedTimer(const ScopedTimer&) = delete; 53 | ScopedTimer& operator=(const ScopedTimer&) = delete; 54 | 55 | private: 56 | std::function callback_; 57 | std::chrono::time_point start_; 58 | }; 59 | 60 | class TheNodeList { 61 | public: 62 | void AddNode(ProfileNode* node) { 63 | nodes_.insert(node); 64 | } 65 | ~TheNodeList() { 66 | Print(); 67 | } 68 | void printNodeList() {Print();} 69 | static TheNodeList& Instance() { 70 | static TheNodeList nodes; 71 | return nodes; 72 | } 73 | void endprofiler(){this->Print();} 74 | void Print() { 75 | printf("--------------------------------------------------------------\n"); 76 | printf("name count elapsed us/call\n"); 77 | for (auto node : nodes_) node->Print(); 78 | } 79 | 80 | private: 81 | std::set nodes_; 82 | }; 83 | #endif 84 | 85 | #ifndef PROFILER_PDFR 86 | #define PROFC_NODE(name) ((void)0); 87 | #endif 88 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | .get_xref <- function(file_name) { 5 | .Call(`_PDFR_GetXrefFromString`, file_name) 6 | } 7 | 8 | .get_xrefraw <- function(raw_file) { 9 | .Call(`_PDFR_GetXrefFromRaw`, raw_file) 10 | } 11 | 12 | .get_obj <- function(file_name, object_number) { 13 | .Call(`_PDFR_GetObjectFromString`, file_name, object_number) 14 | } 15 | 16 | .get_objraw <- function(raw_file, object_number) { 17 | .Call(`_PDFR_GetObjectFromRaw`, raw_file, object_number) 18 | } 19 | 20 | .pdfpage <- function(file_name, page_number, each_glyph) { 21 | .Call(`_PDFR_GetPdfPageFromString`, file_name, page_number, each_glyph) 22 | } 23 | 24 | .pdfpageraw <- function(raw_file, page_number, atoms) { 25 | .Call(`_PDFR_GetPdfPageFromRaw`, raw_file, page_number, atoms) 26 | } 27 | 28 | .getglyphmap <- function(file_name, page_number) { 29 | .Call(`_PDFR_GetGlyphMap`, file_name, page_number) 30 | } 31 | 32 | .pagestring <- function(file_name, page_number) { 33 | .Call(`_PDFR_GetPageStringFromString`, file_name, page_number) 34 | } 35 | 36 | .pagestringraw <- function(raw_file, page_number) { 37 | .Call(`_PDFR_GetPageStringFromRaw`, raw_file, page_number) 38 | } 39 | 40 | .pdfdoc <- function(file_name) { 41 | .Call(`_PDFR_GetPdfDocumentFromString`, file_name) 42 | } 43 | 44 | .pdfdocraw <- function(file_name) { 45 | .Call(`_PDFR_GetPdfDocumentFromRaw`, file_name) 46 | } 47 | 48 | .pdfboxesString <- function(file_name, page_number) { 49 | .Call(`_PDFR_GetPdfBoxesFromString`, file_name, page_number) 50 | } 51 | 52 | .pdfboxesRaw <- function(file_name, page_number) { 53 | .Call(`_PDFR_GetPdfBoxesFromRaw`, file_name, page_number) 54 | } 55 | 56 | .GetPaths <- function(file_name, page_number) { 57 | .Call(`_PDFR_GetPaths`, file_name, page_number) 58 | } 59 | 60 | .GetGrobs <- function(file_name, page_number) { 61 | .Call(`_PDFR_GetGrobs`, file_name, page_number) 62 | } 63 | 64 | ReadFontTable <- function(raw) { 65 | .Call(`_PDFR_ReadFontTable`, raw) 66 | } 67 | 68 | GetFontFileHeader <- function(raw) { 69 | .Call(`_PDFR_GetFontFileHeader`, raw) 70 | } 71 | 72 | GetFontFileCMap <- function(raw) { 73 | .Call(`_PDFR_GetFontFileCMap`, raw) 74 | } 75 | 76 | GetFontFileMaxp <- function(raw) { 77 | .Call(`_PDFR_GetFontFileMaxp`, raw) 78 | } 79 | 80 | GetFontFileLoca <- function(raw) { 81 | .Call(`_PDFR_GetFontFileLoca`, raw) 82 | } 83 | 84 | GetFontFileGlyph <- function(raw, glyph) { 85 | .Call(`_PDFR_GetFontFileGlyph`, raw, glyph) 86 | } 87 | 88 | GetFontFilePostTable <- function(raw) { 89 | .Call(`_PDFR_GetFontFilePostTable`, raw) 90 | } 91 | 92 | GetFontFileNameTable <- function(raw) { 93 | .Call(`_PDFR_GetFontFileNameTable`, raw) 94 | } 95 | 96 | GetFontFileOS2Table <- function(raw) { 97 | .Call(`_PDFR_GetFontFileOS2Table`, raw) 98 | } 99 | 100 | .stopCpp <- function() { 101 | invisible(.Call(`_PDFR_stopCpp`)) 102 | } 103 | 104 | -------------------------------------------------------------------------------- /src/letter_grouper.h: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------// 2 | // // 3 | // PDFR LetterGrouper header file // 4 | // // 5 | // Copyright (C) 2018 - 2019 by Allan Cameron // 6 | // // 7 | // Licensed under the MIT license - see https://mit-license.org // 8 | // or the LICENSE file in the project root directory // 9 | // // 10 | //----------------------------------------------------------------------------// 11 | 12 | #ifndef PDFR_LGROUPER 13 | 14 | //----------------------------------------------------------------------------// 15 | 16 | #define PDFR_LGROUPER 17 | 18 | /* The LetterGrouper class co-ordinates the grouping together of words. In 19 | * terms of program structure, this comes directly after the parser step that 20 | * reads the page description program. The goal of this class is to clump 21 | * adjoining glyphs to form strings. Mostly, these will form words, but if 22 | * actual spaces are included as glyphs then grouped strings of words will be 23 | * included in the output. 24 | * 25 | * This is the first step of a "meet-in-the-middle" document reconstruction, 26 | * which will use these strings as the atoms from which to form structures such 27 | * as paragraphs, headers and tables. 28 | */ 29 | 30 | #include "textbox.h" 31 | 32 | //----------------------------------------------------------------------------// 33 | // The LetterGrouper class contains a constructor, an output map of results, 34 | // and a method for passing out the minimum text bounding box found in page 35 | // construction. Its private methods are used only in construction of the 36 | // output. The main private member is a map of vectors of TextElements, each 37 | // vector representing all glyphs in one of 256 equally sized cells on the page. 38 | // Each glyph is addressable by two numbers - the grid number and the position 39 | // of the glyph in the cell's vector. 40 | 41 | class LetterGrouper 42 | { 43 | public: 44 | using TextPointer = std::shared_ptr; 45 | // constructor. 46 | LetterGrouper(std::unique_ptr); 47 | 48 | // Passes text elements to WordGrouper for further construction if needed 49 | std::unique_ptr Output(); 50 | TextTable Out(); // output table to interface if ungrouped words needed 51 | 52 | private: 53 | // A copy of the parser output used to create grid 54 | std::unique_ptr text_box_; 55 | 56 | // Main data member - a 16 x 16 grid of cells, each with a TextPointer vector 57 | std::unordered_map> grid_; 58 | 59 | // private methods 60 | void MakeGrid_(); // Assigns glyphs to a 16 x 16 grid 61 | void CompareCells_(); // Co-ordinates matching between cells 62 | void MatchRight_(TextPointer, uint8_t); // Compares all glyphs in cell 63 | void Merge_(); // Joins matching glyphs together 64 | }; 65 | 66 | //----------------------------------------------------------------------------// 67 | 68 | #endif 69 | -------------------------------------------------------------------------------- /src/deflate.h: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------------// 2 | // // 3 | // PDFR Deflate header file // 4 | // // 5 | // Copyright (C) 2018 - 2019 by Allan Cameron // 6 | // // 7 | // Licensed under the MIT license - see https://mit-license.org // 8 | // or the LICENSE file in the project root directory // 9 | // // 10 | //---------------------------------------------------------------------------// 11 | 12 | #ifndef PDFR_DEFLATE 13 | 14 | //---------------------------------------------------------------------------// 15 | 16 | #define PDFR_DEFLATE 17 | 18 | #include 19 | #include "streams.h" 20 | 21 | std::string FlateDecode(std::string* message); 22 | std::string FlateDecode(const CharString& message); 23 | 24 | //---------------------------------------------------------------------------// 25 | // This class reinvents the wheel in an attempt to free the library from 26 | // dependencies. It is a full implementation of Deflate decompression. It uses 27 | // std::map for storing and looking up Huffman trees and inherits from Stream 28 | // to give it an easy interface to the underlying stream. Only the constructor 29 | // is public. 30 | 31 | class Deflate : public Stream 32 | { 33 | public: 34 | // String and byte-vector constructors. The latter converts to a string. 35 | Deflate(const std::string*); 36 | Deflate(const CharString&); 37 | 38 | private: 39 | bool is_last_block_; // Flag so decompressor knows when to stop 40 | 41 | // The fixed literal and distance maps are used if compression used a 42 | // fixed dictionary. Usually this only happens with short messages. 43 | static const std::unordered_map fixed_literal_map_; 44 | static const std::unordered_map fixed_distance_map_; 45 | 46 | // If we come across a length code or a distance code, we need to know 47 | // how many extra bytes to read. This is looked up in these tables. 48 | static const std::vector length_table_; 49 | static const std::vector distance_table_; 50 | 51 | // Whether its fixed or dynamic compression, we want to end up with a literal 52 | // and distance map that we can look up. 53 | std::unordered_map literal_map_; 54 | std::unordered_map distance_map_; 55 | 56 | void CheckHeader_(); // Read first two bytes to ensure valid 57 | void ReadBlock_(); // Co-ordinates reading of a single block 58 | void BuildDynamicCodeTable_(); // Builds lookup tables for each block 59 | void ReadCodes_(); // Actual reading of compressed data 60 | void HandlePointer_(uint32_t); // Deals with length & distance pointers 61 | 62 | // Finds the next code in the input stream using given lookup table 63 | uint32_t ReadCode_(std::unordered_map&); 64 | 65 | // Creates a Huffman tree from a vector of bit lengths. 66 | std::unordered_map 67 | Huffmanize_(const std::vector&); 68 | }; 69 | 70 | 71 | #endif 72 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | I am an amateur, with little time to spend on this project and no experience whatsoever in C++. If you are interested in contributing, I would be delighted to hear from you: please [get in touch](Allan.Cameron@nhs.net). 4 | 5 | To keep the C++ codebase consistent, I will declare some coding conventions here. This is mostly for my own benefit, but I would ask any contributors to keep to these conventions too where possible. 6 | 7 | # Conventions 8 | 9 | ## Naming conventions 10 | - All variable names are written in `snake_case` with no capitals. 11 | - Aim for descriptive names over saving horizontal space e.g `temporary_byte_vector` is better than `tmpvec` 12 | - Prefer named iterators in a loop rather than `i`; e.g. `for (size_t entry = 0; entry < table.size(); ++entry)` unless there is no meaningful name to apply. 13 | - All private data members are suffixed with a single underscore: `private_member_` 14 | - All function / method names are written in `CamelCase`. 15 | - Class, struct, enum and type names are written in `CamelCase`. 16 | - Suffix private methods with an underscore - `MyPrivateMethod_();` 17 | - Use descriptive variable names in class method declarations, as these will help document the class user interface. Private methods don't always need a variable name, or can be short descriptive names if preferred. 18 | 19 | The following code block demonstrates most of these naming conventions: 20 | 21 | ```cpp 22 | //---------------------------------------------------------------------------// 23 | // Method to make things OK 24 | 25 | std::string MakeEverythingOK(std::string input_string) 26 | { 27 | std::string ok_suffix = " is OK"; 28 | input_string.append(ok_suffix); 29 | return input_string; 30 | } 31 | 32 | //---------------------------------------------------------------------------// 33 | // Make the data member OK 34 | 35 | void MyClass::MakeAnOKMember_(const std::string& input_string) 36 | { 37 | ok_data_member_ = MakeEverythingOK(input_string); 38 | } 39 | 40 | //---------------------------------------------------------------------------// 41 | ``` 42 | 43 | 44 | ## Comments 45 | - Every file begins with the MIT license header 46 | - Most comments should have the single-line `//` format rather than `/* Multi-line */` type. An exception can be made for large introductory comments explaining the rationale for a class at the top of a header file, just below the license. 47 | - Prefer verbose comments, even though the naming rules should make the code largely self-commenting. It takes less time to understand what's going on if things are well commented. Of course, we want to avoid being silly with the likes of 48 | ```cpp 49 | return result; // Returns the result 50 | ``` 51 | but the general rule is, if it is quicker to understand it with a comment, the comment goes in. 52 | 53 | ## Layout 54 | - Indentation is in [Allman style](https://en.wikipedia.org/wiki/Indentation_style#Allman_style). Yes it wastes vertical space, but I just find it more readable. 55 | - Indentation is with two spaces. No tabs allowed. 56 | - The maximum line width is 80 characters. No exceptions. 57 | - All function definitions are seperated by an 80-character comment line as shown in the snippet above, with a brief description commented below, a line break, then the function, followed by a line break then the next comment line. 58 | - Class definitions are declared public members first, then private members. 59 | - The keywords `public:` and `private:` in a class definition get a single space indentation. 60 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at Allan.Cameron@nhs.net. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /src/line_grouper.h: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------------// 2 | // // 3 | // PDFR LineGrouper header file // 4 | // // 5 | // Copyright (C) 2018 - 2019 by Allan Cameron // 6 | // // 7 | // Licensed under the MIT license - see https://mit-license.org // 8 | // or the LICENSE file in the project root directory // 9 | // // 10 | //---------------------------------------------------------------------------// 11 | 12 | #ifndef PDFR_LINEGOUPER 13 | 14 | //---------------------------------------------------------------------------// 15 | 16 | #define PDFR_LINEGOUPER 17 | 18 | #include "textbox.h" 19 | 20 | 21 | //---------------------------------------------------------------------------// 22 | /* The LineGrouper class takes the output of the whitespace class, which is 23 | * a vector of TextBoxes - that is, a box containing a vector of text elements. 24 | * What we want is to change this so that we have a 1:1 correspondence between 25 | * boxes and text elements, but for the text elements to be joined-up, logical 26 | * components of the document such as paragraphs, headers, table entries and so 27 | * on. 28 | * 29 | * This requires a few different processes. First, we need to arrange all the 30 | * text elements in the boxes into the correct "reading order". Since we have 31 | * already split elements by whitespace, this should be a simple matter of 32 | * sorting top to bottom and left to right. 33 | * 34 | * Secondly, we need to determine whether there are logical breaks between 35 | * the lines of text, or whether they are supposed to join together. We do this 36 | * by taking clues such as the size of line spacing and the alignment of text to 37 | * spot paragraph breaks. 38 | * 39 | * Thirdly, we need to work out how lines are meant to be joined together. 40 | * Usually, they should be joined with a space. However, if a line is to be 41 | * joined to the one below but already ends in a space or ends in a hyphen, 42 | * it should be joined without a space. 43 | * 44 | * The LineGrouper class modifies the std::vector class, so we only 45 | * need to pass a pointer to this 46 | * 47 | */ 48 | 49 | class LineGrouper 50 | { 51 | public: 52 | using TextPointer = std::shared_ptr; 53 | 54 | // Constructor takes the output of WordGrouper - a vector of TextBoxes 55 | LineGrouper(PageBox page_box_from_whitespace); 56 | 57 | // The output is also a vector of TextBoxes 58 | inline TextBox Output() { return text_boxes_.CastToTextBox();} 59 | 60 | private: 61 | void FindBreaks_(TextBox&); // Identifies paragraph breaks 62 | void LineEndings_(TextBox&); // Adjusts line endings to facilitate pasting 63 | void PasteLines_(TextBox&); // Pastes TextElements in the TextBoxes together 64 | 65 | // Defines the reading order for elements in a text box. If an element is 66 | // higher than another, it comes before it. If it is at the same height but 67 | // to the left of the other element, it comes before it. In all other cases, 68 | // it comes afterwards. 69 | struct ReadingOrder_ 70 | { 71 | bool operator() (const TextPointer& row1, const TextPointer& row2) const 72 | { 73 | if (row1->GetBottom() > row2->GetBottom() ) return true; 74 | if (row1->GetBottom() == row2->GetBottom() && 75 | row1->GetLeft() < row2->GetLeft() ) return true; 76 | return false; 77 | } 78 | }; 79 | 80 | // private data member 81 | PageBox text_boxes_; 82 | }; 83 | 84 | 85 | //---------------------------------------------------------------------------// 86 | 87 | #endif 88 | -------------------------------------------------------------------------------- /src/graphicsstate.h: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------------// 2 | // // 3 | // PDFR GraphicsState header file // 4 | // // 5 | // Copyright (C) 2018 - 2021 by Allan Cameron // 6 | // // 7 | // Licensed under the MIT license - see https://mit-license.org // 8 | // or the LICENSE file in the project root directory // 9 | // // 10 | //---------------------------------------------------------------------------// 11 | 12 | #ifndef PDFR_GS 13 | 14 | //---------------------------------------------------------------------------// 15 | 16 | #define PDFR_GS 17 | 18 | #include "matrix.h" 19 | #include "page.h" 20 | #include "graphicobject.h" 21 | 22 | /*---------------------------------------------------------------------------*/ 23 | 24 | class TextState 25 | { 26 | public: 27 | float tc, // Character spacing 28 | tw, // Word spacing 29 | th, // Horizontal scaling 30 | tl, // Text leading 31 | tfs, // Font size 32 | trise; // Text rise 33 | std::string tf; // Font name 34 | int tmode; // Text printing mode 35 | std::shared_ptr current_font; 36 | 37 | TextState() : tc(0), tw(0), th(100), tl(0), 38 | tfs(0), trise(0), tf(""), tmode(0) {} 39 | }; 40 | 41 | //---------------------------------------------------------------------------// 42 | 43 | class GraphicsState 44 | { 45 | public: 46 | Matrix CTM; 47 | Path clipping_path; 48 | std::vector colour_space_stroke, 49 | colour_space_fill; 50 | std::vector colour, 51 | fill; 52 | TextState text_state; 53 | Matrix tm_state, 54 | td_state; 55 | float line_width; 56 | int line_cap, 57 | line_join; 58 | float miter_limit; 59 | std::string rendering_intent; 60 | bool stroke_adjustment; 61 | std::vector dash_array; 62 | std::vector blending_mode; 63 | std::string soft_mask; 64 | float alpha_constant; 65 | bool alpha_source; 66 | 67 | GraphicsState(std::shared_ptr p) : 68 | CTM(Matrix()), clipping_path(Path()), 69 | colour_space_stroke({"/DeviceGray"}), 70 | colour_space_fill({"/DeviceGray"}), 71 | colour({0, 0, 0}), fill({0, 0, 0}), 72 | text_state(TextState()), tm_state(Matrix()), 73 | td_state(Matrix()), line_width(1), 74 | line_cap(0), line_join(0), miter_limit(10.0), 75 | rendering_intent("/RelativeColorimetric"), 76 | stroke_adjustment(false), 77 | dash_array({0}), 78 | blending_mode({"Normal"}), soft_mask("None"), 79 | alpha_constant(1.0), alpha_source(false) 80 | { 81 | std::shared_ptr b = p->GetMinbox(); 82 | clipping_path.SetX({b->GetLeft(), b->GetLeft(), b->GetRight(), 83 | b->GetRight(), b->GetLeft()}); 84 | clipping_path.SetY({b->GetBottom(), b->GetTop(), b->GetTop(), 85 | b->GetBottom(), b->GetBottom()}); 86 | } 87 | 88 | }; 89 | 90 | //---------------------------------------------------------------------------// 91 | 92 | #endif 93 | 94 | 95 | -------------------------------------------------------------------------------- /src/box.cpp: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------------// 2 | // // 3 | // PDFR box implementation file // 4 | // // 5 | // Copyright (C) 2018 - 2019 by Allan Cameron // 6 | // // 7 | // Licensed under the MIT license - see https://mit-license.org // 8 | // or the LICENSE file in the project root directory // 9 | // // 10 | //---------------------------------------------------------------------------// 11 | 12 | 13 | #include "box.h" 14 | using namespace std; 15 | 16 | //---------------------------------------------------------------------------// 17 | // Every vertex of the final polygon surrounding each text element contains 18 | // information about its position. However, to "connect" the vertices so as to 19 | // arrange them in clockwise order, it also needs to know which direction 20 | // the incoming and outgoing edges are "pointing". We do this by working out 21 | // for each vertex whether there is whitespace immediately to the NorthWest, 22 | // NorthEast, SouthEast and SouthWest of the vertex. These are recorded as the 23 | // four lowest order bits in a single "flags" byte; thus a vertex that had 24 | // whitespace to the NorthWest and SouthWest (as it would if it lay along the 25 | // middle of the left edge of a text polygon), would have its flags set to 26 | // 1001 in binary (or 0x9 in hexadecimal, or a byte value of 0x09 provided we 27 | // mask the flag byte with & 0x0f). Since we know that a masked flag value of 28 | // 0x09 must represent a point lying on a left edge, its "incoming" edge must 29 | // be travelling North (since we are concerned with clockwise ordering), and 30 | // its outgoing edge must also be pointing North. To make all this clearer we 31 | // want each Vertex to specify its incoming and outgoing directions. We can 32 | // therefore just look up the four lowest order bytes in each Vertex's flags 33 | // using this unordered map to get the implied direction based on the 34 | // surrounding whitespace. 35 | 36 | unordered_map> Vertex::arrows_ = 37 | { 38 | {0x00, {None, None}}, {0x01, {North, West}}, {0x02, {West, South}}, 39 | {0x03, {West, West}}, {0x04, {South, East}}, {0x05, {None, None}}, 40 | {0x06, {South, South}}, {0x07, {South, West}}, {0x08, {East, North}}, 41 | {0x09, {North, North}}, {0x0A, {None, None}}, {0x0B, {West, North}}, 42 | {0x0C, {East, East}}, {0x0D, {North, East}}, {0x0E, {East, South}}, 43 | {0x0F, {None, None}} 44 | }; 45 | 46 | //---------------------------------------------------------------------------// 47 | // Create a vertex from a given corner of the box 48 | // (0 = top-left, 1 = top-right, 2 = bottom-left, 3 = bottom-right) 49 | // Note, the given vertex is automatically flagged as being impinged at the 50 | // correct compass direction 51 | 52 | shared_ptr Box::GetVertex(int corner) 53 | { 54 | switch (corner) 55 | { 56 | case 0 : return std::make_shared(left_, top_, 0x02); 57 | case 1 : return std::make_shared(right_, top_, 0x01); 58 | case 2 : return std::make_shared(left_, bottom_, 0x04); 59 | case 3 : return std::make_shared(right_, bottom_, 0x08); 60 | default: return std::make_shared(0, 0, 0); 61 | } 62 | return std::make_shared (0, 0, 0); 63 | } 64 | 65 | //---------------------------------------------------------------------------// 66 | // Marks a box's impingement on a given vertex. This records whether moving 67 | // an arbitrarily small distance in a given direction from the vertex will 68 | // place one inside the current box. 69 | 70 | void Box::RecordImpingementOn(Vertex& corner) 71 | { 72 | if (IsNorthWestOf(corner)) corner.SetFlags(0x08); 73 | if (IsNorthEastOf(corner)) corner.SetFlags(0x04); 74 | if (IsSouthEastOf(corner)) corner.SetFlags(0x02); 75 | if (IsSouthWestOf(corner)) corner.SetFlags(0x01); 76 | } 77 | -------------------------------------------------------------------------------- /src/object_class.h: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------------// 2 | // // 3 | // PDFR Object header file // 4 | // // 5 | // Copyright (C) 2018 - 2019 by Allan Cameron // 6 | // // 7 | // Licensed under the MIT license - see https://mit-license.org // 8 | // or the LICENSE file in the project root directory // 9 | // // 10 | //---------------------------------------------------------------------------// 11 | 12 | #ifndef PDFR_OBJECT 13 | 14 | //---------------------------------------------------------------------------// 15 | 16 | #define PDFR_OBJECT 17 | 18 | /* This is the fourth header file in a daisy-chain of main headers which builds 19 | * up an interface for parsing pdf files. It comes directly after xref.h and is 20 | * the last step before the main document class is declared. 21 | * 22 | * The object class comprises the data and functions needed to represent a pdf 23 | * object. Each Object object is made of two main items of data: a 24 | * dictionary (which can be empty), and a pair of size_t indicating the offset 25 | * of the stream's start and stop. The reason we don't just build the stream 26 | * is that decryption and deflation of large streams is computationally 27 | * expensive, and we should only do it on request. As an object may be requested 28 | * more than once however, if we have gone to the trouble of calculating the 29 | * stream, it is stored as a private data member. 30 | * 31 | * Of course, for objects to have this memory of their state, they need to 32 | * stay in scope from creation until the program exits. This is done by keeping 33 | * a vector of retrieved objects in the document class, which persists through 34 | * the lifetime of the program. 35 | * 36 | * The job of finding the object, parsing its dictionary and decoding its stream 37 | * is abstracted away using this class, so that pdf objects can be directly 38 | * interrogated for key:value pairs and their streams can be parsed as plain 39 | * text where appropriate. This means that logical structures such as pages, 40 | * fonts and form objects can be built by interfacing directly with pdf objects 41 | * rather than indirectly through byte offsets and binary streams 42 | */ 43 | 44 | #include "xref.h" 45 | 46 | //---------------------------------------------------------------------------// 47 | 48 | class Object 49 | { 50 | public: 51 | // Get pdf object from a given object number 52 | Object(std::shared_ptr xref_ptr, int object_number); 53 | 54 | // Get stream object from inside the holding object, given object number 55 | Object(std::shared_ptr holding_object_ptr, int object_number); 56 | 57 | // Default constructor 58 | Object() = delete; 59 | 60 | // Returns an Object's stream as a string 61 | std::string& GetStream(); 62 | 63 | // Returns an Object's Dictionary 64 | Dictionary& GetDictionary(); 65 | 66 | friend std::ostream& operator<<(std::ostream& os, const Object& obj); 67 | 68 | private: 69 | std::shared_ptr xref_; // Pointer to creating xref 70 | int object_number_; // The object knows its own number 71 | Dictionary header_; // The object's dictionary 72 | std::string stream_; // The object's stream or contents 73 | CharString raw_stream_; // Start position and length of stream 74 | 75 | // A lookup of start / stop positions of the objects within an object stream 76 | std::shared_ptr>> stream_index_; 77 | 78 | // private methods 79 | void IndexObjectStream_(); 80 | void ReadStream_(); 81 | }; 82 | 83 | //---------------------------------------------------------------------------// 84 | 85 | inline std::ostream& operator<<(std::ostream& os, Object& obj) 86 | { 87 | os << obj.GetDictionary() << "\n\nStream:\n" << obj.GetStream(); 88 | return os; 89 | } 90 | 91 | #endif 92 | -------------------------------------------------------------------------------- /src/word_grouper.h: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------------// 2 | // // 3 | // PDFR WordGrouper header file // 4 | // // 5 | // Copyright (C) 2018 - 2019 by Allan Cameron // 6 | // // 7 | // Licensed under the MIT license - see https://mit-license.org // 8 | // or the LICENSE file in the project root directory // 9 | // // 10 | //---------------------------------------------------------------------------// 11 | 12 | #ifndef PDFR_WGROUPER 13 | 14 | //---------------------------------------------------------------------------// 15 | 16 | #define PDFR_WGROUPER 17 | 18 | /* The word grouper takes all of the words stuck together by the letter grouper 19 | * and attempts to join them into lines of text. It does this primarily by 20 | * identifying whether two adjacent words are close enough to be joined by a 21 | * single space character. 22 | * 23 | * There are a few caveats to this. Often text will be in columns, and we don't 24 | * want words at the right edge of one column to join to words in the adjacent 25 | * column if they are close together. The word grouper attempts to prevent this 26 | * by identifying words on the page whose left edges are aligned. If several 27 | * words have matching left edges, then they probably form a left-aligned 28 | * column. Any word with its left edge on a left-aligned column should not be 29 | * allowed to be joined to a word to its right. 30 | * 31 | * This isn't perfect, since we may get false positives, when words 32 | * coincidentally line up within a body of text. The higher we stipulate the 33 | * number of words that must be aligned to count as a column, the less likely 34 | * this is to happen, but we will then run the risk of false negatives, where 35 | * adjacent columns get stuck together. Therefore, the more left edges we find 36 | * and the higher the likliehood of a column being present, the smaller the gap 37 | * that is allowed to be bridged. 38 | * 39 | * We carry out a similar process for right-aligned and centre-aligned text. 40 | * Right-aligned text is intolerant of anything to the left joining and centre- 41 | * aligned text is intolerant of left or right joins. 42 | */ 43 | 44 | #include "textbox.h" 45 | 46 | //---------------------------------------------------------------------------// 47 | // The word grouper class takes a pointer to a letter grouper object in its 48 | // constructor. It makes a table of the x values of the left, right and centre 49 | // points of each word, and uses these to infer which word pairs are elligible 50 | // for sticking together. 51 | 52 | class WordGrouper 53 | { 54 | public: 55 | // Constructor - takes the main textbox as output from LetterGrouper 56 | WordGrouper(std::unique_ptr output_from_lettergrouper); 57 | 58 | // Output individual text elements for next phase of layout analysis 59 | std::unique_ptr Output() { return std::move(text_box_); } 60 | 61 | // Output text elements with sizes, fonts, positions to API 62 | TextTable Out() const; 63 | 64 | private: 65 | // Make a table of values in a vector of floats rounded to one decimal place 66 | void Tabulate_(const std::vector&, std::unordered_map&); 67 | 68 | // Use tabulate function to find likely left, right or mid-aligned columns 69 | void FindEdges_(); 70 | 71 | // Tell the text elements whether they form an edge or not 72 | void AssignEdges_(); 73 | 74 | // Join elligible adjacent glyphs together and merge their properties 75 | void FindRightMatch_(); 76 | 77 | // private data members 78 | std::unordered_map left_edges_, // The tables of edges 79 | right_edges_, 80 | mids_; 81 | std::unique_ptr text_box_; // The main data member 82 | }; 83 | 84 | //---------------------------------------------------------------------------// 85 | 86 | #endif 87 | -------------------------------------------------------------------------------- /src/streams.cpp: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------------// 2 | // // 3 | // PDFR Streams implementation file // 4 | // // 5 | // Copyright (C) 2018 - 2019 by Allan Cameron // 6 | // // 7 | // Licensed under the MIT license - see https://mit-license.org // 8 | // or the LICENSE file in the project root directory // 9 | // // 10 | //---------------------------------------------------------------------------// 11 | 12 | /* Streams are normally compressed in PDFs, and the majority appear to be 13 | * compressed in DEFLATE format. I have used inheritance here with the Stream 14 | * class playing the role of base class and the various types of compression 15 | * having their own dervied classes, so that the interface remains standardized 16 | * and new classes for each type of compression could be added as needed. 17 | * 18 | * The Stream Class itself is effectively an abstract class. Its constructor 19 | * is protected so it can only be called by the derived class constructors. 20 | */ 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include "streams.h" 29 | 30 | using namespace std; 31 | 32 | 33 | Stream::Stream(const string* input) : input_(*input), 34 | output_(std::string()), 35 | input_position_(input_.begin()), 36 | output_position_(output_.begin()), 37 | unconsumed_bits_(0), 38 | unconsumed_bit_value_(0) {} 39 | 40 | Stream::Stream(const CharString& input) : input_(input), 41 | output_(std::string()), 42 | input_position_(input_.begin()), 43 | output_position_(output_.begin()), 44 | unconsumed_bits_(0), 45 | unconsumed_bit_value_(0) {} 46 | 47 | /*---------------------------------------------------------------------------*/ 48 | 49 | uint32_t Stream::GetByte() 50 | { 51 | if (input_position_ == input_.end()) return 256; 52 | return (uint8_t) *input_position_++; 53 | } 54 | 55 | 56 | /*---------------------------------------------------------------------------*/ 57 | 58 | uint32_t Stream::PeekByte() 59 | { 60 | uint32_t result = GetByte(); 61 | --input_position_; 62 | return result; 63 | } 64 | 65 | /*---------------------------------------------------------------------------*/ 66 | 67 | void Stream::Reset() 68 | { 69 | input_position_ = input_.begin(); 70 | output_.clear(); 71 | output_position_ = output_.begin(); 72 | unconsumed_bit_value_ = 0; 73 | unconsumed_bits_ = 0; 74 | } 75 | 76 | /*---------------------------------------------------------------------------*/ 77 | 78 | uint32_t Stream::GetBits(uint32_t n_bits) 79 | { 80 | uint32_t value_read = unconsumed_bit_value_; 81 | uint8_t bits_read = unconsumed_bits_; 82 | 83 | while (bits_read < n_bits) 84 | { 85 | uint32_t new_byte = GetByte(); 86 | if (new_byte == 256) throw runtime_error("Unexpected end of stream"); 87 | value_read |= new_byte << bits_read; 88 | bits_read += 8; 89 | } 90 | 91 | uint32_t result = value_read & ((1 << n_bits) - 1); 92 | unconsumed_bit_value_ = value_read >> n_bits; 93 | bits_read -= n_bits; 94 | unconsumed_bits_ = bits_read; 95 | return result; 96 | } 97 | 98 | /*---------------------------------------------------------------------------*/ 99 | 100 | uint32_t Stream::BitFlip(uint32_t value, uint32_t n_bits) 101 | { 102 | uint32_t result = 0; 103 | for(uint32_t i = 1; i <= n_bits; ++i) 104 | { 105 | result = (result << 1) | (value & 1); 106 | value >>= 1; 107 | } 108 | return result; 109 | } 110 | -------------------------------------------------------------------------------- /codemeta.json: -------------------------------------------------------------------------------- 1 | { 2 | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", 3 | "@type": "SoftwareSourceCode", 4 | "identifier": "PDFR", 5 | "description": "Extracts text from PDF into an R dataframe giving the content, size, position and font of any text elements. This information can then be manipulated in R.", 6 | "name": "PDFR: Extract Text From PDFs In An R Friendly Way", 7 | "codeRepository": "https://github.com/AllanCameron/PDFR", 8 | "issueTracker": "https://github.com/AllanCameron/PDFR/issues", 9 | "license": "https://spdx.org/licenses/MIT", 10 | "version": "0.1.0", 11 | "programmingLanguage": { 12 | "@type": "ComputerLanguage", 13 | "name": "R", 14 | "url": "https://r-project.org" 15 | }, 16 | "runtimePlatform": "R version 4.2.0 Patched (2022-05-23 r82396)", 17 | "author": [ 18 | { 19 | "@type": "Person", 20 | "givenName": "Allan", 21 | "familyName": "Cameron", 22 | "email": "Allan.Cameron@nhs.net" 23 | } 24 | ], 25 | "contributor": [ 26 | { 27 | "@type": "Person", 28 | "givenName": "Eli", 29 | "familyName": "Pousson", 30 | "email": "eli.pousson@gmail.com", 31 | "@id": "https://orcid.org/0000-0001-8280-1706" 32 | } 33 | ], 34 | "copyrightHolder": [ 35 | { 36 | "@type": "Person", 37 | "givenName": "Allan", 38 | "familyName": "Cameron", 39 | "email": "Allan.Cameron@nhs.net" 40 | } 41 | ], 42 | "maintainer": [ 43 | { 44 | "@type": "Person", 45 | "givenName": "Allan", 46 | "familyName": "Cameron", 47 | "email": "Allan.Cameron@nhs.net" 48 | } 49 | ], 50 | "softwareSuggestions": [ 51 | { 52 | "@type": "SoftwareApplication", 53 | "identifier": "ggplot2", 54 | "name": "ggplot2", 55 | "provider": { 56 | "@id": "https://cran.r-project.org", 57 | "@type": "Organization", 58 | "name": "Comprehensive R Archive Network (CRAN)", 59 | "url": "https://cran.r-project.org" 60 | }, 61 | "sameAs": "https://CRAN.R-project.org/package=ggplot2" 62 | }, 63 | { 64 | "@type": "SoftwareApplication", 65 | "identifier": "testthat", 66 | "name": "testthat", 67 | "provider": { 68 | "@id": "https://cran.r-project.org", 69 | "@type": "Organization", 70 | "name": "Comprehensive R Archive Network (CRAN)", 71 | "url": "https://cran.r-project.org" 72 | }, 73 | "sameAs": "https://CRAN.R-project.org/package=testthat" 74 | } 75 | ], 76 | "softwareRequirements": { 77 | "1": { 78 | "@type": "SoftwareApplication", 79 | "identifier": "R", 80 | "name": "R", 81 | "version": ">= 2.10" 82 | }, 83 | "2": { 84 | "@type": "SoftwareApplication", 85 | "identifier": "cli", 86 | "name": "cli", 87 | "provider": { 88 | "@id": "https://cran.r-project.org", 89 | "@type": "Organization", 90 | "name": "Comprehensive R Archive Network (CRAN)", 91 | "url": "https://cran.r-project.org" 92 | }, 93 | "sameAs": "https://CRAN.R-project.org/package=cli" 94 | }, 95 | "3": { 96 | "@type": "SoftwareApplication", 97 | "identifier": "grDevices", 98 | "name": "grDevices" 99 | }, 100 | "4": { 101 | "@type": "SoftwareApplication", 102 | "identifier": "grid", 103 | "name": "grid" 104 | }, 105 | "5": { 106 | "@type": "SoftwareApplication", 107 | "identifier": "Rcpp", 108 | "name": "Rcpp", 109 | "provider": { 110 | "@id": "https://cran.r-project.org", 111 | "@type": "Organization", 112 | "name": "Comprehensive R Archive Network (CRAN)", 113 | "url": "https://cran.r-project.org" 114 | }, 115 | "sameAs": "https://CRAN.R-project.org/package=Rcpp" 116 | }, 117 | "6": { 118 | "@type": "SoftwareApplication", 119 | "identifier": "rlang", 120 | "name": "rlang", 121 | "provider": { 122 | "@id": "https://cran.r-project.org", 123 | "@type": "Organization", 124 | "name": "Comprehensive R Archive Network (CRAN)", 125 | "url": "https://cran.r-project.org" 126 | }, 127 | "sameAs": "https://CRAN.R-project.org/package=rlang" 128 | }, 129 | "SystemRequirements": "C++11" 130 | }, 131 | "fileSize": "46223.28KB", 132 | "readme": "https://github.com/AllanCameron/PDFR/blob/master/README.md", 133 | "developmentStatus": "https://lifecycle.r-lib.org/articles/stages.html#experimental", 134 | "keywords": ["pdf-format", "pdf", "extract-text", "data-scientists"] 135 | } 136 | -------------------------------------------------------------------------------- /src/streams.h: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------------// 2 | // // 3 | // PDFR Streams header file // 4 | // // 5 | // Copyright (C) 2018 - 2019 by Allan Cameron // 6 | // // 7 | // Licensed under the MIT license - see https://mit-license.org // 8 | // or the LICENSE file in the project root directory // 9 | // // 10 | //---------------------------------------------------------------------------// 11 | 12 | #ifndef PDFR_STREAMS 13 | 14 | //---------------------------------------------------------------------------// 15 | 16 | #define PDFR_STREAMS 17 | 18 | /* Streams in pdf files are usually made up of a sequence of non-ascii bytes 19 | * intended to represent raw data. When they occur they are always part of a pdf 20 | * object, which will always start with a <>. At the end of the 21 | * dictionary, after the closing brackets, comes the keyword 'stream', usually 22 | * (?always) followed by two whitespace bytes: \r and \n. The data then begins. 23 | * The end of the stream is declared by the sequence (\r\nendstream). 24 | * 25 | * The data can represent many different things including pictures, fonts, 26 | * annotations and postscript-type page descriptions. For the purposes of text 27 | * extraction, it is mainly the latter we are interested in. 28 | * 29 | * The raw data in the stream is almost always compressed, so needs to be 30 | * decompressed before being processed. That is the purpose of the stream class. 31 | * 32 | * At present, only the flatedecode decompression algorithm is implemented. 33 | * I have yet to find a pdf file that uses anything else for page description 34 | * to allow testing. 35 | * 36 | * The possible stream types are: 37 | * 38 | * Ascii85Stream, 39 | * AsciiHexStream, 40 | * DecodeStream, 41 | * FlateStream, 42 | * NullStream, 43 | * PredictorStream, 44 | * RunLengthStream, 45 | * StreamsSequenceStream, 46 | * StringStream, 47 | * LZWStream 48 | * 49 | * This header is required by the xref class, as it needs to be able to deflate 50 | * xrefstreams. 51 | */ 52 | 53 | #include "utilities.h" 54 | 55 | //---------------------------------------------------------------------------// 56 | // The Stream class is the base class for the different streams used in pdfs. 57 | // It provides a unified interface, with an input string, an output string, 58 | // and an iterator for each. It allows for consumption of individual bytes or 59 | // even for bits within bytes, while keeping track of its reading position and 60 | // signalling when the end of a stream has been reached without throwing. 61 | 62 | class Stream 63 | { 64 | // The constructors are protected to make this an abstract class. 65 | protected: 66 | Stream(const std::string*); 67 | Stream(const CharString&); 68 | 69 | public: 70 | std::string Output(){return output_;} // Getter for output 71 | uint32_t GetByte(); // Consumes next byte 72 | uint32_t PeekByte(); // Looks but doesn't consume 73 | void Reset(); // Returns stream to start 74 | uint32_t GetBits(uint32_t n); // Get next n bits 75 | uint32_t BitFlip(uint32_t value, uint32_t); // Reverses bit order 76 | 77 | // Appends byte to output and advances iterator 78 | void WriteOutput(uint8_t byte) 79 | { 80 | output_.append(1, (char) byte); 81 | output_position_ = output_.end(); 82 | } 83 | 84 | // Writes a repeat sequence from earlier in the ouput to the end of the 85 | // output. Used in Deflate and LZW. 86 | void AppendPrevious(uint32_t distance, uint32_t len) 87 | { 88 | for (unsigned i = 0; i < len; ++i) 89 | WriteOutput(*(output_position_ - distance)); 90 | } 91 | 92 | void SetExpansionRatio(uint8_t r) {output_.reserve(input_.size() * r);} 93 | void ShrinkToFit() { output_.shrink_to_fit();} 94 | char GetOutput(){return *output_position_++;} 95 | 96 | uint64_t GetEightBytes(); 97 | 98 | private: 99 | CharString input_; // The input string 100 | std::string output_; // The output string 101 | const char* input_position_; // Input iterator 102 | std::string::const_iterator output_position_; // Output iterator 103 | uint8_t unconsumed_bits_; // Bit iterator 104 | uint32_t unconsumed_bit_value_; // Keeps track of unused bits 105 | }; 106 | 107 | #endif 108 | 109 | -------------------------------------------------------------------------------- /src/tokenizer.h: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------------// 2 | // // 3 | // PDFR tokenizer header file // 4 | // // 5 | // Copyright (C) 2018 - 2019 by Allan Cameron // 6 | // // 7 | // Licensed under the MIT license - see https://mit-license.org // 8 | // or the LICENSE file in the project root directory // 9 | // // 10 | //---------------------------------------------------------------------------// 11 | 12 | #ifndef PDFR_TOKEN 13 | 14 | //---------------------------------------------------------------------------// 15 | 16 | #define PDFR_TOKEN 17 | 18 | /* The tokenizer class represents the last of our dealings with the actual 19 | * pdf file. After this stage, we have a complete description of the text on 20 | * the page including the size and position of every correctly-encoded glyph. 21 | * The subsequent steps will use only this data to try to reconstruct useful 22 | * semantic information from the text position in an attempt to provide useable 23 | * data, and to output the result to a variety of formats. 24 | * 25 | * The tokenizer class is used to read page description programs from the 26 | * page contents objects (and form xobjects). Rather than using regex to do 27 | * this (which is extremely slow), we use a custom-built lexer. This takes the 28 | * page program as a text string and goes through each character, identifying 29 | * tokens as it goes and storing them in a buffer until it can be decided what 30 | * type of token it has read. It switches state according to a finite set of 31 | * rules so that it knows when to pass the buffer to the parser for 32 | * parsing. 33 | * 34 | * Its interface is very simple - create the object by feeding it a string and 35 | * a pointer to the graphics state. It will tokenize the string and send it 36 | * to the parser for parsing 37 | * 38 | * It has a number of private members because it is a fairly complex lexer and 39 | * is easier to maintain as a collection of functions that pass private members 40 | * around, rather than one huge hairball function. 41 | */ 42 | 43 | #include "parser.h" 44 | 45 | //---------------------------------------------------------------------------// 46 | // The Tokenizer class. It has a simple interface of one constructor and one 47 | // getter for the result. The private members allow for passing of state 48 | // between member functions during the instruction set creation. 49 | 50 | class Tokenizer 51 | { 52 | public: 53 | // Constructor. Takes a string pointer to the page description program 54 | // and a fresh Parser object 55 | Tokenizer(const std::string& input_string, Parser* parser); 56 | 57 | private: 58 | // Enumerates the types of characters that can alter state differently 59 | 60 | Reader it_; 61 | Token::TokenState state_; // Current Tokenizer state 62 | Parser* interpreter_; // The Parser instructions are sent to 63 | static std::string in_loop_; // Prevents an infinite loop 64 | 65 | // const member functions 66 | char GetChar() const {return it_.GetChar();} 67 | CharType GetCharType() const {return it_.GetCharType();} 68 | bool empty() const {return it_.empty();} 69 | 70 | // private methods 71 | void NewSymbolState_(); //--------//--------------------------------------- 72 | void ResourceState_(); // 73 | void IdentifierState_(); // 74 | void NumberState_(); // These private member functions handle 75 | void StringState_(); // the various states of the lexer, 76 | void ArrayState_(); // responding variously to each character 77 | void EscapeState_(); // they come across to build the result 78 | void HexStringState_(); // 79 | void DictionaryState_(); // 80 | void WaitState_(); //--------//--------------------------------------- 81 | 82 | // Frequently used helper functions to update buffer and state 83 | void PushBuffer_(const Token::TokenState, const Token::TokenState); 84 | void HandleXObject_(); 85 | 86 | // Some simple inlined helpers 87 | void NewToken_(const Token::TokenState T) {it_.Clear(); state_ = T;} 88 | 89 | void Skip_() { ++it_; it_.Clear(); } 90 | 91 | void HandleLAB_() 92 | { 93 | Skip_(); 94 | if (GetChar() == '<') state_ = Token::DICT; 95 | else state_ = Token::HEXSTRING; 96 | } 97 | 98 | void HandleLCB_() 99 | { 100 | Skip_(); 101 | if (GetChar() == '\\') EscapeState_(); 102 | else state_ = Token::STRING; 103 | } 104 | }; 105 | 106 | //---------------------------------------------------------------------------// 107 | 108 | #endif 109 | -------------------------------------------------------------------------------- /src/charstring.cpp: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------------// 2 | // // 3 | // PDFR CharString implementation file // 4 | // // 5 | // Copyright (C) 2018 - 2019 by Allan Cameron // 6 | // // 7 | // Licensed under the MIT license - see https://mit-license.org // 8 | // or the LICENSE file in the project root directory // 9 | // // 10 | //---------------------------------------------------------------------------// 11 | 12 | #include "charstring.h" 13 | #include 14 | 15 | /*--------------------------------------------------------------------------*/ 16 | // Returns a pointer to the beginning of the first instance of a target 17 | // character literal in a CharString 18 | 19 | const char* CharString::find(const char* target) const 20 | { 21 | int first_char = -1; 22 | size_t target_index = 0; 23 | 24 | for (auto it = this->begin(); it != this->end(); ++it) 25 | { 26 | if (*it == *(target + target_index)) 27 | { 28 | if (first_char == -1) first_char = it - this->begin(); 29 | ++target_index; 30 | } 31 | else 32 | { 33 | if (*(target) == *it) 34 | { 35 | first_char = it - this->begin(); 36 | target_index = 1; 37 | } 38 | else 39 | { 40 | first_char = -1; 41 | target_index = 0; 42 | } 43 | } 44 | if (*(target + target_index) == '\0') return this->begin() + first_char; 45 | } 46 | return this->end(); 47 | } 48 | 49 | const char* CharString::find(const CharString& target) const 50 | { 51 | int first_char = -1; 52 | size_t target_index = 0; 53 | 54 | for (auto it = this->begin(); it != this->end(); ++it) 55 | { 56 | if (*it == target[target_index]) 57 | { 58 | if (first_char == -1) first_char = it - this->begin(); 59 | ++target_index; 60 | } 61 | else 62 | { 63 | if (*(target.begin()) == *it) 64 | { 65 | first_char = it - this->begin(); 66 | target_index = 1; 67 | } 68 | else 69 | { 70 | first_char = -1; 71 | target_index = 0; 72 | } 73 | } 74 | if (target[target_index] == '\0') return this->begin() + first_char; 75 | } 76 | return this->end(); 77 | } 78 | 79 | /*--------------------------------------------------------------------------*/ 80 | 81 | std::ostream& operator<<(std::ostream& os, const CharString& cs) 82 | { 83 | for(auto it = cs.begin(); it != cs.end(); ++it) os << *it; 84 | return os; 85 | } 86 | 87 | /*--------------------------------------------------------------------------*/ 88 | // A CharString matches a C-string only if all characters in the two strings 89 | // match, not including the C-string's terminal nul character. 90 | 91 | bool CharString::operator==(const char* cstring) const 92 | { 93 | if (length_ == 0) return false; 94 | for (size_t i = 0; i < length_; ++i) 95 | { 96 | if (*(begin_ + i) != *(cstring + i)) return false; 97 | if (*(cstring + i) == '\0') return false; 98 | if (length_ - i == 1 && *(cstring + i + 1) != '\0') return false; 99 | } 100 | return true; 101 | } 102 | 103 | /*--------------------------------------------------------------------------*/ 104 | 105 | bool CharString::operator==(const CharString& other) const 106 | { 107 | if (length_ != other.length_) return false; 108 | if (begin_ == other.begin_) return true; 109 | for (size_t i = 0; i < length_; ++i) 110 | { 111 | if (*(begin_ + i) != other[i]) return false; 112 | } 113 | return true; 114 | } 115 | 116 | /*--------------------------------------------------------------------------*/ 117 | 118 | bool CharString::operator==(const std::string& stdstring) const 119 | { 120 | if (length_ != stdstring.size()) return false; 121 | for (size_t i = 0; i < length_; ++i) 122 | { 123 | if (*(begin_ + i) != stdstring[i]) return false; 124 | } 125 | return true; 126 | } 127 | 128 | /*--------------------------------------------------------------------------*/ 129 | 130 | CharString CharString::substr(size_t start, size_t length) const 131 | { 132 | if (start >= this->size()) 133 | { 134 | throw std::runtime_error("Invalid substring range in CharString::substr"); 135 | } 136 | 137 | if (start + length > this->size()) 138 | { 139 | length = this->size() - start; 140 | } 141 | 142 | return CharString(this->begin(), start, start + length); 143 | } 144 | 145 | CharString CharString::CarveOut(const char* left, const char* right) const 146 | { 147 | size_t leftsize = 0; 148 | while(*(leftsize + left)) ++leftsize; 149 | const char* newstart = find(left); 150 | if (newstart == end()) return *this; else newstart += leftsize; 151 | CharString leftchunk(newstart, end() - newstart); 152 | const char* newend = leftchunk.find(right); 153 | return CharString(newstart, newend - newstart); 154 | } 155 | -------------------------------------------------------------------------------- /src/whitespace.h: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------------// 2 | // // 3 | // PDFR whitespace header file // 4 | // // 5 | // Copyright (C) 2018 - 2019 by Allan Cameron // 6 | // // 7 | // Licensed under the MIT license - see https://mit-license.org // 8 | // or the LICENSE file in the project root directory // 9 | // // 10 | //---------------------------------------------------------------------------// 11 | 12 | #ifndef PDFR_WSPACE 13 | 14 | //---------------------------------------------------------------------------// 15 | 16 | #define PDFR_WSPACE 17 | 18 | /* This class's job is to take the output of the parser and to carry out the 19 | * first stage of page segmentation. It does this by dividing the page into 20 | * a large number of tall vertical strips. Any strips that encounter an 21 | * obstruction (i.e. one or more glyphs) as they go from the top to the bottom 22 | * of the page are divided so they do not overlap the glyphs. Thus, if there 23 | * are n rows of text that the strip would otherwise cross, the strip is 24 | * divided into n + 1 segments. 25 | * 26 | * Once the strips are all calculated, they will cover all the significant empty 27 | * spaces (henceforth whitespace) in a document, leaving islands of text content 28 | * uncovered. These islands are physically, and usually logically related. 29 | * 30 | * However, there is some work to be done to identify the islands in question. 31 | * Firstly, we need to ensure that contiguous whitespace is joined together as 32 | * far as possible. This is done by looking to the right of each strip segment. 33 | * If the strip immediately to the right has the same top and bottom value, then 34 | * it is joined to the strip to the left by reducing its left value to the 35 | * same as the test strip, then flagging the test strip for deletion. 36 | * 37 | * The procedure may also leave small holes in the text islands due to 38 | * whitespace between words and lines. We remove these based on size criteria. 39 | * 40 | * Once we have are final set of whitespace boxes, we look at each vertex in 41 | * each whitespace box to determine which quadrants contain whitespace. None 42 | * should contain zero or four quadrants, and they should all lie on either a 43 | * page margin or the margin of a text island. 44 | * 45 | * The point of doing this is that we can use this information to draw a line 46 | * clockwise around the edge of each island by identifying the configuration 47 | * of whitespace around each vertex. Once we have drawn the polygons defining 48 | * each island, we can then assign glyphs to be inside one of these polygons. 49 | * This gives us a group of page segments along with the glyphs they contain. 50 | * We can then use this information to group letters and words together, 51 | * establish a reading order and attempt classification of text elements based 52 | * on size, shape, position and order on the page. 53 | */ 54 | 55 | #include "textbox.h" 56 | 57 | //---------------------------------------------------------------------------// 58 | // The whitespace class takes a word grouper as an argument in its constructor 59 | // and from that uses a sequence of helper functions to construct its final 60 | // output, which is a vector of WS_box containing the text boxes for a page. 61 | 62 | class Whitespace 63 | { 64 | public: 65 | using TextPointer = std::shared_ptr; 66 | // constructor 67 | Whitespace(std::unique_ptr ouput_from_wordgrouper); 68 | 69 | // Output the text element groups directly 70 | PageBox Output(); 71 | 72 | // Output the final text box co-ordinates 73 | std::vector WSBoxOut() const; 74 | 75 | private: 76 | //The main output is a collection of pairs of text boxes with their elements 77 | std::unique_ptr text_box_; // A copy of word grouper's output 78 | std::unordered_map>> polygonmap_; 79 | std::vector boxes_; // Used in construction AND output 80 | std::vector> vertices_; // Used to make polygons 81 | static const size_t DIVISIONS_ = 200; // number of strips used for whitespace 82 | 83 | void PageDimensions_(); // Gets page margins 84 | void CleanAndSortBoxes_(); // Helper to remove Boxes flagged for deletion 85 | void MakeStrips_(); // Cover the whitespace with tall thin strips 86 | void MergeStrips_(); // merge adjacent strips into boxes 87 | void RemoveSmall_(); // remove insufficiently tall boxes 88 | void MakeVertices_(); // use Boxes to find vertices of polygons 89 | void TidyVertices_(); // identify and remove the unneeded vertices 90 | void TracePolygons_(); // trace around polygons by following vertices 91 | void MakePolygonMap_(); // map polygons to size_t keys 92 | void PolygonMax_(); // find bounding boxes of polygons 93 | void RemoveEngulfed_(); // remove boxes within other boxes 94 | }; 95 | 96 | //---------------------------------------------------------------------------// 97 | 98 | #endif 99 | -------------------------------------------------------------------------------- /src/line_grouper.cpp: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------// 2 | // // 3 | // PDFR LineGrouper implementation file // 4 | // // 5 | // Copyright (C) 2018 - 2019 by Allan Cameron // 6 | // // 7 | // Licensed under the MIT license - see https://mit-license.org // 8 | // or the LICENSE file in the project root directory // 9 | // // 10 | //----------------------------------------------------------------------------// 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include "line_grouper.h" 17 | 18 | //----------------------------------------------------------------------------// 19 | 20 | using namespace std; 21 | 22 | //----------------------------------------------------------------------------// 23 | // The LineGrouper constructor takes the WordGrouper output and goes through all 24 | // of its text boxes. If the elements within each box can be grouped together 25 | // into a single logical component, then they are glued together into a logical 26 | // unit. Otherwise, the box is split vertically. 27 | 28 | LineGrouper::LineGrouper(PageBox text_boxes) 29 | : text_boxes_(text_boxes) 30 | { 31 | size_t i = 0; 32 | 33 | // If there are no textboxes, there is nothing to do. 34 | if ( !text_boxes_.empty()) 35 | { 36 | while (i < text_boxes_.size()) 37 | { 38 | // There is no point processing a textbox with 0 or 1 elements. 39 | if (text_boxes_[i].size() < 2){++i; continue;} 40 | 41 | // Ensures the text elements are in the correct reading order in the box. 42 | sort(text_boxes_[i].begin(), text_boxes_[i].end(), ReadingOrder_()); 43 | 44 | // Finds logical breaks within the text box and splits if needed. 45 | FindBreaks_(text_boxes_[i]); 46 | 47 | // After splitting, there may only be 1 element left in the box. 48 | if (text_boxes_[i].size() < 2){++i; continue;} 49 | 50 | // Makes sure the lines have correct final character before being pasted 51 | LineEndings_(text_boxes_[i]); 52 | 53 | // Pastes the text elements together 54 | PasteLines_(text_boxes_[i++]); 55 | } 56 | } 57 | }; 58 | 59 | //----------------------------------------------------------------------------// 60 | // Since the TextElements are now sorted by reading order, we can compare 61 | // consecutive elements in a textbox to work out whether they belong to the 62 | // same logical group. If they don't, then we call SplitBox_ to seperate them. 63 | // 64 | // This method identifies whether a new line is indented compared the previous 65 | // line. 66 | 67 | void LineGrouper::FindBreaks_(TextBox& text_box) 68 | { 69 | // For each TextElement in the TextBox 70 | for (size_t i = 1; i < text_box.size(); ++i) 71 | { 72 | if (text_box[i]->GetBottom() < text_box[i - 1]->GetBottom() && // Below 73 | text_box[i]->GetLeft() - text_box[i - 1]->GetLeft() > 0.1) // To left 74 | { 75 | auto slice_at = text_box[i - 1]->GetBottom(); 76 | auto&& new_box = text_box.SplitIntoTopAndBottom(slice_at); 77 | if (!new_box.empty()) text_boxes_.push_back(new_box); 78 | break; 79 | } 80 | } 81 | } 82 | 83 | //----------------------------------------------------------------------------// 84 | // To join lines together properly, we normally want to add a space to seperate 85 | // the word ending the line above and the first word of the line below. However, 86 | // we don't want to add an extra whitespace if the line already ends in one. 87 | // Furthermore, we don't want to add a space between the two fragments of a 88 | // hyphenated word, and instead we should just remove the hyphen. 89 | // 90 | // This method handles these various possibilities 91 | 92 | void LineGrouper::LineEndings_(TextBox& text_box) 93 | { 94 | // For each element in the TextBox 95 | for (size_t i = 0; i < text_box.size() - 1; ++i) 96 | { 97 | auto& element = text_box[i]; 98 | switch (element->GetGlyph().back()) 99 | { 100 | case 0x0020: break; 101 | case 0x00A0: break; 102 | case 0x002d: element->PopLastGlyph(); break; 103 | case 0x2010: element->PopLastGlyph(); break; 104 | case 0x2011: element->PopLastGlyph(); break; 105 | case 0x2012: element->PopLastGlyph(); break; 106 | case 0x2013: element->PopLastGlyph(); break; 107 | case 0x2014: element->PopLastGlyph(); break; 108 | case 0x2015: element->PopLastGlyph(); break; 109 | default: element->AddSpace(); 110 | } 111 | } 112 | } 113 | 114 | //----------------------------------------------------------------------------// 115 | // Combines the text elements into a single element with the textbox 116 | 117 | void LineGrouper::PasteLines_(TextBox& text_box) 118 | { 119 | for (auto& element : text_box) 120 | { 121 | if (&element != &(text_box[0])) 122 | { 123 | text_box[0]->ConcatenateUnicode(element->GetGlyph()); 124 | } 125 | } 126 | text_box.resize(1); 127 | } 128 | 129 | 130 | -------------------------------------------------------------------------------- /src/font.h: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------------// 2 | // // 3 | // PDFR Font header file // 4 | // // 5 | // Copyright (C) 2018 - 2019 by Allan Cameron // 6 | // // 7 | // Licensed under the MIT license - see https://mit-license.org // 8 | // or the LICENSE file in the project root directory // 9 | // // 10 | //---------------------------------------------------------------------------// 11 | 12 | #ifndef PDFR_FONT 13 | 14 | //---------------------------------------------------------------------------// 15 | 16 | #define PDFR_FONT 17 | 18 | /* This is the seventh step of a daisy-chain of headers comprising the PDFR 19 | * program as described in headerMap.txt. It #includes the two files that 20 | * comprise the 6th step - encoding.h and glyphwidths.h. 21 | * 22 | * Most of the hard work in creating fonts has been done in the previous step - 23 | * working out which glyphs are intended by an input pdf string, and what size 24 | * those glyphs should be when printed. 25 | * 26 | * The job of the Font class is therefore to co-ordinate the process of font 27 | * creation by using these other two classes, then combining their results into 28 | * a structure that I have called a glyphmap. This is a map which maps any 29 | * raw character from the pdf input to a pair indicating the intended Unicode 30 | * output glyph and glyph width for that character code. 31 | * 32 | * Its public interface includes constructors which require a document pointer, 33 | * the font dictionary and the ID of the font, used as shorthand in the 34 | * page dictionary. 35 | * 36 | * The remainder of the public members are: a getter for the actual font name; 37 | * an enumerator of all the RawChars mapped in the glyphmap, and a function to 38 | * safely interrogate the glyphmap, returning a vector of paired Unicode code 39 | * points and integer widths for each glyph, given an input vector of RawChars. 40 | * 41 | */ 42 | 43 | #include 44 | #include 45 | #include 46 | #include 47 | #include 48 | #include "truetype.h" 49 | 50 | class Dictionary; 51 | class Document; 52 | using Unicode = uint16_t; 53 | using RawChar = uint16_t; 54 | 55 | 56 | //---------------------------------------------------------------------------// 57 | // The GlyphMap is the main data member of the Font class. Although it is 58 | // constructed from standard library components, it needs a shorthand name 59 | 60 | typedef std::pair GlyphData; 61 | typedef std::unordered_map GlyphMap; 62 | 63 | //---------------------------------------------------------------------------// 64 | // Each Font object is created and stored as an object in a pdf page, as this 65 | // is how the pdf is logically organised. However, its public methods are 66 | // called by other classes, which use Font objects to interpret pdf strings. 67 | 68 | class Font 69 | { 70 | public: 71 | // Constructor 72 | Font(std::shared_ptr document_ptr, 73 | Dictionary& font_dictionary_ptr, 74 | const std::string& id); 75 | 76 | // public methods 77 | std::string GetFontName(); // Gets the actual PostScript Font name 78 | std::vector GetGlyphKeys(); // Returns vector of all mapped RawChars 79 | 80 | // The most important public method is MapRawChar, which takes a vector of 81 | // uint16_t representing raw character codes, and returns a vector of pairs 82 | // containing the Unicode interpretation and its associated width 83 | std::vector MapRawChar(const std::vector& raw_chars); 84 | 85 | private: 86 | // private data members 87 | std::shared_ptr document_; // - Pointer to the containing document 88 | Dictionary& font_dictionary_; // - The main font dictionary 89 | std::string font_id_, // - The name the font as used in PDF 90 | font_name_, // - The actual name of the font 91 | fontfile_; // - The bytes making up the font 92 | GlyphMap glyph_map_; // - Main data member, mapping RawChar 93 | // to a {Unicode, width} pair. 94 | std::shared_ptr font_data_; 95 | 96 | std::vector GetGlyphPath(RawChar ch, float x_scale, float y_scale, 97 | float x_offset, float y_offset) { 98 | return font_data_->ReadGlyf(ch).AsPath(x_scale, y_scale, 99 | x_offset, y_offset); 100 | } 101 | 102 | std::vector GetGlyphPath(RawChar ch, float x_scale, float y_scale) { 103 | return font_data_->ReadGlyf(ch).AsPath(x_scale, y_scale); 104 | } 105 | 106 | std::vector GetGlyphPath(RawChar ch, float scale) { 107 | return font_data_->ReadGlyf(ch).AsPath(scale, scale); 108 | } 109 | 110 | // private methods 111 | void ReadFontName_(); // Finds the postscript font name 112 | void MakeGlyphTable_(); // Co-ordinates font construction 113 | void GetFontFile_(); // Gets TTF data 114 | }; 115 | 116 | //---------------------------------------------------------------------------// 117 | 118 | #endif 119 | -------------------------------------------------------------------------------- /src/charstring.h: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------------// 2 | // // 3 | // PDFR CharString header file // 4 | // // 5 | // Copyright (C) 2018 - 2019 by Allan Cameron // 6 | // // 7 | // Licensed under the MIT license - see https://mit-license.org // 8 | // or the LICENSE file in the project root directory // 9 | // // 10 | //---------------------------------------------------------------------------// 11 | 12 | 13 | #ifndef PDFR_CHARSTRING 14 | 15 | //---------------------------------------------------------------------------// 16 | 17 | #define PDFR_CHARSTRING 18 | 19 | #include 20 | #include 21 | 22 | // The CharString class offers cheap, read-only access to std::strings and 23 | // C-style strings without having to copy data at any point. It is effectively 24 | // a glorified struct{const char* string, size_t length;} with member functions 25 | // such as operator[](), size(), begin(), end(), find() and substr() that 26 | // function as one might expect. It can be created from a std::string, a 27 | // const char* or a string literal, and can be compared directly for equality 28 | // against each of these using operator==(). It has its own output stream 29 | // method so it can be written directly to the console. 30 | // 31 | // Although it doesn't own any other resources, it will only remain valid as 32 | // long as the string to which it points exists, and this can make it 33 | // problematic unless care is taken to ensure its lifetime falls strictly 34 | // within the lifetime of the pointed-to string. 35 | // 36 | // It is used in PDFR because the entire pdf file is read into the free store as 37 | // an std::string and sits there for the duration of the parsing process. It 38 | // is therefore a safe and efficient tool for this job. 39 | // 40 | // This class is a wheel that has been reinvented many times, not least by the 41 | // C++17 addition of string_view. My guess is that string_view is much more 42 | // efficient, safe and portable than this class, but isn't available in C++11. 43 | // I have tried to give the member functions the same names as those in 44 | // string_view so that the code base can be easily upgraded in the future. 45 | // 46 | // Most of the methods are inlined, and only those that are a bit more complex 47 | // such as the find() and substr() methods are defined seperately in the 48 | // implementation file 49 | 50 | class CharString 51 | { 52 | public: 53 | // There are several ways to construct a Charstring: 54 | 55 | // Give it a pointer, a starting offset and an endpoint 56 | CharString(const char* ptr, size_t start, size_t end) : 57 | begin_(ptr + start), length_(end - start) {} 58 | 59 | // Or just a pointer and a length 60 | CharString(const char* ptr, size_t length) : begin_(ptr), length_(length) {} 61 | 62 | // Or just a pointer to a zero-terminated string 63 | CharString(const char* ptr) : 64 | begin_(ptr), length_(0) { while (*(begin_ + length_)) ++length_; } 65 | 66 | // Or an std::string 67 | CharString(const std::string& s) : begin_(s.c_str()), length_(s.size()) {} 68 | 69 | // Or an std::string with a starting offset 70 | CharString(const std::string& str, size_t start) : 71 | begin_(str.c_str() + start), length_(str.size() - start) {} 72 | 73 | // Or another CharString 74 | CharString(const CharString&) = default; 75 | CharString& operator=(const CharString& chunk) = default; 76 | CharString& operator=(CharString&& chunk) noexcept = default; 77 | 78 | // Empty constructor 79 | CharString() : begin_(nullptr), length_(0) {} 80 | 81 | // The comparators are seperately defined 82 | bool operator==(const CharString& other) const; 83 | bool operator==(const std::string& string) const; 84 | bool operator==(const char* cstring) const; 85 | 86 | // Find and substr also require seperate definition 87 | const char* find(const char* target) const; 88 | const char* find(const CharString& target) const; 89 | CharString substr(size_t start, size_t length) const; 90 | CharString CarveOut(const char* left, const char* right) const; 91 | 92 | // The basic reading operations are all inlined 93 | char operator[](int index) const {return *(begin_ + index);} 94 | std::string AsString() const {return {begin_, length_};} 95 | const char* begin() const {return begin_;} 96 | const char* end() const {return begin_ + length_;} 97 | char back() const {return *(end() - 1);} 98 | bool empty() const {return length_ == 0;} 99 | size_t size() const {return length_;} 100 | const char* find(const std::string& str) const {return find(str.c_str());} 101 | bool contains(const char* target) const {return find(target) != end();} 102 | bool contains(std::string target) const {return find(target) != end();} 103 | 104 | private: 105 | const char* begin_; 106 | size_t length_; 107 | }; 108 | 109 | // Declaration for output stream interface doesn't need to be a member 110 | std::ostream& operator<<(std::ostream& os, const CharString& charstring); 111 | 112 | // Create a string literal CharString directly 113 | inline CharString operator "" _cs(const char* cstr) { return CharString(cstr);} 114 | 115 | #endif 116 | -------------------------------------------------------------------------------- /src/matrix.h: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------------// 2 | // // 3 | // PDFR Matrix header file // 4 | // // 5 | // Copyright (C) 2018 - 2021 by Allan Cameron // 6 | // // 7 | // Licensed under the MIT license - see https://mit-license.org // 8 | // or the LICENSE file in the project root directory // 9 | // // 10 | //---------------------------------------------------------------------------// 11 | 12 | #ifndef PDFR_MATRIX 13 | 14 | //---------------------------------------------------------------------------// 15 | 16 | #define PDFR_MATRIX 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | //---------------------------------------------------------------------------// 24 | // To define the position of elements on a page, the pdf page description 25 | // program uses 3 * 3 matrices. These allow for arbitrary scaling, rotation, 26 | // translation and skewing. Since the last column of a transformation 27 | // matrix is always {0, 0, 1}, the matrices in pdf are defined by just six 28 | // numbers in the page description program. 29 | // 30 | // For example, the entry "11 12 13 14 15 16 Tm" represents the following 31 | // 3x3 transformation matrix: 32 | // 33 | // | 11 12 0 | 34 | // | | 35 | // | 13 14 0 | 36 | // | | 37 | // | 15 16 1 | 38 | // 39 | // The matrices all use floating point numbers and are all 3 x 3. Although we 40 | // could just model them with a length 9 array of floats, it makes things a bit 41 | // easier to just define a 3 x 3 float matrix here. That way, we can easily 42 | // add or multiply two matrices using '+' and '-' instead of calling named 43 | // functions. This is despite the fact that the underlying data member is 44 | // a std::array anyway. 45 | 46 | class Matrix 47 | { 48 | public: 49 | // The default constructor returns a 3 x 3 identity matrix 50 | Matrix(): data_(std::array {1.0, 0, 0, 0, 1.0, 0, 0, 0, 1.0}) {} 51 | 52 | // We can create a Matrix directly from a length-9 array of floats 53 | Matrix(std::array float_array): data_(float_array){} 54 | 55 | // This constructor takes a vector of 6 strings representing floats and 56 | // turns them into a 3 x 3 matrix as specified by the pdf page descriptor 57 | Matrix(const std::vector& string_vector) 58 | { 59 | if (string_vector.size() < 6) 60 | { 61 | throw std::runtime_error("Can't create Matrix with fewer than 6 floats"); 62 | } 63 | 64 | data_ = {stof(string_vector[0]), stof(string_vector[1]), 0, 65 | stof(string_vector[2]), stof(string_vector[3]), 0, 66 | stof(string_vector[4]), stof(string_vector[5]), 1}; 67 | } 68 | 69 | // Assignment constructor 70 | Matrix& operator=(const Matrix& other) 71 | { 72 | this->data_ = other.data_; 73 | return *this; 74 | } 75 | 76 | // Operator overload of '*': returns dot product of two matrices 77 | Matrix operator*(const Matrix& other) 78 | { 79 | std::array new_data {}; 80 | 81 | // Use indices to fill by loop 82 | for (size_t i = 0; i < 9; ++i) 83 | { 84 | new_data[i] = (data_[i % 3 + 0] * other.data_[3 * (i / 3) + 0] + 85 | data_[i % 3 + 3] * other.data_[3 * (i / 3) + 1] + 86 | data_[i % 3 + 6] * other.data_[3 * (i / 3) + 2] ); 87 | } 88 | 89 | return Matrix(new_data); 90 | } 91 | 92 | // Transforms this matrix into the dot product of *this and t_other 93 | void operator*=(const Matrix& other) 94 | { 95 | std::array new_data {}; 96 | 97 | // Use indices to fill by loop 98 | for (size_t i = 0; i < 9; ++i) 99 | { 100 | new_data[i] = (data_[i % 3 + 0] * other.data_[3 * (i / 3) + 0] + 101 | data_[i % 3 + 3] * other.data_[3 * (i / 3) + 1] + 102 | data_[i % 3 + 6] * other.data_[3 * (i / 3) + 2] ); 103 | } 104 | // Swap rather than copy the array used as the data member 105 | std::swap(this->data_, new_data); 106 | } 107 | 108 | // Overloaded + operator returns the element-by-element addition of Matrices 109 | Matrix operator+(const Matrix& other) 110 | { 111 | std::array new_data {}; 112 | for (size_t element = 0; element < 9; ++element) 113 | { 114 | new_data[element] = this->data_[element] + other.data_[element]; 115 | } 116 | return Matrix(new_data); 117 | } 118 | 119 | // Transforms *this into *this + t_other using element-by-element addition 120 | void operator+=(const Matrix& other) 121 | { 122 | for (size_t element = 0; element < 9; ++element) 123 | { 124 | this->data_[element] += other.data_[element]; 125 | } 126 | } 127 | 128 | // Gets a reference to an element of the data member 129 | float& operator[](size_t index) 130 | { 131 | return data_[index]; 132 | } 133 | 134 | std::array transformXY(float x, float y) 135 | { 136 | std::array result = {data_[0] * x + data_[3] * y + data_[6], 137 | data_[1] * x + data_[4] * y + data_[7]}; 138 | return result; 139 | } 140 | 141 | private: 142 | std::array data_; // The actual data member 143 | }; 144 | 145 | //---------------------------------------------------------------------------// 146 | 147 | #endif 148 | -------------------------------------------------------------------------------- /src/document.h: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------------// 2 | // // 3 | // PDFR Document header file // 4 | // // 5 | // Copyright (C) 2018 - 2019 by Allan Cameron // 6 | // // 7 | // Licensed under the MIT license - see https://mit-license.org // 8 | // or the LICENSE file in the project root directory // 9 | // // 10 | //---------------------------------------------------------------------------// 11 | 12 | #ifndef PDFR_DOCUMENT 13 | 14 | //---------------------------------------------------------------------------// 15 | 16 | #define PDFR_DOCUMENT 17 | 18 | /* This is the fifth header file in a daisy-chain of headers that build up the 19 | * tools required to parse pdfs. It follows just after the definition of the 20 | * object_class. 21 | * 22 | * The tools already in place have abstracted away decryption, decompression, 23 | * bytewise navigation of the file and parsing of dictionaries. The job of the 24 | * Document class is therefore to act as an interface to use the pdf objects 25 | * from which we build up logical structures such as fonts, xobjects and pages. 26 | * 27 | * The previous classes have been encapsulated as far as possible to be able to 28 | * work in isolation with minimal knowledge of each other. The Document class 29 | * in contrast acts as a creator, container and user of these objects. 30 | * 31 | * Each Document will have one and only one xref class. Instead of a pointer to 32 | * the xref as in other classes, the xref is actually a data member of the 33 | * Document class. PDF objects are created and stored in a map for easy access. 34 | * The file string is stored here and any other class that needs to read the 35 | * file accesses a pointer to the filestring held in the Document class. 36 | * 37 | * The Document class is therefore self-contained, in that after the initial 38 | * step of reading in the file, it has everything in needs to build up its 39 | * own components and interface. The logical PDF structures we go on to build 40 | * only need to know about the Document class, and can use it as the interface 41 | * they need. They "see" the pdf as a random access collection of numbered 42 | * objects with key:value dictionaries and uncompressed streams without being 43 | * concerned about how that is implemented. 44 | * 45 | * The Document also needs to have an outline of its own logical structure, 46 | * in terms of the pages it contains and where they are located. Part of the 47 | * task of Document creation is therefore to count and locate the objects 48 | * that act as page descriptors. It does this by finding the catalog 49 | * dictionary and then following pointers to dictionaries that contain 50 | * individual page headers. There is then a "getter" function for other classes 51 | * to access the dictionary pertaining to a particular page 52 | */ 53 | 54 | #include 55 | #include 56 | #include 57 | #include 58 | 59 | class Dictionary; 60 | class XRef; 61 | class Object; 62 | 63 | //---------------------------------------------------------------------------// 64 | // The public interface of the Document class comprises constructors and two 65 | // member functions - one to return any object from the pdf and one to retrieve 66 | // a specific page header. 67 | 68 | class Document 69 | { 70 | public: 71 | // Constructor to create Document from file path 72 | Document(const std::string& file_path) 73 | : file_string_(GetFile(file_path)) 74 | { BuildDocument_(); } 75 | 76 | // Constructor to create Document from raw data 77 | Document(const std::vector& byte_vector) 78 | : file_string_(std::string(byte_vector.begin(), byte_vector.end())) 79 | { BuildDocument_(); } 80 | 81 | 82 | // Gets a pointer to the Object specified by object_number. If the object has 83 | // previously been accessed, it will retrieve a pointer from the Object cache. 84 | // If it has not been accessed before, it will first create it. If the object 85 | // is inside an object stream, it will automatically add the holding object to 86 | // the cache as well. 87 | std::shared_ptr GetObject(int object_number); 88 | 89 | // Returns the main header dictionary for page specified by page_number 90 | Dictionary GetPageHeader(size_t page_number); 91 | 92 | // Accesses the private member containing object numbers of all page headers. 93 | std::vector GetPageObjectNumbers() const {return page_object_numbers_;}; 94 | 95 | private: 96 | const std::string file_string_; // Full contents of file 97 | std::shared_ptr xref_; // Pointer to creating XRef object 98 | std::vector page_object_numbers_; // The object numbers of page headers 99 | 100 | // This map holds Object pointers. Since some objects may be read 101 | // multiple times, it is best to store them when they are first created, 102 | // then return the stored object on request rather than creating a new 103 | // instance of the object every time it is requested. 104 | std::unordered_map > object_cache_; 105 | 106 | void BuildDocument_(); // The constructors use this as a common pathway 107 | 108 | // This function effectively builds the pages tree. 109 | std::vector ExpandKids_(const std::vector& object_numbers); 110 | }; 111 | 112 | //---------------------------------------------------------------------------// 113 | 114 | 115 | 116 | #endif 117 | -------------------------------------------------------------------------------- /src/page.h: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------------// 2 | // // 3 | // PDFR Page header file // 4 | // // 5 | // Copyright (C) 2018 - 2019 by Allan Cameron // 6 | // // 7 | // Licensed under the MIT license - see https://mit-license.org // 8 | // or the LICENSE file in the project root directory // 9 | // // 10 | //---------------------------------------------------------------------------// 11 | 12 | #ifndef PDFR_PAGE 13 | 14 | //---------------------------------------------------------------------------// 15 | 16 | #define PDFR_PAGE 17 | 18 | /* This is the eighth in a sequence of daisy-chained headers that build up the 19 | * tools needed to read and parse the text content of pdfs. It comes after 20 | * font.h in the sequence and is the last step of constructing the logical 21 | * structure of pdf files. 22 | * 23 | * Each Page object represents a page in the pdf document, taking as its 24 | * construction parameters just a document pointer and a page number. 25 | * 26 | * The Page object acts as a container and organiser of the data required to 27 | * build a representation of the page. This includes the page dimensions, 28 | * the Font objects used on the page, any xobjects, and the contents of the 29 | * page (as a page description program). 30 | * 31 | * The document and pagenumber are used to find the appropriate page header 32 | * dictionary. This gives the page dimensions, contents and resources (such 33 | * as fonts and xobjects). These items are pulled in from the relevant 34 | * pdf objects and processed to get the data members. 35 | * 36 | * The public interface is more substantial with the Page class than with other 37 | * classes. The reason for this is that some of the data held by the Page class 38 | * may be useful to the end user rather than just being abstractions accessed 39 | * by other classes. Some of the downstream classes will also needs members of 40 | * the interface however - the parser class needs to access the fonts, 41 | * page contents and Xobjects for example. 42 | */ 43 | 44 | #include "font.h" 45 | #include "dictionary.h" 46 | #include "object_class.h" 47 | #include 48 | 49 | class Box; 50 | 51 | //---------------------------------------------------------------------------// 52 | 53 | class Page 54 | { 55 | public: 56 | // Constructor 57 | Page(std::shared_ptr document_pointer, int page_number); 58 | 59 | // Move constructor 60 | Page(Page&& other_page) noexcept {*this = std::move(other_page);} 61 | 62 | // lvalue assignment operator 63 | Page& operator=(const Page& other_page) 64 | { 65 | *this = other_page; 66 | return *this; 67 | } 68 | 69 | // rvalue assignment operator 70 | Page& operator=(Page&& other_page) noexcept 71 | { 72 | *this = std::move(other_page); 73 | return *this; 74 | } 75 | 76 | // Returns PostScript font names 77 | std::vector GetFontNames(); 78 | 79 | // Returns page description program 80 | const std::string& GetPageContents(); 81 | 82 | // Returns a pointer to the contents of an XObject used by the page 83 | std::shared_ptr GetXObject(const std::string& x_object_name); 84 | 85 | // Returns a pointer to the Font object from a given font name 86 | std::shared_ptr GetFont(const std::string& font_name); 87 | 88 | // Returns a Box object describing the page's bounding box. 89 | std::shared_ptr GetMinbox() const { return minbox_;} 90 | 91 | // Since the font map is a static object, it should be cleared at the end 92 | // of processing any particular document. Important! 93 | void ClearFontMap() { fontmap_.clear(); }; 94 | 95 | // Allows a dictionary to be returned either directly or via reference 96 | Dictionary FollowToDictionary(Dictionary&, const std::string&); 97 | 98 | std::list> SubXobjects(int xobj_num); 99 | 100 | private: 101 | std::shared_ptr document_; // Pointer to main document 102 | int page_number_; // [Zero-indexed] page number 103 | Dictionary header_, // The page's header dictionary 104 | resources_, // Resource sub-dictionary 105 | fonts_; // Font sub-dictionary 106 | std::shared_ptr minbox_; // Page bounding Box 107 | std::string content_string_; // The page PostScript program 108 | float rotate_; // Page rotation in degrees 109 | 110 | // A map of Xobject strings, which are fragments of page description programs 111 | std::unordered_map xobjects_; 112 | 113 | // The actual storage container for fonts, mapped to their pdf names 114 | static std::unordered_map> fontmap_; 115 | 116 | // private methods 117 | void ReadXObjects_(); // Write form XObjects to the xobject map 118 | void ReadBoxes_(); // Store bounding boxes and calculate the smallest 119 | void ReadHeader_(); // Find the correct header dictionary in document 120 | void ReadResources_(); // Obtain the resource dictionary 121 | void ReadFonts_(); // Get font dictionary and build fontmap 122 | void ReadContents_(); // find content objects to Write contentstring 123 | 124 | // Gets the leaf nodes of a content tree 125 | std::vector ExpandContents_(std::vector); 126 | }; 127 | 128 | //---------------------------------------------------------------------------// 129 | 130 | #endif 131 | -------------------------------------------------------------------------------- /src/font.cpp: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------------// 2 | // // 3 | // PDFR Font implementation file // 4 | // // 5 | // Copyright (C) 2018 - 2019 by Allan Cameron // 6 | // // 7 | // Licensed under the MIT license - see https://mit-license.org // 8 | // or the LICENSE file in the project root directory // 9 | // // 10 | //---------------------------------------------------------------------------// 11 | 12 | #include "utilities.h" 13 | #include "dictionary.h" 14 | #include "glyphwidths.h" 15 | #include "encoding.h" 16 | #include "font.h" 17 | #include "object_class.h" 18 | #include "document.h" 19 | 20 | //---------------------------------------------------------------------------// 21 | 22 | using namespace std; 23 | 24 | /*---------------------------------------------------------------------------*/ 25 | // The Font constructor simply initializes the private data members, calls 26 | // getFontName() to get the postscript font title, and then makeGlyphTable() 27 | // to create the main data member 28 | 29 | Font::Font(shared_ptr document_ptr, 30 | Dictionary& font_dictionary, 31 | const string& font_id) 32 | : document_(document_ptr), 33 | font_dictionary_(font_dictionary), 34 | font_id_(font_id) 35 | { 36 | ReadFontName_(); 37 | MakeGlyphTable_(); 38 | GetFontFile_(); 39 | if(fontfile_.size() > 0) font_data_ = std::make_shared(fontfile_); 40 | } 41 | 42 | /*---------------------------------------------------------------------------*/ 43 | // Obtains the font's relevant TrueType file 44 | 45 | void Font::GetFontFile_() 46 | { 47 | if(font_dictionary_.ContainsReferences("/FontDescriptor")) 48 | { 49 | int descriptor_ref = font_dictionary_.GetReference("/FontDescriptor"); 50 | std::shared_ptr descriptor = document_->GetObject(descriptor_ref); 51 | if(descriptor->GetDictionary().ContainsReferences("/FontFile2")) 52 | { 53 | int fontfile_ref = descriptor->GetDictionary().GetReference("/FontFile2"); 54 | std::shared_ptr font_obj = document_->GetObject(fontfile_ref); 55 | fontfile_ = font_obj->GetStream(); 56 | } 57 | } 58 | } 59 | 60 | /*---------------------------------------------------------------------------*/ 61 | // Obtains the font's PostScript name from the font dictionary 62 | 63 | void Font::ReadFontName_() 64 | { 65 | // Reads /BaseFont entry 66 | string base_font(font_dictionary_.GetString("/BaseFont")); 67 | 68 | if (base_font.size() > 7 && base_font[7] == '+') 69 | { 70 | font_name_ = base_font.substr(8, base_font.size() - 8); 71 | } 72 | else 73 | { 74 | font_name_ = base_font.substr(1, base_font.size() - 1); 75 | } 76 | } 77 | 78 | /*---------------------------------------------------------------------------*/ 79 | // Most of the work asked of an object of the Font class will be to provide 80 | // interpretations of raw character codes, in terms of the actual glyphs and 81 | // their sizes intended by the document. This public method allows a vector 82 | // of raw characters to be interpreted. It returns a vector of the same length 83 | // as the input vector, containing a pair of {Unicode glyph, width} at each 84 | // position 85 | 86 | vector> Font::MapRawChar(const vector& raw_vector) 87 | { 88 | vector> result; 89 | result.reserve(raw_vector.size()); 90 | 91 | for (const auto& raw_char : raw_vector) 92 | { 93 | auto finder = glyph_map_.find(raw_char); 94 | if (finder != glyph_map_.end()) 95 | { 96 | result.push_back(finder->second); 97 | } 98 | } 99 | 100 | return result; 101 | } 102 | 103 | /*---------------------------------------------------------------------------*/ 104 | // The Font class subcontracts most of the work of its own construction out to 105 | // the encoding and glyphwidth classes. This private method co-ordinates the 106 | // building of the glyphmap using these two component classes 107 | 108 | void Font::MakeGlyphTable_() 109 | { 110 | // Create Encoding object 111 | Encoding encodings(font_dictionary_, document_); 112 | 113 | // Create glyphwidth object 114 | GlyphWidths widths(font_dictionary_, document_); 115 | 116 | // get all the mapped RawChars from the Encoding object 117 | auto encoding_map = encodings.GetEncodingKeys(); 118 | 119 | // We need to know whether the width code points refer to the width of raw 120 | // character codes or to the final Unicode translations 121 | 122 | // If the widths refer to RawChar code points, map every RawChar to a width 123 | if (widths.WidthsAreForRaw()) 124 | { 125 | for (auto& key_value_pair : *encoding_map) 126 | { 127 | auto& key = key_value_pair.first; 128 | glyph_map_[key] = make_pair(encodings.Interpret(key), 129 | widths.GetWidth(key)); 130 | } 131 | } 132 | // Otherwise widths refer to Unicode glyphs, so map each to a width 133 | else 134 | { 135 | for (auto& key_value_pair : *encoding_map) 136 | { 137 | auto& key = key_value_pair.first; 138 | glyph_map_[key] = make_pair(encodings.Interpret(key), 139 | widths.GetWidth(encodings.Interpret(key))); 140 | } 141 | } 142 | } 143 | 144 | /*---------------------------------------------------------------------------*/ 145 | // Public getter for FontName 146 | 147 | std::string Font::GetFontName() 148 | { 149 | return font_name_; 150 | } 151 | 152 | /*---------------------------------------------------------------------------*/ 153 | // Public getter for the keys of the glyphmap, needed to output the map from 154 | // the program if required 155 | 156 | std::vector Font::GetGlyphKeys() 157 | { 158 | return GetKeys(glyph_map_); 159 | } 160 | -------------------------------------------------------------------------------- /src/textbox.cpp: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------------// 2 | // // 3 | // PDFR TextElement implementation file // 4 | // // 5 | // Copyright (C) 2018 - 2019 by Allan Cameron // 6 | // // 7 | // Licensed under the MIT license - see https://mit-license.org // 8 | // or the LICENSE file in the project root directory // 9 | // // 10 | //---------------------------------------------------------------------------// 11 | 12 | #include "utilities.h" 13 | #include "box.h" 14 | #include "font.h" 15 | #include "textbox.h" 16 | 17 | using namespace std; 18 | 19 | //---------------------------------------------------------------------------// 20 | // Converts TextBox to TextTable 21 | TextTable::TextTable(const TextBox& text_box): 22 | Box((Box) text_box) 23 | { 24 | for (auto ptr = text_box.cbegin(); ptr != text_box.cend(); ++ptr) 25 | { 26 | auto& element = *ptr; 27 | if (!element->IsConsumed()) 28 | { 29 | this->text_.push_back(element->Utf()); 30 | this->lefts_.push_back(element->GetLeft()); 31 | this->bottoms_.push_back(element->GetBottom()); 32 | this->rights_.push_back(element->GetRight()); 33 | this->fonts_.push_back(element->GetFontName()); 34 | this->tops_.push_back(element->GetTop()); 35 | this->sizes_.push_back(element->GetSize()); 36 | } 37 | } 38 | } 39 | 40 | //---------------------------------------------------------------------------// 41 | 42 | void TextBox::RemoveDuplicates() 43 | { 44 | for (auto this_row = data_.begin(); this_row != data_.end(); ++this_row) 45 | { 46 | if ((*this_row)->IsConsumed()) continue; 47 | for (auto other_row = this_row; other_row != data_.end(); ++other_row) 48 | { 49 | if (other_row == this_row) continue; 50 | 51 | if (**other_row == **this_row) 52 | { 53 | (*other_row)->Consume(); 54 | } 55 | } 56 | } 57 | } 58 | 59 | //---------------------------------------------------------------------------// 60 | // Join another text table to this one 61 | 62 | void TextTable::Join(TextTable& other) 63 | { 64 | this->Merge(other); 65 | Concatenate(this->text_, other.text_); 66 | Concatenate(this->lefts_, other.lefts_); 67 | Concatenate(this->bottoms_, other.bottoms_); 68 | Concatenate(this->rights_, other.rights_); 69 | Concatenate(this->fonts_, other.fonts_); 70 | Concatenate(this->tops_, other.tops_); 71 | } 72 | 73 | 74 | //----------------------------------------------------------------------------// 75 | // Divides a TextBox into two by a horizontal line given as a y value 76 | 77 | TextBox TextBox::SplitIntoTopAndBottom(float top_edge) 78 | { 79 | if (this->empty()) return TextBox(); // Don't split the box if it's empty 80 | 81 | // Lambda to find elements whose bottom edge is below the cutoff 82 | auto FindLower = [&](TextPointer text_ptr) -> bool 83 | { return text_ptr->GetTop() < top_edge; }; 84 | 85 | // Gets an iterator to the first element below the cutoff 86 | auto split_at = find_if(this->begin(), this->end(), FindLower); 87 | 88 | // We won't split the box if all or none of the elements would be moved 89 | // to a new box 90 | if (split_at == this->begin() || split_at == this->end()) return TextBox(); 91 | 92 | // Create a new textbox using a vector of all elements below the cutoff 93 | // and a down-cast copy of the text box 94 | std::vector lower_contents {split_at, this->end()}; 95 | auto lower = TextBox(std::move(lower_contents), (Box) *this); 96 | 97 | // Now we can erase the lower elements we have just copied from the upper box 98 | this->erase(split_at, this->end()); 99 | 100 | // We also need to readjust the margins of our bounding boxes based on their 101 | // new contents 102 | this->SetBottom(this->back()->GetBottom()); 103 | lower.SetTop(lower.front()->GetTop()); 104 | 105 | // The upper box has been changed in place, 106 | return lower; 107 | } 108 | 109 | //----------------------------------------------------------------------------// 110 | // Divides a TextBox into two by a vertical line given as an x value 111 | 112 | TextBox TextBox::SplitIntoLeftAndRight(float left_edge) 113 | { 114 | if (this->empty()) return TextBox(); // Don't split the box if it's empty 115 | 116 | // This lambda defines a TextPointer sort from left to right 117 | auto LeftSort = [ ](const TextPointer& a, const TextPointer& b) -> bool 118 | { return a->GetLeft() < b->GetLeft(); }; 119 | 120 | std::stable_sort(this->begin(), this->end(), LeftSort); 121 | 122 | // Lambda to find elements whose left edge is below the cutoff 123 | auto FindLeftMost = [&](TextPointer text_ptr) -> bool 124 | { return text_ptr->GetLeft() < left_edge; }; 125 | 126 | // Gets an iterator to the first element right of the cutoff 127 | auto split_at = find_if(this->begin(), this->end(), FindLeftMost); 128 | 129 | // We won't split the box if all or none of the elements would be moved 130 | // to a new box 131 | if (split_at == this->begin() || split_at == this->end()) return TextBox(); 132 | 133 | // Create a new textbox using a vector of all elements below the cutoff 134 | // and a down-cast copy of the text box 135 | std::vector rightmost_contents {split_at, this->end()}; 136 | auto rightmost = TextBox(std::move(rightmost_contents), (Box) *this); 137 | 138 | // Now we can erase the lower elements we have just copied from the upper box 139 | this->erase(split_at, this->end()); 140 | 141 | // We also need to readjust the margins of our bounding boxes based on their 142 | // new contents 143 | this->SetRight(this->back()->GetRight()); 144 | rightmost.SetLeft(rightmost.front()->GetTop()); 145 | 146 | // The upper box has been changed in place, 147 | return rightmost; 148 | } 149 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | 6 | 7 | ```{r, include = FALSE} 8 | knitr::opts_chunk$set( 9 | collapse = TRUE, 10 | comment = "#>", 11 | fig.path = "man/figures/README-", 12 | out.width = "100%" 13 | ) 14 | ``` 15 | 16 | # PDFR 17 | 18 | 19 | [![Lifecycle: experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html#experimental) 20 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 21 | 22 | 23 | The goal of PDFR is to aid data scientists who need the ability to extract data from files in pdf format. PDFR is a new C++ based R library to extract usable text from portable document format (pdf) files. 24 | 25 | The majority of the code base is written in C++ with a view to being ported to other languages, but at present it is constructed to be built as an R package. 26 | 27 | ## Installation 28 | 29 | You can install the development version of PDFR from [GitHub](https://github.com/) with: 30 | 31 | ``` r 32 | # install.packages("pak") 33 | pak::pkg_install("AllanCameron/PDFR") 34 | ``` 35 | 36 | ## Usage 37 | 38 | The main function used to extract all data from a pdf page to an R data frame is `pdfpage()`. This accepts either the path to a pdf or a raw data vector representing a pdf. For example, this is how you extract all text from page 1 of the barcodes PDF from `pdfr_paths`: 39 | 40 | ```{r} 41 | library(PDFR) 42 | 43 | barcodes <- system.file("extdata", "barcodes.pdf", package = "PDFR") 44 | pdfpage(barcodes, 1) 45 | ``` 46 | 47 | ## Background 48 | 49 | The current version is at an early stage of development. It will work with most pdfs, but there are some unsupported features which may lead to some pdfs producing runtime errors. 50 | 51 | Documents encrypted using the standard method and which can be opened without a password are supported. Password-based encryption is currently unsupported. 52 | 53 | If there are any suggestions for development please submit a feature request, or let me know about pdfs that break the package. 54 | 55 | ## Motivation 56 | 57 | Extracting useful data from pdf is difficult for two reasons. Firstly, the pdf format primarily consists of binary data, which is laid out in such a way as to provide quick random access to pdf *objects* as required by a pdf reader. The text elements as seen on the page are usually encoded in a binary stream within the document. Even when the binary stream is decoded, the text items exist as individual elements within a page description program, which has to be parsed before the text can be extracted. It is therefore not a trivial matter to extract the "raw text" from a pdf file into a format in which it can be read by R, though there exist some excellent tools that can do this quickly. In particular, [pdftools](https://ropensci.org/blog/2016/03/01/pdftools-and-jeroen/) provides an R interface to some of Poppler's pdf tools, and can quickly and reliably extract text wholesale from pdf. 58 | 59 | The second problem is that, unlike some other common file types used to exchange information on the internet (e.g. html, xml, csv, JSON), the raw text extracted from a pdf does not have a fixed structure to provide semantic information about the data to allow it to be processed easily by a data scientist. 60 | 61 | The mismatch between the fact that humans can read data from pdfs so easily yet the format is so difficult to convert into machine-readable data is explained by the fact that humans use the structure of the page layout to provide the semantic context to the data. When the structure is lost (as it often is with copy and pasting from PDF), it becomes very difficult for a human reader to interpret. The computer does not know how to interpret the characters' positions, so it cannot classify the characters by semantics as a human reader (usually) can. 62 | 63 | The idea behind PDFR is to try to extract raw text then use the positioning and formatting data from the extracted text to reconstruct some of the semantic content that would otherwise be lost. For example, identifying and grouping letters into words, words into paragraphs or into tables. 64 | 65 | Ultimately, to extract useful data, the user will need the option to control how and to what extent text elements are grouped. For example, they may need the fine control of having every letter's position on the page (e.g. to accurately reconstruct a part of the document on a plot), or may wish to extract a corpus of plain text from a book as a set of paragraphs or even whole pages. 66 | 67 | PDFR is written in C++ 11 and has no external dependencies, but makes extensive use of the C++ standard libraries. Rather than being based on an existing library such as [xpdf](https://www.xpdfreader.com/) or [Poppler](https://poppler.freedesktop.org/), it was written from scratch with the specific goal of making text extraction easier for R users. Most of the design is new, an attempt to implement the text extraction elements of the pdf standard [ISO 32000](https://www.iso.org/standard/51502.html), though it borrows some concepts from existing open-source libraries such as Poppler and [pdfjs](https://mozilla.github.io/pdf.js/). 68 | 69 | Clearly, the package would not exist without the excellent [Rcpp](http://www.rcpp.org/) package. Much of the pdf parsing would take too long to do in R, but having the facility to write C++ extensions makes pdf parsing feasible, and even pretty quick in some cases. 70 | 71 | ## Related projects 72 | 73 | - [pdftools](https://github.com/ropensci/pdftools): Text Extraction, Rendering and Converting of PDF Documents. 74 | - [qpdf](https://github.com/ropensci/qpdf): Content-preserving transformations transformations of PDF files such as split, combine, and compress. This package interfaces directly to the ‘qpdf’ C++ API and does not require any command line utilities. 75 | - [tabulizer](https://github.com/ropensci/tabulizer): Bindings for Tabula PDF Table Extractor Library 76 | - [PDE](https://github.com/erikstricker/PDE): The PDE (Pdf Data Extractor) allows the extraction of information and tables optionally based on search words from PDF (Portable Document Format) files and enables the visualization of the results, both by providing a convenient user-interface. 77 | - [xmpdf](https://github.com/trevorld/r-xmpdf): Edit XMP metadata and PDF bookmarks/documentation info. 78 | -------------------------------------------------------------------------------- /src/crypto.h: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------------// 2 | // // 3 | // PDFR crypto header file // 4 | // // 5 | // Copyright (C) 2018 by Allan Cameron // 6 | // // 7 | // Permission is hereby granted, free of charge, to any person obtaining // 8 | // a copy of this software and associated documentation files // 9 | // (the "Software"), to deal in the Software without restriction, including // 10 | // without limitation the rights to use, copy, modify, merge, publish, // 11 | // distribute, sublicense, and/or sell copies of the Software, and to // 12 | // permit persons to whom the Software is furnished to do so, subject to // 13 | // the following conditions: // 14 | // // 15 | // The above copyright notice and this permission notice shall be included // 16 | // in all copies or substantial portions of the Software. // 17 | // // 18 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS // 19 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF // 20 | // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. // 21 | // IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY // 22 | // CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, // 23 | // TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE // 24 | // SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. // 25 | // // 26 | //---------------------------------------------------------------------------// 27 | 28 | #ifndef PDFR_CRYPTO 29 | 30 | //---------------------------------------------------------------------------// 31 | 32 | #define PDFR_CRYPTO 33 | 34 | /* This header file includes the declaration of a class which containes the 35 | * algorithms needed to decrypt protected pdfs. 36 | * 37 | * This only applies to situations in which a password is not required to 38 | * open the file. It allows reading of pdfs in which the ability to copy and 39 | * paste, save or modify the file have been disabled by the owner but it can 40 | * still be opened and read by anyone without a user password. 41 | * 42 | * Most pdfs will open without the need for decryption, but some (such as the 43 | * ISO 32000 pdf reference document itself) are useless without the ability to 44 | * decrypt. 45 | * 46 | * Decryption is quite well encapsulated here. The implementation of decryption 47 | * is left to private member functions. The decryption itself is called only 48 | * when an object stream is extracted at the point of pdf object creation and 49 | * is accessed via a wrapper function in the xref class. The public interface 50 | * is a single function to decrypt a stream given the raw stream, the object 51 | * number and the generation number of the pdf object in which the stream 52 | * resides. 53 | */ 54 | 55 | #include 56 | #include 57 | #include // Needed for md5mix function 58 | #include 59 | #include "charstring.h" 60 | class Dictionary; 61 | 62 | 63 | //---------------------------------------------------------------------------// 64 | // The md5 algorithm makes use of 4-byte numbers (unsigned long or uint32_t). 65 | // To shorten the name and make it explicit what we are talking about I have 66 | // typedef'd uint32_t as FourBytes 67 | 68 | typedef uint32_t FourBytes; 69 | 70 | //---------------------------------------------------------------------------// 71 | // Class definition for crypto 72 | 73 | class Crypto 74 | { 75 | public: 76 | // Constructors 77 | Crypto(const Dictionary& encryption_dictionary, 78 | const Dictionary& trailer_dictionary); 79 | 80 | // This is the main decryption function which is also the public interface for 81 | // the class. It takes the raw stream, the object and generation numbers then 82 | // returns the decrypted stream. 83 | std::string DecryptStream(const std::string& stream_to_be_decoded, 84 | int object_number, 85 | int object_generation_number) const; 86 | 87 | std::string DecryptStream(const CharString&, int, int) const; 88 | 89 | 90 | private: 91 | // private data members 92 | const Dictionary& encryption_dictionary_; 93 | const Dictionary& trailer_; 94 | int revision_; 95 | std::vector filekey_; 96 | static const std::vector default_user_password_; 97 | static const std::vector md5_table; 98 | static const std::vector> mixarray; 99 | 100 | // Chops FourBytes into 4 bytes 101 | std::vector ChopLong_(FourBytes) const; 102 | 103 | // Return permission flags for file 104 | std::vector ReadPermissions_(const std::string&); 105 | 106 | // Helper function for md5 107 | void Md5Mix_(int, std::deque&, std::vector&) const; 108 | 109 | // Gives md5 hash of a vector of raw bytes 110 | std::vector Md5_(const std::vector&) const; 111 | 112 | // Gives md5 hash of a string (as bytes) 113 | std::vector Md5_(const std::string&) const; 114 | 115 | // Gives rc4 cipher of message:key pair, given key and message 116 | void Rc4_(std::vector&, const std::vector&) const; 117 | 118 | // Gets /O and /U cipher 119 | std::vector ReadPassword_(const std::string&); 120 | 121 | // Constructs file key 122 | void ReadFileKey_(); 123 | 124 | // Checks file key (revision 2) 125 | void CheckKeyR2_(); 126 | 127 | // Checks file key (revision 3) 128 | void CheckKeyR3_(); 129 | 130 | // Ensure the ID is read correctly whether hex or plain bytes 131 | std::vector ParseID_(const std::string&); 132 | 133 | }; 134 | 135 | //---------------------------------------------------------------------------// 136 | 137 | #endif 138 | -------------------------------------------------------------------------------- /src/text_element.h: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------------// 2 | // // 3 | // PDFR TextElement header file // 4 | // // 5 | // Copyright (C) 2018 - 2019 by Allan Cameron // 6 | // // 7 | // Licensed under the MIT license - see https://mit-license.org // 8 | // or the LICENSE file in the project root directory // 9 | // // 10 | //---------------------------------------------------------------------------// 11 | 12 | #ifndef PDFR_TEXT_ELEMENT 13 | 14 | //---------------------------------------------------------------------------// 15 | 16 | #define PDFR_TEXT_ELEMENT 17 | 18 | #include 19 | #include "box.h" 20 | 21 | class Font; 22 | using Unicode = uint16_t; 23 | 24 | //---------------------------------------------------------------------------// 25 | // The "atom" of our output will be the TextElement. This is a class containing 26 | // one or more glyphs as a vector of uint16_t (representing Unicode code points) 27 | // along with its position, size, and the name of the font used to draw it. 28 | // We will need to shuffle these around quite a lot in processing, so we use 29 | // shared pointers to each TextElement to represent each text element. The 30 | // pointers to text_elements are typedef'd as TextPointer for brevity. 31 | 32 | //---------------------------------------------------------------------------// 33 | // The TextElement is a class which contains information about each text 34 | // element on a page including the actual unicode glyph(s), the position, the 35 | // font and size of the character(s). It also contains a pair that acts as an 36 | // address for the adjacent glyph which will be found during LetterGrouper's 37 | // construction, and Boolean flags to indicate whether it is "consumed" when 38 | // the glyphs are stuck together into words, as well as flags to indicate 39 | // whether the element is at the left, right or centre of a column 40 | 41 | class TextElement : public Box 42 | { 43 | typedef std::shared_ptr TextPointer; 44 | 45 | public: 46 | TextElement(float left, float right, float top, float bottom, 47 | float size, std::shared_ptr font, 48 | std::vector glyphs) 49 | : Box(left, right, top, bottom), size_(size), 50 | font_(font), glyph_(std::move(glyphs)), join_(nullptr) {}; 51 | 52 | // Inevitably, we need to define some "magic number" constants to define 53 | // how close together text elements have to be to clump together 54 | 55 | constexpr static float CLUMP_H = 0.01; // horizontal clumping, high = sticky 56 | constexpr static float CLUMP_V = 0.1; // vertical clumping, high = sticky 57 | constexpr static float LINE_CLUMP = 0.7; 58 | constexpr static float MAX_WORD_GAP = 0.5; 59 | constexpr static float MAX_ALIGN_IGNORE = 0.0; 60 | 61 | inline void MakeLeftEdge() { this->SetFlag(0x04); } 62 | inline void MakeRightEdge() { this->SetFlag(0x02); } 63 | inline void MakeCentred() { this->SetFlag(0x06); } 64 | inline float GetSize() const override {return this->size_;} 65 | inline bool IsLeftEdge() const { return this->HasFlag(0x04); } 66 | inline bool IsRightEdge() const { return this->HasFlag(0x02); } 67 | inline bool IsCentred() const { return this->HasFlag(0x06); } 68 | 69 | inline void SetJoin(TextPointer element) { this->join_ = element;} 70 | inline TextPointer GetJoin() { return this->join_; } 71 | inline bool HasJoin() const { if (join_) return true; else return false;} 72 | 73 | std::string GetFontName() const; // can't inline without including font.h 74 | inline std::vector GetGlyph() const { return this->glyph_;} 75 | inline void AddSpace() { glyph_.push_back(0x0020); } 76 | 77 | inline void PopLastGlyph() 78 | { 79 | if (glyph_.empty()) throw std::runtime_error("Can't pop empty vector"); 80 | else glyph_.pop_back(); 81 | } 82 | 83 | inline bool operator ==(const TextElement& other) const 84 | { 85 | if (&other == this) return true; 86 | return (other.GetLeft() == this->GetLeft() && 87 | other.GetBottom() == this->GetBottom() && 88 | other.GetTop() == this->GetTop() && 89 | other.GetGlyph() == this->GetGlyph() ); 90 | } 91 | 92 | inline bool IsAdjoiningLetter(const TextElement& other) const 93 | { 94 | if (&other == this) return false; 95 | return 96 | other.GetLeft() > GetLeft() && 97 | abs(other.GetBottom() - GetBottom()) < (CLUMP_V * GetSize()) && 98 | ( 99 | abs(other.GetLeft() - GetRight()) < (CLUMP_H * GetSize()) || 100 | (other.GetLeft() < GetRight()) 101 | ) ; 102 | } 103 | 104 | inline bool IsOnSameLineAs(const TextElement& other) const 105 | { 106 | if (&other == this) return true; 107 | return 108 | (other.GetBottom() - this->GetBottom() < LINE_CLUMP * this->GetSize()) && 109 | (this->GetBottom() - other.GetBottom() < LINE_CLUMP * this->GetSize()); 110 | } 111 | 112 | inline bool IsWayBeyond(const TextElement& other) const 113 | { 114 | if (&other == this) return false; 115 | return GetLeft() - other.GetRight() > MAX_WORD_GAP * other.GetSize(); 116 | } 117 | 118 | inline bool CannotJoinLeftOf(const TextElement& other) const 119 | { 120 | if (&other == this) return true; 121 | return 122 | ( other.IsLeftEdge() || other.IsCentred() || 123 | this->IsRightEdge() || this->IsCentred()) && 124 | (other.GetLeft() - this->GetRight()) > (MAX_ALIGN_IGNORE * GetSize()); 125 | } 126 | 127 | void MergeLetters(TextElement&); 128 | bool IsElligibleToJoin(const TextElement&) const; 129 | void JoinWords(TextElement&); 130 | void ConcatenateUnicode(const std::vector&); 131 | std::string Utf(); 132 | 133 | 134 | private: 135 | float size_; // The font size 136 | std::shared_ptr font_; // Font used to draw text 137 | std::vector glyph_; // The actual Unicode glyphs encoded 138 | std::shared_ptr join_; // address of closest adjacent element 139 | }; 140 | 141 | 142 | #endif 143 | -------------------------------------------------------------------------------- /src/graphicobject.h: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------------// 2 | // // 3 | // PDFR GraphicObject header file // 4 | // // 5 | // Copyright (C) 2018 - 2021 by Allan Cameron // 6 | // // 7 | // Licensed under the MIT license - see https://mit-license.org // 8 | // or the LICENSE file in the project root directory // 9 | // // 10 | //---------------------------------------------------------------------------// 11 | 12 | #ifndef PDFR_GO 13 | 14 | //---------------------------------------------------------------------------// 15 | 16 | #define PDFR_GO 17 | 18 | #include "utilities.h" 19 | #include 20 | #include 21 | #include 22 | #include "text_element.h" 23 | 24 | 25 | /*---------------------------------------------------------------------------*/ 26 | /* This is a header-only implementation of a GraphicObject class, which is used 27 | * to store information about shapes extracted from the page description 28 | * program. 29 | */ 30 | 31 | class GraphicObject 32 | { 33 | public: 34 | GraphicObject() : linewidth_(1), 35 | stroke_colour_({0, 0, 0}), is_stroked_(false), 36 | is_filled_(false), fill_colour_({0.5, 0.5, 0.5}) {}; 37 | 38 | // Setters 39 | void SetLineWidth(float size) {this->linewidth_ = size;} 40 | void SetColour(std::vector colour) {this->stroke_colour_ = colour;} 41 | void SetFillColour(std::vector colour) {this->fill_colour_ = colour;} 42 | void SetStroke(bool visible) {this->is_stroked_ = visible;} 43 | void SetFilled(bool is_filled) {this->is_filled_ = is_filled;} 44 | 45 | // virtual functions allow type-specific behaviour in derived classes 46 | virtual void NewSubpath() {} 47 | virtual void SetX(std::vector values) {} 48 | virtual void SetY(std::vector values) {} 49 | virtual void CloseSubpath() {} 50 | virtual void AppendX(std::vector value) {} 51 | virtual void AppendY(std::vector value) {} 52 | virtual std::vector GetX() {return {0};} 53 | virtual std::vector GetY() {return {0};} 54 | virtual bool IsClosed() { return false;} 55 | virtual float Bottom() { return 0;} 56 | virtual float Top() { return 0;} 57 | virtual float Left() { return 0;} 58 | virtual float Right() { return 0;} 59 | virtual float Width() { return 0;} 60 | virtual float Height() { return 0;} 61 | virtual std::string GetText() {return "";} 62 | virtual float GetFontSize() {return 0;} 63 | virtual std::vector GetSubpaths() {return {0};} 64 | 65 | // Getters 66 | 67 | virtual float GetLineWidth() {return this->linewidth_;} 68 | virtual std::vector GetColour() {return this->stroke_colour_;} 69 | bool IsStroked() {return this->is_stroked_;} 70 | bool IsFilled() {return this->is_filled_;} 71 | std::vector GetFillColour() {return this->fill_colour_;} 72 | 73 | 74 | private: 75 | float linewidth_; 76 | std::vector stroke_colour_; 77 | bool is_stroked_; 78 | bool is_filled_; 79 | std::vector fill_colour_; 80 | 81 | }; 82 | 83 | /*---------------------------------------------------------------------------*/ 84 | 85 | class Path : public GraphicObject { 86 | public: 87 | Path(): path_x_({}), path_y_({}), current_subpath_(0), is_closed_({false}) {} 88 | 89 | void SetX(std::vector values) {this->path_x_ = values;} 90 | void SetY(std::vector values) {this->path_y_ = values;} 91 | 92 | void NewSubpath() {++current_subpath_;} 93 | void CloseSubpath() { 94 | is_closed_.back() = true; 95 | int pos = std::find(subpaths_.begin(), subpaths_.end(), current_subpath_) - 96 | subpaths_.begin(); 97 | path_x_.push_back(path_x_[pos]); 98 | path_y_.push_back(path_y_[pos]); 99 | subpaths_.push_back(subpaths_.back()); 100 | } 101 | 102 | void SetSubpaths(std::vector value) { subpaths_ = value;} 103 | 104 | void AppendX(std::vector value) { 105 | Concatenate(this->path_x_, {value}); 106 | while(subpaths_.size() < path_x_.size()){ 107 | subpaths_.push_back(current_subpath_); 108 | } 109 | } 110 | 111 | void AppendY(std::vector value) { 112 | Concatenate(this->path_y_, {value}); 113 | while(subpaths_.size() < path_x_.size()){ 114 | subpaths_.push_back(current_subpath_); 115 | } 116 | } 117 | 118 | std::vector GetX() {return this->path_x_;} 119 | std::vector GetY() {return this->path_y_;} 120 | bool IsClosed() { return this->is_closed_.back();} 121 | 122 | float Bottom() { return *std::min_element(this->path_y_.begin(), 123 | this->path_y_.end());} 124 | float Top() { return *std::max_element(this->path_y_.begin(), 125 | this->path_y_.end());} 126 | float Left() { return *std::min_element(this->path_x_.begin(), 127 | this->path_x_.end());} 128 | float Right() { return *std::max_element(this->path_x_.begin(), 129 | this->path_x_.end());} 130 | float Width() { return this->Right() - this->Left();} 131 | float Height() { return this->Top() - this->Bottom();} 132 | std::vector GetSubpaths() {return subpaths_;} 133 | 134 | private: 135 | std::vector path_x_; 136 | std::vector path_y_; 137 | int current_subpath_; 138 | std::vector subpaths_; 139 | std::vector is_closed_; 140 | }; 141 | 142 | /*---------------------------------------------------------------------------*/ 143 | 144 | class Text : public GraphicObject { 145 | 146 | public: 147 | Text(std::shared_ptr text) : contents_(text) {} 148 | std::string GetText() {return contents_->Utf();} 149 | std::vector GetColour() {return this->GetFillColour();} 150 | std::vector GetX() {return {contents_->GetLeft()};} 151 | std::vector GetY() {return {contents_->GetBottom()};} 152 | float GetFontSize() {return contents_->GetSize();} 153 | 154 | 155 | private: 156 | std::shared_ptr contents_; 157 | }; 158 | 159 | #endif 160 | -------------------------------------------------------------------------------- /src/word_grouper.cpp: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------------// 2 | // // 3 | // PDFR WordGrouper implementation file // 4 | // // 5 | // Copyright (C) 2018 - 2019 by Allan Cameron // 6 | // // 7 | // Licensed under the MIT license - see https://mit-license.org // 8 | // or the LICENSE file in the project root directory // 9 | // // 10 | //---------------------------------------------------------------------------// 11 | 12 | #include "word_grouper.h" 13 | 14 | using namespace std; 15 | 16 | //---------------------------------------------------------------------------// 17 | // This "magic number" is an integer that specifies how many glyphs need to 18 | // line up to infer an aligned column on the page. 19 | 20 | constexpr int EDGECOUNT = 4; 21 | 22 | //---------------------------------------------------------------------------// 23 | // Constructor for WordGrouper class. This takes the output from LetterGrouper 24 | // and finds its column edges, then joins elligible words together as long as 25 | // they do not belong to different columns. 26 | 27 | WordGrouper::WordGrouper(std::unique_ptr text_box) 28 | : text_box_(move(text_box)) 29 | { 30 | FindEdges_(); 31 | AssignEdges_(); 32 | FindRightMatch_(); 33 | }; 34 | 35 | //---------------------------------------------------------------------------// 36 | // Makes a table of supplied vector of floats. Multiplies them by 10 and 37 | // casts to int as a way of rounding to 1 decimal place. It then removes any 38 | // keys whose counts are less than EDGECOUNT, so the remaining keys are the 39 | // positions we wish to identify as possible edges. Since the maps we want to 40 | // return are data members of the class, we need to pass the map we wish to 41 | // create by reference. 42 | 43 | void WordGrouper::Tabulate_(const vector& supplied_vector, 44 | unordered_map& table ) 45 | { 46 | // Take each member of the supplied vector 47 | for (const auto& element : supplied_vector) 48 | { 49 | // Multiply it by 10 and use it as a key in the map with value 1 50 | auto inserter = table.insert(pair((int) 10 * element, 1)); 51 | 52 | // If the key already exists in the map, increment the value by 1 53 | if (!inserter.second) inserter.first->second++; 54 | } 55 | 56 | // Now take each key in the resulting map 57 | for (auto key_value_pair = table.begin(); key_value_pair != table.end(); ) 58 | { 59 | // if value is below the number needed to declare a column, delete it 60 | if (key_value_pair->second < EDGECOUNT) 61 | { 62 | table.erase(key_value_pair++); 63 | } 64 | else ++key_value_pair; 65 | } 66 | } 67 | 68 | //---------------------------------------------------------------------------// 69 | // This uses the Tabulate function to find left, right and centre-aligned text 70 | // elements on the page. 71 | 72 | void WordGrouper::FindEdges_() 73 | { 74 | // Create vectors of left and right edges of text elements 75 | vector left, right, midvec; 76 | left.reserve(text_box_->size()); 77 | right.reserve(text_box_->size()); 78 | midvec.reserve(text_box_->size()); 79 | 80 | for (auto& element : *text_box_) 81 | { 82 | left.push_back(element->GetLeft()); 83 | right.push_back(element->GetRight()); 84 | midvec.push_back((left.back() + right.back()) / 2); 85 | } 86 | 87 | 88 | // Use Tabulate to find left and right edges as well as midpoints 89 | Tabulate_(left, left_edges_); 90 | Tabulate_(right, right_edges_); 91 | Tabulate_(midvec, mids_); 92 | } 93 | 94 | //---------------------------------------------------------------------------// 95 | // Now we need to "tell" each element whether it is a left, right or centre 96 | // aligned element so it "knows" which side(s), if any, are eligible to join 97 | // other elements 98 | 99 | void WordGrouper::AssignEdges_() 100 | { 101 | for (auto& element : *text_box_) 102 | { 103 | int left_int = element->GetLeft() * 10; 104 | int right_int = element->GetRight() * 10; 105 | int mid_int = (element->GetRight() + element->GetLeft()) * 5; 106 | 107 | // Non-unique left edge - assume column edge 108 | if (left_edges_.find(left_int) != left_edges_.end()) 109 | { 110 | element->MakeLeftEdge(); 111 | } 112 | 113 | // Non-unique right edge - assume column edge 114 | if (right_edges_.find(right_int) != right_edges_.end()) 115 | { 116 | element->MakeRightEdge(); 117 | } 118 | 119 | // Non-unique centre value - assume centred column 120 | if (mids_.find(mid_int) != mids_.end()) 121 | { 122 | element->MakeCentred(); 123 | } 124 | } 125 | } 126 | 127 | //---------------------------------------------------------------------------// 128 | // It's a bit naughty for a function to do two things instead of one, but these 129 | // two things are easier / quicker done in a single loop. Go through each text 130 | // item and check whether it is elligible for joining to another element. If it 131 | // is, find the most appropriate match to its right that is elligible and stick 132 | // the two together. 133 | 134 | void WordGrouper::FindRightMatch_() 135 | { 136 | // Handle empty data 137 | if (text_box_->empty()) throw runtime_error("empty data"); 138 | 139 | for (auto element = text_box_->begin(); element != text_box_->end(); ++element) 140 | { 141 | // Check the row is elligible for matching 142 | if ((*element)->IsConsumed()) continue; 143 | 144 | // If elligible, check every other word for the best match 145 | for (auto other = element; other != text_box_->end(); ++other) 146 | { 147 | // Don't match against itself 148 | if (element == other) continue; 149 | 150 | // These TextElement functions are quite complex in themselves 151 | if ((*element)->IsElligibleToJoin(**other)) 152 | { 153 | (*element)->JoinWords(**other); 154 | --element; // Keep matching same element until no other matches found 155 | break; 156 | } 157 | } 158 | } 159 | } 160 | 161 | //---------------------------------------------------------------------------// 162 | 163 | TextTable WordGrouper::Out() const { return TextTable(*text_box_);} 164 | -------------------------------------------------------------------------------- /src/text_element.cpp: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------------// 2 | // // 3 | // PDFR TextElement implementation file // 4 | // // 5 | // Copyright (C) 2018 - 2019 by Allan Cameron // 6 | // // 7 | // Licensed under the MIT license - see https://mit-license.org // 8 | // or the LICENSE file in the project root directory // 9 | // // 10 | //---------------------------------------------------------------------------// 11 | 12 | #include "utilities.h" 13 | #include "box.h" 14 | #include "font.h" 15 | #include "text_element.h" 16 | 17 | using namespace std; 18 | 19 | //---------------------------------------------------------------------------// 20 | 21 | void TextElement::MergeLetters(TextElement& matcher) 22 | { 23 | // paste the left glyph to the right glyph 24 | this->ConcatenateUnicode(matcher.glyph_); 25 | 26 | // make the right glyph now contain both glyphs 27 | swap(matcher.glyph_, this->glyph_); 28 | 29 | // make the right glyph now start where the left glyph started 30 | matcher.SetLeft(this->GetLeft()); 31 | 32 | // Ensure bottom is the lowest value of the two glyphs 33 | if (this->GetBottom() < matcher.GetBottom()) 34 | matcher.SetBottom(this->GetBottom()); 35 | 36 | // The checked glyph is now consumed - move to the next 37 | this->Consume(); 38 | } 39 | 40 | //---------------------------------------------------------------------------// 41 | 42 | bool TextElement::IsElligibleToJoin(const TextElement& other) const 43 | { 44 | return !other.IsConsumed() && 45 | other.IsBeyond(*this) && 46 | other.IsOnSameLineAs(*this) && 47 | !other.IsWayBeyond(*this) && 48 | !this->CannotJoinLeftOf(other) ; 49 | } 50 | 51 | //---------------------------------------------------------------------------// 52 | 53 | void TextElement::JoinWords(TextElement& other) 54 | { 55 | // This element is elligible for joining - start by adding a space to it 56 | this->glyph_.push_back(0x0020); 57 | 58 | // If the gap is wide enough, add two spaces 59 | if (other.GetLeft() - this->GetRight() > 1 * this->GetSize()) 60 | { 61 | this->glyph_.push_back(0x0020); 62 | } 63 | 64 | // Stick contents together 65 | Concatenate(this->glyph_, other.GetGlyph()); 66 | 67 | // The rightmost glyph's right edge properties are also copied over 68 | this->SetRight(other.GetRight()); 69 | if (other.IsRightEdge()) this->MakeRightEdge(); 70 | 71 | // The word will take up the size of its largest glyph 72 | this->SetTop(max(this->GetSize(), other.GetSize()) + this->GetBottom()); 73 | 74 | // The element on the right is now consumed 75 | other.Consume(); 76 | } 77 | 78 | //---------------------------------------------------------------------------// 79 | 80 | void TextElement::ConcatenateUnicode(const std::vector& other) 81 | { 82 | Concatenate(glyph_, other); 83 | } 84 | 85 | /*--------------------------------------------------------------------------*/ 86 | // converts (16-bit) Unicode code points to multibyte utf-8 encoding. 87 | 88 | string TextElement::Utf() 89 | { 90 | std::string result_string {}; // empty string for results 91 | for (auto& point : this->glyph_) // for each uint16_t in the input vector... 92 | { 93 | // values less than 128 are just single-byte ASCII 94 | if (point < 0x0080) 95 | { 96 | result_string.push_back(point & 0x007f); 97 | continue; 98 | } 99 | 100 | // values of 128 - 2047 are two bytes. The first byte starts 110xxxxx 101 | // and the second starts 10xxxxxx. The remaining 11 x's are filled with the 102 | // 11 bits representing a number between 128 and 2047. e.g. Unicode point 103 | // U+061f (decimal 1567) is 11000011111 in 11 bits of binary, which we split 104 | // into length-5 and length-6 pieces 11000 and 011111. These are appended on 105 | // to 110 and 10 respectively to give the 16-bit number 110 11000 10 011111, 106 | // which as two bytes is 11011000 10011111 or d8 9f. Thus the UTF-8 107 | // encoding for character U+061f is the two-byte sequence d8 9f. 108 | if (point > 0x007f && point < 0x0800) 109 | { 110 | // construct byte with bits 110 and first 5 bits of unicode point number 111 | result_string.push_back((0x00c0 | ((point >> 6) & 0x001f))); 112 | 113 | // construct byte with bits 10 and final 6 bits of unicode point number 114 | result_string.push_back(0x0080 | (point & 0x003f)); 115 | continue; 116 | } 117 | 118 | // Unicode values between 2048 (0x0800) and the maximum uint16_t value 119 | // (65535 or 0xffff) are given by 16 bits split over three bytes in the 120 | // following format: 1110xxxx 10xxxxxx 10xxxxxx. Each x here takes one of 121 | // the 16 bits representing 2048 - 65535. 122 | if (point > 0x07ff) 123 | { 124 | // First we specifically change ligatures to appropriate Ascii values 125 | if (point == 0xFB00) {result_string += "ff"; continue;} 126 | if (point == 0xFB01) {result_string += "fi"; continue;} 127 | if (point == 0xFB02) {result_string += "fl"; continue;} 128 | if (point == 0xFB03) {result_string += "ffi"; continue;} 129 | if (point == 0xFB04) {result_string += "ffl"; continue;} 130 | 131 | // construct byte with 1110 and first 4 bits of unicode point number 132 | result_string.push_back(0x00e0 | ((point >> 12) & 0x000f)); 133 | 134 | // construct byte with 10 and bits 5-10 of unicode point number 135 | result_string.push_back(0x0080 | ((point >> 6) & 0x003f)); 136 | 137 | // construct byte with bits 10 and final 6 bits of unicode point number 138 | result_string.push_back(0x0080 | ((point) & 0x003f)); 139 | } 140 | // Although higher Unicode points are defined and can be encoded in utf8, 141 | // the hex-strings in pdf seem to be two bytes wide at most. These are 142 | // therefore not supported at present. 143 | } 144 | return result_string; 145 | } 146 | 147 | /*--------------------------------------------------------------------------*/ 148 | // Although this method looks like it should be inlined, doing so would mean 149 | // having to include font.h in the header file 150 | 151 | string TextElement::GetFontName() const 152 | { 153 | return this->font_->GetFontName(); 154 | } 155 | -------------------------------------------------------------------------------- /src/xref.h: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------------// 2 | // // 3 | // PDFR XRef header file // 4 | // // 5 | // Copyright (C) 2018 - 2019 by Allan Cameron // 6 | // // 7 | // Licensed under the MIT license - see https://mit-license.org // 8 | // or the LICENSE file in the project root directory // 9 | // // 10 | //---------------------------------------------------------------------------// 11 | 12 | #ifndef PDFR_XREF 13 | 14 | //---------------------------------------------------------------------------// 15 | 16 | #define PDFR_XREF 17 | 18 | /* This is the third main header in the daisy-chain of #includes that builds up 19 | * the tools needed to read and parse pdf, after utilities.h and dictionary.h. 20 | * It also includes a couple of other headers which are needed to decrypt and 21 | * decode encrypted and compressed streams (streams.h and crypto.h) 22 | * 23 | * The cross reference table (XRef) is a data structure (or more accurately a 24 | * group of data structures) that forms part of the pdf file format and allows 25 | * for the rapid random access of the pdf objects from which a document is 26 | * comprised. At its simplest, this is a table containing the object number, 27 | * the generation number of the object, and the number of bytes from the start 28 | * of the file where that object is located. 29 | * 30 | * However, it is not always quite that simple. Firstly, documents can and do 31 | * have more than one XRef that lists different objects. Secondly, the XRef 32 | * can itself be a compressed stream which must be found and translated before 33 | * being read. This means the XRef class must have access to decryption and 34 | * decoding algorithms. 35 | * 36 | * Fortunately, the location of the start of an XRef table (as number of bytes 37 | * offset from the start of the file) is given right at the end of a file, just 38 | * before the %%EOF on the last line. It is thus simple to get to the start of 39 | * an XRef from this number. For a normal uncompressed XRef, this takes us to 40 | * the top of a table which is just read and parsed. At the end of the table is 41 | * a special dictionary which does not belong to any object. This is the 42 | * trailer dictionary. If there are other xrefs in the file, this tells us 43 | * where the next one is, and we can continue to hop around and read the xrefs 44 | * until none are left and we have a complete "roadmap" of where the objects 45 | * are in the file. 46 | * 47 | * If, however, the XRef is located in a stream, things get more complicated. 48 | * The stream belongs to an object, and the dictionary at the beginning of that 49 | * object doubles as the trailer dictionary. As well as being compressed, the 50 | * stream containing the XRef is usually encoded as a string of bytes which 51 | * then need to be interpreted using the algorithm normally used for 52 | * decompressing PNG files. This makes handling XRef streams complex enough to 53 | * warrant their own class. However, since this class only has to perform a part 54 | * of XRef implementation, it has no public interface and is therefore not 55 | * defined in this header file, but rather within xref.cpp 56 | */ 57 | #include 58 | #include 59 | #include 60 | #include 61 | 62 | class Dictionary; 63 | class Crypto; 64 | class CharString; 65 | 66 | /*---------------------------------------------------------------------------*/ 67 | // The main XRef data member is an unordered map with the key being the object 68 | // number and the value being a struct of named ints as defined here 69 | 70 | struct XRefRow 71 | { 72 | int startbyte, // Its byte offset 73 | stopbyte, // The offset of the corresponding endobj marker 74 | in_object; // If this is a stream object, in which other object is it 75 | }; // located? Has value of 0 if the object is not in a stream 76 | 77 | /*---------------------------------------------------------------------------*/ 78 | // The main XRef class definition. Since this is the main "skeleton" of the pdf 79 | // which is used by other classes to negotiate & parse the pdf, and because it 80 | // can be complex to construct, it is a fairly large and complex class. 81 | // 82 | // Where possible I have tried to delegate some of its work to other classes 83 | // or subclasses, but even still it is a little unwieldy. 84 | 85 | class XRef 86 | { 87 | public: 88 | XRef(std::shared_ptr); 89 | 90 | // Empty XRef constructor 91 | XRef(){}; 92 | 93 | // public methods 94 | Dictionary GetTrailer() const; // Gets trailer dictionary 95 | size_t GetObjectEndByte(int) const; // Gets object end position 96 | std::vector GetAllObjectNumbers() const; // Gets all object numbers 97 | CharString GetStreamLocation(int) const; // Gets start/stop of stream 98 | std::string Decrypt(std::string&, int, int) const; // Decrypts a stream 99 | std::string Decrypt(const CharString&, int, int) const; 100 | 101 | std::shared_ptr File() const { return file_string_;} 102 | 103 | CharString GetCharString() const { return CharString(*file_string_);} 104 | 105 | bool IsEncrypted() const { if(encryption_) return true; else return false; } 106 | 107 | size_t GetObjectStartByte(int object_number) const 108 | { return GetRow_(object_number).startbyte; } 109 | 110 | size_t GetHoldingNumberOf(int object_number) const 111 | { return GetRow_(object_number).in_object; } 112 | 113 | private: 114 | std::shared_ptr file_string_; // Pointer to file string 115 | std::unordered_map xref_table_; // Main data member 116 | Dictionary trailer_dictionary_; // Main trailer dictionary 117 | std::shared_ptr encryption_; // Used for encrypted files 118 | 119 | // private methods 120 | XRef& operator=(const XRef&); 121 | int GetStreamLength_(const Dictionary&) const; 122 | void LocateXRefs_(); // Finds XRef locations 123 | void ReadXRefStrings_(int); // Gets strings from XRef locations 124 | void ReadXRefFromStream_(int); // Uses xrefstream class to get XRef 125 | void ReadXRefFromString_(const CharString&); // parses XRef directly 126 | void CreateCrypto_(); // Allows decryption of encrypted docs 127 | const XRefRow& GetRow_(int) const; 128 | }; 129 | 130 | //---------------------------------------------------------------------------// 131 | 132 | #endif 133 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # PDFR 5 | 6 | 7 | 8 | [![Lifecycle: 9 | experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html#experimental) 10 | [![License: 11 | MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 12 | 13 | 14 | The goal of PDFR is to aid data scientists who need the ability to 15 | extract data from files in pdf format. PDFR is a new C++ based R library 16 | to extract usable text from portable document format (pdf) files. 17 | 18 | The majority of the code base is written in C++ with a view to being 19 | ported to other languages, but at present it is constructed to be built 20 | as an R package. 21 | 22 | ## Installation 23 | 24 | You can install the development version of PDFR from 25 | [GitHub](https://github.com/) with: 26 | 27 | ``` r 28 | # install.packages("pak") 29 | pak::pkg_install("AllanCameron/PDFR") 30 | ``` 31 | 32 | ## Usage 33 | 34 | The main function used to extract all data from a pdf page to an R data 35 | frame is `pdfpage()`. This accepts either the path to a pdf or a raw 36 | data vector representing a pdf. For example, this is how you extract all 37 | text from page 1 of the barcodes PDF from `pdfr_paths`: 38 | 39 | ``` r 40 | library(PDFR) 41 | 42 | barcodes <- system.file("extdata", "barcodes.pdf", package = "PDFR") 43 | pdfpage(barcodes, 1) 44 | #> text left right bottom top font size 45 | #> 1 None 53.5 74.4 774.2 782.2 Courier 8 46 | #> 2 Acute medicine 187.4 255.9 774.2 782.2 Courier 8 47 | #> 3 / 258.8 264.8 774.2 782.2 Courier 8 48 | #> 4 ward 267.8 288.6 774.2 782.2 Courier 8 49 | #> 5 11 291.6 303.5 774.2 782.2 Courier 8 50 | #> 6 jean.cairney@ggc.scot.nhs.uk0141 318.3 470.1 774.2 782.2 Courier 8 51 | #> 7 211 473.0 490.9 774.2 782.2 Courier 8 52 | #> 8 5719 493.9 514.7 774.2 782.2 Courier 8 53 | ``` 54 | 55 | ## Background 56 | 57 | The current version is at an early stage of development. It will work 58 | with most pdfs, but there are some unsupported features which may lead 59 | to some pdfs producing runtime errors. 60 | 61 | Documents encrypted using the standard method and which can be opened 62 | without a password are supported. Password-based encryption is currently 63 | unsupported. 64 | 65 | If there are any suggestions for development please submit a feature 66 | request, or let me know about pdfs that break the package. 67 | 68 | ## Motivation 69 | 70 | Extracting useful data from pdf is difficult for two reasons. Firstly, 71 | the pdf format primarily consists of binary data, which is laid out in 72 | such a way as to provide quick random access to pdf *objects* as 73 | required by a pdf reader. The text elements as seen on the page are 74 | usually encoded in a binary stream within the document. Even when the 75 | binary stream is decoded, the text items exist as individual elements 76 | within a page description program, which has to be parsed before the 77 | text can be extracted. It is therefore not a trivial matter to extract 78 | the “raw text” from a pdf file into a format in which it can be read by 79 | R, though there exist some excellent tools that can do this quickly. In 80 | particular, 81 | [pdftools](https://ropensci.org/blog/2016/03/01/pdftools-and-jeroen/) 82 | provides an R interface to some of Poppler’s pdf tools, and can quickly 83 | and reliably extract text wholesale from pdf. 84 | 85 | The second problem is that, unlike some other common file types used to 86 | exchange information on the internet (e.g. html, xml, csv, JSON), the 87 | raw text extracted from a pdf does not have a fixed structure to provide 88 | semantic information about the data to allow it to be processed easily 89 | by a data scientist. 90 | 91 | The mismatch between the fact that humans can read data from pdfs so 92 | easily yet the format is so difficult to convert into machine-readable 93 | data is explained by the fact that humans use the structure of the page 94 | layout to provide the semantic context to the data. When the structure 95 | is lost (as it often is with copy and pasting from PDF), it becomes very 96 | difficult for a human reader to interpret. The computer does not know 97 | how to interpret the characters’ positions, so it cannot classify the 98 | characters by semantics as a human reader (usually) can. 99 | 100 | The idea behind PDFR is to try to extract raw text then use the 101 | positioning and formatting data from the extracted text to reconstruct 102 | some of the semantic content that would otherwise be lost. For example, 103 | identifying and grouping letters into words, words into paragraphs or 104 | into tables. 105 | 106 | Ultimately, to extract useful data, the user will need the option to 107 | control how and to what extent text elements are grouped. For example, 108 | they may need the fine control of having every letter’s position on the 109 | page (e.g. to accurately reconstruct a part of the document on a plot), 110 | or may wish to extract a corpus of plain text from a book as a set of 111 | paragraphs or even whole pages. 112 | 113 | PDFR is written in C++ 11 and has no external dependencies, but makes 114 | extensive use of the C++ standard libraries. Rather than being based on 115 | an existing library such as [xpdf](https://www.xpdfreader.com/) or 116 | [Poppler](https://poppler.freedesktop.org/), it was written from scratch 117 | with the specific goal of making text extraction easier for R users. 118 | Most of the design is new, an attempt to implement the text extraction 119 | elements of the pdf standard [ISO 120 | 32000](https://www.iso.org/standard/51502.html), though it borrows some 121 | concepts from existing open-source libraries such as Poppler and 122 | [pdfjs](https://mozilla.github.io/pdf.js/). 123 | 124 | Clearly, the package would not exist without the excellent 125 | [Rcpp](http://www.rcpp.org/) package. Much of the pdf parsing would take 126 | too long to do in R, but having the facility to write C++ extensions 127 | makes pdf parsing feasible, and even pretty quick in some cases. 128 | 129 | ## Related projects 130 | 131 | - [pdftools](https://github.com/ropensci/pdftools): Text Extraction, 132 | Rendering and Converting of PDF Documents. 133 | - [qpdf](https://github.com/ropensci/qpdf): Content-preserving 134 | transformations transformations of PDF files such as split, combine, 135 | and compress. This package interfaces directly to the ‘qpdf’ C++ API 136 | and does not require any command line utilities. 137 | - [tabulizer](https://github.com/ropensci/tabulizer): Bindings for 138 | Tabula PDF Table Extractor Library 139 | - [PDE](https://github.com/erikstricker/PDE): The PDE (Pdf Data 140 | Extractor) allows the extraction of information and tables optionally 141 | based on search words from PDF (Portable Document Format) files and 142 | enables the visualization of the results, both by providing a 143 | convenient user-interface. 144 | - [xmpdf](https://github.com/trevorld/r-xmpdf): Edit XMP metadata and 145 | PDF bookmarks/documentation info. 146 | -------------------------------------------------------------------------------- /src/textbox.h: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------------// 2 | // // 3 | // PDFR TextBox header file // 4 | // // 5 | // Copyright (C) 2018 - 2019 by Allan Cameron // 6 | // // 7 | // Licensed under the MIT license - see https://mit-license.org // 8 | // or the LICENSE file in the project root directory // 9 | // // 10 | //---------------------------------------------------------------------------// 11 | 12 | #ifndef PDFR_TEXT_BOX 13 | 14 | //---------------------------------------------------------------------------// 15 | 16 | #define PDFR_TEXT_BOX 17 | 18 | #include "text_element.h" 19 | 20 | //---------------------------------------------------------------------------// 21 | // We need to be able to process groups of text_elements together; for this we 22 | // could just use a vector of TextPointer. However, we often need to know the 23 | // bounding box of a group of text_elements. We can therefore define a TextBox 24 | // as a struct with a Box and a vector of text_elements. 25 | // 26 | // This header file contains the definitions of the TextElement, TextPointer and 27 | // TextBox classes. Most of their methods are straightforward and inlined, but 28 | // some of the more involved methods are described in text_element.cpp 29 | 30 | //---------------------------------------------------------------------------// 31 | // The TextBox will be the main data repository for our output. It inherits from 32 | // Box and contains a vector of text_elements. To make it easy to work with, it 33 | // contains functions that allow us to use it as if it was just a vector of 34 | // text_elements. This allows for easy iteration. 35 | 36 | class TextBox : public Box 37 | { 38 | using TextPointer = std::shared_ptr; 39 | using TextBoxIterator = std::vector::iterator; 40 | using TextBoxConstIterator = std::vector::const_iterator; 41 | 42 | public: 43 | // Standard constructor - takes vector of TextElement pointers and the minbox 44 | TextBox(std::vector text, Box box) 45 | : Box(box), data_(text) {} 46 | 47 | // Constructor from text and vector of floats representing a box 48 | TextBox(std::vector text, std::vector float_vector) 49 | : Box(float_vector), data_(text) {} 50 | 51 | // Constructor from individual elements 52 | TextBox(std::vector text, float left, float right, 53 | float top, float bottom) 54 | : Box(left, right, top, bottom), data_(text) {} 55 | 56 | // Assignment constructor 57 | TextBox(Box box): Box(box) {} 58 | 59 | // Default constructor 60 | TextBox() = default; 61 | 62 | // Copy contructor 63 | TextBox(const TextBox& textbox) = default; 64 | 65 | // Lvalue assignment constructor 66 | TextBox& operator=(const TextBox& textbox) = default; 67 | 68 | // Rvalue assignment constructor 69 | TextBox& operator=(TextBox&& textbox) noexcept { 70 | std::swap(textbox, *this); return *this;} 71 | 72 | std::shared_ptr CastToElement() 73 | { 74 | if (data_.size() > 1) 75 | { 76 | throw std::runtime_error("Can't cast multiple TextBoxes to TextElement"); 77 | } 78 | auto& element = data_[0]; 79 | element->SetLeft(this->GetLeft()); 80 | element->SetRight(this->GetRight()); 81 | element->SetTop(this->GetTop()); 82 | element->SetBottom(this->GetBottom()); 83 | return element; 84 | } 85 | 86 | // Functions to copy the methods of vectors to access main data object 87 | inline TextBoxIterator begin() {return data_.begin(); } 88 | inline TextBoxIterator end() {return data_.end(); } 89 | inline void erase(TextBoxIterator start, TextBoxIterator finish) 90 | { 91 | data_.erase(start, finish); 92 | } 93 | inline TextBoxConstIterator cbegin() const {return data_.cbegin(); } 94 | inline TextBoxConstIterator cend() const {return data_.cend(); } 95 | inline TextPointer& operator[](int index) { return data_[index]; } 96 | inline TextPointer front() const {return data_.front(); } 97 | inline TextPointer back() const { return data_.back(); } 98 | inline size_t size() const { return data_.size(); } 99 | inline bool empty() const { return data_.empty(); } 100 | inline void push_back(TextPointer text_ptr) { data_.push_back(text_ptr);} 101 | inline void clear() { data_.clear(); } 102 | inline void resize(int new_size) { data_.resize(new_size); } 103 | inline void SwapData(std::vector& other) 104 | { 105 | std::swap(data_, other); 106 | } 107 | 108 | inline void emplace_back(TextPointer text_ptr) 109 | { 110 | data_.emplace_back(text_ptr); 111 | } 112 | 113 | void RemoveDuplicates(); 114 | 115 | // Divides a TextBox into two 116 | TextBox SplitIntoTopAndBottom(float divide_at_this_y_value); 117 | TextBox SplitIntoLeftAndRight(float divide_at_this_x_value); 118 | 119 | private: 120 | // The data member 121 | std::vector data_; 122 | }; 123 | 124 | //---------------------------------------------------------------------------// 125 | // This struct inherits from Box, and is created by feeding it a TextBox. It 126 | // converts the vector of text_elements (which is conceptually a vector of 127 | // data frame rows) into columns of the different data types. 128 | 129 | class TextTable: public Box 130 | { 131 | public: 132 | TextTable(const TextBox&); 133 | void Join(TextTable&); 134 | inline std::vector& GetLefts() { return this->lefts_; } 135 | inline std::vector& GetRights() { return this->rights_; } 136 | inline std::vector& GetTops() { return this->tops_; } 137 | inline std::vector& GetBottoms() { return this->bottoms_;} 138 | inline std::vector& GetSizes() { return this->sizes_; } 139 | inline std::vector& GetFontNames() { return this->fonts_; } 140 | inline std::vector& GetText() { return this->text_; } 141 | 142 | private: 143 | std::vector text_, fonts_; 144 | std::vector lefts_, rights_, bottoms_, tops_, sizes_; 145 | }; 146 | 147 | 148 | //---------------------------------------------------------------------------// 149 | // PageBox class. This is a class containing multiple textboxes as well as a 150 | // 'naked' Box that gives the page dimensions 151 | 152 | class PageBox : public Box 153 | { 154 | public: 155 | PageBox(const Box& box, std::vector text_boxes) 156 | : Box(box), data_(text_boxes) {} 157 | 158 | inline TextBox& operator[](size_t i) { return data_[i];} 159 | inline std::vector::iterator begin() { return data_.begin();} 160 | inline std::vector::iterator end() { return data_.end();} 161 | inline bool empty() const { return data_.empty();} 162 | inline size_t size() const { return data_.size();} 163 | inline void push_back(TextBox textbox) { data_.push_back(textbox);} 164 | TextBox CastToTextBox() 165 | { 166 | auto result = TextBox((Box) *this); 167 | for (auto box : data_) 168 | { 169 | if(!box.empty()) result.push_back(box.CastToElement()); 170 | } 171 | return result; 172 | } 173 | 174 | private: 175 | std::vector data_; 176 | }; 177 | 178 | #endif 179 | -------------------------------------------------------------------------------- /src/encoding.h: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------------// 2 | // // 3 | // PDFR Encoding header file // 4 | // // 5 | // Copyright (C) 2018 - 2019 by Allan Cameron // 6 | // // 7 | // Licensed under the MIT license - see https://mit-license.org // 8 | // or the LICENSE file in the project root directory // 9 | // // 10 | //---------------------------------------------------------------------------// 11 | 12 | #ifndef PDFR_ENCODING 13 | 14 | //---------------------------------------------------------------------------// 15 | 16 | #define PDFR_ENCODING 17 | 18 | /* This is the joint 6th in a series of daisy-chained headers that build up the 19 | * tools to read and parse pdfs. It is logically paired with glyphwidths.h 20 | * in that they both come after document.h and together form the basis for the 21 | * next step, which is font creation. 22 | * 23 | * The reason that font creation comes before page creation is that pages 24 | * include a list of their fonts in the page description header, and the 25 | * program needs to know what these are. 26 | * 27 | * There are three main parts of font creation pertinent to the task of text 28 | * extraction: identifying the font's name, working out the width of the glyphs, 29 | * and working out the correspondence between the characters in a pdf string and 30 | * the intended glyphs as Unicode code points. The latter of these tasks is 31 | * called encoding, and is fairly complex. 32 | * 33 | * The complexity arises because there are several different methods for 34 | * encoding fonts in pdf. First, a base encoding scheme can be declared, such as 35 | * WinAnsiEncoding or MacRomanEncoding. These encodings are stored as static 36 | * private data members of the class in the form of an unordered_map, though 37 | * they are defined in the chartounicode.cpp file rather than encoding.cpp to 38 | * improve code readability. 39 | * 40 | * Whether a base encoding is specified or not, the actual encoding used can 41 | * be modified, for example to include Unicode characters that are not 42 | * available in the base encoding's character set (a common example is the 43 | * glyph for the ligatures ff, fi or fl). This is done using an explicit 44 | * mapping of input characters ("code points") to standard glyph names. That 45 | * means the program needs to know all these glyph names and how to convert 46 | * them to Unicode. This is a very large mapping, and again is declared here as 47 | * a static member but defined in a seperate source file (adobetounicode.h) 48 | * 49 | * The encoding may instead be specified in a CMap, which is a type of 50 | * raw char to Unicode mapping table that usually appears in a (compressed) 51 | * pdf object stream. 52 | * 53 | * The idea behind the encoding class is to use these methods as required to 54 | * produce a mapping for each font so that each code point encountered has a 55 | * Unicode interpretation. It keeps the implementation private and its interface 56 | * is limited to querying its main data member - an unordered map of input 57 | * characters (represented as 2-byte unsigned integers or uint16_t) to Unicode 58 | * characters (also represented as uint16_t). Since in most cases the input 59 | * characters are given as single bytes, these have to be recast as two-byte 60 | * uints for consistency to handle the odd cases when two-byte characters are 61 | * supplied in the strings (as is the case with "hexstrings" or ascii-encoded 62 | * multi-byte character strings). 63 | * 64 | * To make the code clearer, both RawChar and Unicode are typedef'd as synonyms 65 | * of uint16_t so we know at any time whether we are referring to input ("raw") 66 | * code points or output (Unicode) characters. 67 | */ 68 | 69 | #include 70 | #include 71 | #include 72 | #include 73 | #include 74 | 75 | class Dictionary; 76 | class Document; 77 | using Unicode = uint16_t; 78 | using RawChar = uint16_t; 79 | 80 | //---------------------------------------------------------------------------// 81 | // The encoding class comprises constructors which use private subroutines 82 | // and large static maps to construct the main variable data member. The 83 | // public interface is a simple RawChar in, Unicode out translator and a 84 | // function to get all of the encoding (RawChar) keys 85 | 86 | class Encoding 87 | { 88 | public: 89 | // Constructor 90 | Encoding(Dictionary& font_dictionary, 91 | std::shared_ptr ptr_to_document); 92 | 93 | // Maps given raw code point to Unicode 94 | Unicode Interpret(const RawChar& code_point_to_be_interpreted); 95 | 96 | // This typedef shortens the name of the RawChar to Unicode lookup maps. 97 | typedef std::unordered_map UnicodeMap; 98 | 99 | // Gets all available Raw chars that may be translated to Unicode in the map 100 | std::shared_ptr GetEncodingKeys(); 101 | 102 | private: 103 | // States used by parser to read "differences" entry in encoding dictionary 104 | enum DifferencesState { NEWSYMB, NUM, NAME, STOP }; 105 | 106 | // Data lookup tables - defined as static, which means only a single 107 | // instance of each is created rather than a copy for each object. 108 | // Note these maps are defined in adobetounicode.h and chartounicode.h 109 | static const std::unordered_map adobe_to_unicode_; 110 | static const UnicodeMap macroman_to_unicode_; 111 | static const UnicodeMap winansi_to_unicode_; 112 | static const UnicodeMap pdfdoc_to_unicode_; 113 | 114 | UnicodeMap encoding_map_; // The main data member lookup 115 | Dictionary& font_dictionary_; // the main font dictionary 116 | std::shared_ptr document_; // pointer to the containing document 117 | std::string base_encoding_; // value of /BaseEncoding entry 118 | 119 | // The entries_ vector gives a pair of type : entry for each entity pushed 120 | // onto the stack by the lexer. We therefore know whether we are dealing with 121 | // a code point or a name when we parse the stack 122 | std::vector> entries_; 123 | 124 | // private member functions 125 | 126 | // uses lexer to parse /Differences entry 127 | void ReadDifferences_(const std::string&); 128 | 129 | // finds encoding dictionary, gets /basencoding and /Differences entries 130 | void ReadEncoding_(); // Tokenizer 131 | void ReadDifferenceEntries_(); // Parser 132 | 133 | // parses CMap encoding ranges 134 | void ProcessUnicodeRange_(std::vector&); 135 | 136 | // parses CMap direct char-char conversion table 137 | void ProcessUnicodeChars_(std::vector&); 138 | 139 | // finds CMap if any and co-ordinates parsers to create mapping 140 | void MapUnicode_(); 141 | 142 | // Handles type 1 fonts 143 | void HandleTypeOneFont_(); 144 | void ParseTypeOneFont_(std::string); 145 | 146 | // Helper function for parser 147 | void Write_(DifferencesState& state_to_push_to_entries, 148 | std::string& string_to_push_to_entries); 149 | }; 150 | 151 | //---------------------------------------------------------------------------// 152 | 153 | #endif 154 | -------------------------------------------------------------------------------- /src/glyphwidths.h: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------------// 2 | // // 3 | // PDFR GlyphWidths header file // 4 | // // 5 | // Copyright (C) 2018 - 2019 by Allan Cameron // 6 | // // 7 | // Licensed under the MIT license - see https://mit-license.org // 8 | // or the LICENSE file in the project root directory // 9 | // // 10 | //---------------------------------------------------------------------------// 11 | 12 | #ifndef PDFR_WIDTH 13 | 14 | //---------------------------------------------------------------------------// 15 | 16 | #define PDFR_WIDTH 17 | 18 | /* This is the joint 6th in a series of daisy-chained headers that build up the 19 | * tools to read and parse pdfs. It is logically paired with encoding.h 20 | * in that they both come after document.h and together form the basis for the 21 | * next step, which is font creation. 22 | * 23 | * Calculating the width of each glyph is necessary for working out the spacing 24 | * between letters, words, paragraphs and other text elements. The glyph widths 25 | * in pdf are given in units of text space, where 1000 = 1 point = 1/72 inch in 26 | * 1-point font size. 27 | * 28 | * Getting the glyph widths is one of the more complex tasks in extracting text, 29 | * since there are various ways for pdf files to describe them. The most 30 | * explicit way is by listing the font widths at each code point in an array. 31 | * The array is preceeded by the first code point that is being described, 32 | * then the array itself comprises numbers for the widths of sequential code 33 | * points. Often there are several consecutive arrays like this specifying 34 | * groups of sequential code points. Sometimes the entry is just an array of 35 | * widths, and the first code point is given seperately in the font 36 | * dictionary. Sometimes there is a default width for missing glyphs. Sometimes 37 | * the width array is in the font dictionary; sometimes it is in a descendant 38 | * font dictionary; other times it is in an encoded stream; still other times 39 | * it comprises an entire non-dictionary object on its own. 40 | * 41 | * In older pdfs, the widths may not be specified at all if the font used is 42 | * one of 14 core fonts in the pdf specification. A conforming reader is 43 | * supposed to know the glyph widths for these fonts. 44 | * 45 | * The glyphwidth class attempts to work out the method used to describe 46 | * glyph widths and produce a map of the intended glyphs to their intended 47 | * widths, without bothering any other classes with its implementation. 48 | * 49 | * Among the tools it needs to do this, it requires navigating the document, 50 | * reading dictionaries and streams, and parsing a width description array. 51 | * It therefore needs the document.h header which wraps most of these 52 | * capabilities. The class defines its own lexer for interpreting the special 53 | * width arrays. 54 | * 55 | * It also needs a group of static objects listing the widths of each of the 56 | * characters used in the 'built-in' fonts used in pdfs. In theory, later 57 | * versions of pdf require specification of all glyph widths, but for back- 58 | * compatibility, the widths of the 14 core fonts still need to be defined. 59 | * 60 | * The widths are available as an open online resource from Adobe. 61 | * 62 | * To preserve encapsulation, this header is included only by the fonts 63 | * class. The fonts class merges its width map with the encoding map to 64 | * produce the glyphmap, which gives the intended Unicode code point and 65 | * width as a paired value for any given input character in a pdf string. 66 | */ 67 | 68 | //---------------------------------------------------------------------------// 69 | 70 | #include 71 | #include 72 | #include 73 | #include 74 | 75 | class Dictionary; 76 | class Document; 77 | using Unicode = uint16_t; 78 | using RawChar = uint16_t; 79 | 80 | 81 | //---------------------------------------------------------------------------// 82 | // The GlyphWidths class contains private methods to find the description of 83 | // widths for each character in a font. It only makes sense to the font class, 84 | // from whence it is created and accessed. 85 | // 86 | // The core font widths are declared static private because they are only 87 | // needed by this class, and we don't want an extra copy of all of them if 88 | // several fonts are created. This also prevents them polluting the global 89 | // namespace. 90 | 91 | class GlyphWidths 92 | { 93 | public: 94 | // Constructor 95 | GlyphWidths(Dictionary& font_dictionary_ptr, 96 | std::shared_ptr document_ptr); 97 | 98 | // public methods 99 | float GetWidth(const RawChar& code_point); // Get width of character code 100 | std::vector WidthKeys(); // Returns all map keys 101 | 102 | inline bool WidthsAreForRaw() const { return width_is_pre_interpretation_; } 103 | 104 | private: 105 | // This enum is used in the width array lexer 106 | enum WidthState {NEWSYMB, READFIRSTCHAR, READSECONDCHAR, 107 | READWIDTH, INSUBARRAY, END}; 108 | 109 | // private data 110 | std::unordered_map width_map_; // The main data member 111 | Dictionary& font_dictionary_; // The font dictionary 112 | std::shared_ptr document_; // Pointer to document 113 | std::string base_font_; // The base font (if any) 114 | bool width_is_pre_interpretation_; // Are widths for code points 115 | // pre- or post- translation? 116 | // private methods 117 | void ParseWidthArray_(const std::string&); // Width lexer 118 | void ReadCoreFont_(); // Core font getter 119 | void ParseDescendants_(); // Gets descendant dictionary 120 | void ParseWidths_(); // Parses the width array 121 | void ReadWidthTable_(); // Co-ordinates construction 122 | 123 | //-- The core fonts as defined in corefonts.cpp ------------------------------// 124 | // 125 | static const std::unordered_map courier_widths_; // 126 | static const std::unordered_map helvetica_widths_; // 127 | static const std::unordered_map helvetica_bold_widths_; // 128 | static const std::unordered_map symbol_widths_; // 129 | static const std::unordered_map times_bold_widths_; // 130 | static const std::unordered_map times_bold_italic_widths_; // 131 | static const std::unordered_map times_italic_widths_; // 132 | static const std::unordered_map times_roman_widths_; // 133 | static const std::unordered_map dingbats_widths_; // 134 | // 135 | //----------------------------------------------------------------------------// 136 | }; 137 | 138 | //---------------------------------------------------------------------------// 139 | 140 | #endif 141 | -------------------------------------------------------------------------------- /src/object_class.cpp: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------------// 2 | // // 3 | // PDFR Object implementation file // 4 | // // 5 | // Copyright (C) 2018 - 2019 by Allan Cameron // 6 | // // 7 | // Licensed under the MIT license - see https://mit-license.org // 8 | // or the LICENSE file in the project root directory // 9 | // // 10 | //---------------------------------------------------------------------------// 11 | 12 | #include "utilities.h" 13 | #include "dictionary.h" 14 | #include "streams.h" 15 | #include "deflate.h" 16 | #include "xref.h" 17 | #include "object_class.h" 18 | #include 19 | 20 | //---------------------------------------------------------------------------// 21 | 22 | using namespace std; 23 | 24 | //---------------------------------------------------------------------------// 25 | // The main object creator class. It needs a pointer to the xref and a number 26 | // representing the object's number as set out in the xref table. 27 | 28 | Object::Object(shared_ptr xref, int object_number) : 29 | xref_(xref), 30 | object_number_(object_number), 31 | raw_stream_(), 32 | stream_index_(make_shared>>()) 33 | { 34 | // Find start and end of object 35 | size_t start = xref_->GetObjectStartByte(object_number_); 36 | size_t stop = xref_->GetObjectEndByte(object_number_); 37 | 38 | if (xref_->File()->substr(start, 20).find("%") != string::npos) 39 | { 40 | start = xref_->File()->substr(start, 200).find("\n") + start; 41 | } 42 | 43 | // We check to see if the object has a header dictionary by finding '<<' 44 | if (xref_->File()->substr(start, 20).find("<<") == string::npos) 45 | { 46 | // No dictionary found - make blank dictionary for header 47 | header_ = Dictionary(); 48 | 49 | // Finds start and length of contents 50 | size_t c_start = xref_->File()->find(" obj", start) + 4; 51 | raw_stream_ = {xref_->File()->c_str() + c_start, stop - c_start}; 52 | } 53 | 54 | else // Else the object has a header dictionary 55 | { 56 | header_ = Dictionary(xref_->File(), start); 57 | // Find the stream (if any) 58 | raw_stream_ = xref_->GetStreamLocation(start); 59 | 60 | // The object may contain an object stream that needs unpacked 61 | if (header_["/Type"] == "/ObjStm") 62 | { 63 | // Get the object stream 64 | ReadStream_(); 65 | 66 | // Index the objects in the stream 67 | IndexObjectStream_(); 68 | } 69 | } 70 | } 71 | 72 | //---------------------------------------------------------------------------// 73 | // Object streams start with a group of integers representing the object 74 | // numbers and the byte offset of each object relative to the stream. This 75 | // method reads the objects and their positions in the stream, indexing them 76 | // for later retrieval. 77 | 78 | void Object::IndexObjectStream_() 79 | { 80 | // Get the first character that is not a digit or space 81 | int startbyte = stream_.find_first_not_of("\n\r\t 0123456789"); 82 | 83 | // Now get the substring with the objects proper... 84 | string stream_string(stream_.begin() + startbyte, stream_.end()); 85 | 86 | // ...and the substring with the registration numbers... 87 | string index_string(stream_.begin(), stream_.begin() + startbyte - 1); 88 | 89 | // extract these numbers to a vector 90 | vector index = ParseInts(index_string); 91 | 92 | // If this is empty, something has gone wrong. 93 | if (index.empty()) throw runtime_error("Couldn't parse object stream"); 94 | 95 | // We now set up a loop that determines which numbers are object numbers and 96 | // which are byte offsets 97 | for (size_t byte_length, i = 1; i < index.size(); i += 2) 98 | { 99 | if (i == (index.size() - 1)) byte_length = stream_string.size() - index[i]; 100 | else byte_length = index[i + 2] - index[i]; 101 | auto&& index_pair = make_pair(index[i] + startbyte, byte_length); 102 | (*stream_index_)[index[i - 1]] = index_pair; 103 | } 104 | } 105 | 106 | /*---------------------------------------------------------------------------*/ 107 | // The constructor for in-stream objects. This is called automatically by the 108 | // main object constructor if the main object constructor determines that the 109 | // requested object lies inside the stream of another object 110 | 111 | Object::Object(shared_ptr holder, int object_number): 112 | xref_(holder->xref_), 113 | object_number_(object_number), 114 | raw_stream_() 115 | { 116 | auto finder = holder->stream_index_->find(object_number_); 117 | if (finder == holder->stream_index_->end()) 118 | { 119 | throw runtime_error("Object not found in stream"); 120 | } 121 | 122 | auto index_position = finder->second.first; 123 | auto index_length = finder->second.second; 124 | auto stream_string = holder->stream_.substr(index_position, index_length); 125 | 126 | // Most stream objects consist of just a dictionary 127 | if (stream_string[0] == '<') 128 | { 129 | header_ = Dictionary(make_shared(stream_string)); 130 | stream_ = ""; // stream objects don't have their own stream 131 | } 132 | else // The object is not a dictionary - maybe just an array or int etc 133 | { 134 | header_ = Dictionary();// empty header 135 | stream_ = stream_string; // Call the contents a stream for ease 136 | 137 | // Annoyingly, some "objects" in an object stream are just pointers 138 | // to other objects. This is pointless but does happen and needs to 139 | // be handled by recursively calling the constructor 140 | if (stream_.size() < 15 && stream_.find(" R", 0) < 15) 141 | { 142 | size_t new_number = ParseReferences(stream_)[0]; 143 | size_t holder = xref_->GetHoldingNumberOf(new_number); 144 | if (holder == 0) *this = Object(xref_, new_number); 145 | else *this = Object(make_shared(xref_, holder), new_number); 146 | this->object_number_ = object_number; 147 | } 148 | } 149 | } 150 | 151 | /*---------------------------------------------------------------------------*/ 152 | // Simple public getter for the header dictionary 153 | 154 | Dictionary& Object::GetDictionary() 155 | { 156 | return header_; 157 | } 158 | 159 | /*---------------------------------------------------------------------------*/ 160 | // We have to create the stream on the fly when it is needed rather than 161 | // calculating and storing all the streams upon document creation 162 | 163 | string& Object::GetStream() 164 | { 165 | // If the stream has not already been processed, do it now 166 | if (stream_.empty()) ReadStream_(); 167 | return stream_; 168 | } 169 | 170 | /*---------------------------------------------------------------------------*/ 171 | // We will keep all stream processing in one place for easier debugging and 172 | // future development 173 | 174 | void Object::ReadStream_() 175 | { 176 | 177 | string filters = header_["/Filter"]; 178 | bool is_flatedecode = filters.find("/FlateDecode") != string::npos; 179 | 180 | // Decrypt if necessary 181 | if (xref_->IsEncrypted()) 182 | { 183 | stream_ = xref_->Decrypt(raw_stream_, object_number_, 0); 184 | if (is_flatedecode) stream_ = FlateDecode(&stream_); 185 | } 186 | else 187 | { 188 | if (is_flatedecode) stream_ = FlateDecode(raw_stream_); 189 | } 190 | } 191 | 192 | --------------------------------------------------------------------------------