├── .Rbuildignore
├── .gitignore
├── .travis.yml
├── DESCRIPTION
├── LICENSE
├── LICENSE.md
├── Makefile
├── NAMESPACE
├── NEWS.md
├── R
    └── word_count.R
├── README.md
├── inst
    └── CITATION
├── man
    └── word_count.Rd
├── server.R
├── tests
    ├── testthat-pdfcount.R
    └── testthat
    │   ├── test.tex
    │   └── tests.R
└── ui.R


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^\.github.?
 2 | ^\.travis\.yml$
 3 | ^appveyor\.yml$
 4 | ^travis-tool\.sh$
 5 | ^Makefile$
 6 | ^README\.Rmd$
 7 | ^README\.html$
 8 | ^README_files$
 9 | ^README_files/.+$
10 | ^CONTRIBUTING\.md$
11 | ^LICENSE\.md$
12 | ^inst/doc/.+\.log$
13 | ^inst/doc/.+\.Rmd$
14 | ^inst/rsconnect/.+$
15 | ^figure$
16 | ^figure/.+$
17 | ^cache/.+$
18 | ^docs$
19 | ^docs/.+$
20 | ^data-raw$
21 | ^data-raw/.+$
22 | ^revdep.?
23 | ^ignore$
24 | ^.+\.aux$
25 | ^.+\.bbl$
26 | ^.+\.blg$
27 | ^.+\.dvi$
28 | ^.+\.log$
29 | ^.+\.out$
30 | ^.+\.pdf$
31 | ^.+\.sty$
32 | ^.+\.tex$
33 | server\.R
34 | ui\.R
35 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | rsconnect/*
2 | tests/testthat/*.aux
3 | tests/testthat/*.log
4 | tests/testthat/*.pdf
5 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: r
 2 | sudo: false
 3 | cache: packages
 4 | 
 5 | addons:
 6 |   apt:
 7 |     packages:
 8 |       - libpoppler-cpp-dev
 9 | 
10 | after_success:
11 | - R -q -e 'library("covr");codecov()'
12 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: pdfcount
 2 | Type: Package
 3 | Title: Word Counts for PDF Documents
 4 | Description: Count words in a PDF in R or via a Shiny application.
 5 | License: MIT + file LICENSE
 6 | Version: 0.1.4
 7 | Date: 2018-08-30
 8 | Authors@R: c(person("Thomas J.", "Leeper",
 9 |                     role = c("aut", "cre"), 
10 |                     email = "thosjleeper@gmail.com",
11 |                     comment = c(ORCID = "0000-0003-4097-6326")))
12 | URL: https://leeper.shinyapps.io/pdfcount/
13 | BugReports: https://github.com/leeper/pdfcount/issues
14 | Imports:
15 |   pdftools,
16 |   dplyr,
17 |   tidytext
18 | Suggests:
19 |   testthat,
20 |   tools,
21 |   shiny,
22 |   plotly
23 | ByteCompile: true
24 | RoxygenNote: 6.1.0
25 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2018
2 | COPYRIGHT HOLDER: Thomas J. Leeper
3 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2018 Thomas J. Leeper
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | pkg = $(shell basename $(CURDIR))
 2 | 
 3 | all: build
 4 | 
 5 | NAMESPACE: R/*
 6 | 	Rscript -e "devtools::document()"
 7 | 
 8 | README.html: README.md
 9 | 	pandoc -o README.html README.md
10 | 
11 | ../$(pkg)*.tar.gz: DESCRIPTION NAMESPACE README.md
12 | 	cd ../ && R CMD build $(pkg)
13 | 
14 | build: ../$(pkg)*.tar.gz
15 | 
16 | check: ../$(pkg)*.tar.gz
17 | 	cd ../ && R CMD check $(pkg)*.tar.gz
18 | 	rm ../$(pkg)*.tar.gz
19 | 
20 | install: ../$(pkg)*.tar.gz
21 | 	cd ../ && R CMD INSTALL $(pkg)*.tar.gz
22 | 	rm ../$(pkg)*.tar.gz
23 | 
24 | shiny: NAMESPACE
25 | 	Rscript -e "shiny::runApp()"
26 | 
27 | deploy: NAMESPACE
28 | 	Rscript -e "rsconnect::deployApp(appFiles = c("ui.R", "server.R", "R/word_count.R"), forceUpdate = TRUE)"
29 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 | 
3 | export(word_count)
4 | import(dplyr)
5 | import(pdftools)
6 | import(tidytext)
7 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # pdfcount 0.1.4
 2 | 
 3 | * Added a test suite. (#3)
 4 | 
 5 | # pdfcount 0.1.3
 6 | 
 7 | * Change plot to interactive plotly graphics.
 8 | * Improve `page` parsing to reduce failures and clarify page sequencing on UI.
 9 | 
10 | # pdfcount 0.1.2
11 | 
12 | * Add plot of per-page word counts.
13 | 
14 | # pdfcount 0.1.1
15 | 
16 | * Initial release
17 | 
18 | 


--------------------------------------------------------------------------------
/R/word_count.R:
--------------------------------------------------------------------------------
  1 | #' @title Word Count a PDF
  2 | #' @description Obtain a Word Count from a PDF
  3 | #' @param document A file path specifying a PDF document.
  4 | #' @param pages Optionally, an integer vector specifying a subset of pages to count from. Negative values serve as negative subsets.
  5 | #' @param count_numbers A logical specifying whether to count numbers as words.
  6 | #' @param count_captions A logical specifying whether to count lines beginning with \dQuote{Table} or \dQuote{Figure} in word count.
  7 | #' @param count_equations A logical specifying whether to count lines ending with \dQuote{([Number])} in word count.
  8 | #' @param split_hyphenated A logical specifying whether to split hyphenated words or expressions as separate words.
  9 | #' @param split_urls A logical specifying whether to split URLs into multiple words when counting.
 10 | #' @param verbose A logical specifying whether to be verbose. If \code{TRUE}, the page and word counts are printed to the console and the result is is returned invisibly. If \code{FALSE}, the result is visible.
 11 | #' @return A data frame with two columns, one specifying page and the other specifying word count for that page.
 12 | #' @details This is useful for obtaining a word count for a LaTeX-compiled PDF. Counting words in the tex source is a likely undercount (due to missing citations, cross-references, and parenthetical citations). Counting words from the PDF is likely over count (due to hyphenation issues, URLs, ligatures, tables and figures, and various other things). This function tries to obtain a word from the PDF while accounting for some of the sources of overcounting.
 13 | #' 
 14 | #' It is often desirable to have word counts excluding tables and figures. A solution on TeX StackExchange (\url{https://tex.stackexchange.com/a/352394/30039}) provides guidance on how to exclude tables and figures (or any arbitrary LaTeX environment) from a compiled document, which may be useful before attempting to word count the PDF.
 15 | #' 
 16 | #' @author Thomas J. Leeper <thosjleeper@gmail.com>
 17 | #' @examples
 18 | #' \dontrun{
 19 | #' # "R-intro.pdf" manual
 20 | #' rintro <- file.path(Sys.getenv("R_HOME"), "doc", "manual", "R-intro.pdf")
 21 | #' 
 22 | #' # Online service at http://www.montereylanguages.com/pdf-word-count-online-free-tool.html
 23 | #' # claims the word count to be 36,530 words
 24 | #' 
 25 | #' # Microsoft Word (PDF conversion) word count is 36,869 words
 26 | #' 
 27 | #' word_count(rintro)      # all pages (105 pages, 37870 words)
 28 | #' word_count(rintro, 1:3) # pages 1-3
 29 | #' word_count(rintro, -1)  # skip first page
 30 | #' }
 31 | #' @import pdftools
 32 | #' @import dplyr
 33 | #' @import tidytext
 34 | #' @export
 35 | word_count <-
 36 | function(
 37 |   document,
 38 |   pages = NULL,
 39 |   count_numbers = TRUE,
 40 |   count_captions = FALSE,
 41 |   count_equations = FALSE,
 42 |   split_hyphenated = FALSE,
 43 |   split_urls = FALSE,
 44 |   verbose = getOption("verbose", FALSE)
 45 | ) {
 46 |     
 47 |     # import
 48 |     char <- pdftools::pdf_text(document)
 49 |     
 50 |     # handle URLs
 51 |     ## unnest_tokens() splits URLs by default into multiple tokens
 52 |     if (!isTRUE(split_urls)) {
 53 |         # borrowed from: https://stackoverflow.com/a/8234912/2338862
 54 |         url_regex <- "((([A-Za-z]{3,9}:(?:\\/\\/)?)(?:[-;:&=+$,\\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=+$,\\w]+@)[A-Za-z0-9.-]+)((?:\\/[\\+~%\\/.\\w-_]*)?\\??(?:[-\\\\+=&;%@.\\w_]*)#?(?:[\\w]*))?)"
 55 |         char <- gsub(url_regex, "URL", char, perl = TRUE)
 56 |     }
 57 |     
 58 |     # cleanup hypenations across line breaks
 59 |     char <- gsub("-\n", "", char)
 60 |     
 61 |     # handle hyphenated words
 62 |     ## unnest_tokens() splits URLs by default into multiple tokens
 63 |     if (!isTRUE(split_hyphenated)) {
 64 |         char <- gsub("(?<=.)-(?=.)", "", char, perl = TRUE)
 65 |     }
 66 |     
 67 |     # subset pages
 68 |     all_pages <- seq_len(length(char))
 69 |     if (!is.null(pages)) {
 70 |         to_count <- rep(FALSE, length(char))
 71 |         ## inclusions
 72 |         pos <- pages[pages > 0]
 73 |         if (length(pos)) {
 74 |             to_count[pos] <- TRUE
 75 |         } else {
 76 |             to_count[] <- TRUE
 77 |         }
 78 |         ## exclusions
 79 |         neg <- pages[pages < 0]
 80 |         if (length(neg)) {
 81 |             to_count[abs(neg)] <- FALSE
 82 |         }
 83 |         ## subset
 84 |         char <- char[to_count]
 85 |         pages <- all_pages[to_count]
 86 |     } else {
 87 |         pages <- all_pages
 88 |     }
 89 |     
 90 |     # tidy lines
 91 |     txt_df <- data.frame(page = pages, text = char, stringsAsFactors = FALSE)
 92 |     tidy_lines <- tokenize_to_lines(txt_df)
 93 |     
 94 |     # remove likely figure/title captions
 95 |     if (!isTRUE(count_captions)) {
 96 |         tidy_lines <- tidy_lines[!grepl("^([Ff]igure)|([Tt]able) [[:digit:]]+ ?[.:,] ?", tidy_lines$line), ]
 97 |     }
 98 |     
 99 |     # remove likely equations
100 |     if (!isTRUE(count_equations)) {
101 |         tidy_lines <- tidy_lines[!grepl(" +\\([[:digit:]]+\\)$", tidy_lines$line), ]
102 |     }
103 |     
104 |     # tidy words
105 |     tidy_words <- tokenize_to_words(tidy_lines)
106 |     
107 |     # handle numbers
108 |     if (!isTRUE(count_numbers)) {
109 |         suppressWarnings(tidy_words$number <- as.numeric(tidy_words$word))
110 |         tidy_words <- tidy_words[is.na(tidy_words$number), ]
111 |         tidy_words$number <- NULL
112 |     }
113 |     
114 |     # count and, if verbose, message() the count
115 |     if (isTRUE(verbose)) {
116 |         message(sprintf("Document with %d %s and %d %s",
117 |                         nrow(txt_df),
118 |                         ngettext(nrow(txt_df), "page", "pages"),
119 |                         nrow(tidy_words),
120 |                         ngettext(nrow(tidy_words), "word", "words")))
121 |     }
122 |     
123 |     # construct page-level data frame of counts to return
124 |     out <- dplyr::ungroup(dplyr::summarize(dplyr::group_by(tidy_words, page), words = n()))
125 |     
126 |     if (isTRUE(verbose)) {
127 |         invisible(out)
128 |     } else {
129 |         out
130 |     }
131 | }
132 | 
133 | tokenize_to_lines <- function(dat) {
134 |     x <- tidytext::unnest_tokens(dat, line, text, token = "lines")
135 |     x
136 | }
137 | 
138 | tokenize_to_words <- function(dat) {
139 |     x <- tidytext::unnest_tokens(dat, word, line, drop = FALSE)
140 |     x
141 | }
142 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # pdfcount
 2 | 
 3 | [![CRAN](https://www.r-pkg.org/badges/version/pdfcount)](https://cran.r-project.org/package=pdfcount)
 4 | ![Downloads](https://cranlogs.r-pkg.org/badges/pdfcount)
 5 | [![Travis Build Status](https://travis-ci.org/leeper/pdfcount.png?branch=master)](https://travis-ci.org/leeper/pdfcount)
 6 | [![codecov.io](https://codecov.io/github/leeper/pdfcount/coverage.svg?branch=master)](https://codecov.io/github/leeper/pdfcount?branch=master)
 7 | 
 8 | This is a simple package and shiny app designed to count words in a PDF document, for example generated by LaTeX.
 9 | 
10 | You can use the package locally:
11 | 
12 | ```R
13 | # install
14 | remotes::install_github("leeper/pdfcount")
15 | 
16 | # load
17 | library("pdfcount")
18 | 
19 | # count
20 | rintro <- file.path(Sys.getenv("R_HOME"), "doc", "manual", "R-intro.pdf")
21 | word_count(rintro)
22 | ```
23 | 
24 | Or, you can use it as a shiny app. A demo runs live at: https://leeper.shinyapps.io/pdfcount/
25 | 
26 | 


--------------------------------------------------------------------------------
/inst/CITATION:
--------------------------------------------------------------------------------
 1 | citHeader("To cite package 'pdfcount' in publications use:")
 2 |      
 3 |  year <- sub(".*(2[[:digit:]]{3})-.*", "\\1", meta$Date, perl = TRUE)
 4 |  vers <- paste("R package version", meta$Version)
 5 |  
 6 |  citEntry(entry="Manual",
 7 |           title = "pdfcount: Word Counts for PDF Documents",
 8 |           author = personList(as.person("Thomas J. Leeper")),
 9 |           year = year,
10 |           note = vers,
11 |           textVersion =
12 |           paste("Thomas J. Leeper (",
13 |                 year,
14 |                 "). pdfcount: Word Counts for PDF Documents. ",
15 |                 vers, ".", sep=""))
16 | 


--------------------------------------------------------------------------------
/man/word_count.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/word_count.R
 3 | \name{word_count}
 4 | \alias{word_count}
 5 | \title{Word Count a PDF}
 6 | \usage{
 7 | word_count(document, pages = NULL, count_numbers = TRUE,
 8 |   count_captions = FALSE, count_equations = FALSE,
 9 |   split_hyphenated = FALSE, split_urls = FALSE,
10 |   verbose = getOption("verbose", FALSE))
11 | }
12 | \arguments{
13 | \item{document}{A file path specifying a PDF document.}
14 | 
15 | \item{pages}{Optionally, an integer vector specifying a subset of pages to count from. Negative values serve as negative subsets.}
16 | 
17 | \item{count_numbers}{A logical specifying whether to count numbers as words.}
18 | 
19 | \item{count_captions}{A logical specifying whether to count lines beginning with \dQuote{Table} or \dQuote{Figure} in word count.}
20 | 
21 | \item{count_equations}{A logical specifying whether to count lines ending with \dQuote{([Number])} in word count.}
22 | 
23 | \item{split_hyphenated}{A logical specifying whether to split hyphenated words or expressions as separate words.}
24 | 
25 | \item{split_urls}{A logical specifying whether to split URLs into multiple words when counting.}
26 | 
27 | \item{verbose}{A logical specifying whether to be verbose. If \code{TRUE}, the page and word counts are printed to the console and the result is is returned invisibly. If \code{FALSE}, the result is visible.}
28 | }
29 | \value{
30 | A data frame with two columns, one specifying page and the other specifying word count for that page.
31 | }
32 | \description{
33 | Obtain a Word Count from a PDF
34 | }
35 | \details{
36 | This is useful for obtaining a word count for a LaTeX-compiled PDF. Counting words in the tex source is a likely undercount (due to missing citations, cross-references, and parenthetical citations). Counting words from the PDF is likely over count (due to hyphenation issues, URLs, ligatures, tables and figures, and various other things). This function tries to obtain a word from the PDF while accounting for some of the sources of overcounting.
37 | 
38 | It is often desirable to have word counts excluding tables and figures. A solution on TeX StackExchange (\url{https://tex.stackexchange.com/a/352394/30039}) provides guidance on how to exclude tables and figures (or any arbitrary LaTeX environment) from a compiled document, which may be useful before attempting to word count the PDF.
39 | }
40 | \examples{
41 | \dontrun{
42 | # "R-intro.pdf" manual
43 | rintro <- file.path(Sys.getenv("R_HOME"), "doc", "manual", "R-intro.pdf")
44 | 
45 | # Online service at http://www.montereylanguages.com/pdf-word-count-online-free-tool.html
46 | # claims the word count to be 36,530 words
47 | 
48 | # Microsoft Word (PDF conversion) word count is 36,869 words
49 | 
50 | word_count(rintro)      # all pages (105 pages, 37870 words)
51 | word_count(rintro, 1:3) # pages 1-3
52 | word_count(rintro, -1)  # skip first page
53 | }
54 | }
55 | \author{
56 | Thomas J. Leeper <thosjleeper@gmail.com>
57 | }
58 | 


--------------------------------------------------------------------------------
/server.R:
--------------------------------------------------------------------------------
 1 | ## --------
 2 | ## server.R
 3 | ## --------
 4 | 
 5 | library("shiny")
 6 | library("ggplot2")
 7 | library("plotly")
 8 | source("R/word_count.R")
 9 | 
10 | server <- function(input, output) {
11 |     count <- reactive({
12 |         infile <- input$infile
13 |         if (is.null(infile)) {
14 |             return(NULL)
15 |         }
16 |         pages <- if (is.null(input$pages)) {
17 |             NULL
18 |         } else {
19 |             # try to parse pages
20 |             try_pages <- try(eval(parse(text = paste0("c(", input$pages, ")"))), silent = TRUE)
21 |             if (!inherits(try_pages, "try-error")) {
22 |                 pages <- try_pages
23 |             } else {
24 |                 pages <- NULL
25 |             }
26 |         }
27 |         
28 |         # count words
29 |         word_count(infile$datapath,
30 |                    pages = pages,
31 |                    count_numbers = input$count_numbers,
32 |                    count_captions = input$count_captions,
33 |                    count_equations = input$count_equations,
34 |                    split_hyphenated = input$split_hyphenated,
35 |                    split_urls = input$split_urls,
36 |                    verbose = FALSE)
37 |     })
38 |     
39 |     # total word count
40 |     output$grand_total <- renderText({
41 |         counts <- count()
42 |         if (is.null(counts)) {
43 |             return("")
44 |         }
45 |         paste("Total Word Count:", sum(counts$words, na.rm = TRUE))
46 |     })
47 |     
48 |     # text to display when showing per-page word counts
49 |     output$page_counts <- renderText({
50 |         counts <- count()
51 |         if (is.null(counts)) {
52 |             return("")
53 |         }
54 |         paste("Word Count by Page:")
55 |     })
56 |     
57 |     # graph of word counts by page
58 |     output$barplot <- renderPlotly({
59 |         counts <- count()
60 |         if (is.null(counts)) {
61 |             return(NULL)
62 |         }
63 |         
64 |         # plot
65 |         ggplot(counts, aes(x = page, y = words)) +
66 |           xlab("Page") +
67 |           ylab("Words per page") +
68 |           scale_x_reverse(breaks = seq_len(max(counts$page, na.rm = TRUE))) + 
69 |           geom_col(na.rm = TRUE) +
70 |           coord_flip() +
71 |           theme_minimal()
72 |     })
73 | }
74 | 


--------------------------------------------------------------------------------
/tests/testthat-pdfcount.R:
--------------------------------------------------------------------------------
1 | library("testthat")
2 | library("pdfcount")
3 | test_check("pdfcount")
4 | 


--------------------------------------------------------------------------------
/tests/testthat/test.tex:
--------------------------------------------------------------------------------
 1 | \documentclass{article}
 2 | 
 3 | \begin{document}
 4 | 
 5 | \title{Two-word Title: Three-word article subtitle}
 6 | \author{Two Words}
 7 | \date{}
 8 | 
 9 | \maketitle
10 | 
11 | \clearpage
12 | 
13 | \abstract{Page has five words.}
14 | 
15 | \clearpage
16 | 
17 | Page three has five words.
18 | 
19 | \clearpage
20 | 
21 | Page 4 has a table:
22 | 
23 | \begin{tabular}{lll}
24 | Column One & Column Two & Column Three \\
25 | 1 & A & B \\
26 | 2 & C & D \\
27 | 3 & E & F \\
28 | \end{tabular}
29 | 
30 | \clearpage
31 | 
32 | Page five has a footnote.\footnote{Page has nine words.}
33 | 
34 | \clearpage
35 | 
36 | Page six has an equation:
37 | 
38 | \begin{equation}
39 | Y = \alpha + \beta X
40 | \end{equation}
41 | 
42 | \end{document}
43 | 


--------------------------------------------------------------------------------
/tests/testthat/tests.R:
--------------------------------------------------------------------------------
 1 | context("Test word_count()")
 2 | 
 3 | tools::texi2pdf("test.tex")
 4 | 
 5 | if ("test.pdf" %in% dir()) {
 6 | 
 7 |     # count all words, including numbers
 8 |     wc <- word_count("test.pdf")
 9 |     
10 |     # count all words, excluding numbers
11 |     wc_no_numbers <- word_count("test.pdf", count_numbers = FALSE)
12 | 
13 |     test_that("Test word_count()", {
14 |         
15 |         # class
16 |         expect_true(inherits(wc, "data.frame"), label = "word_count() returns PDF")
17 |         
18 |         # nrow()
19 |         expect_true(nrow(wc) == 6L, label = "word_count() returns correct rows")
20 |         
21 |         # ncol() & names()
22 |         expect_true(ncol(wc) == 2L, label = "word_count() returns correct columns")
23 |         expect_true(identical(names(wc), c("page", "words")), label = "word_count() returns correct column names")
24 |         
25 |         # correct counts
26 |         expect_true(all.equal(wc$words, c(8, 6, 6, 21, 12, 6), tolerance = 1L),
27 |                     label = "word_count() returns correct word counts, with numbers")
28 |         expect_true(all.equal(wc_no_numbers$words, c(7, 6, 5, 16, 9, 5), tolerance = 1L),
29 |                     label = "word_count() returns correct word counts, without numbers")
30 | 
31 |     })
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/ui.R:
--------------------------------------------------------------------------------
 1 | ## -----
 2 | ## ui.R
 3 | ## -----
 4 | 
 5 | library("shiny")
 6 | library("plotly")
 7 | 
 8 | ui <- fluidPage(
 9 |   h2("Count Words in a PDF Document", style = "text-align:center;"),
10 |   br(),
11 |   fluidRow(
12 |     column(3, 
13 |       strong(p("Upload your file here:")),
14 |       fileInput("infile", label = NULL, accept = "application/pdf"),
15 |       textInput("pages", "Page numbers (e.g., 1:3,5:7):", "", width = "70%"),
16 |       strong(p("Additional options:")),
17 |       checkboxInput("count_numbers", "Count numbers?", TRUE),
18 |       checkboxInput("count_captions", "Count table/figure captions?", FALSE),
19 |       checkboxInput("count_equations", "Count equation lines?", FALSE),
20 |       checkboxInput("split_hyphenated", "Split hyphenated words?", FALSE),
21 |       checkboxInput("split_urls", "Tokenize URLs?", FALSE)
22 |     ),
23 |     column(9,
24 |       strong(textOutput("grand_total")),
25 |       p(""),
26 |       strong(textOutput("page_counts")),
27 |       plotlyOutput("barplot", width = "100%", height = "500px")
28 |     )
29 |   ),
30 |   br(),
31 |   tags$footer(
32 |     tags$p("Copyright Thomas J. Leeper (2018).", a("MIT-licensed.", href = "https://opensource.org/licenses/MIT"), "Source code and R package available from: ", a("https://github.com/leeper/pdfcount", href = "https://github.com/leeper/pdfcount")))
33 | )
34 | 


--------------------------------------------------------------------------------