├── .github
    └── workflows
    │   └── rhub.yaml
├── .gitignore
├── .travis.yml
├── DESCRIPTION
├── LICENSE
├── LICENSE.md
├── NAMESPACE
├── NEWS.md
├── R
    ├── data.R
    ├── ggram.R
    ├── methods.R
    ├── ngram.R
    ├── ngrami.R
    ├── ngramr-package.R
    ├── ngramw.R
    ├── sysdata.rda
    ├── themes.R
    └── utilities.R
├── README.md
├── cran-comments.md
├── man
    ├── chunk.Rd
    ├── corpuses.Rd
    ├── figures
    │   ├── archy.png
    │   └── hacker.png
    ├── ggram.Rd
    ├── hacker.Rd
    ├── ngram.Rd
    ├── ngrami.Rd
    ├── ngramr.Rd
    ├── ngramw.Rd
    ├── print.ngram.Rd
    └── theme_google.Rd
├── testme
└── tests
    ├── results.txt
    ├── testthat.R
    └── testthat
        └── test-ngramr.R


/.github/workflows/rhub.yaml:
--------------------------------------------------------------------------------
 1 | # R-hub's generic GitHub Actions workflow file. It's canonical location is at
 2 | # https://github.com/r-hub/actions/blob/v1/workflows/rhub.yaml
 3 | # You can update this file to a newer version using the rhub2 package:
 4 | #
 5 | # rhub::rhub_setup()
 6 | #
 7 | # It is unlikely that you need to modify this file manually.
 8 | 
 9 | name: R-hub
10 | run-name: "${{ github.event.inputs.id }}: ${{ github.event.inputs.name || format('Manually run by {0}', github.triggering_actor) }}"
11 | 
12 | on:
13 |   workflow_dispatch:
14 |     inputs:
15 |       config:
16 |         description: 'A comma separated list of R-hub platforms to use.'
17 |         type: string
18 |         default: 'linux,windows,macos'
19 |       name:
20 |         description: 'Run name. You can leave this empty now.'
21 |         type: string
22 |       id:
23 |         description: 'Unique ID. You can leave this empty now.'
24 |         type: string
25 | 
26 | jobs:
27 | 
28 |   setup:
29 |     runs-on: ubuntu-latest
30 |     outputs:
31 |       containers: ${{ steps.rhub-setup.outputs.containers }}
32 |       platforms: ${{ steps.rhub-setup.outputs.platforms }}
33 | 
34 |     steps:
35 |     # NO NEED TO CHECKOUT HERE
36 |     - uses: r-hub/actions/setup@v1
37 |       with:
38 |         config: ${{ github.event.inputs.config }}
39 |       id: rhub-setup
40 | 
41 |   linux-containers:
42 |     needs: setup
43 |     if: ${{ needs.setup.outputs.containers != '[]' }}
44 |     runs-on: ubuntu-latest
45 |     name: ${{ matrix.config.label }}
46 |     strategy:
47 |       fail-fast: false
48 |       matrix:
49 |         config: ${{ fromJson(needs.setup.outputs.containers) }}
50 |     container:
51 |       image: ${{ matrix.config.container }}
52 | 
53 |     steps:
54 |       - uses: r-hub/actions/checkout@v1
55 |       - uses: r-hub/actions/platform-info@v1
56 |         with:
57 |           token: ${{ secrets.RHUB_TOKEN }}
58 |           job-config: ${{ matrix.config.job-config }}
59 |       - uses: r-hub/actions/setup-deps@v1
60 |         with:
61 |           token: ${{ secrets.RHUB_TOKEN }}
62 |           job-config: ${{ matrix.config.job-config }}
63 |       - uses: r-hub/actions/run-check@v1
64 |         with:
65 |           token: ${{ secrets.RHUB_TOKEN }}
66 |           job-config: ${{ matrix.config.job-config }}
67 | 
68 |   other-platforms:
69 |     needs: setup
70 |     if: ${{ needs.setup.outputs.platforms != '[]' }}
71 |     runs-on: ${{ matrix.config.os }}
72 |     name: ${{ matrix.config.label }}
73 |     strategy:
74 |       fail-fast: false
75 |       matrix:
76 |         config: ${{ fromJson(needs.setup.outputs.platforms) }}
77 | 
78 |     steps:
79 |       - uses: r-hub/actions/checkout@v1
80 |       - uses: r-hub/actions/setup-r@v1
81 |         with:
82 |           job-config: ${{ matrix.config.job-config }}
83 |           token: ${{ secrets.RHUB_TOKEN }}
84 |       - uses: r-hub/actions/platform-info@v1
85 |         with:
86 |           token: ${{ secrets.RHUB_TOKEN }}
87 |           job-config: ${{ matrix.config.job-config }}
88 |       - uses: r-hub/actions/setup-deps@v1
89 |         with:
90 |           job-config: ${{ matrix.config.job-config }}
91 |           token: ${{ secrets.RHUB_TOKEN }}
92 |       - uses: r-hub/actions/run-check@v1
93 |         with:
94 |           job-config: ${{ matrix.config.job-config }}
95 |           token: ${{ secrets.RHUB_TOKEN }}
96 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # OS X
 2 | .DS_Store
 3 | 
 4 | # History files
 5 | .Rhistory
 6 | 
 7 | # Example code in package build process
 8 | *-Ex.R
 9 | .Rproj.user
10 | .RData
11 | .Rprofile
12 | .Renviron
13 | *.Rproj
14 | google/*
15 | .Rbuildignore
16 | CRAN-SUBMISSION
17 | working/*
18 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r
 2 | 
 3 | language: r
 4 | 
 5 | r:
 6 |   - release
 7 |   - devel
 8 | 
 9 | cache: packages
10 | 
11 | r_packages:
12 |   - covr
13 | 
14 | after_success:
15 |   - Rscript -e 'library(covr); codecov()'
16 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: ngramr
 2 | Type: Package
 3 | Title: Retrieve and Plot Google n-Gram Data
 4 | Version: 1.10.0
 5 | Date: 2025-01-10
 6 | Authors@R: c(
 7 |     person("Sean", "Carmody", email = "seancarmody@gmail.com", role = c("aut", "cre", "cph"))
 8 |     )
 9 | Maintainer: Sean Carmody <seancarmody@gmail.com>
10 | Description: Retrieve and plot word frequencies through time from the "Google
11 |     Ngram Viewer" <https://books.google.com/ngrams>.
12 | Depends:
13 |     R (>= 4.0.0)
14 | Imports:
15 |     httr,
16 |     rlang,
17 |     curl,
18 |     dplyr (>= 1.0.3),
19 |     cli,
20 |     tibble,
21 |     tidyr,
22 |     rjson,
23 |     stringr,
24 |     ggplot2,
25 |     scales,
26 |     xml2,
27 |     textutils
28 | URL: https://github.com/seancarmody/ngramr
29 | BugReports: https://github.com/seancarmody/ngramr/issues
30 | License: MIT + file LICENSE
31 | RoxygenNote: 7.3.2
32 | Roxygen: list(markdown = TRUE)
33 | Encoding: UTF-8
34 | Suggests: 
35 |     testthat
36 | Language: en-AU
37 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2023
2 | COPYRIGHT HOLDER: Sean Carmody
3 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | Copyright (c) 2022 Sean Carmody
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method("[",ngram)
 4 | S3method(print,ngram)
 5 | export(chunk)
 6 | export(corpuses)
 7 | export(ggram)
 8 | export(hacker)
 9 | export(ngram)
10 | export(ngrami)
11 | export(ngramw)
12 | export(theme_google)
13 | import(dplyr)
14 | import(ggplot2)
15 | import(tidyr)
16 | importFrom(rlang,.data)
17 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
  1 | # ngramr 1.10.0
  2 | 
  3 | * NOTE: this is a major release and removes some functionality
  4 | * Update for new corpuses
  5 | * Remove the add_count option (no data provided by Google)
  6 | * Remove drop_corpus (Google no longer supports the :corpus operator for the current corpus)
  7 | 
  8 | # ngramr 1.9.1-1.9.3
  9 | 
 10 | * Fix package after more changes to the Google Ngram Viewer website
 11 | 
 12 | # ngramr 1.9.0
 13 | 
 14 | * Fix package after latest changes to the Google Ngram Viewer website
 15 | 
 16 | # ngramr 1.8.3
 17 | 
 18 | * Improved error handling
 19 | 
 20 | # ngramr 1.8.2
 21 | 
 22 | * Suppress testing of all examples that make internet calls (fail gracefully)
 23 | 
 24 | # ngramr 1.8.1
 25 | 
 26 | * Handle offline state
 27 | * Skip testing if offline
 28 | 
 29 | # ngramr 1.8.0
 30 | 
 31 | * Incremented version to reflect that 1.7.7 was a major release
 32 | * Rolled back use of |> for compatibility with earlier versions of R
 33 | 
 34 | # ngramr 1.7.7
 35 | 
 36 | * Update for changes in ngram viewer website
 37 | * New corpus names (e.g. eng_2019 changed to en_2019)
 38 | 
 39 | # ngramr 1.7.6
 40 | 
 41 | * Drop use of lifecycle badges
 42 | * Add markdown format NEWS file
 43 | 
 44 | # ngramr 1.7.5
 45 | 
 46 | * Tidied fromJSON call
 47 | * Started to use lifecycle in documentation (ngrami)
 48 | 
 49 | # ngramr 1.7.4
 50 | 
 51 | * Imposed version dependency for dplyr to ensure relocate available
 52 | 
 53 | # ngramr 1.7.3
 54 | 
 55 | * Updated documentation to provide details of return values
 56 | 
 57 | # ngramr 1.7.2
 58 | 
 59 | * Change download code to use 'url' to ensure code works behind a proxy server
 60 | * Addressed CRAN submission requirements
 61 | 
 62 | # ngramr 1.7.1
 63 | 
 64 | * Change year_start default to 1800 in documentation
 65 | 
 66 | # ngramr 1.7.0
 67 | 
 68 | * Comprehensive refactor of underlying code
 69 | * More robust error/warning handling
 70 | * Dropped the "tag" argument from ngram functions
 71 | 
 72 | # ngramr 1.6.5
 73 | 
 74 | * Fix case_sensitive attribute
 75 | 
 76 | # ngramr 1.6.4
 77 | 
 78 | * Fix error in corpus count dataset
 79 | 
 80 | # ngramr 1.6.0
 81 | 
 82 | * Update to address issue (#26) resulting from change in the format of Google Ngram Viewer webpage
 83 | 
 84 | # ngramr 1.5.0
 85 | 
 86 | * Incorporated pull [changes #22, @seancarmody](https://github.com/seancarmody/ngramr/pull/22)
 87 | * Make wildcard searches expand to all terms
 88 | * Error out on server answer "Please try again later."    1a655f3
 89 | * Fix setting default corpus. 0b22dc4
 90 | * scale functions: do not explicitly set name, allow overwrite.   3a21061
 91 | * Allow passing through additional parameters to ngram_single.    ac6b1cc
 92 | * For wildcard searches, drop the cumulated (All) column
 93 | * Added travis-ci testing
 94 | 
 95 | # ngramr 1.4.5
 96 | 
 97 | * Fixed problems with (some) advanced operators
 98 | 
 99 | 
100 | # ngramr 1.4.4
101 | 
102 | * Removed debugging from ngrami
103 | 
104 | # ngramr 1.4.3
105 | 
106 | * Fixed the Pulser bug
107 | 
108 | # ngramr 1.4.2
109 | 
110 | * Fix accented character encoding problem on Windows
111 | 
112 | # ngramr 1.4.1
113 | 
114 | * Improve ssl handling (refer Hadley's comment here: http://www.statsravingmad.com/blog/statistics/a-tiny-rcurl-headache/) 
115 | 
116 | # ngramr 1.4.0
117 | 
118 | Google has switched to SSL for the N-gram viewer and the format of the web-pages has
119 | changed. This means that earlier versions of the package are completely broken. This
120 | release fixes this major problem.
121 | 
122 | # ngramr 1.3.2
123 | 
124 | * Add README.md to .Rbuildignore to remove from CRAN
125 | 
126 | # ngramr 1.3.1
127 | 
128 | * Fix count for n-grams with n>1, including a "fudge" for 2012 corpuses
129 | 
130 | # ngramr 1.3.0
131 | 
132 | * Add option to display long-form corpus name
133 | * Warn about smoothing >0 for geoms other than "line"
134 | * Tidy documentation for print.ngram
135 | * ngram and ngrami return S3 class "ngram"
136 | * Format print for ngram objects
137 | * ggram can take either a list of phrases or an ngram object
138 | 
139 | # ngramr 1.2.4
140 | 
141 | * Add option to relabel y-axis
142 | * Add word counts option to ngram
143 | * Change ggplot2 and scales from Requires to Suggests
144 | 
145 | # ngramr 1.2.3
146 | 
147 | * Prevent use of complex operators in case insensitive searches
148 | * Warn about character substitution
149 | 
150 | # ngramr 1.2.2
151 | 
152 | * CRAN release version
153 | * More efficient handling of escaped Unicode (thanks Hadley http://stackoverflow.com/a/17787736/1543437)
154 | * Fix package checking problems associated with plyr
155 | 
156 | # ngramr 1.2.1
157 | 
158 | * Tidy Google theme
159 | 
160 | # ngramr 1.2.0
161 | 
162 | * First semi-official release. All future development moved to the 'develop' branch.
163 | * Allow case insensitive plotting with ggram
164 | * Avoid reshape/reshape2 conflicts (thanks to Francois Briatte)
165 | * Pass arbitrary geoms to `ggram`
166 | * New function `ngramw` to return results in "wide" format
167 | * Removed `wide` option from `ggram` and `ggrami`
168 | * Better handling of legends when `ignore_case = TRUE`
169 | * Error trapping long phrase lists
170 | * Google theme option
171 | 
172 | # ngramr 1.1
173 | 
174 | * Added plot wrapper ggram
175 | * Detect invalid corpus names
176 | 
177 | # ngramr 1.0
178 | 
179 | * Initial release of the ngramr package


--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
 1 | #' Sample n-gram data
 2 | #'
 3 | #' Frequency data for the phrases "hacker", "programmer", from 1950 to 2008.
 4 | #'
 5 | #' @docType data
 6 | #' @usage hacker
 7 | #' @name hacker
 8 | #' @format a 236 x 4 ngram data frame
 9 | #' @keywords datasets
10 | #' @export
11 | #'
12 | NULL
13 | 
14 | #' Google n-gram corpus information
15 | #'
16 | #' Details of the various corpuses available through the Google n-gram tool
17 | #'
18 | #' @docType data
19 | #' @usage corpuses
20 | #' @name corpuses
21 | #' @format 44 x 6 ngram data frame
22 | #' @keywords datasets
23 | #' @export
24 | #'
25 | NULL
26 | 


--------------------------------------------------------------------------------
/R/ggram.R:
--------------------------------------------------------------------------------
  1 | #' Plot n-gram frequencies
  2 | #'
  3 | #' \code{ggram} downloads data from the Google Ngram Viewer website and
  4 | #' plots it in \code{ggplot2} style.
  5 | #'
  6 | #' @param phrases vector of phrases. Alternatively, phrases can be an ngram
  7 | #'   object returned by \code{\link{ngram}} or \code{\link{ngrami}}.
  8 | #' @param ignore_case logical, indicating whether the frequencies are case
  9 | #'  insensitive.
 10 | #'  Default is \code{FALSE}.
 11 | #' @param geom the ggplot2 geom used to plot the data; defaults to "line"
 12 | #' @param geom_options list of additional parameters passed to the ggplot2 geom.
 13 | #' @param lab y-axis label. Defaults to "Frequency".
 14 | #' @param google_theme use a Google Ngram-style plot theme.
 15 | #' @param ... additional parameters passed to \code{ngram}
 16 | #' @details
 17 | #'  Google generated two datasets drawn from digitised books in the Google
 18 | #'  books collection. One was generated in July 2009, the second in July 2012.
 19 | #'  Google will update these datasets as book scanning continues.
 20 | #'
 21 | #' @examples
 22 | #' \donttest{library(ggplot2)
 23 | #' ggram(c("hacker", "programmer"), year_start = 1950)
 24 | #'
 25 | #' # Changing the geom.
 26 | #' ggram(c("cancer", "fumer", "cigarette"),
 27 | #'       year_start = 1900,
 28 | #'       corpus = "fr-2012",
 29 | #'       smoothing = 0,
 30 | #'       geom = "step")
 31 | #'
 32 | #' # Passing more options.
 33 | #' ggram(c("cancer", "smoking", "tobacco"),
 34 | #'       year_start = 1900,
 35 | #'       corpus = "en-fiction-2012",
 36 | #'       geom = "point",
 37 | #'       smoothing = 0,
 38 | #'       geom_options = list(alpha = .5)) +
 39 | #'   stat_smooth(method="loess", se = FALSE, formula = y  ~ x)
 40 | #'
 41 | #' # Setting the layers manually.
 42 | #' ggram(c("cancer", "smoking", "tobacco"),
 43 | #'       year_start = 1900,
 44 | #'       corpus = "en-fiction-2012",
 45 | #'       smoothing = 0,
 46 | #'       geom = NULL) +
 47 | #'   stat_smooth(method="loess", se=FALSE, span = 0.3, formula = y ~ x)
 48 | #'
 49 | #' # Setting the legend placement on a long query and using the Google theme.
 50 | #' # Example taken from a post by Ben Zimmer at Language Log.
 51 | #' p <- c("((The United States is + The United States has) / The United States)",
 52 | #'       "((The United States are + The United States have) / The United States)")
 53 | #' ggram(p, year_start = 1800, google_theme = TRUE) +
 54 | #'       theme(legend.direction="vertical")
 55 | #'
 56 | #' # Pass ngram data rather than phrases
 57 | #' ggram(hacker) + facet_wrap(~ Corpus)
 58 | #'}
 59 | #' @export
 60 | 
 61 | ggram <- function(phrases, ignore_case = FALSE, 
 62 |                   geom = "line", geom_options = list(), lab = NA,
 63 |                   google_theme = FALSE, ...) {
 64 |   if ("ngram" %in% class(phrases)) {
 65 |     ng <- phrases
 66 |   } else {
 67 |     if (ignore_case) {
 68 |       ng <- ngrami(phrases, ...)
 69 |     } else {
 70 |       ng <- ngram(phrases, ...)
 71 |     }
 72 |   }
 73 |   if (is.null(ng)) {
 74 |     message("Unable to plot: no data returned")
 75 |     return(invisible(NULL))
 76 |   }
 77 |   if (is.character(geom) &&
 78 |       !(geom %in% c("area", "line")) && attr(ng, "smoothing") > 0) {
 79 |     warning("ngram data is smoothed. Consider setting smoothing = 0.")
 80 |   }
 81 |   if (!"Year" %in% names(ng)) stop("No ngram data returned")
 82 |   ng <- within(ng, Year <- as.Date(paste(Year, 1, 1, sep = "-")))
 83 |   p <- ggplot(data = ng,
 84 |              aes_string(x = "Year", y = "Frequency",
 85 |                         colour = "Phrase", fill = "Phrase",
 86 |                         label = "Phrase"))
 87 |   if (!inherits(geom, "character")) geom <- NULL
 88 |   if (!is.null(geom)) p <- p + do.call(stat_identity,
 89 |                                        c(geom = geom, geom_options))
 90 |   p <-  p + labs(x = NULL)
 91 |   if (google_theme) {
 92 |     # Google Ngram palette.
 93 |     p <- p +
 94 |       scale_colour_google() +
 95 |       scale_fill_google() +
 96 |       theme_google() + labs(y = NULL, colour = NULL) +
 97 |       scale_x_date(expand = c(0, 0)) +
 98 |       scale_y_continuous(expand = c(0, 0), labels = scales::percent)
 99 |   } else {
100 |     p <- p +
101 |       scale_colour_discrete("") +
102 |       scale_fill_discrete("") +
103 |       scale_y_continuous(labels = scales::percent)
104 |   }
105 |   if (!is.na(lab)) p <- p + labs(y = lab)
106 |   return(p)
107 | }
108 | 


--------------------------------------------------------------------------------
/R/methods.R:
--------------------------------------------------------------------------------
 1 | #' Print n-gram contents
 2 | #'
 3 | #' @param x ngram object as returned by \code{link{ngram}}
 4 | #' @param rows number of rows to print. Default is 6.
 5 | #' @param ... additional parameters passed to default print method.
 6 | #' @export
 7 | #' @method print ngram
 8 | #' @examples
 9 | #' \donttest{x <- ngram(c("hacker", "programmer"), year_start = 1950)
10 | #' print(x)
11 | #' }
12 | 
13 | print.ngram <- function(x, rows=6, ...) {
14 |   df <- x
15 |   class(df) <- class(df)[-1]
16 |   np.rows <- dim(df)[1] - rows
17 | 
18 |   if (all(c("Phrase", "Corpus", "Year") %in% names(x))) {
19 |     cli::cat_line("# Ngram data table", col = "green")
20 |     cli::cat_line("# Phrases:\t\t", paste(levels(x$Phrase), collapse = ", "))
21 |     cli::cat_line("# Case-sensitive:\t", attributes(x)$case_sensitive)
22 |     cli::cat_line("# Corpuses:\t\t", paste(levels(x$Corpus), collapse = ", "))
23 |     cli::cat_line("# Smoothing:\t\t", attributes(x)$smoothing)
24 |     cli::cat_line("# Years:\t\t", min(x$Year), "-", max(x$Year))
25 |     cat("\n")
26 |   }
27 | 
28 |   print(utils::head(as.data.frame(df), rows))
29 |   if (np.rows > 0) {
30 |     cli::cat_line(cli::cli_text(cli::col_grey("# ... with {np.rows} more row{?s}")))
31 |   }
32 |   invisible(x)
33 | }
34 | 
35 | #' @export
36 | `[.ngram` <- function(x, ...) {
37 |   class(x) <- class(x)[-1]
38 |   x <- x[...]
39 |   if (all(c("Phrase", "Corpus", "Year") %in% names(x))) class(x) <- c("ngram", class(x))
40 |   return(x)
41 | }
42 | 


--------------------------------------------------------------------------------
/R/ngram.R:
--------------------------------------------------------------------------------
  1 | #' Get n-gram frequencies
  2 | #'
  3 | #' `ngram` downloads data from the Google Ngram Viewer website and
  4 | #' returns it in a tibble.
  5 | #'
  6 | #' @param phrases vector of phrases, with a maximum of 12 items
  7 | #' @param corpus Google corpus to search (see Details for possible values)
  8 | #' @param year_start start year, default is 1800. Data available back to 1500.
  9 | #' @param year_end end year, default is 2008
 10 | #' @param smoothing smoothing parameter, default is 3
 11 | #' @param case_ins Logical indicating whether to force a case insensitive search.
 12 | #'   Default is `FALSE`.
 13 | #' @param aggregate Sum up the frequencies for ngrams associated with wildcard
 14 | #'   or case insensitive searches. Default is `FALSE`.
 15 | #' @param count Default is `FALSE`.
 16 | #' @param drop_parent  Drop the parent phrase associated with a wildcard
 17 | #'   or case-insensitive search. Default is `FALSE`.
 18 | #' @param drop_all Delete the suffix "(All)" from aggregated case-insensitive
 19 | #'   searches. Default is `FALSE`.
 20 | #' @param type Include the Google return type (e.g. NGRAM, NGRAM_COLLECTION,
 21 | #'   EXPANSION) from result set. Default is `FALSE`.
 22 | #' @return `ngram` returns an object of class "`ngram`",
 23 | #'   which is a tidyverse `tibble` enriched with attributes reflecting 
 24 | #'   some of the parameters used in the Ngram Viewer query.
 25 | #' @details
 26 | #'  Google generated two datasets drawn from digitised books in the Google
 27 | #'  Books collection. One was generated in July 2009, the second in July 2012
 28 | #'  and the third in 2019. Google is expected to update these datasets as book
 29 | #'  scanning continues.
 30 | #'
 31 | #'  This function provides the annual frequency of words or phrases, known
 32 | #'  as n-grams, in a sub-collection or "corpus" taken from the Google Books
 33 | #'  collection.The search across the corpus is case-sensitive.
 34 | #'  
 35 | #' If the function is unable to retrieve data from the Google Ngram Viewer
 36 | #' site (either because of access issues or if the format of Google's site
 37 | #' has changed) a NULL result is returned and messages are printed to the 
 38 | #' console but no errors or warnings are raised (this is to align with
 39 | #' CRAN package policies).
 40 | #' 
 41 | #' Below is a list of available corpora. Note that the data for the 2012
 42 | #' corpuses only extends to 2009.
 43 | #' \tabular{ll}{
 44 | #' \bold{Corpus} \tab \bold{Corpus Name}\cr
 45 | #' en-US-2019\tab American English 2019\cr
 46 | #' en-US-2012\tab American English 2012\cr
 47 | #' en-US-2009\tab American English 2009\cr
 48 | #' en-GB-2019\tab British English 2019\cr
 49 | #' en-GB-2012\tab British English 2012\cr
 50 | #' en-GB-2009\tab British English 2009\cr
 51 | #' zh-Hans-2019\tab Chinese 2019\cr
 52 | #' zh-Hans-2012\tab Chinese 2012\cr
 53 | #' zh-Hans-2009\tab Chinese 2009\cr
 54 | #' en-2019\tab English 2019\cr
 55 | #' en-2012\tab English 2012\cr
 56 | #' en-2009\tab English 2009\cr
 57 | #' en-fiction-2019\tab English Fiction 2019\cr
 58 | #' en-fiction-2012\tab English Fiction 2012\cr
 59 | #' en-fiction-2009\tab English Fiction 2009\cr
 60 | #' en-1M-2009\tab English One Million\cr
 61 | #' fr-2019\tab French 2019\cr
 62 | #' fr-2012\tab French 2012\cr
 63 | #' fr-2009\tab French 2009\cr
 64 | #' de-2019\tab German 2019\cr
 65 | #' de-2012\tab German 2012\cr
 66 | #' de-2009\tab German 2009\cr
 67 | #' iw-2019\tab Hebrew 2019\cr
 68 | #' iw-2012\tab Hebrew 2012\cr
 69 | #' iw-2009\tab Hebrew 2009\cr
 70 | #' es-2019\tab Spanish 2019\cr
 71 | #' es-2012\tab Spanish 2012\cr
 72 | #' es-2009\tab Spanish 2009\cr
 73 | #' ru-2019\tab Russian 2019\cr
 74 | #' ru-2012\tab Russian 2012\cr
 75 | #' ru-2009\tab Russian 2009\cr
 76 | #' it-2019\tab Italian 2019\cr
 77 | #' it-2012\tab Italian 2012\cr
 78 | #' }
 79 | #'
 80 | #' The Google Million is a sub-collection of Google Books. All are in
 81 | #' English with dates ranging from 1500 to 2008.
 82 | #' No more than about 6,000 books were chosen from any one year, which
 83 | #' means that all of the scanned books from early years are present,
 84 | #' and books from later years are randomly sampled. The random samplings
 85 | #' reflect the subject distributions for the year (so there are more
 86 | #' computer books in 2000 than 1980).
 87 | #'
 88 | #' See \url{http://books.google.com/ngrams/info} for the full Ngram syntax.
 89 | #' @examples
 90 | #' \donttest{ngram(c("mouse", "rat"), year_start = 1950)
 91 | #' ngram(c("blue_ADJ", "red_ADJ"))
 92 | #' ngram(c("_START_ President Roosevelt", "_START_ President Truman"), year_start = 1920)
 93 | #' }
 94 | #' @export
 95 | 
 96 | ngram <- function(phrases, corpus = "en", year_start = 1800, 
 97 |                       year_end = 2022, smoothing = 3, case_ins=FALSE,
 98 |                       aggregate = FALSE, count = FALSE, 
 99 |                       drop_parent = FALSE, drop_all = FALSE, type = FALSE) {
100 |   #if (!curl::has_internet()) {stop("Unable to access internet.")}
101 |   phrases <- ngram_check_phrases(phrases)
102 |   # Loop over corpuses
103 |   dfs <- lapply(corpus, function(corp) ngram_single(phrases, corpus = corp,
104 |                                                     year_start = year_start,
105 |                                                     year_end = year_end,
106 |                                                     smoothing = smoothing,
107 |                                                     case_ins = case_ins))
108 |   ng <- bind_rows(dfs)
109 |   if (length(ng) == 0) return(NULL)
110 |   class(ng) <- c("ngram", class(ng))
111 |   ng <- truncate_years(ng)
112 |   if (aggregate) {
113 |     ng <- filter(ng, .data$type != "EXPANSION")
114 |     } else {
115 |     ng <- filter(ng, .data$type %in% c("NGRAM", "EXPANSION"))
116 |     }
117 |   print(ng)
118 |   if (drop_parent || all(ng$Parent == "")) ng$Parent <- NULL
119 |   if (drop_all) {
120 |     ng <- mutate(ng, 
121 |                  Phrase = if_else(type == "CASE_INSENSITIVE",
122 |                                   stringr::str_replace(.data$Phrase, "\\s*\\(All\\)\\z", ""), 
123 |                                   .data$Phrase))
124 |   }
125 |   #ng <- select(ng, -"clean")
126 |   attr(ng, "smoothing") <- smoothing
127 |   attr(ng, "case_sensitive") <- !case_ins
128 |   ng$Corpus <- as.factor(ng$Corpus)
129 |   ng$Phrase <- as.factor(ng$Phrase)
130 |   if (type) ng$Type <- ng$type
131 |   ng$type <- NULL
132 |   return(ng)
133 | }
134 | 
135 | ngram_single <- function(phrases, corpus, year_start, year_end,
136 |                              smoothing, case_ins) {
137 |   if (!(corpus %in% corpuses$Shorthand)) {warning(paste(corpus, "not a valid corpus. Defaulting to en-2019."))}
138 |   #corpus <- get_corpus_n(corpus)
139 |   query <- as.list(environment())
140 |   if (case_ins) query["case_insensitive"] <- "true"
141 |   query$phrases <- NULL
142 |   query$case_ins <- NULL
143 |   ng_url <- ngram_url(phrases, query)
144 |   html <- ngram_fetch_xml(ng_url)
145 |   if (is.null(html)){
146 |     ng <- NULL
147 |   } else {
148 |     ng <- ngram_fetch_data(html)
149 |     warnings <- ngram_check_warnings(html)
150 |     show_warnings(warnings)
151 |   }
152 |   return(ng)
153 | }
154 | 
155 | ngram_check_phrases <- function(phrases){
156 |   stopifnot(is.character(phrases))
157 |   phrases <- phrases[phrases != ""]
158 |   if (length(phrases) == 0) stop("No valid phrases provided.")
159 |   if (!all(check_balanced(phrases))) stop("mis-matched parentheses")
160 |   if (length(phrases) > 12) {
161 |     phrases <- phrases[1:12]
162 |     warning("Maximum number of phrases exceeded: only using first 12.")
163 |   }
164 |   return(phrases)
165 | }
166 | 
167 | ngram_fetch_xml <- function(url) {
168 |   # retrieve data from Google Ngram Viewer site
169 |   # no errors or warnings generated on fail, only messages
170 |   try_get <- function(x, ...) {
171 |     tryCatch(
172 |       httr::GET(url = x, httr::timeout(3), ...),
173 |       error = function(e) conditionMessage(e),
174 |       warning = function(w) conditionMessage(w)
175 |     )
176 |   }
177 |   is_response <- function(x) {
178 |     class(x) == "response"
179 |   }
180 |   
181 |   # first check internet connection
182 |   if (!curl::has_internet()) {
183 |     message("No internet connection.")
184 |     return(invisible(NULL))
185 |   }
186 |   # then try for timeout problems
187 |   resp <- try_get(url)
188 |   if (!is_response(resp)) {
189 |     message("Please check Google's Ngram Viewer site is up.")
190 |     message(resp)
191 |     return(invisible(NULL))
192 |   }
193 |   # then stop if status > 400
194 |   if (httr::http_error(resp)) { 
195 |     message("Please check Google's Ngram Viewer site is up.")
196 |     httr::message_for_status(resp)
197 |     return(invisible(NULL))
198 |   }
199 |   return(xml2::read_html(resp))
200 | }
201 | 
202 | ngram_check_warnings <- function(html) {
203 |   node <- xml2::xml_find_first(html, "//div[@id='warning-area']")
204 |   warnings <- list()
205 |   if (length(node) > 0) {
206 |     for (n in xml2::xml_find_all(node, "div")) {
207 |       type <- xml2::xml_text(xml2::xml_find_first(n, "mwc-icon"))
208 |       msg <- stringr::str_trim(xml2::xml_text(xml2::xml_find_first(n, "span")))
209 |       msg <- stringr::str_replace_all(msg, "\\s+", " ")
210 |       msg <- stringr::str_replace(msg, "No valid ngrams to plot!", "No valid ngrams retrieved!")
211 |       warnings <- c(warnings, list(list(type = type, message = msg)))
212 |     }
213 |   }
214 |   return(warnings)
215 | }
216 | 
217 | ngram_fetch_data <- function(html) {
218 |   data <- tryCatch(
219 |     {
220 |       if (is.null(html)) {
221 |         NULL
222 |       } else {
223 |         corpus <- xml2::xml_find_first(html, "//select[@id='form-corpus']/option")
224 |         corpus <- xml2::xml_attr(corpus, "value")
225 |         if (grepl("^[0-9]+$", corpus, perl = TRUE)) {
226 |           corpus <- get_corpus_text(as.numeric(corpus))
227 |           }
228 |         script <- xml2::xml_find_all(html, "//div[@id='chart']/following::script")[1]
229 |         json <- xml2::xml_text(script)
230 |         json <- stringr::str_split(json, "\n")[[1]]
231 |         json <- json[json != '']
232 |         json <- stringr::str_squish(json)
233 |         years <- xml2::xml_find_all(html, "//div[@id='chart']/following::script")[2]
234 |         years <- xml2::xml_text(years)
235 |         years <-  stringr::str_split(years, "\n")[[1]]
236 |         years <-  grep('drawD3Chart', years, value = TRUE)
237 |         years <- as.integer(stringr::str_split(grep("drawD3Chart", years, value = TRUE), ",")[[1]][2:3])
238 |         data <- rjson::fromJSON(json)
239 |         if (length(data) == 0) return(NULL)
240 |         data <- lapply(data,
241 |                        function(x) tibble::add_column(tibble::as_tibble(x),
242 |                                                       Year = seq.int(years[1], years[2])))
243 |         data <- bind_rows(data)
244 |         data <- mutate(data, ngram = textutils::HTMLdecode(data$ngram), Corpus = corpus)
245 |         data <- relocate(data, "Year", "ngram", "timeseries", "Corpus")
246 |         data <- rename(data, Phrase = "ngram",  Frequency = "timeseries", Parent = "parent")
247 |         data
248 |       }
249 |     },
250 |     error=function(cond) {
251 |       message("Error parsing ngram data, please contact package maintainer.")
252 |       message("Here's the original error message:")
253 |       message(cond)
254 |       message("\nError occurred in the following code:")
255 |       message(conditionCall(cond))
256 |       return(NULL)
257 |     },
258 |     warning=function(cond) {
259 |       message("Warning generated when parsing ngram data.")
260 |       message("Here's the original warning message:")
261 |       message(cond)
262 |       return(NULL)
263 |     },
264 |     finally = {}
265 |   )
266 |   return(data)
267 | }
268 | 
269 | ngram_url <- function(phrases, query=character()) {
270 |   url <- "https://books.google.com/ngrams/graph"
271 |   n <- length(phrases)
272 |   for (i in 1:n) {
273 |     if (grepl("\\+|/", phrases[i])) phrases[i] <- paste0("(", phrases[i], ")")
274 |     p <- phrases[i]
275 |     if (!(Encoding(p) %in% c("unknown", "UTF-8"))) {
276 |       phrases[i] <- iconv(p, Encoding(p), "UTF-8")
277 |     }
278 |   }
279 |   phrases <- paste(curl::curl_escape(stringr::str_trim(phrases)),
280 |                    collapse = "%2c")
281 |   if (phrases == "") stop("No valid phrases provided.")
282 |   url <- paste0(url, "?content=", phrases)
283 |   if (length(query) > 0) url <- httr::modify_url(url, query = query)
284 |   url <- gsub("%28", "(", url)
285 |   url <- gsub("%29", ")", url)
286 |   url <- gsub("%20", "+", url)
287 |   return(url)
288 | }
289 | 
290 | check_balanced <- function(x) {
291 |   # Check parenthesis are appropriately balanced (i.e. every open is closed)
292 |   sapply(x, function(str) {
293 |     str <- gsub("[^\\(\\)]", "", str)
294 |     str <- strsplit(str, "")[[1]]
295 |     str <- ifelse(str == "(", 1, -1)
296 |     all(cumsum(str) >= 0) && sum(str) == 0
297 |   })
298 | }
299 | 
300 | show_warnings <- function(warnings){
301 |   if (length(warnings) > 0) {
302 |     for (w in warnings) {
303 |       warning(w$message, call. = FALSE)
304 |     }
305 |   }
306 | }
307 | 
308 | get_corpus_n <- function(corpus, default = "en-2019"){
309 |   stopifnot(is.character(corpus))
310 |   n <-  corpuses[corpus, "Number"]
311 |   if (any(is.na(n)) && !is.na(default)) {
312 |     if (is.character(default)) default <- get_corpus_n(default)
313 |     stopifnot(default %in% corpuses$Number)
314 |     invalid <- paste(corpus[is.na(n)], collapse = ", ")
315 |     warning(paste0("Unknown corpus ", invalid, ". Using default corpus instead."), call. = FALSE)
316 |     n[is.na(n)] <- default
317 |   }
318 |   return(n)
319 | }
320 | 
321 | get_corpus_text <- function(n, default = NA){
322 |   stopifnot(is.numeric(n))
323 |   text <- row.names(corpuses)[match(n, corpuses$Number)]
324 |   if (any(is.na(text)) && !is.na(default)) {
325 |     if (is.numeric(default)) default <- get_corpus_text(default)
326 |     stopifnot(default %in% row.names(corpuses))
327 |     invalid <- paste(n[is.na(text)], collapse = ", ")
328 |     warning(paste0("Unknown corpus ", invalid, ". Using default corpus instead."), call. = FALSE)
329 |     text[is.na(text)] <- default
330 |   }
331 |   return(text)
332 | }
333 | 
334 | truncate_years <- function(ngram){
335 |   stopifnot(class(ngram)[1] == "ngram")
336 |   ngram$Corpus <- as.character(ngram$Corpus)
337 |   ngram <- left_join(ngram, select(corpuses,
338 |                                    "Shorthand",
339 |                                    "Last.Year"),
340 |                      by = c("Corpus" = "Shorthand"))
341 |   ngram <- filter(ngram, .data$Year <= "Last.Year")
342 |   ngram$Last.Year <- NULL
343 |   return(ngram)
344 | }
345 | 


--------------------------------------------------------------------------------
/R/ngrami.R:
--------------------------------------------------------------------------------
 1 | #' Get n-gram frequencies (case insensitive version)
 2 | #'
 3 | #' @param phrases vector of phrases
 4 | #' @param aggregate sum up each of the terms
 5 | #' @param ... remaining parameters passed to ngram
 6 | #' @description 
 7 | #' This function is a simple wrapper of `ngram` for case insensitive searches.
 8 | #' @export
 9 |      
10 | ngrami <- function(phrases, aggregate = TRUE, ...){
11 |   ngram(phrases, aggregate = aggregate, case_ins = TRUE, drop_all = TRUE, ...)  
12 | }
13 | 


--------------------------------------------------------------------------------
/R/ngramr-package.R:
--------------------------------------------------------------------------------
 1 | #' ngramr: Dig into the Google Ngram Viewer using R
 2 | #'
 3 | #' @description
 4 | #' The \href{http://books.google.com/ngrams}{Google Books Ngram Viewer}
 5 | #' allows you to enter a list of phrases and then displays a graph showing
 6 | #' how often the phrases have occurred in a corpus of books
 7 | #' (e.g., "British English", "English Fiction", "French") over time.
 8 | #' The underlying data is hidden in web page, embedded in some Javascript.
 9 | #'
10 | #' This package extracts the data an provides it in the form of an R dataframe.
11 | #'
12 | #' The key function is \code{ngram} which, given a collection of
13 | #' phrases, returns a dataframe containing the frequencies by year.
14 | #'
15 | #' The code is based on the \code{getNgrams.py} Python script available on
16 | #' \href{https://web.archive.org/web/20221129120802/https://www.culturomics.org/}{Culturomics Code}
17 | #' written by Jean-Baptiste Michel. The Culturomics website doesn't 
18 | #' exist anymore but can still be find 
19 | #' \href{https://web.archive.org/web/20221129220150/https://www.culturomics.org/Resources/get-ngrams}{on archive.org}
20 | #' and is worth exploring.
21 | #'
22 | #' Note that compared to the 2009 versions, the 2012 and 2019 versions have
23 | #' larger numbers of books, improved OCR, improved library and publisher
24 | #' metadata. The 2012 and 2019 corpuses also don't form ngrams that cross
25 | #' sentence boundaries, and do form ngrams across page boundaries and 
26 | #' support part of speech tagging, unlike the 2009 versions.
27 | #'
28 | #' Like the Google Ngram Viewer website itself, this package is aimed at for
29 | #' quick inquiries into the usage of small sets of phrases.
30 | #'
31 | #' Please respect the terms of service of the Google Books Ngram Viewer while
32 | #' using this code. This code is meant to help viewers retrieve data behind
33 | #' a few queries, not bang at Google's  servers with dozens of queries.
34 | #' The complete dataset can be
35 | #' \href{https://storage.googleapis.com/books/ngrams/books/datasetsv3.html}{downloaded here}.
36 | #'
37 | #' @references
38 | #' Michel, Jean-Baptiste, et al. "Quantitative analysis of culture using
39 | #' millions of digitized books." \emph{Science} 331, No. 6014 (2011): 176--182.
40 | #' 
41 | #' @keywords internal
42 | #' @import dplyr tidyr ggplot2 
43 | #' @importFrom rlang .data 
44 | #' @docType package
45 | #' @name ngramr
46 | #' @aliases ngramr ngramr-package
47 | "_PACKAGE"
48 | 


--------------------------------------------------------------------------------
/R/ngramw.R:
--------------------------------------------------------------------------------
 1 | #' Get n-gram frequencies ("wide" format)
 2 | #'
 3 | #' @param phrases vector of phrases
 4 | #' @param ignore_case ignore case of phrases (i.e. call \code{ngrami}
 5 | #'   rather than \code{ngram}). Default value is \code{FALSE}.
 6 | #' @param ... remaining parameters passed to \code{ngram}
 7 | #' @export
 8 | 
 9 | ngramw <- function(phrases, ignore_case=FALSE, ...) {
10 |   if ("ngram" %in% class(phrases)) {
11 |     ng <- phrases
12 |   } else {
13 |     ng <- if (ignore_case) ngrami(phrases, ...) else ngram(phrases, ...)
14 |   }
15 |   if (is.null(ng)) return(NULL)
16 |   ng <- pivot_wider(ng, names_from = "Phrase", values_from = "Frequency")
17 |   return(ng)
18 | }
19 | 


--------------------------------------------------------------------------------
/R/sysdata.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/seancarmody/ngramr/32f11bea7531db06afb7c8c9d40a80be15663227/R/sysdata.rda


--------------------------------------------------------------------------------
/R/themes.R:
--------------------------------------------------------------------------------
 1 | #' Google Ngram theme for ggplot2
 2 | #'
 3 | #' @param ... additional parameters to pass to \code{theme}
 4 | #'
 5 | #' @details
 6 | #' Use a Google Ngram-style plot theme.
 7 | #'
 8 | #' @export
 9 | 
10 | theme_google <- function(...) {
11 |   theme(panel.border = element_rect(colour = "grey", size = 0.2, fill = NA),
12 |         panel.background = element_rect(fill = NA),
13 |         axis.line = element_line(colour = "black", size = 0.3),
14 |         panel.grid.major = element_line(colour = "grey", size = 0.2),
15 |         panel.grid.minor = element_blank(),
16 |         legend.position = "top",
17 |         legend.direction = "horizontal",
18 |         legend.box = "vertical",
19 |         legend.key = element_rect(fill = NA),
20 |         axis.text = element_text(colour = "black"),
21 |         axis.ticks = element_blank(), ...)
22 | }
23 | 
24 | scale_colour_google <- function(...) {
25 |   palette <- c("#264EC0", "#D22310", "#FC8608", "#168713", "#850086",
26 |                "#1086B9", "#D22B63", "#559D05", "#A71B23", "#21436F",
27 |                "#852D86", "#219B86")
28 |   scale_colour_manual(..., values = palette)
29 | }
30 | 
31 | scale_fill_google <- function(...) {
32 |   palette <- c("#264EC0", "#D22310", "#FC8608", "#168713", "#850086",
33 |                "#1086B9", "#D22B63", "#559D05", "#A71B23", "#21436F",
34 |                "#852D86", "#219B86")
35 |   scale_fill_manual(..., values = palette)
36 | }
37 | 


--------------------------------------------------------------------------------
/R/utilities.R:
--------------------------------------------------------------------------------
 1 | #' Chunk a vector or list
 2 | #'
 3 | #' \code{chunk} takes a vector (or list) and returns a list of chunks
 4 | #' which all have lengths (approximately) equal to a specified value.
 5 | #'
 6 | #' @param x vector of list
 7 | #' @param len target length of chunks
 8 | #' @param n number of chunks
 9 | #' 
10 | #' @details 
11 | #'  If \code{n} is specified, \code{len} is ignored and \code{chunk} returns
12 | #'  a list of length \code{n} of "chunks" of \code{x}. Otherwise
13 | #'  \code{n} is calculated to break the vector into chunks which are
14 | #'  each approximately of length \code{len}. If both \code{len} and
15 | #'  \code{n} are unspecified, \code{chunk} simply returns \code{x}.
16 | #' @examples
17 | #' chunk(letters, 10)
18 | #' chunk(LETTERS, n = 3)
19 | #' 
20 | #' @export
21 | 
22 | chunk <- function(x, len = NULL, n = NULL) {
23 |   if (is.null(len) & is.null(len)) return(x)
24 |   if (is.null(len)) len <- ceiling(length(x) / n)
25 |   if (is.null(n)) n <- ceiling(length(x) / len)
26 |   if (len >= length(x)) {
27 |     return(x)
28 |     } else {
29 |     return(split(x, cut(seq_along(x), n, labels = FALSE)))
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ngramr - R package to query the Google Ngram Viewer
  2 | 
  3 | <!-- badges: start -->
  4 | [![CRAN
  5 | status](https://www.r-pkg.org/badges/version/ngramr)](https://cran.r-project.org/package=ngramr)
  6 | [![DOI](https://zenodo.org/badge/11216907.svg)](https://zenodo.org/badge/latestdoi/11216907)
  7 | [![Build Status](https://app.travis-ci.com/seancarmody/ngramr.svg?branch=master)](https://app.travis-ci.com/seancarmody/ngramr)
  8 | <!-- badges: end -->
  9 | 
 10 | <!-- [![codecov](https://codecov.io/github/seancarmody/ngramr/branch/master/graphs/badge.svg)](https://codecov.io/github/seancarmody/ngramr) -->
 11 | 
 12 | The [Google Books Ngram Viewer][1] allows you to enter a list of phrases and
 13 | then displays a graph showing how often the phrases have occurred in a large
 14 | corpus of books (e.g., "British English", "English Fiction", "French") over
 15 | time. The current corpus produced in 2023 contains around two trillion words
 16 | for English alone.
 17 | 
 18 | The underlying data is hidden in Web page, embedded in some Javascript.
 19 | This package extracts the data and provides it in the form of an R dataframe.
 20 | Early versions of code was adapted from a handy Python script available from 
 21 | [Culturomics][2], written by [Jean-Baptiste Michel][3]. The code has been
 22 | comprehensively redeveloped since then.
 23 | 
 24 | ## Installing
 25 | 
 26 | This package requires R version 4.0.0 or higher. If you are using an older
 27 | version of R you will be prompted to upgrade when you try to install the
 28 | package, so you may as well upgrade now!
 29 | 
 30 | The official release of ngramr is available on [CRAN][4]. To install from
 31 | CRAN, use the following command:
 32 | 
 33 |     install.packages('ngramr')
 34 | 
 35 | If you have any problems installing the package on macOS, try installing from
 36 | source:
 37 | 
 38 |     install.packages("ngramr", type="source")
 39 | 
 40 | If you have the [`devtools`][5] package installed, install the latest stable
 41 | version this package directly from GitHub:
 42 | 
 43 |     library(devtools)
 44 |     install_github("seancarmody/ngramr")
 45 |     library(ngramr)
 46 |    
 47 | and if you are feeling a little more adventurous, you can install the
 48 | development version:
 49 | 
 50 |     install_github("seancarmody/ngramr", "develop")
 51 | 
 52 | although it may not always work.
 53 | 
 54 | Note though that many releases fix problems that arise when Google changes the
 55 | format of the Ngram Viewer website so older versions generally no longer work.
 56 | If you are seeing errors with the latest version then the package may need fixing
 57 | after one of these Google changes. If so please report this on [GitHub][12]. 
 58 | 
 59 | If you are behind a proxy, `install_github` may not work for you. Instead of
 60 | fiddling around with the `RCurl` proxy settings, you can download the latest
 61 | [ZIP archive][6] and use `install_local` instead.
 62 | 
 63 | ## Examples
 64 | 
 65 | Here is an example of how to use the `ngram` function:
 66 | 
 67 |     library(ggplot2)
 68 |     ng  <- ngram(c("hacker", "programmer"), year_start = 1950)
 69 |     ggplot(ng, aes(x = Year, y = Frequency, colour = Phrase)) +
 70 |       geom_line()
 71 | 
 72 | The result is a ggplot2 line graph of the following form:
 73 | 
 74 | ![Ngram Chart](man/figures/hacker.png)
 75 | 
 76 | The same result can be achieved even more simply by using the `ggram`
 77 | plotting wrapper that supports many options, as in this example:
 78 | 
 79 | ![Ngram chart, with options](man/figures/archy.png)
 80 | 
 81 |     ggram(c("monarchy", "democracy"), year_start = 1500, year_end = 2000, 
 82 |           corpus = "en-GB-2012", ignore_case = TRUE, 
 83 |           geom = "area", geom_options = list(position = "stack")) + 
 84 |           labs(y = NULL)
 85 | 
 86 | The colours used by Google Ngram are available through the `google_theme`
 87 | option, as in this example posted by Ben Zimmer [at Language Log][7]:
 88 | 
 89 | ![Ngram chart, with Google theme](http://i.imgur.com/qKHvQA4.png)
 90 | 
 91 |     ng <- c("((The United States is + The United States has) / The United States)",
 92 |           "((The United States are + The United States have) / The United States)")
 93 |     ggram(ng, year_start = 1800, google_theme = TRUE) +
 94 |       theme(legend.direction = "vertical")
 95 | 
 96 | ## Getting help
 97 | 
 98 | If you encounter a bug, please file an issue with a reproducible
 99 | example on [GitHub][12]. 
100 | 
101 | ## Further Reading
102 | 
103 | For more information, read [this Stubborn Mule post][8] and the
104 | [Google Ngram syntax][9] documentation. Language Log has a [good post][10]
105 | written just after the launch of the 2012 corpus. 
106 | 
107 | If you would rather work with R and SQL on the raw Google Ngram datasets,
108 | [see this post][11].
109 | 
110 | ![Twitter Follow](https://img.shields.io/twitter/follow/stubbornmule?label=%40stubbornmule&style=social)
111 | 
112 | [1]:  http://books.google.com/ngrams "Google Ngram Viewer"
113 | [2]:  https://bit.ly/4gQ6dtw "Culturomics: Get Ngrams"
114 | [3]:  https://twitter.com/jb_michel "@jb_michel"
115 | [4]:  http://cran.r-project.org/web/packages/ngramr/index.html "ngramr on CRAN"
116 | [5]:  http://cran.r-project.org/web/packages/devtools/index.html "devtools"
117 | [6]:  https://github.com/seancarmody/ngramr/archive/latest.zip "ngramr ZIP"
118 | [7]:  http://languagelog.ldc.upenn.edu/nll/?p=4979 "US: singular or plural?"
119 | [8]:  http://www.stubbornmule.net/2013/07/ngramr/ "Mule on ngramr"
120 | [9]:  http://books.google.com/ngrams/info "Goole Ngram info"
121 | [10]: https://languagelog.ldc.upenn.edu/nll/?p=4258 "A new chapter for ngrams"
122 | [11]: http://rpsychologist.com/how-to-work-with-google-ngram-data-sets-in-r-using-mysql/ "Ngrams with R and mysql"
123 | [12]: https://github.com/seancarmody/ngramr/issues "ngramr issues on GitHub"
124 | 


--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
 1 | ## Test environments
 2 | * local MacOS install, R 4.4.1
 3 | * rhub v2
 4 | * win-builder (devel and release)
 5 | 
 6 | ## R CMD check results
 7 | * There were no ERRORs or WARNINGs
 8 | * Local and online build generated no NOTES. 
 9 | 
10 | ## CRAN requirements
11 | * None outstanding
12 | 


--------------------------------------------------------------------------------
/man/chunk.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utilities.R
 3 | \name{chunk}
 4 | \alias{chunk}
 5 | \title{Chunk a vector or list}
 6 | \usage{
 7 | chunk(x, len = NULL, n = NULL)
 8 | }
 9 | \arguments{
10 | \item{x}{vector of list}
11 | 
12 | \item{len}{target length of chunks}
13 | 
14 | \item{n}{number of chunks}
15 | }
16 | \description{
17 | \code{chunk} takes a vector (or list) and returns a list of chunks
18 | which all have lengths (approximately) equal to a specified value.
19 | }
20 | \details{
21 | If \code{n} is specified, \code{len} is ignored and \code{chunk} returns
22 | a list of length \code{n} of "chunks" of \code{x}. Otherwise
23 | \code{n} is calculated to break the vector into chunks which are
24 | each approximately of length \code{len}. If both \code{len} and
25 | \code{n} are unspecified, \code{chunk} simply returns \code{x}.
26 | }
27 | \examples{
28 | chunk(letters, 10)
29 | chunk(LETTERS, n = 3)
30 | 
31 | }
32 | 


--------------------------------------------------------------------------------
/man/corpuses.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{corpuses}
 5 | \alias{corpuses}
 6 | \title{Google n-gram corpus information}
 7 | \format{
 8 | 44 x 6 ngram data frame
 9 | }
10 | \usage{
11 | corpuses
12 | }
13 | \description{
14 | Details of the various corpuses available through the Google n-gram tool
15 | }
16 | \keyword{datasets}
17 | 


--------------------------------------------------------------------------------
/man/figures/archy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/seancarmody/ngramr/32f11bea7531db06afb7c8c9d40a80be15663227/man/figures/archy.png


--------------------------------------------------------------------------------
/man/figures/hacker.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/seancarmody/ngramr/32f11bea7531db06afb7c8c9d40a80be15663227/man/figures/hacker.png


--------------------------------------------------------------------------------
/man/ggram.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/ggram.R
 3 | \name{ggram}
 4 | \alias{ggram}
 5 | \title{Plot n-gram frequencies}
 6 | \usage{
 7 | ggram(
 8 |   phrases,
 9 |   ignore_case = FALSE,
10 |   geom = "line",
11 |   geom_options = list(),
12 |   lab = NA,
13 |   google_theme = FALSE,
14 |   ...
15 | )
16 | }
17 | \arguments{
18 | \item{phrases}{vector of phrases. Alternatively, phrases can be an ngram
19 | object returned by \code{\link{ngram}} or \code{\link{ngrami}}.}
20 | 
21 | \item{ignore_case}{logical, indicating whether the frequencies are case
22 | insensitive.
23 | Default is \code{FALSE}.}
24 | 
25 | \item{geom}{the ggplot2 geom used to plot the data; defaults to "line"}
26 | 
27 | \item{geom_options}{list of additional parameters passed to the ggplot2 geom.}
28 | 
29 | \item{lab}{y-axis label. Defaults to "Frequency".}
30 | 
31 | \item{google_theme}{use a Google Ngram-style plot theme.}
32 | 
33 | \item{...}{additional parameters passed to \code{ngram}}
34 | }
35 | \description{
36 | \code{ggram} downloads data from the Google Ngram Viewer website and
37 | plots it in \code{ggplot2} style.
38 | }
39 | \details{
40 | Google generated two datasets drawn from digitised books in the Google
41 | books collection. One was generated in July 2009, the second in July 2012.
42 | Google will update these datasets as book scanning continues.
43 | }
44 | \examples{
45 | \donttest{library(ggplot2)
46 | ggram(c("hacker", "programmer"), year_start = 1950)
47 | 
48 | # Changing the geom.
49 | ggram(c("cancer", "fumer", "cigarette"),
50 |       year_start = 1900,
51 |       corpus = "fr-2012",
52 |       smoothing = 0,
53 |       geom = "step")
54 | 
55 | # Passing more options.
56 | ggram(c("cancer", "smoking", "tobacco"),
57 |       year_start = 1900,
58 |       corpus = "en-fiction-2012",
59 |       geom = "point",
60 |       smoothing = 0,
61 |       geom_options = list(alpha = .5)) +
62 |   stat_smooth(method="loess", se = FALSE, formula = y  ~ x)
63 | 
64 | # Setting the layers manually.
65 | ggram(c("cancer", "smoking", "tobacco"),
66 |       year_start = 1900,
67 |       corpus = "en-fiction-2012",
68 |       smoothing = 0,
69 |       geom = NULL) +
70 |   stat_smooth(method="loess", se=FALSE, span = 0.3, formula = y ~ x)
71 | 
72 | # Setting the legend placement on a long query and using the Google theme.
73 | # Example taken from a post by Ben Zimmer at Language Log.
74 | p <- c("((The United States is + The United States has) / The United States)",
75 |       "((The United States are + The United States have) / The United States)")
76 | ggram(p, year_start = 1800, google_theme = TRUE) +
77 |       theme(legend.direction="vertical")
78 | 
79 | # Pass ngram data rather than phrases
80 | ggram(hacker) + facet_wrap(~ Corpus)
81 | }
82 | }
83 | 


--------------------------------------------------------------------------------
/man/hacker.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{hacker}
 5 | \alias{hacker}
 6 | \title{Sample n-gram data}
 7 | \format{
 8 | a 236 x 4 ngram data frame
 9 | }
10 | \usage{
11 | hacker
12 | }
13 | \description{
14 | Frequency data for the phrases "hacker", "programmer", from 1950 to 2008.
15 | }
16 | \keyword{datasets}
17 | 


--------------------------------------------------------------------------------
/man/ngram.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/ngram.R
  3 | \name{ngram}
  4 | \alias{ngram}
  5 | \title{Get n-gram frequencies}
  6 | \usage{
  7 | ngram(
  8 |   phrases,
  9 |   corpus = "en",
 10 |   year_start = 1800,
 11 |   year_end = 2022,
 12 |   smoothing = 3,
 13 |   case_ins = FALSE,
 14 |   aggregate = FALSE,
 15 |   count = FALSE,
 16 |   drop_parent = FALSE,
 17 |   drop_all = FALSE,
 18 |   type = FALSE
 19 | )
 20 | }
 21 | \arguments{
 22 | \item{phrases}{vector of phrases, with a maximum of 12 items}
 23 | 
 24 | \item{corpus}{Google corpus to search (see Details for possible values)}
 25 | 
 26 | \item{year_start}{start year, default is 1800. Data available back to 1500.}
 27 | 
 28 | \item{year_end}{end year, default is 2008}
 29 | 
 30 | \item{smoothing}{smoothing parameter, default is 3}
 31 | 
 32 | \item{case_ins}{Logical indicating whether to force a case insensitive search.
 33 | Default is \code{FALSE}.}
 34 | 
 35 | \item{aggregate}{Sum up the frequencies for ngrams associated with wildcard
 36 | or case insensitive searches. Default is \code{FALSE}.}
 37 | 
 38 | \item{count}{Default is \code{FALSE}.}
 39 | 
 40 | \item{drop_parent}{Drop the parent phrase associated with a wildcard
 41 | or case-insensitive search. Default is \code{FALSE}.}
 42 | 
 43 | \item{drop_all}{Delete the suffix "(All)" from aggregated case-insensitive
 44 | searches. Default is \code{FALSE}.}
 45 | 
 46 | \item{type}{Include the Google return type (e.g. NGRAM, NGRAM_COLLECTION,
 47 | EXPANSION) from result set. Default is \code{FALSE}.}
 48 | }
 49 | \value{
 50 | \code{ngram} returns an object of class "\code{ngram}",
 51 | which is a tidyverse \code{tibble} enriched with attributes reflecting
 52 | some of the parameters used in the Ngram Viewer query.
 53 | }
 54 | \description{
 55 | \code{ngram} downloads data from the Google Ngram Viewer website and
 56 | returns it in a tibble.
 57 | }
 58 | \details{
 59 | Google generated two datasets drawn from digitised books in the Google
 60 | Books collection. One was generated in July 2009, the second in July 2012
 61 | and the third in 2019. Google is expected to update these datasets as book
 62 | scanning continues.
 63 | 
 64 | This function provides the annual frequency of words or phrases, known
 65 | as n-grams, in a sub-collection or "corpus" taken from the Google Books
 66 | collection.The search across the corpus is case-sensitive.
 67 | 
 68 | If the function is unable to retrieve data from the Google Ngram Viewer
 69 | site (either because of access issues or if the format of Google's site
 70 | has changed) a NULL result is returned and messages are printed to the
 71 | console but no errors or warnings are raised (this is to align with
 72 | CRAN package policies).
 73 | 
 74 | Below is a list of available corpora. Note that the data for the 2012
 75 | corpuses only extends to 2009.
 76 | \tabular{ll}{
 77 | \bold{Corpus} \tab \bold{Corpus Name}\cr
 78 | en-US-2019\tab American English 2019\cr
 79 | en-US-2012\tab American English 2012\cr
 80 | en-US-2009\tab American English 2009\cr
 81 | en-GB-2019\tab British English 2019\cr
 82 | en-GB-2012\tab British English 2012\cr
 83 | en-GB-2009\tab British English 2009\cr
 84 | zh-Hans-2019\tab Chinese 2019\cr
 85 | zh-Hans-2012\tab Chinese 2012\cr
 86 | zh-Hans-2009\tab Chinese 2009\cr
 87 | en-2019\tab English 2019\cr
 88 | en-2012\tab English 2012\cr
 89 | en-2009\tab English 2009\cr
 90 | en-fiction-2019\tab English Fiction 2019\cr
 91 | en-fiction-2012\tab English Fiction 2012\cr
 92 | en-fiction-2009\tab English Fiction 2009\cr
 93 | en-1M-2009\tab English One Million\cr
 94 | fr-2019\tab French 2019\cr
 95 | fr-2012\tab French 2012\cr
 96 | fr-2009\tab French 2009\cr
 97 | de-2019\tab German 2019\cr
 98 | de-2012\tab German 2012\cr
 99 | de-2009\tab German 2009\cr
100 | iw-2019\tab Hebrew 2019\cr
101 | iw-2012\tab Hebrew 2012\cr
102 | iw-2009\tab Hebrew 2009\cr
103 | es-2019\tab Spanish 2019\cr
104 | es-2012\tab Spanish 2012\cr
105 | es-2009\tab Spanish 2009\cr
106 | ru-2019\tab Russian 2019\cr
107 | ru-2012\tab Russian 2012\cr
108 | ru-2009\tab Russian 2009\cr
109 | it-2019\tab Italian 2019\cr
110 | it-2012\tab Italian 2012\cr
111 | }
112 | 
113 | The Google Million is a sub-collection of Google Books. All are in
114 | English with dates ranging from 1500 to 2008.
115 | No more than about 6,000 books were chosen from any one year, which
116 | means that all of the scanned books from early years are present,
117 | and books from later years are randomly sampled. The random samplings
118 | reflect the subject distributions for the year (so there are more
119 | computer books in 2000 than 1980).
120 | 
121 | See \url{http://books.google.com/ngrams/info} for the full Ngram syntax.
122 | }
123 | \examples{
124 | \donttest{ngram(c("mouse", "rat"), year_start = 1950)
125 | ngram(c("blue_ADJ", "red_ADJ"))
126 | ngram(c("_START_ President Roosevelt", "_START_ President Truman"), year_start = 1920)
127 | }
128 | }
129 | 


--------------------------------------------------------------------------------
/man/ngrami.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/ngrami.R
 3 | \name{ngrami}
 4 | \alias{ngrami}
 5 | \title{Get n-gram frequencies (case insensitive version)}
 6 | \usage{
 7 | ngrami(phrases, aggregate = TRUE, ...)
 8 | }
 9 | \arguments{
10 | \item{phrases}{vector of phrases}
11 | 
12 | \item{aggregate}{sum up each of the terms}
13 | 
14 | \item{...}{remaining parameters passed to ngram}
15 | }
16 | \description{
17 | This function is a simple wrapper of \code{ngram} for case insensitive searches.
18 | }
19 | 


--------------------------------------------------------------------------------
/man/ngramr.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/ngramr-package.R
 3 | \docType{package}
 4 | \name{ngramr}
 5 | \alias{ngramr}
 6 | \alias{ngramr-package}
 7 | \title{ngramr: Dig into the Google Ngram Viewer using R}
 8 | \description{
 9 | The \href{http://books.google.com/ngrams}{Google Books Ngram Viewer}
10 | allows you to enter a list of phrases and then displays a graph showing
11 | how often the phrases have occurred in a corpus of books
12 | (e.g., "British English", "English Fiction", "French") over time.
13 | The underlying data is hidden in web page, embedded in some Javascript.
14 | 
15 | This package extracts the data an provides it in the form of an R dataframe.
16 | 
17 | The key function is \code{ngram} which, given a collection of
18 | phrases, returns a dataframe containing the frequencies by year.
19 | 
20 | The code is based on the \code{getNgrams.py} Python script available on
21 | \href{https://web.archive.org/web/20221129120802/https://www.culturomics.org/}{Culturomics Code}
22 | written by Jean-Baptiste Michel. The Culturomics website doesn't
23 | exist anymore but can still be find
24 | \href{https://web.archive.org/web/20221129220150/https://www.culturomics.org/Resources/get-ngrams}{on archive.org}
25 | and is worth exploring.
26 | 
27 | Note that compared to the 2009 versions, the 2012 and 2019 versions have
28 | larger numbers of books, improved OCR, improved library and publisher
29 | metadata. The 2012 and 2019 corpuses also don't form ngrams that cross
30 | sentence boundaries, and do form ngrams across page boundaries and
31 | support part of speech tagging, unlike the 2009 versions.
32 | 
33 | Like the Google Ngram Viewer website itself, this package is aimed at for
34 | quick inquiries into the usage of small sets of phrases.
35 | 
36 | Please respect the terms of service of the Google Books Ngram Viewer while
37 | using this code. This code is meant to help viewers retrieve data behind
38 | a few queries, not bang at Google's  servers with dozens of queries.
39 | The complete dataset can be
40 | \href{https://storage.googleapis.com/books/ngrams/books/datasetsv3.html}{downloaded here}.
41 | }
42 | \references{
43 | Michel, Jean-Baptiste, et al. "Quantitative analysis of culture using
44 | millions of digitized books." \emph{Science} 331, No. 6014 (2011): 176--182.
45 | }
46 | \seealso{
47 | Useful links:
48 | \itemize{
49 |   \item \url{https://github.com/seancarmody/ngramr}
50 |   \item Report bugs at \url{https://github.com/seancarmody/ngramr/issues}
51 | }
52 | 
53 | }
54 | \author{
55 | \strong{Maintainer}: Sean Carmody \email{seancarmody@gmail.com} [copyright holder]
56 | 
57 | }
58 | \keyword{internal}
59 | 


--------------------------------------------------------------------------------
/man/ngramw.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/ngramw.R
 3 | \name{ngramw}
 4 | \alias{ngramw}
 5 | \title{Get n-gram frequencies ("wide" format)}
 6 | \usage{
 7 | ngramw(phrases, ignore_case = FALSE, ...)
 8 | }
 9 | \arguments{
10 | \item{phrases}{vector of phrases}
11 | 
12 | \item{ignore_case}{ignore case of phrases (i.e. call \code{ngrami}
13 | rather than \code{ngram}). Default value is \code{FALSE}.}
14 | 
15 | \item{...}{remaining parameters passed to \code{ngram}}
16 | }
17 | \description{
18 | Get n-gram frequencies ("wide" format)
19 | }
20 | 


--------------------------------------------------------------------------------
/man/print.ngram.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/methods.R
 3 | \name{print.ngram}
 4 | \alias{print.ngram}
 5 | \title{Print n-gram contents}
 6 | \usage{
 7 | \method{print}{ngram}(x, rows = 6, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{ngram object as returned by \code{link{ngram}}}
11 | 
12 | \item{rows}{number of rows to print. Default is 6.}
13 | 
14 | \item{...}{additional parameters passed to default print method.}
15 | }
16 | \description{
17 | Print n-gram contents
18 | }
19 | \examples{
20 | \donttest{x <- ngram(c("hacker", "programmer"), year_start = 1950)
21 | print(x)
22 | }
23 | }
24 | 


--------------------------------------------------------------------------------
/man/theme_google.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/themes.R
 3 | \name{theme_google}
 4 | \alias{theme_google}
 5 | \title{Google Ngram theme for ggplot2}
 6 | \usage{
 7 | theme_google(...)
 8 | }
 9 | \arguments{
10 | \item{...}{additional parameters to pass to \code{theme}}
11 | }
12 | \description{
13 | Google Ngram theme for ggplot2
14 | }
15 | \details{
16 | Use a Google Ngram-style plot theme.
17 | }
18 | 


--------------------------------------------------------------------------------
/testme:
--------------------------------------------------------------------------------
1 | Rscript -e 'library(devtools); load_all(); test(reporter="minimal")'
2 | 


--------------------------------------------------------------------------------
/tests/results.txt:
--------------------------------------------------------------------------------
 1 | 1 tackle_*    ""         NGRAM_COLLECTION   220
 2 | 2 tackle_NOUN ""         NGRAM              220
 3 | 3 tackle_NOUN "tackle_*" EXPANSION          220
 4 | 4 tackle_VERB "tackle_*" EXPANSION          220
 5 | 
 6 | 1 duPont           "Dupont (All)"     EXPANSION          220
 7 | 2 Dupont           "Dupont (All)"     EXPANSION          220
 8 | 3 DuPont           "Dupont (All)"     EXPANSION          220
 9 | 4 DUPONT           "Dupont (All)"     EXPANSION          220
10 | 5 Dupont (All)     ""                 CASE_INSENSITIVE   220
11 | 6 Fitzgerald       "Fitzgerald (All)" EXPANSION          220
12 | 7 FitzGerald       "Fitzgerald (All)" EXPANSION          220
13 | 8 FITZGERALD       "Fitzgerald (All)" EXPANSION          220
14 | 9 Fitzgerald (All) ""                 CASE_INSENSITIVE   220
15 | 
16 | 1 read _DET_ book ""     NGRAM   220
17 | 
18 |  1 read * _DET_ book       ""                  NGRAM_COLLECTION   220
19 |  2 read as _DET_ book      "read * _DET_ book" EXPANSION          220
20 |  3 read from _DET_ book    "read * _DET_ book" EXPANSION          220
21 |  4 read in _DET_ book      "read * _DET_ book" EXPANSION          220
22 |  5 read like _DET_ book    "read * _DET_ book" EXPANSION          220
23 |  6 read of _DET_ book      "read * _DET_ book" EXPANSION          220
24 |  7 read over _DET_ book    "read * _DET_ book" EXPANSION          220
25 |  8 read such _DET_ book    "read * _DET_ book" EXPANSION          220
26 |  9 read through _DET_ book "read * _DET_ book" EXPANSION          220
27 | 10 read upon _DET_ book    "read * _DET_ book" EXPANSION          220
28 | 11 read with _DET_ book    "read * _DET_ book" EXPANSION          220
29 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(ngramr)
3 | 
4 | test_check("ngramr")
5 | 


--------------------------------------------------------------------------------
/tests/testthat/test-ngramr.R:
--------------------------------------------------------------------------------
 1 | context("Package")
 2 | test_that("package data", {
 3 |   expect_equal(dim(hacker), c(236, 4))
 4 |   expect_equal(class(hacker)[1], "ngram")
 5 |   expect_equal(dim(corpuses), c(44, 7))
 6 |   expect_equal(dim(corpus_totals), c(12945, 5))
 7 |   expect_equal(unlist(corpus_totals[12945,], use.names = FALSE),
 8 |                c("es-2019", 2019, 1658430069, 10286019, 24720))
 9 | })
10 | 
11 | test_that("utility functions", {
12 |   expect_equal(chunk(letters, len=4)[[4]], letters[12:15])
13 | })
14 | 
15 | context("Google")
16 | test_that("google calls", {
17 |   skip_if_offline()
18 |   skip_if(is.null(ngram("dog")), "Google Ngram calls not succeeding.")
19 |   expect_equal(dim( ngrami("dog", year_start = 1950, year_end = 2020)), c(71, 4))
20 |   expect_equal(dim(ngram(c("hacker", "programmer"), corpus = c("en-2012", "en-US-2012"), 
21 |                          year_start = 1950, year_end = 2008)), dim(hacker))
22 |   expect_equal(dim(ngramw(hacker)), c(118, 4))
23 |   expect_equal(dim(ngram(c("military"), corpus = "en-2012", year_start = 1940, 
24 |                          year_end = 2005, smoothing = 0)), c(66, 4))
25 | })
26 | 
27 | 


--------------------------------------------------------------------------------