├── .github └── workflows │ └── rhub.yaml ├── .gitignore ├── .travis.yml ├── DESCRIPTION ├── LICENSE ├── LICENSE.md ├── NAMESPACE ├── NEWS.md ├── R ├── data.R ├── ggram.R ├── methods.R ├── ngram.R ├── ngrami.R ├── ngramr-package.R ├── ngramw.R ├── sysdata.rda ├── themes.R └── utilities.R ├── README.md ├── cran-comments.md ├── man ├── chunk.Rd ├── corpuses.Rd ├── figures │ ├── archy.png │ └── hacker.png ├── ggram.Rd ├── hacker.Rd ├── ngram.Rd ├── ngrami.Rd ├── ngramr.Rd ├── ngramw.Rd ├── print.ngram.Rd └── theme_google.Rd ├── testme └── tests ├── results.txt ├── testthat.R └── testthat └── test-ngramr.R /.github/workflows/rhub.yaml: -------------------------------------------------------------------------------- 1 | # R-hub's generic GitHub Actions workflow file. It's canonical location is at 2 | # https://github.com/r-hub/actions/blob/v1/workflows/rhub.yaml 3 | # You can update this file to a newer version using the rhub2 package: 4 | # 5 | # rhub::rhub_setup() 6 | # 7 | # It is unlikely that you need to modify this file manually. 8 | 9 | name: R-hub 10 | run-name: "${{ github.event.inputs.id }}: ${{ github.event.inputs.name || format('Manually run by {0}', github.triggering_actor) }}" 11 | 12 | on: 13 | workflow_dispatch: 14 | inputs: 15 | config: 16 | description: 'A comma separated list of R-hub platforms to use.' 17 | type: string 18 | default: 'linux,windows,macos' 19 | name: 20 | description: 'Run name. You can leave this empty now.' 21 | type: string 22 | id: 23 | description: 'Unique ID. You can leave this empty now.' 24 | type: string 25 | 26 | jobs: 27 | 28 | setup: 29 | runs-on: ubuntu-latest 30 | outputs: 31 | containers: ${{ steps.rhub-setup.outputs.containers }} 32 | platforms: ${{ steps.rhub-setup.outputs.platforms }} 33 | 34 | steps: 35 | # NO NEED TO CHECKOUT HERE 36 | - uses: r-hub/actions/setup@v1 37 | with: 38 | config: ${{ github.event.inputs.config }} 39 | id: rhub-setup 40 | 41 | linux-containers: 42 | needs: setup 43 | if: ${{ needs.setup.outputs.containers != '[]' }} 44 | runs-on: ubuntu-latest 45 | name: ${{ matrix.config.label }} 46 | strategy: 47 | fail-fast: false 48 | matrix: 49 | config: ${{ fromJson(needs.setup.outputs.containers) }} 50 | container: 51 | image: ${{ matrix.config.container }} 52 | 53 | steps: 54 | - uses: r-hub/actions/checkout@v1 55 | - uses: r-hub/actions/platform-info@v1 56 | with: 57 | token: ${{ secrets.RHUB_TOKEN }} 58 | job-config: ${{ matrix.config.job-config }} 59 | - uses: r-hub/actions/setup-deps@v1 60 | with: 61 | token: ${{ secrets.RHUB_TOKEN }} 62 | job-config: ${{ matrix.config.job-config }} 63 | - uses: r-hub/actions/run-check@v1 64 | with: 65 | token: ${{ secrets.RHUB_TOKEN }} 66 | job-config: ${{ matrix.config.job-config }} 67 | 68 | other-platforms: 69 | needs: setup 70 | if: ${{ needs.setup.outputs.platforms != '[]' }} 71 | runs-on: ${{ matrix.config.os }} 72 | name: ${{ matrix.config.label }} 73 | strategy: 74 | fail-fast: false 75 | matrix: 76 | config: ${{ fromJson(needs.setup.outputs.platforms) }} 77 | 78 | steps: 79 | - uses: r-hub/actions/checkout@v1 80 | - uses: r-hub/actions/setup-r@v1 81 | with: 82 | job-config: ${{ matrix.config.job-config }} 83 | token: ${{ secrets.RHUB_TOKEN }} 84 | - uses: r-hub/actions/platform-info@v1 85 | with: 86 | token: ${{ secrets.RHUB_TOKEN }} 87 | job-config: ${{ matrix.config.job-config }} 88 | - uses: r-hub/actions/setup-deps@v1 89 | with: 90 | job-config: ${{ matrix.config.job-config }} 91 | token: ${{ secrets.RHUB_TOKEN }} 92 | - uses: r-hub/actions/run-check@v1 93 | with: 94 | job-config: ${{ matrix.config.job-config }} 95 | token: ${{ secrets.RHUB_TOKEN }} 96 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # OS X 2 | .DS_Store 3 | 4 | # History files 5 | .Rhistory 6 | 7 | # Example code in package build process 8 | *-Ex.R 9 | .Rproj.user 10 | .RData 11 | .Rprofile 12 | .Renviron 13 | *.Rproj 14 | google/* 15 | .Rbuildignore 16 | CRAN-SUBMISSION 17 | working/* 18 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r 2 | 3 | language: r 4 | 5 | r: 6 | - release 7 | - devel 8 | 9 | cache: packages 10 | 11 | r_packages: 12 | - covr 13 | 14 | after_success: 15 | - Rscript -e 'library(covr); codecov()' 16 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: ngramr 2 | Type: Package 3 | Title: Retrieve and Plot Google n-Gram Data 4 | Version: 1.10.0 5 | Date: 2025-01-10 6 | Authors@R: c( 7 | person("Sean", "Carmody", email = "seancarmody@gmail.com", role = c("aut", "cre", "cph")) 8 | ) 9 | Maintainer: Sean Carmody 10 | Description: Retrieve and plot word frequencies through time from the "Google 11 | Ngram Viewer" . 12 | Depends: 13 | R (>= 4.0.0) 14 | Imports: 15 | httr, 16 | rlang, 17 | curl, 18 | dplyr (>= 1.0.3), 19 | cli, 20 | tibble, 21 | tidyr, 22 | rjson, 23 | stringr, 24 | ggplot2, 25 | scales, 26 | xml2, 27 | textutils 28 | URL: https://github.com/seancarmody/ngramr 29 | BugReports: https://github.com/seancarmody/ngramr/issues 30 | License: MIT + file LICENSE 31 | RoxygenNote: 7.3.2 32 | Roxygen: list(markdown = TRUE) 33 | Encoding: UTF-8 34 | Suggests: 35 | testthat 36 | Language: en-AU 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2023 2 | COPYRIGHT HOLDER: Sean Carmody 3 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2022 Sean Carmody 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method("[",ngram) 4 | S3method(print,ngram) 5 | export(chunk) 6 | export(corpuses) 7 | export(ggram) 8 | export(hacker) 9 | export(ngram) 10 | export(ngrami) 11 | export(ngramw) 12 | export(theme_google) 13 | import(dplyr) 14 | import(ggplot2) 15 | import(tidyr) 16 | importFrom(rlang,.data) 17 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # ngramr 1.10.0 2 | 3 | * NOTE: this is a major release and removes some functionality 4 | * Update for new corpuses 5 | * Remove the add_count option (no data provided by Google) 6 | * Remove drop_corpus (Google no longer supports the :corpus operator for the current corpus) 7 | 8 | # ngramr 1.9.1-1.9.3 9 | 10 | * Fix package after more changes to the Google Ngram Viewer website 11 | 12 | # ngramr 1.9.0 13 | 14 | * Fix package after latest changes to the Google Ngram Viewer website 15 | 16 | # ngramr 1.8.3 17 | 18 | * Improved error handling 19 | 20 | # ngramr 1.8.2 21 | 22 | * Suppress testing of all examples that make internet calls (fail gracefully) 23 | 24 | # ngramr 1.8.1 25 | 26 | * Handle offline state 27 | * Skip testing if offline 28 | 29 | # ngramr 1.8.0 30 | 31 | * Incremented version to reflect that 1.7.7 was a major release 32 | * Rolled back use of |> for compatibility with earlier versions of R 33 | 34 | # ngramr 1.7.7 35 | 36 | * Update for changes in ngram viewer website 37 | * New corpus names (e.g. eng_2019 changed to en_2019) 38 | 39 | # ngramr 1.7.6 40 | 41 | * Drop use of lifecycle badges 42 | * Add markdown format NEWS file 43 | 44 | # ngramr 1.7.5 45 | 46 | * Tidied fromJSON call 47 | * Started to use lifecycle in documentation (ngrami) 48 | 49 | # ngramr 1.7.4 50 | 51 | * Imposed version dependency for dplyr to ensure relocate available 52 | 53 | # ngramr 1.7.3 54 | 55 | * Updated documentation to provide details of return values 56 | 57 | # ngramr 1.7.2 58 | 59 | * Change download code to use 'url' to ensure code works behind a proxy server 60 | * Addressed CRAN submission requirements 61 | 62 | # ngramr 1.7.1 63 | 64 | * Change year_start default to 1800 in documentation 65 | 66 | # ngramr 1.7.0 67 | 68 | * Comprehensive refactor of underlying code 69 | * More robust error/warning handling 70 | * Dropped the "tag" argument from ngram functions 71 | 72 | # ngramr 1.6.5 73 | 74 | * Fix case_sensitive attribute 75 | 76 | # ngramr 1.6.4 77 | 78 | * Fix error in corpus count dataset 79 | 80 | # ngramr 1.6.0 81 | 82 | * Update to address issue (#26) resulting from change in the format of Google Ngram Viewer webpage 83 | 84 | # ngramr 1.5.0 85 | 86 | * Incorporated pull [changes #22, @seancarmody](https://github.com/seancarmody/ngramr/pull/22) 87 | * Make wildcard searches expand to all terms 88 | * Error out on server answer "Please try again later." 1a655f3 89 | * Fix setting default corpus. 0b22dc4 90 | * scale functions: do not explicitly set name, allow overwrite. 3a21061 91 | * Allow passing through additional parameters to ngram_single. ac6b1cc 92 | * For wildcard searches, drop the cumulated (All) column 93 | * Added travis-ci testing 94 | 95 | # ngramr 1.4.5 96 | 97 | * Fixed problems with (some) advanced operators 98 | 99 | 100 | # ngramr 1.4.4 101 | 102 | * Removed debugging from ngrami 103 | 104 | # ngramr 1.4.3 105 | 106 | * Fixed the Pulser bug 107 | 108 | # ngramr 1.4.2 109 | 110 | * Fix accented character encoding problem on Windows 111 | 112 | # ngramr 1.4.1 113 | 114 | * Improve ssl handling (refer Hadley's comment here: http://www.statsravingmad.com/blog/statistics/a-tiny-rcurl-headache/) 115 | 116 | # ngramr 1.4.0 117 | 118 | Google has switched to SSL for the N-gram viewer and the format of the web-pages has 119 | changed. This means that earlier versions of the package are completely broken. This 120 | release fixes this major problem. 121 | 122 | # ngramr 1.3.2 123 | 124 | * Add README.md to .Rbuildignore to remove from CRAN 125 | 126 | # ngramr 1.3.1 127 | 128 | * Fix count for n-grams with n>1, including a "fudge" for 2012 corpuses 129 | 130 | # ngramr 1.3.0 131 | 132 | * Add option to display long-form corpus name 133 | * Warn about smoothing >0 for geoms other than "line" 134 | * Tidy documentation for print.ngram 135 | * ngram and ngrami return S3 class "ngram" 136 | * Format print for ngram objects 137 | * ggram can take either a list of phrases or an ngram object 138 | 139 | # ngramr 1.2.4 140 | 141 | * Add option to relabel y-axis 142 | * Add word counts option to ngram 143 | * Change ggplot2 and scales from Requires to Suggests 144 | 145 | # ngramr 1.2.3 146 | 147 | * Prevent use of complex operators in case insensitive searches 148 | * Warn about character substitution 149 | 150 | # ngramr 1.2.2 151 | 152 | * CRAN release version 153 | * More efficient handling of escaped Unicode (thanks Hadley http://stackoverflow.com/a/17787736/1543437) 154 | * Fix package checking problems associated with plyr 155 | 156 | # ngramr 1.2.1 157 | 158 | * Tidy Google theme 159 | 160 | # ngramr 1.2.0 161 | 162 | * First semi-official release. All future development moved to the 'develop' branch. 163 | * Allow case insensitive plotting with ggram 164 | * Avoid reshape/reshape2 conflicts (thanks to Francois Briatte) 165 | * Pass arbitrary geoms to `ggram` 166 | * New function `ngramw` to return results in "wide" format 167 | * Removed `wide` option from `ggram` and `ggrami` 168 | * Better handling of legends when `ignore_case = TRUE` 169 | * Error trapping long phrase lists 170 | * Google theme option 171 | 172 | # ngramr 1.1 173 | 174 | * Added plot wrapper ggram 175 | * Detect invalid corpus names 176 | 177 | # ngramr 1.0 178 | 179 | * Initial release of the ngramr package -------------------------------------------------------------------------------- /R/data.R: -------------------------------------------------------------------------------- 1 | #' Sample n-gram data 2 | #' 3 | #' Frequency data for the phrases "hacker", "programmer", from 1950 to 2008. 4 | #' 5 | #' @docType data 6 | #' @usage hacker 7 | #' @name hacker 8 | #' @format a 236 x 4 ngram data frame 9 | #' @keywords datasets 10 | #' @export 11 | #' 12 | NULL 13 | 14 | #' Google n-gram corpus information 15 | #' 16 | #' Details of the various corpuses available through the Google n-gram tool 17 | #' 18 | #' @docType data 19 | #' @usage corpuses 20 | #' @name corpuses 21 | #' @format 44 x 6 ngram data frame 22 | #' @keywords datasets 23 | #' @export 24 | #' 25 | NULL 26 | -------------------------------------------------------------------------------- /R/ggram.R: -------------------------------------------------------------------------------- 1 | #' Plot n-gram frequencies 2 | #' 3 | #' \code{ggram} downloads data from the Google Ngram Viewer website and 4 | #' plots it in \code{ggplot2} style. 5 | #' 6 | #' @param phrases vector of phrases. Alternatively, phrases can be an ngram 7 | #' object returned by \code{\link{ngram}} or \code{\link{ngrami}}. 8 | #' @param ignore_case logical, indicating whether the frequencies are case 9 | #' insensitive. 10 | #' Default is \code{FALSE}. 11 | #' @param geom the ggplot2 geom used to plot the data; defaults to "line" 12 | #' @param geom_options list of additional parameters passed to the ggplot2 geom. 13 | #' @param lab y-axis label. Defaults to "Frequency". 14 | #' @param google_theme use a Google Ngram-style plot theme. 15 | #' @param ... additional parameters passed to \code{ngram} 16 | #' @details 17 | #' Google generated two datasets drawn from digitised books in the Google 18 | #' books collection. One was generated in July 2009, the second in July 2012. 19 | #' Google will update these datasets as book scanning continues. 20 | #' 21 | #' @examples 22 | #' \donttest{library(ggplot2) 23 | #' ggram(c("hacker", "programmer"), year_start = 1950) 24 | #' 25 | #' # Changing the geom. 26 | #' ggram(c("cancer", "fumer", "cigarette"), 27 | #' year_start = 1900, 28 | #' corpus = "fr-2012", 29 | #' smoothing = 0, 30 | #' geom = "step") 31 | #' 32 | #' # Passing more options. 33 | #' ggram(c("cancer", "smoking", "tobacco"), 34 | #' year_start = 1900, 35 | #' corpus = "en-fiction-2012", 36 | #' geom = "point", 37 | #' smoothing = 0, 38 | #' geom_options = list(alpha = .5)) + 39 | #' stat_smooth(method="loess", se = FALSE, formula = y ~ x) 40 | #' 41 | #' # Setting the layers manually. 42 | #' ggram(c("cancer", "smoking", "tobacco"), 43 | #' year_start = 1900, 44 | #' corpus = "en-fiction-2012", 45 | #' smoothing = 0, 46 | #' geom = NULL) + 47 | #' stat_smooth(method="loess", se=FALSE, span = 0.3, formula = y ~ x) 48 | #' 49 | #' # Setting the legend placement on a long query and using the Google theme. 50 | #' # Example taken from a post by Ben Zimmer at Language Log. 51 | #' p <- c("((The United States is + The United States has) / The United States)", 52 | #' "((The United States are + The United States have) / The United States)") 53 | #' ggram(p, year_start = 1800, google_theme = TRUE) + 54 | #' theme(legend.direction="vertical") 55 | #' 56 | #' # Pass ngram data rather than phrases 57 | #' ggram(hacker) + facet_wrap(~ Corpus) 58 | #'} 59 | #' @export 60 | 61 | ggram <- function(phrases, ignore_case = FALSE, 62 | geom = "line", geom_options = list(), lab = NA, 63 | google_theme = FALSE, ...) { 64 | if ("ngram" %in% class(phrases)) { 65 | ng <- phrases 66 | } else { 67 | if (ignore_case) { 68 | ng <- ngrami(phrases, ...) 69 | } else { 70 | ng <- ngram(phrases, ...) 71 | } 72 | } 73 | if (is.null(ng)) { 74 | message("Unable to plot: no data returned") 75 | return(invisible(NULL)) 76 | } 77 | if (is.character(geom) && 78 | !(geom %in% c("area", "line")) && attr(ng, "smoothing") > 0) { 79 | warning("ngram data is smoothed. Consider setting smoothing = 0.") 80 | } 81 | if (!"Year" %in% names(ng)) stop("No ngram data returned") 82 | ng <- within(ng, Year <- as.Date(paste(Year, 1, 1, sep = "-"))) 83 | p <- ggplot(data = ng, 84 | aes_string(x = "Year", y = "Frequency", 85 | colour = "Phrase", fill = "Phrase", 86 | label = "Phrase")) 87 | if (!inherits(geom, "character")) geom <- NULL 88 | if (!is.null(geom)) p <- p + do.call(stat_identity, 89 | c(geom = geom, geom_options)) 90 | p <- p + labs(x = NULL) 91 | if (google_theme) { 92 | # Google Ngram palette. 93 | p <- p + 94 | scale_colour_google() + 95 | scale_fill_google() + 96 | theme_google() + labs(y = NULL, colour = NULL) + 97 | scale_x_date(expand = c(0, 0)) + 98 | scale_y_continuous(expand = c(0, 0), labels = scales::percent) 99 | } else { 100 | p <- p + 101 | scale_colour_discrete("") + 102 | scale_fill_discrete("") + 103 | scale_y_continuous(labels = scales::percent) 104 | } 105 | if (!is.na(lab)) p <- p + labs(y = lab) 106 | return(p) 107 | } 108 | -------------------------------------------------------------------------------- /R/methods.R: -------------------------------------------------------------------------------- 1 | #' Print n-gram contents 2 | #' 3 | #' @param x ngram object as returned by \code{link{ngram}} 4 | #' @param rows number of rows to print. Default is 6. 5 | #' @param ... additional parameters passed to default print method. 6 | #' @export 7 | #' @method print ngram 8 | #' @examples 9 | #' \donttest{x <- ngram(c("hacker", "programmer"), year_start = 1950) 10 | #' print(x) 11 | #' } 12 | 13 | print.ngram <- function(x, rows=6, ...) { 14 | df <- x 15 | class(df) <- class(df)[-1] 16 | np.rows <- dim(df)[1] - rows 17 | 18 | if (all(c("Phrase", "Corpus", "Year") %in% names(x))) { 19 | cli::cat_line("# Ngram data table", col = "green") 20 | cli::cat_line("# Phrases:\t\t", paste(levels(x$Phrase), collapse = ", ")) 21 | cli::cat_line("# Case-sensitive:\t", attributes(x)$case_sensitive) 22 | cli::cat_line("# Corpuses:\t\t", paste(levels(x$Corpus), collapse = ", ")) 23 | cli::cat_line("# Smoothing:\t\t", attributes(x)$smoothing) 24 | cli::cat_line("# Years:\t\t", min(x$Year), "-", max(x$Year)) 25 | cat("\n") 26 | } 27 | 28 | print(utils::head(as.data.frame(df), rows)) 29 | if (np.rows > 0) { 30 | cli::cat_line(cli::cli_text(cli::col_grey("# ... with {np.rows} more row{?s}"))) 31 | } 32 | invisible(x) 33 | } 34 | 35 | #' @export 36 | `[.ngram` <- function(x, ...) { 37 | class(x) <- class(x)[-1] 38 | x <- x[...] 39 | if (all(c("Phrase", "Corpus", "Year") %in% names(x))) class(x) <- c("ngram", class(x)) 40 | return(x) 41 | } 42 | -------------------------------------------------------------------------------- /R/ngram.R: -------------------------------------------------------------------------------- 1 | #' Get n-gram frequencies 2 | #' 3 | #' `ngram` downloads data from the Google Ngram Viewer website and 4 | #' returns it in a tibble. 5 | #' 6 | #' @param phrases vector of phrases, with a maximum of 12 items 7 | #' @param corpus Google corpus to search (see Details for possible values) 8 | #' @param year_start start year, default is 1800. Data available back to 1500. 9 | #' @param year_end end year, default is 2008 10 | #' @param smoothing smoothing parameter, default is 3 11 | #' @param case_ins Logical indicating whether to force a case insensitive search. 12 | #' Default is `FALSE`. 13 | #' @param aggregate Sum up the frequencies for ngrams associated with wildcard 14 | #' or case insensitive searches. Default is `FALSE`. 15 | #' @param count Default is `FALSE`. 16 | #' @param drop_parent Drop the parent phrase associated with a wildcard 17 | #' or case-insensitive search. Default is `FALSE`. 18 | #' @param drop_all Delete the suffix "(All)" from aggregated case-insensitive 19 | #' searches. Default is `FALSE`. 20 | #' @param type Include the Google return type (e.g. NGRAM, NGRAM_COLLECTION, 21 | #' EXPANSION) from result set. Default is `FALSE`. 22 | #' @return `ngram` returns an object of class "`ngram`", 23 | #' which is a tidyverse `tibble` enriched with attributes reflecting 24 | #' some of the parameters used in the Ngram Viewer query. 25 | #' @details 26 | #' Google generated two datasets drawn from digitised books in the Google 27 | #' Books collection. One was generated in July 2009, the second in July 2012 28 | #' and the third in 2019. Google is expected to update these datasets as book 29 | #' scanning continues. 30 | #' 31 | #' This function provides the annual frequency of words or phrases, known 32 | #' as n-grams, in a sub-collection or "corpus" taken from the Google Books 33 | #' collection.The search across the corpus is case-sensitive. 34 | #' 35 | #' If the function is unable to retrieve data from the Google Ngram Viewer 36 | #' site (either because of access issues or if the format of Google's site 37 | #' has changed) a NULL result is returned and messages are printed to the 38 | #' console but no errors or warnings are raised (this is to align with 39 | #' CRAN package policies). 40 | #' 41 | #' Below is a list of available corpora. Note that the data for the 2012 42 | #' corpuses only extends to 2009. 43 | #' \tabular{ll}{ 44 | #' \bold{Corpus} \tab \bold{Corpus Name}\cr 45 | #' en-US-2019\tab American English 2019\cr 46 | #' en-US-2012\tab American English 2012\cr 47 | #' en-US-2009\tab American English 2009\cr 48 | #' en-GB-2019\tab British English 2019\cr 49 | #' en-GB-2012\tab British English 2012\cr 50 | #' en-GB-2009\tab British English 2009\cr 51 | #' zh-Hans-2019\tab Chinese 2019\cr 52 | #' zh-Hans-2012\tab Chinese 2012\cr 53 | #' zh-Hans-2009\tab Chinese 2009\cr 54 | #' en-2019\tab English 2019\cr 55 | #' en-2012\tab English 2012\cr 56 | #' en-2009\tab English 2009\cr 57 | #' en-fiction-2019\tab English Fiction 2019\cr 58 | #' en-fiction-2012\tab English Fiction 2012\cr 59 | #' en-fiction-2009\tab English Fiction 2009\cr 60 | #' en-1M-2009\tab English One Million\cr 61 | #' fr-2019\tab French 2019\cr 62 | #' fr-2012\tab French 2012\cr 63 | #' fr-2009\tab French 2009\cr 64 | #' de-2019\tab German 2019\cr 65 | #' de-2012\tab German 2012\cr 66 | #' de-2009\tab German 2009\cr 67 | #' iw-2019\tab Hebrew 2019\cr 68 | #' iw-2012\tab Hebrew 2012\cr 69 | #' iw-2009\tab Hebrew 2009\cr 70 | #' es-2019\tab Spanish 2019\cr 71 | #' es-2012\tab Spanish 2012\cr 72 | #' es-2009\tab Spanish 2009\cr 73 | #' ru-2019\tab Russian 2019\cr 74 | #' ru-2012\tab Russian 2012\cr 75 | #' ru-2009\tab Russian 2009\cr 76 | #' it-2019\tab Italian 2019\cr 77 | #' it-2012\tab Italian 2012\cr 78 | #' } 79 | #' 80 | #' The Google Million is a sub-collection of Google Books. All are in 81 | #' English with dates ranging from 1500 to 2008. 82 | #' No more than about 6,000 books were chosen from any one year, which 83 | #' means that all of the scanned books from early years are present, 84 | #' and books from later years are randomly sampled. The random samplings 85 | #' reflect the subject distributions for the year (so there are more 86 | #' computer books in 2000 than 1980). 87 | #' 88 | #' See \url{http://books.google.com/ngrams/info} for the full Ngram syntax. 89 | #' @examples 90 | #' \donttest{ngram(c("mouse", "rat"), year_start = 1950) 91 | #' ngram(c("blue_ADJ", "red_ADJ")) 92 | #' ngram(c("_START_ President Roosevelt", "_START_ President Truman"), year_start = 1920) 93 | #' } 94 | #' @export 95 | 96 | ngram <- function(phrases, corpus = "en", year_start = 1800, 97 | year_end = 2022, smoothing = 3, case_ins=FALSE, 98 | aggregate = FALSE, count = FALSE, 99 | drop_parent = FALSE, drop_all = FALSE, type = FALSE) { 100 | #if (!curl::has_internet()) {stop("Unable to access internet.")} 101 | phrases <- ngram_check_phrases(phrases) 102 | # Loop over corpuses 103 | dfs <- lapply(corpus, function(corp) ngram_single(phrases, corpus = corp, 104 | year_start = year_start, 105 | year_end = year_end, 106 | smoothing = smoothing, 107 | case_ins = case_ins)) 108 | ng <- bind_rows(dfs) 109 | if (length(ng) == 0) return(NULL) 110 | class(ng) <- c("ngram", class(ng)) 111 | ng <- truncate_years(ng) 112 | if (aggregate) { 113 | ng <- filter(ng, .data$type != "EXPANSION") 114 | } else { 115 | ng <- filter(ng, .data$type %in% c("NGRAM", "EXPANSION")) 116 | } 117 | print(ng) 118 | if (drop_parent || all(ng$Parent == "")) ng$Parent <- NULL 119 | if (drop_all) { 120 | ng <- mutate(ng, 121 | Phrase = if_else(type == "CASE_INSENSITIVE", 122 | stringr::str_replace(.data$Phrase, "\\s*\\(All\\)\\z", ""), 123 | .data$Phrase)) 124 | } 125 | #ng <- select(ng, -"clean") 126 | attr(ng, "smoothing") <- smoothing 127 | attr(ng, "case_sensitive") <- !case_ins 128 | ng$Corpus <- as.factor(ng$Corpus) 129 | ng$Phrase <- as.factor(ng$Phrase) 130 | if (type) ng$Type <- ng$type 131 | ng$type <- NULL 132 | return(ng) 133 | } 134 | 135 | ngram_single <- function(phrases, corpus, year_start, year_end, 136 | smoothing, case_ins) { 137 | if (!(corpus %in% corpuses$Shorthand)) {warning(paste(corpus, "not a valid corpus. Defaulting to en-2019."))} 138 | #corpus <- get_corpus_n(corpus) 139 | query <- as.list(environment()) 140 | if (case_ins) query["case_insensitive"] <- "true" 141 | query$phrases <- NULL 142 | query$case_ins <- NULL 143 | ng_url <- ngram_url(phrases, query) 144 | html <- ngram_fetch_xml(ng_url) 145 | if (is.null(html)){ 146 | ng <- NULL 147 | } else { 148 | ng <- ngram_fetch_data(html) 149 | warnings <- ngram_check_warnings(html) 150 | show_warnings(warnings) 151 | } 152 | return(ng) 153 | } 154 | 155 | ngram_check_phrases <- function(phrases){ 156 | stopifnot(is.character(phrases)) 157 | phrases <- phrases[phrases != ""] 158 | if (length(phrases) == 0) stop("No valid phrases provided.") 159 | if (!all(check_balanced(phrases))) stop("mis-matched parentheses") 160 | if (length(phrases) > 12) { 161 | phrases <- phrases[1:12] 162 | warning("Maximum number of phrases exceeded: only using first 12.") 163 | } 164 | return(phrases) 165 | } 166 | 167 | ngram_fetch_xml <- function(url) { 168 | # retrieve data from Google Ngram Viewer site 169 | # no errors or warnings generated on fail, only messages 170 | try_get <- function(x, ...) { 171 | tryCatch( 172 | httr::GET(url = x, httr::timeout(3), ...), 173 | error = function(e) conditionMessage(e), 174 | warning = function(w) conditionMessage(w) 175 | ) 176 | } 177 | is_response <- function(x) { 178 | class(x) == "response" 179 | } 180 | 181 | # first check internet connection 182 | if (!curl::has_internet()) { 183 | message("No internet connection.") 184 | return(invisible(NULL)) 185 | } 186 | # then try for timeout problems 187 | resp <- try_get(url) 188 | if (!is_response(resp)) { 189 | message("Please check Google's Ngram Viewer site is up.") 190 | message(resp) 191 | return(invisible(NULL)) 192 | } 193 | # then stop if status > 400 194 | if (httr::http_error(resp)) { 195 | message("Please check Google's Ngram Viewer site is up.") 196 | httr::message_for_status(resp) 197 | return(invisible(NULL)) 198 | } 199 | return(xml2::read_html(resp)) 200 | } 201 | 202 | ngram_check_warnings <- function(html) { 203 | node <- xml2::xml_find_first(html, "//div[@id='warning-area']") 204 | warnings <- list() 205 | if (length(node) > 0) { 206 | for (n in xml2::xml_find_all(node, "div")) { 207 | type <- xml2::xml_text(xml2::xml_find_first(n, "mwc-icon")) 208 | msg <- stringr::str_trim(xml2::xml_text(xml2::xml_find_first(n, "span"))) 209 | msg <- stringr::str_replace_all(msg, "\\s+", " ") 210 | msg <- stringr::str_replace(msg, "No valid ngrams to plot!", "No valid ngrams retrieved!") 211 | warnings <- c(warnings, list(list(type = type, message = msg))) 212 | } 213 | } 214 | return(warnings) 215 | } 216 | 217 | ngram_fetch_data <- function(html) { 218 | data <- tryCatch( 219 | { 220 | if (is.null(html)) { 221 | NULL 222 | } else { 223 | corpus <- xml2::xml_find_first(html, "//select[@id='form-corpus']/option") 224 | corpus <- xml2::xml_attr(corpus, "value") 225 | if (grepl("^[0-9]+$", corpus, perl = TRUE)) { 226 | corpus <- get_corpus_text(as.numeric(corpus)) 227 | } 228 | script <- xml2::xml_find_all(html, "//div[@id='chart']/following::script")[1] 229 | json <- xml2::xml_text(script) 230 | json <- stringr::str_split(json, "\n")[[1]] 231 | json <- json[json != ''] 232 | json <- stringr::str_squish(json) 233 | years <- xml2::xml_find_all(html, "//div[@id='chart']/following::script")[2] 234 | years <- xml2::xml_text(years) 235 | years <- stringr::str_split(years, "\n")[[1]] 236 | years <- grep('drawD3Chart', years, value = TRUE) 237 | years <- as.integer(stringr::str_split(grep("drawD3Chart", years, value = TRUE), ",")[[1]][2:3]) 238 | data <- rjson::fromJSON(json) 239 | if (length(data) == 0) return(NULL) 240 | data <- lapply(data, 241 | function(x) tibble::add_column(tibble::as_tibble(x), 242 | Year = seq.int(years[1], years[2]))) 243 | data <- bind_rows(data) 244 | data <- mutate(data, ngram = textutils::HTMLdecode(data$ngram), Corpus = corpus) 245 | data <- relocate(data, "Year", "ngram", "timeseries", "Corpus") 246 | data <- rename(data, Phrase = "ngram", Frequency = "timeseries", Parent = "parent") 247 | data 248 | } 249 | }, 250 | error=function(cond) { 251 | message("Error parsing ngram data, please contact package maintainer.") 252 | message("Here's the original error message:") 253 | message(cond) 254 | message("\nError occurred in the following code:") 255 | message(conditionCall(cond)) 256 | return(NULL) 257 | }, 258 | warning=function(cond) { 259 | message("Warning generated when parsing ngram data.") 260 | message("Here's the original warning message:") 261 | message(cond) 262 | return(NULL) 263 | }, 264 | finally = {} 265 | ) 266 | return(data) 267 | } 268 | 269 | ngram_url <- function(phrases, query=character()) { 270 | url <- "https://books.google.com/ngrams/graph" 271 | n <- length(phrases) 272 | for (i in 1:n) { 273 | if (grepl("\\+|/", phrases[i])) phrases[i] <- paste0("(", phrases[i], ")") 274 | p <- phrases[i] 275 | if (!(Encoding(p) %in% c("unknown", "UTF-8"))) { 276 | phrases[i] <- iconv(p, Encoding(p), "UTF-8") 277 | } 278 | } 279 | phrases <- paste(curl::curl_escape(stringr::str_trim(phrases)), 280 | collapse = "%2c") 281 | if (phrases == "") stop("No valid phrases provided.") 282 | url <- paste0(url, "?content=", phrases) 283 | if (length(query) > 0) url <- httr::modify_url(url, query = query) 284 | url <- gsub("%28", "(", url) 285 | url <- gsub("%29", ")", url) 286 | url <- gsub("%20", "+", url) 287 | return(url) 288 | } 289 | 290 | check_balanced <- function(x) { 291 | # Check parenthesis are appropriately balanced (i.e. every open is closed) 292 | sapply(x, function(str) { 293 | str <- gsub("[^\\(\\)]", "", str) 294 | str <- strsplit(str, "")[[1]] 295 | str <- ifelse(str == "(", 1, -1) 296 | all(cumsum(str) >= 0) && sum(str) == 0 297 | }) 298 | } 299 | 300 | show_warnings <- function(warnings){ 301 | if (length(warnings) > 0) { 302 | for (w in warnings) { 303 | warning(w$message, call. = FALSE) 304 | } 305 | } 306 | } 307 | 308 | get_corpus_n <- function(corpus, default = "en-2019"){ 309 | stopifnot(is.character(corpus)) 310 | n <- corpuses[corpus, "Number"] 311 | if (any(is.na(n)) && !is.na(default)) { 312 | if (is.character(default)) default <- get_corpus_n(default) 313 | stopifnot(default %in% corpuses$Number) 314 | invalid <- paste(corpus[is.na(n)], collapse = ", ") 315 | warning(paste0("Unknown corpus ", invalid, ". Using default corpus instead."), call. = FALSE) 316 | n[is.na(n)] <- default 317 | } 318 | return(n) 319 | } 320 | 321 | get_corpus_text <- function(n, default = NA){ 322 | stopifnot(is.numeric(n)) 323 | text <- row.names(corpuses)[match(n, corpuses$Number)] 324 | if (any(is.na(text)) && !is.na(default)) { 325 | if (is.numeric(default)) default <- get_corpus_text(default) 326 | stopifnot(default %in% row.names(corpuses)) 327 | invalid <- paste(n[is.na(text)], collapse = ", ") 328 | warning(paste0("Unknown corpus ", invalid, ". Using default corpus instead."), call. = FALSE) 329 | text[is.na(text)] <- default 330 | } 331 | return(text) 332 | } 333 | 334 | truncate_years <- function(ngram){ 335 | stopifnot(class(ngram)[1] == "ngram") 336 | ngram$Corpus <- as.character(ngram$Corpus) 337 | ngram <- left_join(ngram, select(corpuses, 338 | "Shorthand", 339 | "Last.Year"), 340 | by = c("Corpus" = "Shorthand")) 341 | ngram <- filter(ngram, .data$Year <= "Last.Year") 342 | ngram$Last.Year <- NULL 343 | return(ngram) 344 | } 345 | -------------------------------------------------------------------------------- /R/ngrami.R: -------------------------------------------------------------------------------- 1 | #' Get n-gram frequencies (case insensitive version) 2 | #' 3 | #' @param phrases vector of phrases 4 | #' @param aggregate sum up each of the terms 5 | #' @param ... remaining parameters passed to ngram 6 | #' @description 7 | #' This function is a simple wrapper of `ngram` for case insensitive searches. 8 | #' @export 9 | 10 | ngrami <- function(phrases, aggregate = TRUE, ...){ 11 | ngram(phrases, aggregate = aggregate, case_ins = TRUE, drop_all = TRUE, ...) 12 | } 13 | -------------------------------------------------------------------------------- /R/ngramr-package.R: -------------------------------------------------------------------------------- 1 | #' ngramr: Dig into the Google Ngram Viewer using R 2 | #' 3 | #' @description 4 | #' The \href{http://books.google.com/ngrams}{Google Books Ngram Viewer} 5 | #' allows you to enter a list of phrases and then displays a graph showing 6 | #' how often the phrases have occurred in a corpus of books 7 | #' (e.g., "British English", "English Fiction", "French") over time. 8 | #' The underlying data is hidden in web page, embedded in some Javascript. 9 | #' 10 | #' This package extracts the data an provides it in the form of an R dataframe. 11 | #' 12 | #' The key function is \code{ngram} which, given a collection of 13 | #' phrases, returns a dataframe containing the frequencies by year. 14 | #' 15 | #' The code is based on the \code{getNgrams.py} Python script available on 16 | #' \href{https://web.archive.org/web/20221129120802/https://www.culturomics.org/}{Culturomics Code} 17 | #' written by Jean-Baptiste Michel. The Culturomics website doesn't 18 | #' exist anymore but can still be find 19 | #' \href{https://web.archive.org/web/20221129220150/https://www.culturomics.org/Resources/get-ngrams}{on archive.org} 20 | #' and is worth exploring. 21 | #' 22 | #' Note that compared to the 2009 versions, the 2012 and 2019 versions have 23 | #' larger numbers of books, improved OCR, improved library and publisher 24 | #' metadata. The 2012 and 2019 corpuses also don't form ngrams that cross 25 | #' sentence boundaries, and do form ngrams across page boundaries and 26 | #' support part of speech tagging, unlike the 2009 versions. 27 | #' 28 | #' Like the Google Ngram Viewer website itself, this package is aimed at for 29 | #' quick inquiries into the usage of small sets of phrases. 30 | #' 31 | #' Please respect the terms of service of the Google Books Ngram Viewer while 32 | #' using this code. This code is meant to help viewers retrieve data behind 33 | #' a few queries, not bang at Google's servers with dozens of queries. 34 | #' The complete dataset can be 35 | #' \href{https://storage.googleapis.com/books/ngrams/books/datasetsv3.html}{downloaded here}. 36 | #' 37 | #' @references 38 | #' Michel, Jean-Baptiste, et al. "Quantitative analysis of culture using 39 | #' millions of digitized books." \emph{Science} 331, No. 6014 (2011): 176--182. 40 | #' 41 | #' @keywords internal 42 | #' @import dplyr tidyr ggplot2 43 | #' @importFrom rlang .data 44 | #' @docType package 45 | #' @name ngramr 46 | #' @aliases ngramr ngramr-package 47 | "_PACKAGE" 48 | -------------------------------------------------------------------------------- /R/ngramw.R: -------------------------------------------------------------------------------- 1 | #' Get n-gram frequencies ("wide" format) 2 | #' 3 | #' @param phrases vector of phrases 4 | #' @param ignore_case ignore case of phrases (i.e. call \code{ngrami} 5 | #' rather than \code{ngram}). Default value is \code{FALSE}. 6 | #' @param ... remaining parameters passed to \code{ngram} 7 | #' @export 8 | 9 | ngramw <- function(phrases, ignore_case=FALSE, ...) { 10 | if ("ngram" %in% class(phrases)) { 11 | ng <- phrases 12 | } else { 13 | ng <- if (ignore_case) ngrami(phrases, ...) else ngram(phrases, ...) 14 | } 15 | if (is.null(ng)) return(NULL) 16 | ng <- pivot_wider(ng, names_from = "Phrase", values_from = "Frequency") 17 | return(ng) 18 | } 19 | -------------------------------------------------------------------------------- /R/sysdata.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/seancarmody/ngramr/32f11bea7531db06afb7c8c9d40a80be15663227/R/sysdata.rda -------------------------------------------------------------------------------- /R/themes.R: -------------------------------------------------------------------------------- 1 | #' Google Ngram theme for ggplot2 2 | #' 3 | #' @param ... additional parameters to pass to \code{theme} 4 | #' 5 | #' @details 6 | #' Use a Google Ngram-style plot theme. 7 | #' 8 | #' @export 9 | 10 | theme_google <- function(...) { 11 | theme(panel.border = element_rect(colour = "grey", size = 0.2, fill = NA), 12 | panel.background = element_rect(fill = NA), 13 | axis.line = element_line(colour = "black", size = 0.3), 14 | panel.grid.major = element_line(colour = "grey", size = 0.2), 15 | panel.grid.minor = element_blank(), 16 | legend.position = "top", 17 | legend.direction = "horizontal", 18 | legend.box = "vertical", 19 | legend.key = element_rect(fill = NA), 20 | axis.text = element_text(colour = "black"), 21 | axis.ticks = element_blank(), ...) 22 | } 23 | 24 | scale_colour_google <- function(...) { 25 | palette <- c("#264EC0", "#D22310", "#FC8608", "#168713", "#850086", 26 | "#1086B9", "#D22B63", "#559D05", "#A71B23", "#21436F", 27 | "#852D86", "#219B86") 28 | scale_colour_manual(..., values = palette) 29 | } 30 | 31 | scale_fill_google <- function(...) { 32 | palette <- c("#264EC0", "#D22310", "#FC8608", "#168713", "#850086", 33 | "#1086B9", "#D22B63", "#559D05", "#A71B23", "#21436F", 34 | "#852D86", "#219B86") 35 | scale_fill_manual(..., values = palette) 36 | } 37 | -------------------------------------------------------------------------------- /R/utilities.R: -------------------------------------------------------------------------------- 1 | #' Chunk a vector or list 2 | #' 3 | #' \code{chunk} takes a vector (or list) and returns a list of chunks 4 | #' which all have lengths (approximately) equal to a specified value. 5 | #' 6 | #' @param x vector of list 7 | #' @param len target length of chunks 8 | #' @param n number of chunks 9 | #' 10 | #' @details 11 | #' If \code{n} is specified, \code{len} is ignored and \code{chunk} returns 12 | #' a list of length \code{n} of "chunks" of \code{x}. Otherwise 13 | #' \code{n} is calculated to break the vector into chunks which are 14 | #' each approximately of length \code{len}. If both \code{len} and 15 | #' \code{n} are unspecified, \code{chunk} simply returns \code{x}. 16 | #' @examples 17 | #' chunk(letters, 10) 18 | #' chunk(LETTERS, n = 3) 19 | #' 20 | #' @export 21 | 22 | chunk <- function(x, len = NULL, n = NULL) { 23 | if (is.null(len) & is.null(len)) return(x) 24 | if (is.null(len)) len <- ceiling(length(x) / n) 25 | if (is.null(n)) n <- ceiling(length(x) / len) 26 | if (len >= length(x)) { 27 | return(x) 28 | } else { 29 | return(split(x, cut(seq_along(x), n, labels = FALSE))) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ngramr - R package to query the Google Ngram Viewer 2 | 3 | 4 | [![CRAN 5 | status](https://www.r-pkg.org/badges/version/ngramr)](https://cran.r-project.org/package=ngramr) 6 | [![DOI](https://zenodo.org/badge/11216907.svg)](https://zenodo.org/badge/latestdoi/11216907) 7 | [![Build Status](https://app.travis-ci.com/seancarmody/ngramr.svg?branch=master)](https://app.travis-ci.com/seancarmody/ngramr) 8 | 9 | 10 | 11 | 12 | The [Google Books Ngram Viewer][1] allows you to enter a list of phrases and 13 | then displays a graph showing how often the phrases have occurred in a large 14 | corpus of books (e.g., "British English", "English Fiction", "French") over 15 | time. The current corpus produced in 2023 contains around two trillion words 16 | for English alone. 17 | 18 | The underlying data is hidden in Web page, embedded in some Javascript. 19 | This package extracts the data and provides it in the form of an R dataframe. 20 | Early versions of code was adapted from a handy Python script available from 21 | [Culturomics][2], written by [Jean-Baptiste Michel][3]. The code has been 22 | comprehensively redeveloped since then. 23 | 24 | ## Installing 25 | 26 | This package requires R version 4.0.0 or higher. If you are using an older 27 | version of R you will be prompted to upgrade when you try to install the 28 | package, so you may as well upgrade now! 29 | 30 | The official release of ngramr is available on [CRAN][4]. To install from 31 | CRAN, use the following command: 32 | 33 | install.packages('ngramr') 34 | 35 | If you have any problems installing the package on macOS, try installing from 36 | source: 37 | 38 | install.packages("ngramr", type="source") 39 | 40 | If you have the [`devtools`][5] package installed, install the latest stable 41 | version this package directly from GitHub: 42 | 43 | library(devtools) 44 | install_github("seancarmody/ngramr") 45 | library(ngramr) 46 | 47 | and if you are feeling a little more adventurous, you can install the 48 | development version: 49 | 50 | install_github("seancarmody/ngramr", "develop") 51 | 52 | although it may not always work. 53 | 54 | Note though that many releases fix problems that arise when Google changes the 55 | format of the Ngram Viewer website so older versions generally no longer work. 56 | If you are seeing errors with the latest version then the package may need fixing 57 | after one of these Google changes. If so please report this on [GitHub][12]. 58 | 59 | If you are behind a proxy, `install_github` may not work for you. Instead of 60 | fiddling around with the `RCurl` proxy settings, you can download the latest 61 | [ZIP archive][6] and use `install_local` instead. 62 | 63 | ## Examples 64 | 65 | Here is an example of how to use the `ngram` function: 66 | 67 | library(ggplot2) 68 | ng <- ngram(c("hacker", "programmer"), year_start = 1950) 69 | ggplot(ng, aes(x = Year, y = Frequency, colour = Phrase)) + 70 | geom_line() 71 | 72 | The result is a ggplot2 line graph of the following form: 73 | 74 | ![Ngram Chart](man/figures/hacker.png) 75 | 76 | The same result can be achieved even more simply by using the `ggram` 77 | plotting wrapper that supports many options, as in this example: 78 | 79 | ![Ngram chart, with options](man/figures/archy.png) 80 | 81 | ggram(c("monarchy", "democracy"), year_start = 1500, year_end = 2000, 82 | corpus = "en-GB-2012", ignore_case = TRUE, 83 | geom = "area", geom_options = list(position = "stack")) + 84 | labs(y = NULL) 85 | 86 | The colours used by Google Ngram are available through the `google_theme` 87 | option, as in this example posted by Ben Zimmer [at Language Log][7]: 88 | 89 | ![Ngram chart, with Google theme](http://i.imgur.com/qKHvQA4.png) 90 | 91 | ng <- c("((The United States is + The United States has) / The United States)", 92 | "((The United States are + The United States have) / The United States)") 93 | ggram(ng, year_start = 1800, google_theme = TRUE) + 94 | theme(legend.direction = "vertical") 95 | 96 | ## Getting help 97 | 98 | If you encounter a bug, please file an issue with a reproducible 99 | example on [GitHub][12]. 100 | 101 | ## Further Reading 102 | 103 | For more information, read [this Stubborn Mule post][8] and the 104 | [Google Ngram syntax][9] documentation. Language Log has a [good post][10] 105 | written just after the launch of the 2012 corpus. 106 | 107 | If you would rather work with R and SQL on the raw Google Ngram datasets, 108 | [see this post][11]. 109 | 110 | ![Twitter Follow](https://img.shields.io/twitter/follow/stubbornmule?label=%40stubbornmule&style=social) 111 | 112 | [1]: http://books.google.com/ngrams "Google Ngram Viewer" 113 | [2]: https://bit.ly/4gQ6dtw "Culturomics: Get Ngrams" 114 | [3]: https://twitter.com/jb_michel "@jb_michel" 115 | [4]: http://cran.r-project.org/web/packages/ngramr/index.html "ngramr on CRAN" 116 | [5]: http://cran.r-project.org/web/packages/devtools/index.html "devtools" 117 | [6]: https://github.com/seancarmody/ngramr/archive/latest.zip "ngramr ZIP" 118 | [7]: http://languagelog.ldc.upenn.edu/nll/?p=4979 "US: singular or plural?" 119 | [8]: http://www.stubbornmule.net/2013/07/ngramr/ "Mule on ngramr" 120 | [9]: http://books.google.com/ngrams/info "Goole Ngram info" 121 | [10]: https://languagelog.ldc.upenn.edu/nll/?p=4258 "A new chapter for ngrams" 122 | [11]: http://rpsychologist.com/how-to-work-with-google-ngram-data-sets-in-r-using-mysql/ "Ngrams with R and mysql" 123 | [12]: https://github.com/seancarmody/ngramr/issues "ngramr issues on GitHub" 124 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | ## Test environments 2 | * local MacOS install, R 4.4.1 3 | * rhub v2 4 | * win-builder (devel and release) 5 | 6 | ## R CMD check results 7 | * There were no ERRORs or WARNINGs 8 | * Local and online build generated no NOTES. 9 | 10 | ## CRAN requirements 11 | * None outstanding 12 | -------------------------------------------------------------------------------- /man/chunk.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utilities.R 3 | \name{chunk} 4 | \alias{chunk} 5 | \title{Chunk a vector or list} 6 | \usage{ 7 | chunk(x, len = NULL, n = NULL) 8 | } 9 | \arguments{ 10 | \item{x}{vector of list} 11 | 12 | \item{len}{target length of chunks} 13 | 14 | \item{n}{number of chunks} 15 | } 16 | \description{ 17 | \code{chunk} takes a vector (or list) and returns a list of chunks 18 | which all have lengths (approximately) equal to a specified value. 19 | } 20 | \details{ 21 | If \code{n} is specified, \code{len} is ignored and \code{chunk} returns 22 | a list of length \code{n} of "chunks" of \code{x}. Otherwise 23 | \code{n} is calculated to break the vector into chunks which are 24 | each approximately of length \code{len}. If both \code{len} and 25 | \code{n} are unspecified, \code{chunk} simply returns \code{x}. 26 | } 27 | \examples{ 28 | chunk(letters, 10) 29 | chunk(LETTERS, n = 3) 30 | 31 | } 32 | -------------------------------------------------------------------------------- /man/corpuses.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{corpuses} 5 | \alias{corpuses} 6 | \title{Google n-gram corpus information} 7 | \format{ 8 | 44 x 6 ngram data frame 9 | } 10 | \usage{ 11 | corpuses 12 | } 13 | \description{ 14 | Details of the various corpuses available through the Google n-gram tool 15 | } 16 | \keyword{datasets} 17 | -------------------------------------------------------------------------------- /man/figures/archy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/seancarmody/ngramr/32f11bea7531db06afb7c8c9d40a80be15663227/man/figures/archy.png -------------------------------------------------------------------------------- /man/figures/hacker.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/seancarmody/ngramr/32f11bea7531db06afb7c8c9d40a80be15663227/man/figures/hacker.png -------------------------------------------------------------------------------- /man/ggram.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ggram.R 3 | \name{ggram} 4 | \alias{ggram} 5 | \title{Plot n-gram frequencies} 6 | \usage{ 7 | ggram( 8 | phrases, 9 | ignore_case = FALSE, 10 | geom = "line", 11 | geom_options = list(), 12 | lab = NA, 13 | google_theme = FALSE, 14 | ... 15 | ) 16 | } 17 | \arguments{ 18 | \item{phrases}{vector of phrases. Alternatively, phrases can be an ngram 19 | object returned by \code{\link{ngram}} or \code{\link{ngrami}}.} 20 | 21 | \item{ignore_case}{logical, indicating whether the frequencies are case 22 | insensitive. 23 | Default is \code{FALSE}.} 24 | 25 | \item{geom}{the ggplot2 geom used to plot the data; defaults to "line"} 26 | 27 | \item{geom_options}{list of additional parameters passed to the ggplot2 geom.} 28 | 29 | \item{lab}{y-axis label. Defaults to "Frequency".} 30 | 31 | \item{google_theme}{use a Google Ngram-style plot theme.} 32 | 33 | \item{...}{additional parameters passed to \code{ngram}} 34 | } 35 | \description{ 36 | \code{ggram} downloads data from the Google Ngram Viewer website and 37 | plots it in \code{ggplot2} style. 38 | } 39 | \details{ 40 | Google generated two datasets drawn from digitised books in the Google 41 | books collection. One was generated in July 2009, the second in July 2012. 42 | Google will update these datasets as book scanning continues. 43 | } 44 | \examples{ 45 | \donttest{library(ggplot2) 46 | ggram(c("hacker", "programmer"), year_start = 1950) 47 | 48 | # Changing the geom. 49 | ggram(c("cancer", "fumer", "cigarette"), 50 | year_start = 1900, 51 | corpus = "fr-2012", 52 | smoothing = 0, 53 | geom = "step") 54 | 55 | # Passing more options. 56 | ggram(c("cancer", "smoking", "tobacco"), 57 | year_start = 1900, 58 | corpus = "en-fiction-2012", 59 | geom = "point", 60 | smoothing = 0, 61 | geom_options = list(alpha = .5)) + 62 | stat_smooth(method="loess", se = FALSE, formula = y ~ x) 63 | 64 | # Setting the layers manually. 65 | ggram(c("cancer", "smoking", "tobacco"), 66 | year_start = 1900, 67 | corpus = "en-fiction-2012", 68 | smoothing = 0, 69 | geom = NULL) + 70 | stat_smooth(method="loess", se=FALSE, span = 0.3, formula = y ~ x) 71 | 72 | # Setting the legend placement on a long query and using the Google theme. 73 | # Example taken from a post by Ben Zimmer at Language Log. 74 | p <- c("((The United States is + The United States has) / The United States)", 75 | "((The United States are + The United States have) / The United States)") 76 | ggram(p, year_start = 1800, google_theme = TRUE) + 77 | theme(legend.direction="vertical") 78 | 79 | # Pass ngram data rather than phrases 80 | ggram(hacker) + facet_wrap(~ Corpus) 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /man/hacker.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{hacker} 5 | \alias{hacker} 6 | \title{Sample n-gram data} 7 | \format{ 8 | a 236 x 4 ngram data frame 9 | } 10 | \usage{ 11 | hacker 12 | } 13 | \description{ 14 | Frequency data for the phrases "hacker", "programmer", from 1950 to 2008. 15 | } 16 | \keyword{datasets} 17 | -------------------------------------------------------------------------------- /man/ngram.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ngram.R 3 | \name{ngram} 4 | \alias{ngram} 5 | \title{Get n-gram frequencies} 6 | \usage{ 7 | ngram( 8 | phrases, 9 | corpus = "en", 10 | year_start = 1800, 11 | year_end = 2022, 12 | smoothing = 3, 13 | case_ins = FALSE, 14 | aggregate = FALSE, 15 | count = FALSE, 16 | drop_parent = FALSE, 17 | drop_all = FALSE, 18 | type = FALSE 19 | ) 20 | } 21 | \arguments{ 22 | \item{phrases}{vector of phrases, with a maximum of 12 items} 23 | 24 | \item{corpus}{Google corpus to search (see Details for possible values)} 25 | 26 | \item{year_start}{start year, default is 1800. Data available back to 1500.} 27 | 28 | \item{year_end}{end year, default is 2008} 29 | 30 | \item{smoothing}{smoothing parameter, default is 3} 31 | 32 | \item{case_ins}{Logical indicating whether to force a case insensitive search. 33 | Default is \code{FALSE}.} 34 | 35 | \item{aggregate}{Sum up the frequencies for ngrams associated with wildcard 36 | or case insensitive searches. Default is \code{FALSE}.} 37 | 38 | \item{count}{Default is \code{FALSE}.} 39 | 40 | \item{drop_parent}{Drop the parent phrase associated with a wildcard 41 | or case-insensitive search. Default is \code{FALSE}.} 42 | 43 | \item{drop_all}{Delete the suffix "(All)" from aggregated case-insensitive 44 | searches. Default is \code{FALSE}.} 45 | 46 | \item{type}{Include the Google return type (e.g. NGRAM, NGRAM_COLLECTION, 47 | EXPANSION) from result set. Default is \code{FALSE}.} 48 | } 49 | \value{ 50 | \code{ngram} returns an object of class "\code{ngram}", 51 | which is a tidyverse \code{tibble} enriched with attributes reflecting 52 | some of the parameters used in the Ngram Viewer query. 53 | } 54 | \description{ 55 | \code{ngram} downloads data from the Google Ngram Viewer website and 56 | returns it in a tibble. 57 | } 58 | \details{ 59 | Google generated two datasets drawn from digitised books in the Google 60 | Books collection. One was generated in July 2009, the second in July 2012 61 | and the third in 2019. Google is expected to update these datasets as book 62 | scanning continues. 63 | 64 | This function provides the annual frequency of words or phrases, known 65 | as n-grams, in a sub-collection or "corpus" taken from the Google Books 66 | collection.The search across the corpus is case-sensitive. 67 | 68 | If the function is unable to retrieve data from the Google Ngram Viewer 69 | site (either because of access issues or if the format of Google's site 70 | has changed) a NULL result is returned and messages are printed to the 71 | console but no errors or warnings are raised (this is to align with 72 | CRAN package policies). 73 | 74 | Below is a list of available corpora. Note that the data for the 2012 75 | corpuses only extends to 2009. 76 | \tabular{ll}{ 77 | \bold{Corpus} \tab \bold{Corpus Name}\cr 78 | en-US-2019\tab American English 2019\cr 79 | en-US-2012\tab American English 2012\cr 80 | en-US-2009\tab American English 2009\cr 81 | en-GB-2019\tab British English 2019\cr 82 | en-GB-2012\tab British English 2012\cr 83 | en-GB-2009\tab British English 2009\cr 84 | zh-Hans-2019\tab Chinese 2019\cr 85 | zh-Hans-2012\tab Chinese 2012\cr 86 | zh-Hans-2009\tab Chinese 2009\cr 87 | en-2019\tab English 2019\cr 88 | en-2012\tab English 2012\cr 89 | en-2009\tab English 2009\cr 90 | en-fiction-2019\tab English Fiction 2019\cr 91 | en-fiction-2012\tab English Fiction 2012\cr 92 | en-fiction-2009\tab English Fiction 2009\cr 93 | en-1M-2009\tab English One Million\cr 94 | fr-2019\tab French 2019\cr 95 | fr-2012\tab French 2012\cr 96 | fr-2009\tab French 2009\cr 97 | de-2019\tab German 2019\cr 98 | de-2012\tab German 2012\cr 99 | de-2009\tab German 2009\cr 100 | iw-2019\tab Hebrew 2019\cr 101 | iw-2012\tab Hebrew 2012\cr 102 | iw-2009\tab Hebrew 2009\cr 103 | es-2019\tab Spanish 2019\cr 104 | es-2012\tab Spanish 2012\cr 105 | es-2009\tab Spanish 2009\cr 106 | ru-2019\tab Russian 2019\cr 107 | ru-2012\tab Russian 2012\cr 108 | ru-2009\tab Russian 2009\cr 109 | it-2019\tab Italian 2019\cr 110 | it-2012\tab Italian 2012\cr 111 | } 112 | 113 | The Google Million is a sub-collection of Google Books. All are in 114 | English with dates ranging from 1500 to 2008. 115 | No more than about 6,000 books were chosen from any one year, which 116 | means that all of the scanned books from early years are present, 117 | and books from later years are randomly sampled. The random samplings 118 | reflect the subject distributions for the year (so there are more 119 | computer books in 2000 than 1980). 120 | 121 | See \url{http://books.google.com/ngrams/info} for the full Ngram syntax. 122 | } 123 | \examples{ 124 | \donttest{ngram(c("mouse", "rat"), year_start = 1950) 125 | ngram(c("blue_ADJ", "red_ADJ")) 126 | ngram(c("_START_ President Roosevelt", "_START_ President Truman"), year_start = 1920) 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /man/ngrami.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ngrami.R 3 | \name{ngrami} 4 | \alias{ngrami} 5 | \title{Get n-gram frequencies (case insensitive version)} 6 | \usage{ 7 | ngrami(phrases, aggregate = TRUE, ...) 8 | } 9 | \arguments{ 10 | \item{phrases}{vector of phrases} 11 | 12 | \item{aggregate}{sum up each of the terms} 13 | 14 | \item{...}{remaining parameters passed to ngram} 15 | } 16 | \description{ 17 | This function is a simple wrapper of \code{ngram} for case insensitive searches. 18 | } 19 | -------------------------------------------------------------------------------- /man/ngramr.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ngramr-package.R 3 | \docType{package} 4 | \name{ngramr} 5 | \alias{ngramr} 6 | \alias{ngramr-package} 7 | \title{ngramr: Dig into the Google Ngram Viewer using R} 8 | \description{ 9 | The \href{http://books.google.com/ngrams}{Google Books Ngram Viewer} 10 | allows you to enter a list of phrases and then displays a graph showing 11 | how often the phrases have occurred in a corpus of books 12 | (e.g., "British English", "English Fiction", "French") over time. 13 | The underlying data is hidden in web page, embedded in some Javascript. 14 | 15 | This package extracts the data an provides it in the form of an R dataframe. 16 | 17 | The key function is \code{ngram} which, given a collection of 18 | phrases, returns a dataframe containing the frequencies by year. 19 | 20 | The code is based on the \code{getNgrams.py} Python script available on 21 | \href{https://web.archive.org/web/20221129120802/https://www.culturomics.org/}{Culturomics Code} 22 | written by Jean-Baptiste Michel. The Culturomics website doesn't 23 | exist anymore but can still be find 24 | \href{https://web.archive.org/web/20221129220150/https://www.culturomics.org/Resources/get-ngrams}{on archive.org} 25 | and is worth exploring. 26 | 27 | Note that compared to the 2009 versions, the 2012 and 2019 versions have 28 | larger numbers of books, improved OCR, improved library and publisher 29 | metadata. The 2012 and 2019 corpuses also don't form ngrams that cross 30 | sentence boundaries, and do form ngrams across page boundaries and 31 | support part of speech tagging, unlike the 2009 versions. 32 | 33 | Like the Google Ngram Viewer website itself, this package is aimed at for 34 | quick inquiries into the usage of small sets of phrases. 35 | 36 | Please respect the terms of service of the Google Books Ngram Viewer while 37 | using this code. This code is meant to help viewers retrieve data behind 38 | a few queries, not bang at Google's servers with dozens of queries. 39 | The complete dataset can be 40 | \href{https://storage.googleapis.com/books/ngrams/books/datasetsv3.html}{downloaded here}. 41 | } 42 | \references{ 43 | Michel, Jean-Baptiste, et al. "Quantitative analysis of culture using 44 | millions of digitized books." \emph{Science} 331, No. 6014 (2011): 176--182. 45 | } 46 | \seealso{ 47 | Useful links: 48 | \itemize{ 49 | \item \url{https://github.com/seancarmody/ngramr} 50 | \item Report bugs at \url{https://github.com/seancarmody/ngramr/issues} 51 | } 52 | 53 | } 54 | \author{ 55 | \strong{Maintainer}: Sean Carmody \email{seancarmody@gmail.com} [copyright holder] 56 | 57 | } 58 | \keyword{internal} 59 | -------------------------------------------------------------------------------- /man/ngramw.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ngramw.R 3 | \name{ngramw} 4 | \alias{ngramw} 5 | \title{Get n-gram frequencies ("wide" format)} 6 | \usage{ 7 | ngramw(phrases, ignore_case = FALSE, ...) 8 | } 9 | \arguments{ 10 | \item{phrases}{vector of phrases} 11 | 12 | \item{ignore_case}{ignore case of phrases (i.e. call \code{ngrami} 13 | rather than \code{ngram}). Default value is \code{FALSE}.} 14 | 15 | \item{...}{remaining parameters passed to \code{ngram}} 16 | } 17 | \description{ 18 | Get n-gram frequencies ("wide" format) 19 | } 20 | -------------------------------------------------------------------------------- /man/print.ngram.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/methods.R 3 | \name{print.ngram} 4 | \alias{print.ngram} 5 | \title{Print n-gram contents} 6 | \usage{ 7 | \method{print}{ngram}(x, rows = 6, ...) 8 | } 9 | \arguments{ 10 | \item{x}{ngram object as returned by \code{link{ngram}}} 11 | 12 | \item{rows}{number of rows to print. Default is 6.} 13 | 14 | \item{...}{additional parameters passed to default print method.} 15 | } 16 | \description{ 17 | Print n-gram contents 18 | } 19 | \examples{ 20 | \donttest{x <- ngram(c("hacker", "programmer"), year_start = 1950) 21 | print(x) 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /man/theme_google.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/themes.R 3 | \name{theme_google} 4 | \alias{theme_google} 5 | \title{Google Ngram theme for ggplot2} 6 | \usage{ 7 | theme_google(...) 8 | } 9 | \arguments{ 10 | \item{...}{additional parameters to pass to \code{theme}} 11 | } 12 | \description{ 13 | Google Ngram theme for ggplot2 14 | } 15 | \details{ 16 | Use a Google Ngram-style plot theme. 17 | } 18 | -------------------------------------------------------------------------------- /testme: -------------------------------------------------------------------------------- 1 | Rscript -e 'library(devtools); load_all(); test(reporter="minimal")' 2 | -------------------------------------------------------------------------------- /tests/results.txt: -------------------------------------------------------------------------------- 1 | 1 tackle_* "" NGRAM_COLLECTION 220 2 | 2 tackle_NOUN "" NGRAM 220 3 | 3 tackle_NOUN "tackle_*" EXPANSION 220 4 | 4 tackle_VERB "tackle_*" EXPANSION 220 5 | 6 | 1 duPont "Dupont (All)" EXPANSION 220 7 | 2 Dupont "Dupont (All)" EXPANSION 220 8 | 3 DuPont "Dupont (All)" EXPANSION 220 9 | 4 DUPONT "Dupont (All)" EXPANSION 220 10 | 5 Dupont (All) "" CASE_INSENSITIVE 220 11 | 6 Fitzgerald "Fitzgerald (All)" EXPANSION 220 12 | 7 FitzGerald "Fitzgerald (All)" EXPANSION 220 13 | 8 FITZGERALD "Fitzgerald (All)" EXPANSION 220 14 | 9 Fitzgerald (All) "" CASE_INSENSITIVE 220 15 | 16 | 1 read _DET_ book "" NGRAM 220 17 | 18 | 1 read * _DET_ book "" NGRAM_COLLECTION 220 19 | 2 read as _DET_ book "read * _DET_ book" EXPANSION 220 20 | 3 read from _DET_ book "read * _DET_ book" EXPANSION 220 21 | 4 read in _DET_ book "read * _DET_ book" EXPANSION 220 22 | 5 read like _DET_ book "read * _DET_ book" EXPANSION 220 23 | 6 read of _DET_ book "read * _DET_ book" EXPANSION 220 24 | 7 read over _DET_ book "read * _DET_ book" EXPANSION 220 25 | 8 read such _DET_ book "read * _DET_ book" EXPANSION 220 26 | 9 read through _DET_ book "read * _DET_ book" EXPANSION 220 27 | 10 read upon _DET_ book "read * _DET_ book" EXPANSION 220 28 | 11 read with _DET_ book "read * _DET_ book" EXPANSION 220 29 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(ngramr) 3 | 4 | test_check("ngramr") 5 | -------------------------------------------------------------------------------- /tests/testthat/test-ngramr.R: -------------------------------------------------------------------------------- 1 | context("Package") 2 | test_that("package data", { 3 | expect_equal(dim(hacker), c(236, 4)) 4 | expect_equal(class(hacker)[1], "ngram") 5 | expect_equal(dim(corpuses), c(44, 7)) 6 | expect_equal(dim(corpus_totals), c(12945, 5)) 7 | expect_equal(unlist(corpus_totals[12945,], use.names = FALSE), 8 | c("es-2019", 2019, 1658430069, 10286019, 24720)) 9 | }) 10 | 11 | test_that("utility functions", { 12 | expect_equal(chunk(letters, len=4)[[4]], letters[12:15]) 13 | }) 14 | 15 | context("Google") 16 | test_that("google calls", { 17 | skip_if_offline() 18 | skip_if(is.null(ngram("dog")), "Google Ngram calls not succeeding.") 19 | expect_equal(dim( ngrami("dog", year_start = 1950, year_end = 2020)), c(71, 4)) 20 | expect_equal(dim(ngram(c("hacker", "programmer"), corpus = c("en-2012", "en-US-2012"), 21 | year_start = 1950, year_end = 2008)), dim(hacker)) 22 | expect_equal(dim(ngramw(hacker)), c(118, 4)) 23 | expect_equal(dim(ngram(c("military"), corpus = "en-2012", year_start = 1940, 24 | year_end = 2005, smoothing = 0)), c(66, 4)) 25 | }) 26 | 27 | --------------------------------------------------------------------------------