├── .Rbuildignore ├── .gitignore ├── .travis.yml ├── DESCRIPTION ├── NAMESPACE ├── NEWS ├── R ├── annotators.R ├── date_entity.R ├── entity-package.R ├── get_counts.R ├── location_entity.R ├── money_entity.R ├── named_entity.R ├── organization_entity.R ├── percent_entity.R ├── person_entity.R └── utils.R ├── README.Rmd ├── README.md ├── data ├── presidential_debates_2012.rda └── wiki.rda ├── inst ├── CITATION ├── build.R ├── extra_statdoc │ └── readme.R └── staticdocs │ └── index.R ├── man ├── annotators.Rd ├── date_entity.Rd ├── entity.Rd ├── location_entity.Rd ├── money_entity.Rd ├── named_entity.Rd ├── organization_entity.Rd ├── percent_entity.Rd ├── person_entity.Rd ├── plot.entity.Rd ├── presidential_debates_2012.Rd ├── print.entity.Rd └── wiki.Rd ├── tests ├── testthat.R └── testthat │ └── test-named_entity.R └── tools ├── entity_logo ├── r_entity.png ├── r_entity.pptx ├── r_entitya.png └── resize_icon.txt └── figure ├── fig.height-1.png ├── unnamed-chunk-11-1.png └── unnamed-chunk-12-1.png /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^\.gitignore 4 | NEWS.md 5 | FAQ.md 6 | NEWS.html 7 | FAQ.html 8 | ^\.travis\.yml$ 9 | travis-tool.sh 10 | inst/web 11 | contributors.geojson 12 | inst/build.R 13 | ^.*\.Rprofile$ 14 | README.Rmd 15 | README.R 16 | travis.yml 17 | inst/staticdocs 18 | inst/extra_statdoc 19 | inst/maintenance.R 20 | tools/entity_logo/r_entitya.png 21 | tools/entity_logo/r_entity.pptx 22 | tools/entity_logo/resize_icon.txt 23 | Thumbs.db 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | 4 | # Example code in package build process 5 | *-Ex.R 6 | 7 | .Rprofile 8 | .Rproj.user 9 | entity.Rproj 10 | inst/maintenance.R 11 | Thumbs.db 12 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: r 2 | 3 | sudo: false 4 | 5 | before_install: 6 | - sh -e /etc/init.d/xvfb start 7 | 8 | r_github_packages: 9 | - jimhester/covr 10 | 11 | notifications: 12 | email: 13 | on_success: change 14 | on_failure: change 15 | 16 | after_success: 17 | - Rscript -e 'covr::coveralls()' 18 | 19 | r_build_args: "--resave-data=best" 20 | r_check_args: "--as-cran" 21 | 22 | env: 23 | global: 24 | - DISPLAY=:99.0 25 | - BOOTSTRAP_LATEX=1 26 | 27 | 28 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: entity 2 | Title: Named Entity Recognition 3 | Version: 0.1.0 4 | Authors@R: c(person("Tyler", "Rinker", email = 5 | "tyler.rinker@gmail.com", role = c("aut", "cre"))) 6 | Maintainer: Tyler Rinker 7 | Description: A wrapper to simplify and extend 'NLP' and 'openNLP' named 8 | entity recognition. 9 | Depends: R (>= 3.2.2) 10 | Imports: dplyr, ggplot2, NLP, openNLP, utils 11 | Suggests: testthat 12 | Date: 2017-04-10 13 | License: GPL-2 14 | LazyData: TRUE 15 | Roxygen: list(wrap = FALSE) 16 | Collate: 17 | 'annotators.R' 18 | 'named_entity.R' 19 | 'utils.R' 20 | 'date_entity.R' 21 | 'entity-package.R' 22 | 'get_counts.R' 23 | 'location_entity.R' 24 | 'money_entity.R' 25 | 'organization_entity.R' 26 | 'percent_entity.R' 27 | 'person_entity.R' 28 | RoxygenNote: 6.0.1 29 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(plot,entity) 4 | S3method(print,entity) 5 | export(date_annotator) 6 | export(date_entity) 7 | export(location_annotator) 8 | export(location_entity) 9 | export(money_annotator) 10 | export(money_entity) 11 | export(named_entity) 12 | export(organization_annotator) 13 | export(organization_entity) 14 | export(percent_annotator) 15 | export(percent_entity) 16 | export(person_annotator) 17 | export(person_entity) 18 | export(word_annotator) 19 | -------------------------------------------------------------------------------- /NEWS: -------------------------------------------------------------------------------- 1 | NEWS 2 | ==== 3 | 4 | Versioning 5 | ---------- 6 | 7 | Releases will be numbered with the following semantic versioning format: 8 | 9 | .. 10 | 11 | And constructed with the following guidelines: 12 | 13 | * Breaking backward compatibility bumps the major (and resets the minor 14 | and patch) 15 | * New additions without breaking backward compatibility bumps the minor 16 | (and resets the patch) 17 | * Bug fixes and misc changes bumps the patch 18 | 19 | 20 | entity 0.0.1 21 | ---------------------------------------------------------------- 22 | 23 | This package is... 24 | -------------------------------------------------------------------------------- /R/annotators.R: -------------------------------------------------------------------------------- 1 | #' Annotators 2 | #' 3 | #' A wrapper for \code{\link[openNLP]{Maxent_Entity_Annotator}} and 4 | #' \code{\link[openNLP]{Maxent_Word_Token_Annotator}}. 5 | #' 6 | #' @return Returns an annotator for entities or words. 7 | #' @seealso \code{\link[openNLP]{Maxent_Entity_Annotator}}, 8 | #' \code{\link[openNLP]{Maxent_Word_Token_Annotator}} 9 | #' @rdname annotators 10 | #' @export 11 | word_annotator <- function(){ 12 | check_models_package() 13 | openNLP::Maxent_Word_Token_Annotator() 14 | } 15 | 16 | #' @rdname annotators 17 | #' @export 18 | person_annotator <- function(){ 19 | check_models_package() 20 | .PERSON <- openNLP::Maxent_Entity_Annotator(kind = "person") 21 | attributes(.PERSON)[["type"]] <- "person" 22 | .PERSON 23 | } 24 | 25 | #' @rdname annotators 26 | #' @export 27 | location_annotator <- function(){ 28 | check_models_package() 29 | .LOCATION <- openNLP::Maxent_Entity_Annotator(kind = "location") 30 | attributes(.LOCATION)[["type"]] <- "location" 31 | .LOCATION 32 | } 33 | 34 | #' @rdname annotators 35 | #' @export 36 | organization_annotator <- function(){ 37 | check_models_package() 38 | .ORGANIZATION <- openNLP::Maxent_Entity_Annotator(kind = "organization") 39 | attributes(.ORGANIZATION)[["type"]] <- "organization" 40 | .ORGANIZATION 41 | } 42 | 43 | #' @rdname annotators 44 | #' @export 45 | date_annotator <- function(){ 46 | check_models_package() 47 | .DATE <- openNLP::Maxent_Entity_Annotator(kind = "date") 48 | attributes(.DATE)[["type"]] <- "date" 49 | .DATE 50 | } 51 | 52 | #' @rdname annotators 53 | #' @export 54 | money_annotator <- function(){ 55 | check_models_package() 56 | .MONEY <- openNLP::Maxent_Entity_Annotator(kind = "money") 57 | attributes(.MONEY)[["type"]] <- "money" 58 | .MONEY 59 | } 60 | 61 | #' @rdname annotators 62 | #' @export 63 | percent_annotator <- function(){ 64 | check_models_package() 65 | .PERCENT <- openNLP::Maxent_Entity_Annotator(kind = "percent") 66 | attributes(.PERCENT)[["type"]] <- "percent" 67 | .PERCENT 68 | } 69 | 70 | 71 | check_models_package <- function(){ 72 | outcome <- "openNLPmodels.en" %in% list.files(.libPaths()) 73 | if (!outcome) { 74 | message(paste0("Well it appears `openNLPmodels.en` is not installed.\n", 75 | "This package is necessary in order to use the `entity` package.\n\nWould you like me to try and fetch it?")) 76 | ans <- utils::menu(c("Yes", "No")) 77 | if (ans == "2") { 78 | stop("Named entity extraction aborted. Please install `openNLPmodels.en`") 79 | } else { 80 | message("Attempting to install `openNLPmodels.en`.") 81 | utils::install.packages( 82 | "http://datacube.wu.ac.at/src/contrib/openNLPmodels.en_1.5-1.tar.gz", 83 | repos=NULL, 84 | type="source" 85 | ) 86 | outcome <- "openNLPmodels.en" %in% list.files(.libPaths()) 87 | if (outcome) { 88 | return(TRUE) 89 | } else { 90 | stop("Failed to install `openNLPmodels.en`. Please install `openNLPmodels.en` manually.") 91 | } 92 | } 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /R/date_entity.R: -------------------------------------------------------------------------------- 1 | #' Named Date Recognition 2 | #' 3 | #' A wrapper for \pkg{NLP},/\pkg{openNLP}'s named date recognition annotation. 4 | #' 5 | #' @inheritParams named_entity 6 | #' @return Returns a data.frame of named entities and frequencies. 7 | #' @keywords date 8 | #' @export 9 | #' @include utils.R named_entity.R 10 | #' @family variable functions 11 | #' @examples 12 | #' \dontrun{ 13 | #' data(presidential_debates_2012) 14 | #' 15 | #' dates <- date_entity(presidential_debates_2012$dialogue) 16 | #' unlist(dates) 17 | #' 18 | #' library(dplyr) 19 | #' presidential_debates_2012$dates <- date_entity(presidential_debates_2012$dialogue) 20 | #' 21 | #' presidential_debates_2012 %>% 22 | #' {.[!sapply(.$dates, is.null), ]} %>% 23 | #' rowwise() %>% 24 | #' mutate(dates = paste(dates, collapse=", ")) %>% 25 | #' select(person, time, dates) 26 | #' 27 | #' library(tidyr) 28 | #' presidential_debates_2012 %>% 29 | #' {.[!sapply(.$dates, is.null), ]} %>% 30 | #' unnest() %>% 31 | #' select(person, time, dates) 32 | #' } 33 | date_entity <- hijack(named_entity, 34 | entity.annotator = 'date_annotator' 35 | ) 36 | 37 | -------------------------------------------------------------------------------- /R/entity-package.R: -------------------------------------------------------------------------------- 1 | #' Named Entity Extaction 2 | #' 3 | #' A wrapper for \pkg{NLP} and \pkg{openNLP} to facilitate named entity extraction. 4 | #' @docType package 5 | #' @name entity 6 | #' @aliases entity package-entity 7 | NULL 8 | 9 | 10 | #' 2012 U.S. Presidential Debates 11 | #' 12 | #' A dataset containing a cleaned version of all three presidential debates for 13 | #' the 2012 election. 14 | #' 15 | #' @details 16 | #' \itemize{ 17 | #' \item person. The speaker 18 | #' \item tot. Turn of talk 19 | #' \item dialogue. The words spoken 20 | #' \item time. Variable indicating which of the three debates the dialogue is from 21 | #' } 22 | #' 23 | #' @docType data 24 | #' @keywords datasets 25 | #' @name presidential_debates_2012 26 | #' @usage data(presidential_debates_2012) 27 | #' @format A data frame with 2912 rows and 4 variables 28 | NULL 29 | 30 | 31 | #' Bell Labs Wikipedia Article 32 | #' 33 | #' A dataset containing a character vector of an excerpt from Wikipedia about 34 | #' Bell Labs with an extra final sentence to include percent and money when 35 | #' extracting entities. 36 | #' 37 | #' @docType data 38 | #' @keywords datasets 39 | #' @name wiki 40 | #' @usage data(wiki) 41 | #' @format A character vector with 7 elements 42 | #' @references \url{https://en.wikipedia.org/wiki/Bell_Labs} 43 | NULL 44 | -------------------------------------------------------------------------------- /R/get_counts.R: -------------------------------------------------------------------------------- 1 | get_counts <- function(x, alphabetical = FALSE, ...){ 2 | 3 | x <- sort(table(unlist(x)), TRUE) 4 | 5 | x <- data.frame( 6 | entity = names(x), 7 | frequency = c(unname(unlist(x))) 8 | ) 9 | 10 | if (isTRUE(alphabetical)) { 11 | x <- x[order(x[["entity"]]), ] 12 | } 13 | 14 | x[["entity"]] <- factor(x[["entity"]], levels=rev(x[["entity"]])) 15 | 16 | dplyr::tbl_df(x) 17 | } 18 | 19 | 20 | -------------------------------------------------------------------------------- /R/location_entity.R: -------------------------------------------------------------------------------- 1 | #' Named Location Recognition 2 | #' 3 | #' A wrapper for \pkg{NLP},/\pkg{openNLP}'s named location recognition annotation. 4 | #' 5 | #' @inheritParams named_entity 6 | #' @return Returns a data.frame of named entities and frequencies. 7 | #' @keywords location 8 | #' @export 9 | #' @include utils.R named_entity.R 10 | #' @family variable functions 11 | #' @examples 12 | #' \dontrun{ 13 | #' data(presidential_debates_2012) 14 | #' 15 | #' locales <- location_entity(presidential_debates_2012$dialogue) 16 | #' unlist(locales) 17 | #' 18 | #' library(dplyr) 19 | #' presidential_debates_2012$locations <- location_entity(presidential_debates_2012$dialogue) 20 | #' 21 | #' presidential_debates_2012 %>% 22 | #' {.[!sapply(.$locations, is.null), ]} %>% 23 | #' rowwise() %>% 24 | #' mutate(locations = paste(locations, collapse=", ")) %>% 25 | #' select(person, time, locations) 26 | #' 27 | #' library(tidyr) 28 | #' presidential_debates_2012 %>% 29 | #' {.[!sapply(.$locations, is.null), ]} %>% 30 | #' unnest() %>% 31 | #' select(person, time, locations) 32 | #' } 33 | location_entity <- hijack(named_entity, 34 | entity.annotator = 'location_annotator' 35 | ) 36 | 37 | -------------------------------------------------------------------------------- /R/money_entity.R: -------------------------------------------------------------------------------- 1 | #' Named Money Recognition 2 | #' 3 | #' A wrapper for \pkg{NLP},/\pkg{openNLP}'s named money recognition annotation. 4 | #' 5 | #' @inheritParams named_entity 6 | #' @return Returns a data.frame of named entities and frequencies. 7 | #' @keywords money 8 | #' @export 9 | #' @include utils.R named_entity.R 10 | #' @family variable functions 11 | #' @examples 12 | #' \dontrun{ 13 | #' data(presidential_debates_2012) 14 | #' 15 | #' monies <- money_entity(presidential_debates_2012$dialogue) 16 | #' unlist(monies) 17 | #' 18 | #' library(dplyr) 19 | #' presidential_debates_2012$monies <- money_entity(presidential_debates_2012$dialogue) 20 | #' 21 | #' presidential_debates_2012 %>% 22 | #' {.[!sapply(.$monies, is.null), ]} %>% 23 | #' rowwise() %>% 24 | #' mutate(monies = paste(monies, collapse=", ")) %>% 25 | #' select(person, time, monies) 26 | #' 27 | #' library(tidyr) 28 | #' presidential_debates_2012 %>% 29 | #' {.[!sapply(.$monies, is.null), ]} %>% 30 | #' unnest() %>% 31 | #' select(person, time, monies) 32 | #' } 33 | money_entity <- hijack(named_entity, 34 | entity.annotator = 'money_annotator' 35 | ) 36 | 37 | -------------------------------------------------------------------------------- /R/named_entity.R: -------------------------------------------------------------------------------- 1 | #' Named Entity Recognition 2 | #' 3 | #' A wrapper for \pkg{NLP},/\pkg{openNLP}'s named entity recognition annotation 4 | #' tools. 5 | #' 6 | #' @param text.var The text string variable. 7 | #' @param entity.annotator A character vector identifying an entity recognition 8 | #' annotator (\code{c("person_annotator", "location_annotator", "date_annotator", 9 | #' "money_annotator", "percent_annotator")}. See \code{?annotators}. 10 | #' @param word.annotator A word annotator. 11 | #' @param element.chunks The number of elements to include in a chunk. Chunks are 12 | #' passed through an \code{\link[base]{lapply}} and size is kept within a tolerance 13 | #' because of memory allocation in the tagging process with \pkg{Java}. 14 | #' @return Returns a data.frame of named entities and frequencies. 15 | #' @keywords ner named entity 16 | #' @export 17 | #' @seealso \code{\link[openNLP]{Maxent_Entity_Annotator}} 18 | #' @examples 19 | #' \dontrun{ 20 | #' data(presidential_debates_2012) 21 | #' 22 | #' peoples <- named_entity(presidential_debates_2012$dialogue, 'person_annotator') 23 | #' unlist(peoples) 24 | #' plot(peoples) 25 | #' 26 | #' orgs <-named_entity(presidential_debates_2012$dialogue, 'organization_annotator') 27 | #' unlist(orgs) 28 | #' 29 | #' dates <-named_entity(presidential_debates_2012$dialogue, 'date_annotator') 30 | #' unlist(dates) 31 | #' 32 | #' library(dplyr) 33 | #' presidential_debates_2012$organizations <- named_entity( 34 | #' presidential_debates_2012$dialogue, 35 | #' 'organization_annotator' 36 | #' ) 37 | #' 38 | #' presidential_debates_2012 %>% 39 | #' {.[!sapply(.$organizations, is.null), ]} %>% 40 | #' rowwise() %>% 41 | #' mutate(organizations = paste(organizations, collapse=", ")) %>% 42 | #' select(person, time, organizations) 43 | #' 44 | #' library(tidyr) 45 | #' presidential_debates_2012 %>% 46 | #' {.[!sapply(.$organizations, is.null), ]} %>% 47 | #' unnest() %>% 48 | #' select(person, time, organizations) 49 | #' } 50 | named_entity <- function(text.var, entity.annotator, word.annotator = word_annotator(), 51 | element.chunks = floor(2000 * (23.5/mean(sapply(text.var, nchar), na.rm = TRUE)))){ 52 | 53 | len <- length(text.var) 54 | 55 | ## locate empty or missing text elements 56 | nas <- sort(union(which(is.na(text.var)), grep("^\\s*$", text.var))) 57 | 58 | ## Get annotator 59 | entity.annotator <- switch(entity.annotator, 60 | person_annotator = person_annotator(), 61 | location_annotator = location_annotator(), 62 | organization_annotator = organization_annotator(), 63 | date_annotator = date_annotator(), 64 | money_annotator = money_annotator(), 65 | percent_annotator = percent_annotator(), 66 | stop("`entity.annotator` does not appear to be an annotator. See `?annotators`.") 67 | ) 68 | 69 | 70 | ## replace empty text with a period 71 | if(length(nas) > 0){ 72 | text.var[nas] <- "." 73 | } 74 | 75 | ## Chunking the text into memory sized chunks: 76 | ## caluclate the start/end indexes of the chunks 77 | ends <- c(utils::tail(seq(0, by = element.chunks, 78 | length.out = ceiling(len/element.chunks)), -1), len) 79 | starts <- c(1, utils::head(ends + 1 , -1)) 80 | 81 | ## chunk the text 82 | text_list <- Map(function(s, e) {text.var[s:e]}, starts, ends) 83 | 84 | ## loop through the chunks and tag them 85 | out <- lapply(text_list, function(x){ 86 | x <- entify(x, entity.annotator, word.annotator) 87 | gc() 88 | x 89 | }) 90 | 91 | lens <- sapply(text_list, length) 92 | 93 | out <- unlist(lapply(seq_along(out), function(i){ 94 | 95 | vectout <- vector(mode = "list", length = lens[i]) 96 | if (is.null(out[[i]][["entities"]])) return(vectout) 97 | if (length(out[[i]][["entities"]]) == 1){ 98 | splits <- out[[i]][["entities"]] 99 | } else { 100 | splits <- split(out[[i]][["entities"]], out[[i]][["locations"]]) 101 | } 102 | vectout[unique(out[[i]][["locations"]])] <- splits 103 | vectout 104 | }), recursive = FALSE) 105 | 106 | class(out) <- c("entity", class(out)) 107 | attributes(out)[["type"]] <- attributes(entity.annotator)[["type"]] 108 | out 109 | } 110 | 111 | 112 | entify <- function(text.var, ANN, WTA, ...) { 113 | 114 | text.var <- gsub("^\\s+|\\s+$", "", text.var) 115 | s <- NLP::as.String(paste(text.var, collapse="")) 116 | 117 | ## Manually calculate the starts and ends via nchar 118 | lens <- sapply(text.var, nchar) 119 | ends <- cumsum(lens) 120 | starts <- c(1, utils::head(ends + 1, -1)) 121 | 122 | a2 <- NLP::Annotation(seq_along(starts), rep("sentence", length(starts)), starts, ends) 123 | a2 <- NLP::annotate(s, WTA, a2) 124 | a3 <- NLP::annotate(s, ANN, a2) 125 | 126 | ## Determine the distribution of POS tags for word tokens. 127 | ents <- a3$type == "entity" 128 | if (all(!ents)) return(list(locations = NULL, entities = NULL)) 129 | a3wb <- a3w <- a3[ents] 130 | 131 | a3s <- a3[a3$type == "sentence"] 132 | starts <- as.data.frame(a3s)[, "start"] 133 | ends <- as.data.frame(a3s)[, "end"] 134 | 135 | a3w$start <- sapply(as.data.frame(a3w)[, "start"], function(x) { 136 | max(starts[starts <= x]) 137 | }) 138 | a3w$end <- sapply(as.data.frame(a3w)[, "end"], function(x) { 139 | min(ends[ends >= x]) 140 | }) 141 | 142 | list( 143 | locations = match(a3w$start, starts), 144 | entities = s[a3wb] 145 | ) 146 | } 147 | 148 | #' Prints a entity Object 149 | #' 150 | #' Prints a entity object 151 | #' 152 | #' @param x An \code{entity} object. 153 | #' @param \ldots ignored. 154 | #' @method print entity 155 | #' @export 156 | print.entity <- function(x, ...){ 157 | class(x) <- "list" 158 | attributes(x) <- NULL 159 | print(x) 160 | } 161 | 162 | #' Plots a plot.entity Object 163 | #' 164 | #' Plots a plot.entity object 165 | #' 166 | #' @param x An \code{entity} object. 167 | #' @param min Minimum frequency of included entities. 168 | #' @param alphabetical logical. Should rows be arranged alphabetically by entity 169 | #' or by frequency. 170 | #' @param \ldots ignored. 171 | #' @method plot entity 172 | #' @export 173 | plot.entity <- function(x, min = 1, alphabetical = FALSE, ...){ 174 | 175 | stopifnot(min > 0) 176 | 177 | entname <- attributes(x)[["type"]] 178 | substring(entname, 1, 1) <- toupper(substring(entname, 1, 1)) 179 | 180 | x <- get_counts(x, alphabetical = alphabetical) 181 | 182 | x <- x[x[["frequency"]] >= min, ] 183 | 184 | ggplot2::ggplot(x, ggplot2::aes_string(x='entity', weight='frequency')) + 185 | ggplot2::geom_bar() + 186 | ggplot2::coord_flip() + 187 | ggplot2::ylab("Count") + 188 | ggplot2::xlab(entname) + 189 | ggplot2::scale_y_continuous(expand = c(0, 0), limits = c(0, 1.01 * max(x[["frequency"]]))) + 190 | ggplot2::theme_bw() + 191 | ggplot2::theme( 192 | panel.grid.major.y = ggplot2::element_blank(), 193 | legend.title = ggplot2::element_blank(), 194 | panel.border = ggplot2::element_blank(), 195 | axis.line = ggplot2::element_line(color="grey70") 196 | ) 197 | } 198 | 199 | 200 | 201 | -------------------------------------------------------------------------------- /R/organization_entity.R: -------------------------------------------------------------------------------- 1 | #' Named Organization Recognition 2 | #' 3 | #' A wrapper for \pkg{NLP},/\pkg{openNLP}'s named organization recognition annotation. 4 | #' 5 | #' @inheritParams named_entity 6 | #' @return Returns a data.frame of named entities and frequencies. 7 | #' @keywords organization 8 | #' @export 9 | #' @include utils.R named_entity.R 10 | #' @family variable functions 11 | #' @examples 12 | #' \dontrun{ 13 | #' data(presidential_debates_2012) 14 | #' 15 | #' orgs <- organization_entity(presidential_debates_2012$dialogue) 16 | #' unlist(orgs) 17 | #' 18 | #' library(dplyr) 19 | #' presidential_debates_2012$organizations <- organization_entity(presidential_debates_2012$dialogue) 20 | #' 21 | #' presidential_debates_2012 %>% 22 | #' {.[!sapply(.$organizations, is.null), ]} %>% 23 | #' rowwise() %>% 24 | #' mutate(organizations = paste(organizations, collapse=", ")) %>% 25 | #' select(person, time, organizations) 26 | #' 27 | #' library(tidyr) 28 | #' presidential_debates_2012 %>% 29 | #' {.[!sapply(.$organizations, is.null), ]} %>% 30 | #' unnest() %>% 31 | #' select(person, time, organizations) 32 | #' } 33 | organization_entity <- hijack(named_entity, 34 | entity.annotator = 'organization_annotator' 35 | ) 36 | 37 | -------------------------------------------------------------------------------- /R/percent_entity.R: -------------------------------------------------------------------------------- 1 | #' Named Percent Recognition 2 | #' 3 | #' A wrapper for \pkg{NLP},/\pkg{openNLP}'s named percent recognition annotation. 4 | #' 5 | #' @inheritParams named_entity 6 | #' @return Returns a data.frame of named entities and frequencies. 7 | #' @keywords percent 8 | #' @export 9 | #' @include utils.R named_entity.R 10 | #' @family variable functions 11 | #' @examples 12 | #' \dontrun{ 13 | #' data(presidential_debates_2012) 14 | #' 15 | #' percents <- percent_entity(presidential_debates_2012$dialogue) 16 | #' unlist(percents) 17 | #' 18 | #' library(dplyr) 19 | #' presidential_debates_2012$percents <- percent_entity(presidential_debates_2012$dialogue) 20 | #' 21 | #' presidential_debates_2012 %>% 22 | #' {.[!sapply(.$percents, is.null), ]} %>% 23 | #' rowwise() %>% 24 | #' mutate(percents = paste(percents, collapse=", ")) %>% 25 | #' select(person, time, percents) 26 | #' 27 | #' library(tidyr) 28 | #' presidential_debates_2012 %>% 29 | #' {.[!sapply(.$percents, is.null), ]} %>% 30 | #' unnest() %>% 31 | #' select(person, time, percents) 32 | #' } 33 | percent_entity <- hijack(named_entity, 34 | entity.annotator = 'percent_annotator' 35 | ) 36 | 37 | -------------------------------------------------------------------------------- /R/person_entity.R: -------------------------------------------------------------------------------- 1 | #' Named Person Recognition 2 | #' 3 | #' A wrapper for \pkg{NLP},/\pkg{openNLP}'s named person recognition annotation. 4 | #' 5 | #' @inheritParams named_entity 6 | #' @return Returns a data.frame of named entities and frequencies. 7 | #' @keywords person people 8 | #' @export 9 | #' @include utils.R named_entity.R 10 | #' @family variable functions 11 | #' @examples 12 | #' \dontrun{ 13 | #' data(presidential_debates_2012) 14 | #' 15 | #' peoples <- person_entity(presidential_debates_2012$dialogue) 16 | #' unlist(peoples) 17 | #' 18 | #' library(dplyr) 19 | #' presidential_debates_2012$persons <- person_entity(presidential_debates_2012$dialogue) 20 | #' 21 | #' presidential_debates_2012 %>% 22 | #' {.[!sapply(.$persons, is.null), ]} %>% 23 | #' rowwise() %>% 24 | #' mutate(persons = paste(persons, collapse=", ")) %>% 25 | #' select(person, time, persons) 26 | #' 27 | #' library(tidyr) 28 | #' presidential_debates_2012 %>% 29 | #' {.[!sapply(.$persons, is.null), ]} %>% 30 | #' unnest() %>% 31 | #' select(person, time, persons) 32 | #' } 33 | person_entity <- hijack(named_entity, 34 | entity.annotator = 'person_annotator' 35 | ) 36 | 37 | 38 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | ## Hijack a function 2 | ## see: http://stackoverflow.com/a/25366322/1000343 3 | hijack <- function(FUN, ...){ 4 | 5 | .FUN <- FUN 6 | 7 | args <- list(...) 8 | invisible(lapply(seq_along(args), function(i) { 9 | formals(.FUN)[[names(args)[i]]] <<- args[[i]] 10 | })) 11 | 12 | .FUN 13 | } 14 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "entity" 3 | date: "`r format(Sys.time(), '%d %B, %Y')`" 4 | output: 5 | md_document: 6 | toc: true 7 | --- 8 | 9 | ```{r, echo=FALSE} 10 | desc <- suppressWarnings(readLines("DESCRIPTION")) 11 | regex <- "(^Version:\\s+)(\\d+\\.\\d+\\.\\d+)" 12 | loc <- grep(regex, desc) 13 | ver <- gsub(regex, "\\2", desc[loc]) 14 | verbadge <- sprintf('Version

', ver, ver) 15 | ```` 16 | 17 | [![Project Status: Active - The project has reached a stable, usable state and is being actively developed.](http://www.repostatus.org/badges/0.1.0/active.svg)](http://www.repostatus.org/#active) 18 | [![Build Status](https://travis-ci.org/trinker/entity.svg?branch=master)](https://travis-ci.org/trinker/entity) 19 | [![Coverage Status](https://coveralls.io/repos/trinker/entity/badge.svg?branch=master)](https://coveralls.io/r/trinker/entity?branch=master) 20 | `r verbadge` 21 | 22 | ```{r, echo=FALSE, message=FALSE} 23 | library(knitr) 24 | knit_hooks$set(htmlcap = function(before, options, envir) { 25 | if(!before) { 26 | paste('

',options$htmlcap,"

",sep="") 27 | } 28 | }) 29 | knitr::opts_knit$set(self.contained = TRUE, cache = FALSE) 30 | knitr::opts_chunk$set(fig.path = "tools/figure/") 31 | ``` 32 | 33 | ![](tools/entity_logo/r_entity.png) 34 | 35 | 36 | **entity** is wrapper to simplify and extend [**NLP**](https://cran.r-project.org/web/packages/NLP/index.html) and [**openNLP**](https://cran.r-project.org/web/packages/openNLP/index.html) named entity recognition. The package contains 6 entity extractors that take a text vector and return a list of vectors of named entities. The entity extractors include: 37 | 38 | 1. `person_entity` 39 | 2. `location_entity` 40 | 3. `organization_entity` 41 | 4. `date_entity` 42 | 5. `money_entity` 43 | 6. `percent_entity` 44 | 45 | # Installation 46 | 47 | To download the development version of **entity**: 48 | 49 | Download the [zip ball](https://github.com/trinker/entity/zipball/master) or [tar ball](https://github.com/trinker/entity/tarball/master), decompress and run `R CMD INSTALL` on it, or use the **pacman** package to install the development version: 50 | 51 | ```r 52 | if (!require("pacman")) install.packages("pacman") 53 | pacman::p_load_gh("trinker/entity") 54 | ``` 55 | 56 | # Contact 57 | 58 | You are welcome to: 59 | * submit suggestions and bug-reports at: 60 | * send a pull request on: 61 | * compose a friendly e-mail to: 62 | 63 | # Examples 64 | 65 | The following examples demonstrate some of the functionality of **termco**. 66 | 67 | ## Load the Package/Data 68 | 69 | ```{r, message=FALSE} 70 | library(entity) 71 | ``` 72 | 73 | I will demonstrate the 6 annotators on this [Wikipedia excerpt](https://en.wikipedia.org/wiki/Bell_Labs) about Bell Labs (plus one non Wikipedia line at the end). 74 | 75 | ```{r} 76 | data(wiki) 77 | wiki 78 | ``` 79 | 80 | ## Entity Extractors 81 | 82 | ### Person Entities 83 | 84 | ```{r} 85 | person_entity(wiki) 86 | ``` 87 | 88 | ### Location Entities 89 | 90 | ```{r} 91 | location_entity(wiki) 92 | ``` 93 | 94 | ### Organization Entities 95 | 96 | ```{r} 97 | organization_entity(wiki) 98 | ``` 99 | 100 | ### Date Entities 101 | 102 | ```{r} 103 | date_entity(wiki) 104 | ``` 105 | 106 | ### Money Entities 107 | 108 | ```{r} 109 | money_entity(wiki) 110 | ``` 111 | 112 | ### Percent Entities 113 | 114 | ```{r} 115 | percent_entity(wiki) 116 | ``` 117 | 118 | 119 | ## Plotting 120 | 121 | ```{r, fig.height = 7} 122 | organizations <- organization_entity(presidential_debates_2012$dialogue) 123 | plot(organizations) 124 | ``` 125 | 126 | You can include only entities above a minimum frequency (`min = n`) as shown below: 127 | 128 | ```{r, fig.height} 129 | plot(organizations, min = 2) 130 | ``` 131 | 132 | The user may wish to view the entities alphabetically rather than by frequency. Use `alphabetical = TRUE` to accomplish this: 133 | 134 | ```{r, fig.height = 7} 135 | plot(organizations, alphabetical = TRUE) 136 | ``` 137 | 138 | 139 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | entity [![Follow](https://img.shields.io/twitter/follow/tylerrinker.svg?style=social)](https://twitter.com/intent/follow?screen_name=tylerrinker) 2 | ============ 3 | 4 | 5 | [![Project Status: Active - The project has reached a stable, usable 6 | state and is being actively 7 | developed.](http://www.repostatus.org/badges/0.1.0/active.svg)](http://www.repostatus.org/#active) 8 | [![Build 9 | Status](https://travis-ci.org/trinker/entity.svg?branch=master)](https://travis-ci.org/trinker/entity) 10 | [![Coverage 11 | Status](https://coveralls.io/repos/trinker/entity/badge.svg?branch=master)](https://coveralls.io/r/trinker/entity?branch=master) 12 | Version 13 |

14 | 15 | ![](tools/entity_logo/r_entity.png) 16 | 17 | **entity** is wrapper to simplify and extend 18 | [**NLP**](https://cran.r-project.org/web/packages/NLP/index.html) and 19 | [**openNLP**](https://cran.r-project.org/web/packages/openNLP/index.html) 20 | named entity recognition. The package contains 6 entity extractors that 21 | take a text vector and return a list of vectors of named entities. The 22 | entity extractors include: 23 | 24 | 1. `person_entity` 25 | 2. `location_entity` 26 | 3. `organization_entity` 27 | 4. `date_entity` 28 | 5. `money_entity` 29 | 6. `percent_entity` 30 | 31 | 32 | Table of Contents 33 | ============ 34 | 35 | - [Installation](#installation) 36 | - [Contact](#contact) 37 | - [Examples](#examples) 38 | - [Load the Package/Data](#load-the-packagedata) 39 | - [Entity Extractors](#entity-extractors) 40 | - [Person Entities](#person-entities) 41 |        -   [Location Entities](#location-entities)   42 |        -   [Organization Entities](#organization-entities) 43 |        -   [Date Entities](#date-entities)     44 |        -  [Money Entities](#money-entities) 45 |        -   [Percent Entities](#percent-entities) 46 |    -      [Plotting](#plotting) 47 | 48 | Installation 49 | ============ 50 | 51 | 52 | To download the development version of **entity**: 53 | 54 | Download the [zip 55 | ball](https://github.com/trinker/entity/zipball/master) or [tar 56 | ball](https://github.com/trinker/entity/tarball/master), decompress and 57 | run `R CMD INSTALL` on it, or use the **pacman** package to install the 58 | development version: 59 | 60 | if (!require("pacman")) install.packages("pacman") 61 | pacman::p_load_gh("trinker/entity") 62 | 63 | Contact 64 | ======= 65 | 66 | You are welcome to: 67 | - submit suggestions and bug-reports at: 68 | - send a pull request on: 69 | - compose a friendly e-mail to: 70 | 71 | Examples 72 | ======== 73 | 74 | The following examples demonstrate some of the functionality of 75 | **termco**. 76 | 77 | Load the Package/Data 78 | --------------------- 79 | 80 | library(entity) 81 | 82 | I will demonstrate the 6 annotators on this [Wikipedia 83 | excerpt](https://en.wikipedia.org/wiki/Bell_Labs) about Bell Labs (plus 84 | one non Wikipedia line at the end). 85 | 86 | data(wiki) 87 | wiki 88 | 89 | ## [1] "Bell Laboratories (also known as Bell Labs and formerly known as AT&T Bell Laboratories and Bell Telephone Laboratories) is a research and scientific development company that belongs to Alcatel-Lucent." 90 | ## [2] "Its headquarters are located in Murray Hill, New Jersey, in addition to other laboratories around the rest of the United States and in other countries." 91 | ## [3] "The historic laboratory originated in the late 19th century as the Volta Laboratory and Bureau created by Alexander Graham Bell." 92 | ## [4] "Bell Labs was also at one time a division of the American Telephone & Telegraph Company (AT&T Corporation), half-owned through its Western Electric manufacturing subsidiary." 93 | ## [5] "Researchers working at Bell Labs are credited with the development of radio astronomy, the transistor, the laser, the charge-coupled device (CCD), information theory, the UNIX operating system, the C programming language, S programming language and the C++ programming language." 94 | ## [6] "Eight Nobel Prizes have been awarded for work completed at Bell Laboratories." 95 | ## [7] "And an extra line not from Wikipedia worth 2 cents or .001% of 1 percent." 96 | 97 | Entity Extractors 98 | ----------------- 99 | 100 | ### Person Entities 101 | 102 | person_entity(wiki) 103 | 104 | ## [[1]] 105 | ## NULL 106 | ## 107 | ## [[2]] 108 | ## NULL 109 | ## 110 | ## [[3]] 111 | ## [1] "Alexander Graham Bell" 112 | ## 113 | ## [[4]] 114 | ## NULL 115 | ## 116 | ## [[5]] 117 | ## NULL 118 | ## 119 | ## [[6]] 120 | ## NULL 121 | ## 122 | ## [[7]] 123 | ## NULL 124 | 125 | ### Location Entities 126 | 127 | location_entity(wiki) 128 | 129 | ## [[1]] 130 | ## NULL 131 | ## 132 | ## [[2]] 133 | ## [1] "Murray Hill" "New Jersey" "United States" 134 | ## 135 | ## [[3]] 136 | ## NULL 137 | ## 138 | ## [[4]] 139 | ## [1] "Telegraph" 140 | ## 141 | ## [[5]] 142 | ## NULL 143 | ## 144 | ## [[6]] 145 | ## NULL 146 | ## 147 | ## [[7]] 148 | ## NULL 149 | 150 | ### Organization Entities 151 | 152 | organization_entity(wiki) 153 | 154 | ## [[1]] 155 | ## [1] "Bell Laboratories" "Bell Labs" 156 | ## [3] "Bell Laboratories" "Bell Telephone Laboratories" 157 | ## 158 | ## [[2]] 159 | ## NULL 160 | ## 161 | ## [[3]] 162 | ## [1] "Volta Laboratory" "Alexander Graham Bell" 163 | ## 164 | ## [[4]] 165 | ## [1] "Bell Labs" 166 | ## [2] "American Telephone & Telegraph Company" 167 | ## [3] "AT&T Corporation" 168 | ## [4] "Western Electric" 169 | ## 170 | ## [[5]] 171 | ## [1] "Bell Labs" 172 | ## 173 | ## [[6]] 174 | ## [1] "Bell Laboratories" 175 | ## 176 | ## [[7]] 177 | ## NULL 178 | 179 | ### Date Entities 180 | 181 | date_entity(wiki) 182 | 183 | ## [[1]] 184 | ## NULL 185 | ## 186 | ## [[2]] 187 | ## NULL 188 | ## 189 | ## [[3]] 190 | ## [1] "late 19th century" 191 | ## 192 | ## [[4]] 193 | ## NULL 194 | ## 195 | ## [[5]] 196 | ## NULL 197 | ## 198 | ## [[6]] 199 | ## NULL 200 | ## 201 | ## [[7]] 202 | ## NULL 203 | 204 | ### Money Entities 205 | 206 | money_entity(wiki) 207 | 208 | ## [[1]] 209 | ## NULL 210 | ## 211 | ## [[2]] 212 | ## NULL 213 | ## 214 | ## [[3]] 215 | ## NULL 216 | ## 217 | ## [[4]] 218 | ## NULL 219 | ## 220 | ## [[5]] 221 | ## NULL 222 | ## 223 | ## [[6]] 224 | ## NULL 225 | ## 226 | ## [[7]] 227 | ## [1] "2 cents" 228 | 229 | ### Percent Entities 230 | 231 | percent_entity(wiki) 232 | 233 | ## [[1]] 234 | ## NULL 235 | ## 236 | ## [[2]] 237 | ## NULL 238 | ## 239 | ## [[3]] 240 | ## NULL 241 | ## 242 | ## [[4]] 243 | ## NULL 244 | ## 245 | ## [[5]] 246 | ## NULL 247 | ## 248 | ## [[6]] 249 | ## NULL 250 | ## 251 | ## [[7]] 252 | ## [1] ".001%" "1 percent" 253 | 254 | Plotting 255 | -------- 256 | 257 | organizations <- organization_entity(presidential_debates_2012$dialogue) 258 | plot(organizations) 259 | 260 | ![](tools/figure/unnamed-chunk-11-1.png) 261 | 262 | You can include only entities above a minimum frequency (`min = n`) as 263 | shown below: 264 | 265 | plot(organizations, min = 2) 266 | 267 | ![](tools/figure/fig.height-1.png) 268 | 269 | The user may wish to view the entities alphabetically rather than by 270 | frequency. Use `alphabetical = TRUE` to accomplish this: 271 | 272 | plot(organizations, alphabetical = TRUE) 273 | 274 | ![](tools/figure/unnamed-chunk-12-1.png) 275 | -------------------------------------------------------------------------------- /data/presidential_debates_2012.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trinker/entity/5549d30f4f7daa91ecb99ea5dbc770260bfdbedd/data/presidential_debates_2012.rda -------------------------------------------------------------------------------- /data/wiki.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trinker/entity/5549d30f4f7daa91ecb99ea5dbc770260bfdbedd/data/wiki.rda -------------------------------------------------------------------------------- /inst/CITATION: -------------------------------------------------------------------------------- 1 | citHeader("To cite entity in publications, please use:") 2 | 3 | 4 | citEntry(entry = "manual", 5 | title = "{entity}: Named entity recognition", 6 | author = "Tyler W. Rinker", 7 | organization = "University at Buffalo/SUNY", 8 | address = "Buffalo, New York", 9 | note = "version 0.1.0", 10 | year = "2015", 11 | url = "http://github.com/trinker/entity", 12 | textVersion = paste("Rinker, T. W. (2015).", 13 | "entity: Named entity recognition", 14 | "version 0.1.0. University at Buffalo. Buffalo, New York.", 15 | "http://github.com/trinker/entity") 16 | ) 17 | -------------------------------------------------------------------------------- /inst/build.R: -------------------------------------------------------------------------------- 1 | root <- Sys.getenv("USERPROFILE") 2 | pack <- basename(getwd()) 3 | 4 | quick <- TRUE 5 | pdf <- TRUE 6 | 7 | unlink(paste0(pack, ".pdf"), recursive = TRUE, force = TRUE) 8 | devtools::document() 9 | devtools::install(quick = quick, build_vignettes = FALSE, dependencies = TRUE) 10 | 11 | if(pdf){ 12 | path <- find.package(pack) 13 | system(paste(shQuote(file.path(R.home("bin"), "R")), "CMD", "Rd2pdf", shQuote(path))) 14 | file.copy(paste0(pack, '.pdf'), file.path(root,"Desktop", paste0(pack, '.pdf'))) 15 | while (file.exists(paste0(pack, ".pdf"))) {unlink(paste0(pack, ".pdf"), recursive = TRUE, force = TRUE)} 16 | empts <- grep("^\\.Rd", dir(all.files = TRUE), value = TRUE) 17 | unlink(empts, recursive = TRUE, force = TRUE) 18 | } 19 | 20 | message("Done!") 21 | -------------------------------------------------------------------------------- /inst/extra_statdoc/readme.R: -------------------------------------------------------------------------------- 1 |


2 |

entity is a...

3 |

Download the development version of entity here 4 | -------------------------------------------------------------------------------- /inst/staticdocs/index.R: -------------------------------------------------------------------------------- 1 | library(staticdocs) 2 | 3 | sd_section("", 4 | "Function for...", 5 | c( 6 | "myfun" 7 | ) 8 | ) -------------------------------------------------------------------------------- /man/annotators.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/annotators.R 3 | \name{word_annotator} 4 | \alias{word_annotator} 5 | \alias{person_annotator} 6 | \alias{location_annotator} 7 | \alias{organization_annotator} 8 | \alias{date_annotator} 9 | \alias{money_annotator} 10 | \alias{percent_annotator} 11 | \title{Annotators} 12 | \usage{ 13 | word_annotator() 14 | 15 | person_annotator() 16 | 17 | location_annotator() 18 | 19 | organization_annotator() 20 | 21 | date_annotator() 22 | 23 | money_annotator() 24 | 25 | percent_annotator() 26 | } 27 | \value{ 28 | Returns an annotator for entities or words. 29 | } 30 | \description{ 31 | A wrapper for \code{\link[openNLP]{Maxent_Entity_Annotator}} and 32 | \code{\link[openNLP]{Maxent_Word_Token_Annotator}}. 33 | } 34 | \seealso{ 35 | \code{\link[openNLP]{Maxent_Entity_Annotator}}, 36 | \code{\link[openNLP]{Maxent_Word_Token_Annotator}} 37 | } 38 | -------------------------------------------------------------------------------- /man/date_entity.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/date_entity.R 3 | \name{date_entity} 4 | \alias{date_entity} 5 | \title{Named Date Recognition} 6 | \usage{ 7 | date_entity(text.var, entity.annotator = "date_annotator", 8 | word.annotator = word_annotator(), element.chunks = floor(2000 * 9 | (23.5/mean(sapply(text.var, nchar), na.rm = TRUE)))) 10 | } 11 | \arguments{ 12 | \item{text.var}{The text string variable.} 13 | 14 | \item{entity.annotator}{A character vector identifying an entity recognition 15 | annotator (\code{c("person_annotator", "location_annotator", "date_annotator", 16 | "money_annotator", "percent_annotator")}. See \code{?annotators}.} 17 | 18 | \item{word.annotator}{A word annotator.} 19 | 20 | \item{element.chunks}{The number of elements to include in a chunk. Chunks are 21 | passed through an \code{\link[base]{lapply}} and size is kept within a tolerance 22 | because of memory allocation in the tagging process with \pkg{Java}.} 23 | } 24 | \value{ 25 | Returns a data.frame of named entities and frequencies. 26 | } 27 | \description{ 28 | A wrapper for \pkg{NLP},/\pkg{openNLP}'s named date recognition annotation. 29 | } 30 | \examples{ 31 | \dontrun{ 32 | data(presidential_debates_2012) 33 | 34 | dates <- date_entity(presidential_debates_2012$dialogue) 35 | unlist(dates) 36 | 37 | library(dplyr) 38 | presidential_debates_2012$dates <- date_entity(presidential_debates_2012$dialogue) 39 | 40 | presidential_debates_2012 \%>\% 41 | {.[!sapply(.$dates, is.null), ]} \%>\% 42 | rowwise() \%>\% 43 | mutate(dates = paste(dates, collapse=", ")) \%>\% 44 | select(person, time, dates) 45 | 46 | library(tidyr) 47 | presidential_debates_2012 \%>\% 48 | {.[!sapply(.$dates, is.null), ]} \%>\% 49 | unnest() \%>\% 50 | select(person, time, dates) 51 | } 52 | } 53 | \seealso{ 54 | Other variable functions: \code{\link{location_entity}}, 55 | \code{\link{money_entity}}, 56 | \code{\link{organization_entity}}, 57 | \code{\link{percent_entity}}, \code{\link{person_entity}} 58 | } 59 | \keyword{date} 60 | -------------------------------------------------------------------------------- /man/entity.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/entity-package.R 3 | \docType{package} 4 | \name{entity} 5 | \alias{entity} 6 | \alias{package-entity} 7 | \alias{entity-package} 8 | \title{Named Entity Extaction} 9 | \description{ 10 | A wrapper for \pkg{NLP} and \pkg{openNLP} to facilitate named entity extraction. 11 | } 12 | -------------------------------------------------------------------------------- /man/location_entity.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/location_entity.R 3 | \name{location_entity} 4 | \alias{location_entity} 5 | \title{Named Location Recognition} 6 | \usage{ 7 | location_entity(text.var, entity.annotator = "location_annotator", 8 | word.annotator = word_annotator(), element.chunks = floor(2000 * 9 | (23.5/mean(sapply(text.var, nchar), na.rm = TRUE)))) 10 | } 11 | \arguments{ 12 | \item{text.var}{The text string variable.} 13 | 14 | \item{entity.annotator}{A character vector identifying an entity recognition 15 | annotator (\code{c("person_annotator", "location_annotator", "date_annotator", 16 | "money_annotator", "percent_annotator")}. See \code{?annotators}.} 17 | 18 | \item{word.annotator}{A word annotator.} 19 | 20 | \item{element.chunks}{The number of elements to include in a chunk. Chunks are 21 | passed through an \code{\link[base]{lapply}} and size is kept within a tolerance 22 | because of memory allocation in the tagging process with \pkg{Java}.} 23 | } 24 | \value{ 25 | Returns a data.frame of named entities and frequencies. 26 | } 27 | \description{ 28 | A wrapper for \pkg{NLP},/\pkg{openNLP}'s named location recognition annotation. 29 | } 30 | \examples{ 31 | \dontrun{ 32 | data(presidential_debates_2012) 33 | 34 | locales <- location_entity(presidential_debates_2012$dialogue) 35 | unlist(locales) 36 | 37 | library(dplyr) 38 | presidential_debates_2012$locations <- location_entity(presidential_debates_2012$dialogue) 39 | 40 | presidential_debates_2012 \%>\% 41 | {.[!sapply(.$locations, is.null), ]} \%>\% 42 | rowwise() \%>\% 43 | mutate(locations = paste(locations, collapse=", ")) \%>\% 44 | select(person, time, locations) 45 | 46 | library(tidyr) 47 | presidential_debates_2012 \%>\% 48 | {.[!sapply(.$locations, is.null), ]} \%>\% 49 | unnest() \%>\% 50 | select(person, time, locations) 51 | } 52 | } 53 | \seealso{ 54 | Other variable functions: \code{\link{date_entity}}, 55 | \code{\link{money_entity}}, 56 | \code{\link{organization_entity}}, 57 | \code{\link{percent_entity}}, \code{\link{person_entity}} 58 | } 59 | \keyword{location} 60 | -------------------------------------------------------------------------------- /man/money_entity.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/money_entity.R 3 | \name{money_entity} 4 | \alias{money_entity} 5 | \title{Named Money Recognition} 6 | \usage{ 7 | money_entity(text.var, entity.annotator = "money_annotator", 8 | word.annotator = word_annotator(), element.chunks = floor(2000 * 9 | (23.5/mean(sapply(text.var, nchar), na.rm = TRUE)))) 10 | } 11 | \arguments{ 12 | \item{text.var}{The text string variable.} 13 | 14 | \item{entity.annotator}{A character vector identifying an entity recognition 15 | annotator (\code{c("person_annotator", "location_annotator", "date_annotator", 16 | "money_annotator", "percent_annotator")}. See \code{?annotators}.} 17 | 18 | \item{word.annotator}{A word annotator.} 19 | 20 | \item{element.chunks}{The number of elements to include in a chunk. Chunks are 21 | passed through an \code{\link[base]{lapply}} and size is kept within a tolerance 22 | because of memory allocation in the tagging process with \pkg{Java}.} 23 | } 24 | \value{ 25 | Returns a data.frame of named entities and frequencies. 26 | } 27 | \description{ 28 | A wrapper for \pkg{NLP},/\pkg{openNLP}'s named money recognition annotation. 29 | } 30 | \examples{ 31 | \dontrun{ 32 | data(presidential_debates_2012) 33 | 34 | monies <- money_entity(presidential_debates_2012$dialogue) 35 | unlist(monies) 36 | 37 | library(dplyr) 38 | presidential_debates_2012$monies <- money_entity(presidential_debates_2012$dialogue) 39 | 40 | presidential_debates_2012 \%>\% 41 | {.[!sapply(.$monies, is.null), ]} \%>\% 42 | rowwise() \%>\% 43 | mutate(monies = paste(monies, collapse=", ")) \%>\% 44 | select(person, time, monies) 45 | 46 | library(tidyr) 47 | presidential_debates_2012 \%>\% 48 | {.[!sapply(.$monies, is.null), ]} \%>\% 49 | unnest() \%>\% 50 | select(person, time, monies) 51 | } 52 | } 53 | \seealso{ 54 | Other variable functions: \code{\link{date_entity}}, 55 | \code{\link{location_entity}}, 56 | \code{\link{organization_entity}}, 57 | \code{\link{percent_entity}}, \code{\link{person_entity}} 58 | } 59 | \keyword{money} 60 | -------------------------------------------------------------------------------- /man/named_entity.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/named_entity.R 3 | \name{named_entity} 4 | \alias{named_entity} 5 | \title{Named Entity Recognition} 6 | \usage{ 7 | named_entity(text.var, entity.annotator, word.annotator = word_annotator(), 8 | element.chunks = floor(2000 * (23.5/mean(sapply(text.var, nchar), na.rm = 9 | TRUE)))) 10 | } 11 | \arguments{ 12 | \item{text.var}{The text string variable.} 13 | 14 | \item{entity.annotator}{A character vector identifying an entity recognition 15 | annotator (\code{c("person_annotator", "location_annotator", "date_annotator", 16 | "money_annotator", "percent_annotator")}. See \code{?annotators}.} 17 | 18 | \item{word.annotator}{A word annotator.} 19 | 20 | \item{element.chunks}{The number of elements to include in a chunk. Chunks are 21 | passed through an \code{\link[base]{lapply}} and size is kept within a tolerance 22 | because of memory allocation in the tagging process with \pkg{Java}.} 23 | } 24 | \value{ 25 | Returns a data.frame of named entities and frequencies. 26 | } 27 | \description{ 28 | A wrapper for \pkg{NLP},/\pkg{openNLP}'s named entity recognition annotation 29 | tools. 30 | } 31 | \examples{ 32 | \dontrun{ 33 | data(presidential_debates_2012) 34 | 35 | peoples <- named_entity(presidential_debates_2012$dialogue, 'person_annotator') 36 | unlist(peoples) 37 | plot(peoples) 38 | 39 | orgs <-named_entity(presidential_debates_2012$dialogue, 'organization_annotator') 40 | unlist(orgs) 41 | 42 | dates <-named_entity(presidential_debates_2012$dialogue, 'date_annotator') 43 | unlist(dates) 44 | 45 | library(dplyr) 46 | presidential_debates_2012$organizations <- named_entity( 47 | presidential_debates_2012$dialogue, 48 | 'organization_annotator' 49 | ) 50 | 51 | presidential_debates_2012 \%>\% 52 | {.[!sapply(.$organizations, is.null), ]} \%>\% 53 | rowwise() \%>\% 54 | mutate(organizations = paste(organizations, collapse=", ")) \%>\% 55 | select(person, time, organizations) 56 | 57 | library(tidyr) 58 | presidential_debates_2012 \%>\% 59 | {.[!sapply(.$organizations, is.null), ]} \%>\% 60 | unnest() \%>\% 61 | select(person, time, organizations) 62 | } 63 | } 64 | \seealso{ 65 | \code{\link[openNLP]{Maxent_Entity_Annotator}} 66 | } 67 | \keyword{entity} 68 | \keyword{named} 69 | \keyword{ner} 70 | -------------------------------------------------------------------------------- /man/organization_entity.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/organization_entity.R 3 | \name{organization_entity} 4 | \alias{organization_entity} 5 | \title{Named Organization Recognition} 6 | \usage{ 7 | organization_entity(text.var, entity.annotator = "organization_annotator", 8 | word.annotator = word_annotator(), element.chunks = floor(2000 * 9 | (23.5/mean(sapply(text.var, nchar), na.rm = TRUE)))) 10 | } 11 | \arguments{ 12 | \item{text.var}{The text string variable.} 13 | 14 | \item{entity.annotator}{A character vector identifying an entity recognition 15 | annotator (\code{c("person_annotator", "location_annotator", "date_annotator", 16 | "money_annotator", "percent_annotator")}. See \code{?annotators}.} 17 | 18 | \item{word.annotator}{A word annotator.} 19 | 20 | \item{element.chunks}{The number of elements to include in a chunk. Chunks are 21 | passed through an \code{\link[base]{lapply}} and size is kept within a tolerance 22 | because of memory allocation in the tagging process with \pkg{Java}.} 23 | } 24 | \value{ 25 | Returns a data.frame of named entities and frequencies. 26 | } 27 | \description{ 28 | A wrapper for \pkg{NLP},/\pkg{openNLP}'s named organization recognition annotation. 29 | } 30 | \examples{ 31 | \dontrun{ 32 | data(presidential_debates_2012) 33 | 34 | orgs <- organization_entity(presidential_debates_2012$dialogue) 35 | unlist(orgs) 36 | 37 | library(dplyr) 38 | presidential_debates_2012$organizations <- organization_entity(presidential_debates_2012$dialogue) 39 | 40 | presidential_debates_2012 \%>\% 41 | {.[!sapply(.$organizations, is.null), ]} \%>\% 42 | rowwise() \%>\% 43 | mutate(organizations = paste(organizations, collapse=", ")) \%>\% 44 | select(person, time, organizations) 45 | 46 | library(tidyr) 47 | presidential_debates_2012 \%>\% 48 | {.[!sapply(.$organizations, is.null), ]} \%>\% 49 | unnest() \%>\% 50 | select(person, time, organizations) 51 | } 52 | } 53 | \seealso{ 54 | Other variable functions: \code{\link{date_entity}}, 55 | \code{\link{location_entity}}, 56 | \code{\link{money_entity}}, \code{\link{percent_entity}}, 57 | \code{\link{person_entity}} 58 | } 59 | \keyword{organization} 60 | -------------------------------------------------------------------------------- /man/percent_entity.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/percent_entity.R 3 | \name{percent_entity} 4 | \alias{percent_entity} 5 | \title{Named Percent Recognition} 6 | \usage{ 7 | percent_entity(text.var, entity.annotator = "percent_annotator", 8 | word.annotator = word_annotator(), element.chunks = floor(2000 * 9 | (23.5/mean(sapply(text.var, nchar), na.rm = TRUE)))) 10 | } 11 | \arguments{ 12 | \item{text.var}{The text string variable.} 13 | 14 | \item{entity.annotator}{A character vector identifying an entity recognition 15 | annotator (\code{c("person_annotator", "location_annotator", "date_annotator", 16 | "money_annotator", "percent_annotator")}. See \code{?annotators}.} 17 | 18 | \item{word.annotator}{A word annotator.} 19 | 20 | \item{element.chunks}{The number of elements to include in a chunk. Chunks are 21 | passed through an \code{\link[base]{lapply}} and size is kept within a tolerance 22 | because of memory allocation in the tagging process with \pkg{Java}.} 23 | } 24 | \value{ 25 | Returns a data.frame of named entities and frequencies. 26 | } 27 | \description{ 28 | A wrapper for \pkg{NLP},/\pkg{openNLP}'s named percent recognition annotation. 29 | } 30 | \examples{ 31 | \dontrun{ 32 | data(presidential_debates_2012) 33 | 34 | percents <- percent_entity(presidential_debates_2012$dialogue) 35 | unlist(percents) 36 | 37 | library(dplyr) 38 | presidential_debates_2012$percents <- percent_entity(presidential_debates_2012$dialogue) 39 | 40 | presidential_debates_2012 \%>\% 41 | {.[!sapply(.$percents, is.null), ]} \%>\% 42 | rowwise() \%>\% 43 | mutate(percents = paste(percents, collapse=", ")) \%>\% 44 | select(person, time, percents) 45 | 46 | library(tidyr) 47 | presidential_debates_2012 \%>\% 48 | {.[!sapply(.$percents, is.null), ]} \%>\% 49 | unnest() \%>\% 50 | select(person, time, percents) 51 | } 52 | } 53 | \seealso{ 54 | Other variable functions: \code{\link{date_entity}}, 55 | \code{\link{location_entity}}, 56 | \code{\link{money_entity}}, 57 | \code{\link{organization_entity}}, 58 | \code{\link{person_entity}} 59 | } 60 | \keyword{percent} 61 | -------------------------------------------------------------------------------- /man/person_entity.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/person_entity.R 3 | \name{person_entity} 4 | \alias{person_entity} 5 | \title{Named Person Recognition} 6 | \usage{ 7 | person_entity(text.var, entity.annotator = "person_annotator", 8 | word.annotator = word_annotator(), element.chunks = floor(2000 * 9 | (23.5/mean(sapply(text.var, nchar), na.rm = TRUE)))) 10 | } 11 | \arguments{ 12 | \item{text.var}{The text string variable.} 13 | 14 | \item{entity.annotator}{A character vector identifying an entity recognition 15 | annotator (\code{c("person_annotator", "location_annotator", "date_annotator", 16 | "money_annotator", "percent_annotator")}. See \code{?annotators}.} 17 | 18 | \item{word.annotator}{A word annotator.} 19 | 20 | \item{element.chunks}{The number of elements to include in a chunk. Chunks are 21 | passed through an \code{\link[base]{lapply}} and size is kept within a tolerance 22 | because of memory allocation in the tagging process with \pkg{Java}.} 23 | } 24 | \value{ 25 | Returns a data.frame of named entities and frequencies. 26 | } 27 | \description{ 28 | A wrapper for \pkg{NLP},/\pkg{openNLP}'s named person recognition annotation. 29 | } 30 | \examples{ 31 | \dontrun{ 32 | data(presidential_debates_2012) 33 | 34 | peoples <- person_entity(presidential_debates_2012$dialogue) 35 | unlist(peoples) 36 | 37 | library(dplyr) 38 | presidential_debates_2012$persons <- person_entity(presidential_debates_2012$dialogue) 39 | 40 | presidential_debates_2012 \%>\% 41 | {.[!sapply(.$persons, is.null), ]} \%>\% 42 | rowwise() \%>\% 43 | mutate(persons = paste(persons, collapse=", ")) \%>\% 44 | select(person, time, persons) 45 | 46 | library(tidyr) 47 | presidential_debates_2012 \%>\% 48 | {.[!sapply(.$persons, is.null), ]} \%>\% 49 | unnest() \%>\% 50 | select(person, time, persons) 51 | } 52 | } 53 | \seealso{ 54 | Other variable functions: \code{\link{date_entity}}, 55 | \code{\link{location_entity}}, 56 | \code{\link{money_entity}}, 57 | \code{\link{organization_entity}}, 58 | \code{\link{percent_entity}} 59 | } 60 | \keyword{people} 61 | \keyword{person} 62 | -------------------------------------------------------------------------------- /man/plot.entity.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/named_entity.R 3 | \name{plot.entity} 4 | \alias{plot.entity} 5 | \title{Plots a plot.entity Object} 6 | \usage{ 7 | \method{plot}{entity}(x, min = 1, alphabetical = FALSE, ...) 8 | } 9 | \arguments{ 10 | \item{x}{An \code{entity} object.} 11 | 12 | \item{min}{Minimum frequency of included entities.} 13 | 14 | \item{alphabetical}{logical. Should rows be arranged alphabetically by entity 15 | or by frequency.} 16 | 17 | \item{\ldots}{ignored.} 18 | } 19 | \description{ 20 | Plots a plot.entity object 21 | } 22 | -------------------------------------------------------------------------------- /man/presidential_debates_2012.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/entity-package.R 3 | \docType{data} 4 | \name{presidential_debates_2012} 5 | \alias{presidential_debates_2012} 6 | \title{2012 U.S. Presidential Debates} 7 | \format{A data frame with 2912 rows and 4 variables} 8 | \usage{ 9 | data(presidential_debates_2012) 10 | } 11 | \description{ 12 | A dataset containing a cleaned version of all three presidential debates for 13 | the 2012 election. 14 | } 15 | \details{ 16 | \itemize{ 17 | \item person. The speaker 18 | \item tot. Turn of talk 19 | \item dialogue. The words spoken 20 | \item time. Variable indicating which of the three debates the dialogue is from 21 | } 22 | } 23 | \keyword{datasets} 24 | -------------------------------------------------------------------------------- /man/print.entity.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/named_entity.R 3 | \name{print.entity} 4 | \alias{print.entity} 5 | \title{Prints a entity Object} 6 | \usage{ 7 | \method{print}{entity}(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{An \code{entity} object.} 11 | 12 | \item{\ldots}{ignored.} 13 | } 14 | \description{ 15 | Prints a entity object 16 | } 17 | -------------------------------------------------------------------------------- /man/wiki.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/entity-package.R 3 | \docType{data} 4 | \name{wiki} 5 | \alias{wiki} 6 | \title{Bell Labs Wikipedia Article} 7 | \format{A character vector with 7 elements} 8 | \usage{ 9 | data(wiki) 10 | } 11 | \description{ 12 | A dataset containing a character vector of an excerpt from Wikipedia about 13 | Bell Labs with an extra final sentence to include percent and money when 14 | extracting entities. 15 | } 16 | \references{ 17 | \url{https://en.wikipedia.org/wiki/Bell_Labs} 18 | } 19 | \keyword{datasets} 20 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library("testthat") 2 | library("entity") 3 | 4 | test_check("entity") -------------------------------------------------------------------------------- /tests/testthat/test-named_entity.R: -------------------------------------------------------------------------------- 1 | context("Checking named_entity") 2 | 3 | test_that("named_entity ...",{ 4 | 5 | 6 | }) 7 | 8 | -------------------------------------------------------------------------------- /tools/entity_logo/r_entity.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trinker/entity/5549d30f4f7daa91ecb99ea5dbc770260bfdbedd/tools/entity_logo/r_entity.png -------------------------------------------------------------------------------- /tools/entity_logo/r_entity.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trinker/entity/5549d30f4f7daa91ecb99ea5dbc770260bfdbedd/tools/entity_logo/r_entity.pptx -------------------------------------------------------------------------------- /tools/entity_logo/r_entitya.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trinker/entity/5549d30f4f7daa91ecb99ea5dbc770260bfdbedd/tools/entity_logo/r_entitya.png -------------------------------------------------------------------------------- /tools/entity_logo/resize_icon.txt: -------------------------------------------------------------------------------- 1 | cd C:\Users\Tyler\GitHub\entity\tools\entity_logo 2 | ffmpeg -i r_entitya.png -vf scale=250:-1 r_entity.png -------------------------------------------------------------------------------- /tools/figure/fig.height-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trinker/entity/5549d30f4f7daa91ecb99ea5dbc770260bfdbedd/tools/figure/fig.height-1.png -------------------------------------------------------------------------------- /tools/figure/unnamed-chunk-11-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trinker/entity/5549d30f4f7daa91ecb99ea5dbc770260bfdbedd/tools/figure/unnamed-chunk-11-1.png -------------------------------------------------------------------------------- /tools/figure/unnamed-chunk-12-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trinker/entity/5549d30f4f7daa91ecb99ea5dbc770260bfdbedd/tools/figure/unnamed-chunk-12-1.png --------------------------------------------------------------------------------