├── .Rbuildignore
├── .gitignore
├── .travis.yml
├── DESCRIPTION
├── NAMESPACE
├── NEWS
├── R
├── annotators.R
├── date_entity.R
├── entity-package.R
├── get_counts.R
├── location_entity.R
├── money_entity.R
├── named_entity.R
├── organization_entity.R
├── percent_entity.R
├── person_entity.R
└── utils.R
├── README.Rmd
├── README.md
├── data
├── presidential_debates_2012.rda
└── wiki.rda
├── inst
├── CITATION
├── build.R
├── extra_statdoc
│ └── readme.R
└── staticdocs
│ └── index.R
├── man
├── annotators.Rd
├── date_entity.Rd
├── entity.Rd
├── location_entity.Rd
├── money_entity.Rd
├── named_entity.Rd
├── organization_entity.Rd
├── percent_entity.Rd
├── person_entity.Rd
├── plot.entity.Rd
├── presidential_debates_2012.Rd
├── print.entity.Rd
└── wiki.Rd
├── tests
├── testthat.R
└── testthat
│ └── test-named_entity.R
└── tools
├── entity_logo
├── r_entity.png
├── r_entity.pptx
├── r_entitya.png
└── resize_icon.txt
└── figure
├── fig.height-1.png
├── unnamed-chunk-11-1.png
└── unnamed-chunk-12-1.png
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^\.gitignore
4 | NEWS.md
5 | FAQ.md
6 | NEWS.html
7 | FAQ.html
8 | ^\.travis\.yml$
9 | travis-tool.sh
10 | inst/web
11 | contributors.geojson
12 | inst/build.R
13 | ^.*\.Rprofile$
14 | README.Rmd
15 | README.R
16 | travis.yml
17 | inst/staticdocs
18 | inst/extra_statdoc
19 | inst/maintenance.R
20 | tools/entity_logo/r_entitya.png
21 | tools/entity_logo/r_entity.pptx
22 | tools/entity_logo/resize_icon.txt
23 | Thumbs.db
24 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # History files
2 | .Rhistory
3 |
4 | # Example code in package build process
5 | *-Ex.R
6 |
7 | .Rprofile
8 | .Rproj.user
9 | entity.Rproj
10 | inst/maintenance.R
11 | Thumbs.db
12 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: r
2 |
3 | sudo: false
4 |
5 | before_install:
6 | - sh -e /etc/init.d/xvfb start
7 |
8 | r_github_packages:
9 | - jimhester/covr
10 |
11 | notifications:
12 | email:
13 | on_success: change
14 | on_failure: change
15 |
16 | after_success:
17 | - Rscript -e 'covr::coveralls()'
18 |
19 | r_build_args: "--resave-data=best"
20 | r_check_args: "--as-cran"
21 |
22 | env:
23 | global:
24 | - DISPLAY=:99.0
25 | - BOOTSTRAP_LATEX=1
26 |
27 |
28 |
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: entity
2 | Title: Named Entity Recognition
3 | Version: 0.1.0
4 | Authors@R: c(person("Tyler", "Rinker", email =
5 | "tyler.rinker@gmail.com", role = c("aut", "cre")))
6 | Maintainer: Tyler Rinker
7 | Description: A wrapper to simplify and extend 'NLP' and 'openNLP' named
8 | entity recognition.
9 | Depends: R (>= 3.2.2)
10 | Imports: dplyr, ggplot2, NLP, openNLP, utils
11 | Suggests: testthat
12 | Date: 2017-04-10
13 | License: GPL-2
14 | LazyData: TRUE
15 | Roxygen: list(wrap = FALSE)
16 | Collate:
17 | 'annotators.R'
18 | 'named_entity.R'
19 | 'utils.R'
20 | 'date_entity.R'
21 | 'entity-package.R'
22 | 'get_counts.R'
23 | 'location_entity.R'
24 | 'money_entity.R'
25 | 'organization_entity.R'
26 | 'percent_entity.R'
27 | 'person_entity.R'
28 | RoxygenNote: 6.0.1
29 |
--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 |
3 | S3method(plot,entity)
4 | S3method(print,entity)
5 | export(date_annotator)
6 | export(date_entity)
7 | export(location_annotator)
8 | export(location_entity)
9 | export(money_annotator)
10 | export(money_entity)
11 | export(named_entity)
12 | export(organization_annotator)
13 | export(organization_entity)
14 | export(percent_annotator)
15 | export(percent_entity)
16 | export(person_annotator)
17 | export(person_entity)
18 | export(word_annotator)
19 |
--------------------------------------------------------------------------------
/NEWS:
--------------------------------------------------------------------------------
1 | NEWS
2 | ====
3 |
4 | Versioning
5 | ----------
6 |
7 | Releases will be numbered with the following semantic versioning format:
8 |
9 | ..
10 |
11 | And constructed with the following guidelines:
12 |
13 | * Breaking backward compatibility bumps the major (and resets the minor
14 | and patch)
15 | * New additions without breaking backward compatibility bumps the minor
16 | (and resets the patch)
17 | * Bug fixes and misc changes bumps the patch
18 |
19 |
20 | entity 0.0.1
21 | ----------------------------------------------------------------
22 |
23 | This package is...
24 |
--------------------------------------------------------------------------------
/R/annotators.R:
--------------------------------------------------------------------------------
1 | #' Annotators
2 | #'
3 | #' A wrapper for \code{\link[openNLP]{Maxent_Entity_Annotator}} and
4 | #' \code{\link[openNLP]{Maxent_Word_Token_Annotator}}.
5 | #'
6 | #' @return Returns an annotator for entities or words.
7 | #' @seealso \code{\link[openNLP]{Maxent_Entity_Annotator}},
8 | #' \code{\link[openNLP]{Maxent_Word_Token_Annotator}}
9 | #' @rdname annotators
10 | #' @export
11 | word_annotator <- function(){
12 | check_models_package()
13 | openNLP::Maxent_Word_Token_Annotator()
14 | }
15 |
16 | #' @rdname annotators
17 | #' @export
18 | person_annotator <- function(){
19 | check_models_package()
20 | .PERSON <- openNLP::Maxent_Entity_Annotator(kind = "person")
21 | attributes(.PERSON)[["type"]] <- "person"
22 | .PERSON
23 | }
24 |
25 | #' @rdname annotators
26 | #' @export
27 | location_annotator <- function(){
28 | check_models_package()
29 | .LOCATION <- openNLP::Maxent_Entity_Annotator(kind = "location")
30 | attributes(.LOCATION)[["type"]] <- "location"
31 | .LOCATION
32 | }
33 |
34 | #' @rdname annotators
35 | #' @export
36 | organization_annotator <- function(){
37 | check_models_package()
38 | .ORGANIZATION <- openNLP::Maxent_Entity_Annotator(kind = "organization")
39 | attributes(.ORGANIZATION)[["type"]] <- "organization"
40 | .ORGANIZATION
41 | }
42 |
43 | #' @rdname annotators
44 | #' @export
45 | date_annotator <- function(){
46 | check_models_package()
47 | .DATE <- openNLP::Maxent_Entity_Annotator(kind = "date")
48 | attributes(.DATE)[["type"]] <- "date"
49 | .DATE
50 | }
51 |
52 | #' @rdname annotators
53 | #' @export
54 | money_annotator <- function(){
55 | check_models_package()
56 | .MONEY <- openNLP::Maxent_Entity_Annotator(kind = "money")
57 | attributes(.MONEY)[["type"]] <- "money"
58 | .MONEY
59 | }
60 |
61 | #' @rdname annotators
62 | #' @export
63 | percent_annotator <- function(){
64 | check_models_package()
65 | .PERCENT <- openNLP::Maxent_Entity_Annotator(kind = "percent")
66 | attributes(.PERCENT)[["type"]] <- "percent"
67 | .PERCENT
68 | }
69 |
70 |
71 | check_models_package <- function(){
72 | outcome <- "openNLPmodels.en" %in% list.files(.libPaths())
73 | if (!outcome) {
74 | message(paste0("Well it appears `openNLPmodels.en` is not installed.\n",
75 | "This package is necessary in order to use the `entity` package.\n\nWould you like me to try and fetch it?"))
76 | ans <- utils::menu(c("Yes", "No"))
77 | if (ans == "2") {
78 | stop("Named entity extraction aborted. Please install `openNLPmodels.en`")
79 | } else {
80 | message("Attempting to install `openNLPmodels.en`.")
81 | utils::install.packages(
82 | "http://datacube.wu.ac.at/src/contrib/openNLPmodels.en_1.5-1.tar.gz",
83 | repos=NULL,
84 | type="source"
85 | )
86 | outcome <- "openNLPmodels.en" %in% list.files(.libPaths())
87 | if (outcome) {
88 | return(TRUE)
89 | } else {
90 | stop("Failed to install `openNLPmodels.en`. Please install `openNLPmodels.en` manually.")
91 | }
92 | }
93 | }
94 | }
95 |
--------------------------------------------------------------------------------
/R/date_entity.R:
--------------------------------------------------------------------------------
1 | #' Named Date Recognition
2 | #'
3 | #' A wrapper for \pkg{NLP},/\pkg{openNLP}'s named date recognition annotation.
4 | #'
5 | #' @inheritParams named_entity
6 | #' @return Returns a data.frame of named entities and frequencies.
7 | #' @keywords date
8 | #' @export
9 | #' @include utils.R named_entity.R
10 | #' @family variable functions
11 | #' @examples
12 | #' \dontrun{
13 | #' data(presidential_debates_2012)
14 | #'
15 | #' dates <- date_entity(presidential_debates_2012$dialogue)
16 | #' unlist(dates)
17 | #'
18 | #' library(dplyr)
19 | #' presidential_debates_2012$dates <- date_entity(presidential_debates_2012$dialogue)
20 | #'
21 | #' presidential_debates_2012 %>%
22 | #' {.[!sapply(.$dates, is.null), ]} %>%
23 | #' rowwise() %>%
24 | #' mutate(dates = paste(dates, collapse=", ")) %>%
25 | #' select(person, time, dates)
26 | #'
27 | #' library(tidyr)
28 | #' presidential_debates_2012 %>%
29 | #' {.[!sapply(.$dates, is.null), ]} %>%
30 | #' unnest() %>%
31 | #' select(person, time, dates)
32 | #' }
33 | date_entity <- hijack(named_entity,
34 | entity.annotator = 'date_annotator'
35 | )
36 |
37 |
--------------------------------------------------------------------------------
/R/entity-package.R:
--------------------------------------------------------------------------------
1 | #' Named Entity Extaction
2 | #'
3 | #' A wrapper for \pkg{NLP} and \pkg{openNLP} to facilitate named entity extraction.
4 | #' @docType package
5 | #' @name entity
6 | #' @aliases entity package-entity
7 | NULL
8 |
9 |
10 | #' 2012 U.S. Presidential Debates
11 | #'
12 | #' A dataset containing a cleaned version of all three presidential debates for
13 | #' the 2012 election.
14 | #'
15 | #' @details
16 | #' \itemize{
17 | #' \item person. The speaker
18 | #' \item tot. Turn of talk
19 | #' \item dialogue. The words spoken
20 | #' \item time. Variable indicating which of the three debates the dialogue is from
21 | #' }
22 | #'
23 | #' @docType data
24 | #' @keywords datasets
25 | #' @name presidential_debates_2012
26 | #' @usage data(presidential_debates_2012)
27 | #' @format A data frame with 2912 rows and 4 variables
28 | NULL
29 |
30 |
31 | #' Bell Labs Wikipedia Article
32 | #'
33 | #' A dataset containing a character vector of an excerpt from Wikipedia about
34 | #' Bell Labs with an extra final sentence to include percent and money when
35 | #' extracting entities.
36 | #'
37 | #' @docType data
38 | #' @keywords datasets
39 | #' @name wiki
40 | #' @usage data(wiki)
41 | #' @format A character vector with 7 elements
42 | #' @references \url{https://en.wikipedia.org/wiki/Bell_Labs}
43 | NULL
44 |
--------------------------------------------------------------------------------
/R/get_counts.R:
--------------------------------------------------------------------------------
1 | get_counts <- function(x, alphabetical = FALSE, ...){
2 |
3 | x <- sort(table(unlist(x)), TRUE)
4 |
5 | x <- data.frame(
6 | entity = names(x),
7 | frequency = c(unname(unlist(x)))
8 | )
9 |
10 | if (isTRUE(alphabetical)) {
11 | x <- x[order(x[["entity"]]), ]
12 | }
13 |
14 | x[["entity"]] <- factor(x[["entity"]], levels=rev(x[["entity"]]))
15 |
16 | dplyr::tbl_df(x)
17 | }
18 |
19 |
20 |
--------------------------------------------------------------------------------
/R/location_entity.R:
--------------------------------------------------------------------------------
1 | #' Named Location Recognition
2 | #'
3 | #' A wrapper for \pkg{NLP},/\pkg{openNLP}'s named location recognition annotation.
4 | #'
5 | #' @inheritParams named_entity
6 | #' @return Returns a data.frame of named entities and frequencies.
7 | #' @keywords location
8 | #' @export
9 | #' @include utils.R named_entity.R
10 | #' @family variable functions
11 | #' @examples
12 | #' \dontrun{
13 | #' data(presidential_debates_2012)
14 | #'
15 | #' locales <- location_entity(presidential_debates_2012$dialogue)
16 | #' unlist(locales)
17 | #'
18 | #' library(dplyr)
19 | #' presidential_debates_2012$locations <- location_entity(presidential_debates_2012$dialogue)
20 | #'
21 | #' presidential_debates_2012 %>%
22 | #' {.[!sapply(.$locations, is.null), ]} %>%
23 | #' rowwise() %>%
24 | #' mutate(locations = paste(locations, collapse=", ")) %>%
25 | #' select(person, time, locations)
26 | #'
27 | #' library(tidyr)
28 | #' presidential_debates_2012 %>%
29 | #' {.[!sapply(.$locations, is.null), ]} %>%
30 | #' unnest() %>%
31 | #' select(person, time, locations)
32 | #' }
33 | location_entity <- hijack(named_entity,
34 | entity.annotator = 'location_annotator'
35 | )
36 |
37 |
--------------------------------------------------------------------------------
/R/money_entity.R:
--------------------------------------------------------------------------------
1 | #' Named Money Recognition
2 | #'
3 | #' A wrapper for \pkg{NLP},/\pkg{openNLP}'s named money recognition annotation.
4 | #'
5 | #' @inheritParams named_entity
6 | #' @return Returns a data.frame of named entities and frequencies.
7 | #' @keywords money
8 | #' @export
9 | #' @include utils.R named_entity.R
10 | #' @family variable functions
11 | #' @examples
12 | #' \dontrun{
13 | #' data(presidential_debates_2012)
14 | #'
15 | #' monies <- money_entity(presidential_debates_2012$dialogue)
16 | #' unlist(monies)
17 | #'
18 | #' library(dplyr)
19 | #' presidential_debates_2012$monies <- money_entity(presidential_debates_2012$dialogue)
20 | #'
21 | #' presidential_debates_2012 %>%
22 | #' {.[!sapply(.$monies, is.null), ]} %>%
23 | #' rowwise() %>%
24 | #' mutate(monies = paste(monies, collapse=", ")) %>%
25 | #' select(person, time, monies)
26 | #'
27 | #' library(tidyr)
28 | #' presidential_debates_2012 %>%
29 | #' {.[!sapply(.$monies, is.null), ]} %>%
30 | #' unnest() %>%
31 | #' select(person, time, monies)
32 | #' }
33 | money_entity <- hijack(named_entity,
34 | entity.annotator = 'money_annotator'
35 | )
36 |
37 |
--------------------------------------------------------------------------------
/R/named_entity.R:
--------------------------------------------------------------------------------
1 | #' Named Entity Recognition
2 | #'
3 | #' A wrapper for \pkg{NLP},/\pkg{openNLP}'s named entity recognition annotation
4 | #' tools.
5 | #'
6 | #' @param text.var The text string variable.
7 | #' @param entity.annotator A character vector identifying an entity recognition
8 | #' annotator (\code{c("person_annotator", "location_annotator", "date_annotator",
9 | #' "money_annotator", "percent_annotator")}. See \code{?annotators}.
10 | #' @param word.annotator A word annotator.
11 | #' @param element.chunks The number of elements to include in a chunk. Chunks are
12 | #' passed through an \code{\link[base]{lapply}} and size is kept within a tolerance
13 | #' because of memory allocation in the tagging process with \pkg{Java}.
14 | #' @return Returns a data.frame of named entities and frequencies.
15 | #' @keywords ner named entity
16 | #' @export
17 | #' @seealso \code{\link[openNLP]{Maxent_Entity_Annotator}}
18 | #' @examples
19 | #' \dontrun{
20 | #' data(presidential_debates_2012)
21 | #'
22 | #' peoples <- named_entity(presidential_debates_2012$dialogue, 'person_annotator')
23 | #' unlist(peoples)
24 | #' plot(peoples)
25 | #'
26 | #' orgs <-named_entity(presidential_debates_2012$dialogue, 'organization_annotator')
27 | #' unlist(orgs)
28 | #'
29 | #' dates <-named_entity(presidential_debates_2012$dialogue, 'date_annotator')
30 | #' unlist(dates)
31 | #'
32 | #' library(dplyr)
33 | #' presidential_debates_2012$organizations <- named_entity(
34 | #' presidential_debates_2012$dialogue,
35 | #' 'organization_annotator'
36 | #' )
37 | #'
38 | #' presidential_debates_2012 %>%
39 | #' {.[!sapply(.$organizations, is.null), ]} %>%
40 | #' rowwise() %>%
41 | #' mutate(organizations = paste(organizations, collapse=", ")) %>%
42 | #' select(person, time, organizations)
43 | #'
44 | #' library(tidyr)
45 | #' presidential_debates_2012 %>%
46 | #' {.[!sapply(.$organizations, is.null), ]} %>%
47 | #' unnest() %>%
48 | #' select(person, time, organizations)
49 | #' }
50 | named_entity <- function(text.var, entity.annotator, word.annotator = word_annotator(),
51 | element.chunks = floor(2000 * (23.5/mean(sapply(text.var, nchar), na.rm = TRUE)))){
52 |
53 | len <- length(text.var)
54 |
55 | ## locate empty or missing text elements
56 | nas <- sort(union(which(is.na(text.var)), grep("^\\s*$", text.var)))
57 |
58 | ## Get annotator
59 | entity.annotator <- switch(entity.annotator,
60 | person_annotator = person_annotator(),
61 | location_annotator = location_annotator(),
62 | organization_annotator = organization_annotator(),
63 | date_annotator = date_annotator(),
64 | money_annotator = money_annotator(),
65 | percent_annotator = percent_annotator(),
66 | stop("`entity.annotator` does not appear to be an annotator. See `?annotators`.")
67 | )
68 |
69 |
70 | ## replace empty text with a period
71 | if(length(nas) > 0){
72 | text.var[nas] <- "."
73 | }
74 |
75 | ## Chunking the text into memory sized chunks:
76 | ## caluclate the start/end indexes of the chunks
77 | ends <- c(utils::tail(seq(0, by = element.chunks,
78 | length.out = ceiling(len/element.chunks)), -1), len)
79 | starts <- c(1, utils::head(ends + 1 , -1))
80 |
81 | ## chunk the text
82 | text_list <- Map(function(s, e) {text.var[s:e]}, starts, ends)
83 |
84 | ## loop through the chunks and tag them
85 | out <- lapply(text_list, function(x){
86 | x <- entify(x, entity.annotator, word.annotator)
87 | gc()
88 | x
89 | })
90 |
91 | lens <- sapply(text_list, length)
92 |
93 | out <- unlist(lapply(seq_along(out), function(i){
94 |
95 | vectout <- vector(mode = "list", length = lens[i])
96 | if (is.null(out[[i]][["entities"]])) return(vectout)
97 | if (length(out[[i]][["entities"]]) == 1){
98 | splits <- out[[i]][["entities"]]
99 | } else {
100 | splits <- split(out[[i]][["entities"]], out[[i]][["locations"]])
101 | }
102 | vectout[unique(out[[i]][["locations"]])] <- splits
103 | vectout
104 | }), recursive = FALSE)
105 |
106 | class(out) <- c("entity", class(out))
107 | attributes(out)[["type"]] <- attributes(entity.annotator)[["type"]]
108 | out
109 | }
110 |
111 |
112 | entify <- function(text.var, ANN, WTA, ...) {
113 |
114 | text.var <- gsub("^\\s+|\\s+$", "", text.var)
115 | s <- NLP::as.String(paste(text.var, collapse=""))
116 |
117 | ## Manually calculate the starts and ends via nchar
118 | lens <- sapply(text.var, nchar)
119 | ends <- cumsum(lens)
120 | starts <- c(1, utils::head(ends + 1, -1))
121 |
122 | a2 <- NLP::Annotation(seq_along(starts), rep("sentence", length(starts)), starts, ends)
123 | a2 <- NLP::annotate(s, WTA, a2)
124 | a3 <- NLP::annotate(s, ANN, a2)
125 |
126 | ## Determine the distribution of POS tags for word tokens.
127 | ents <- a3$type == "entity"
128 | if (all(!ents)) return(list(locations = NULL, entities = NULL))
129 | a3wb <- a3w <- a3[ents]
130 |
131 | a3s <- a3[a3$type == "sentence"]
132 | starts <- as.data.frame(a3s)[, "start"]
133 | ends <- as.data.frame(a3s)[, "end"]
134 |
135 | a3w$start <- sapply(as.data.frame(a3w)[, "start"], function(x) {
136 | max(starts[starts <= x])
137 | })
138 | a3w$end <- sapply(as.data.frame(a3w)[, "end"], function(x) {
139 | min(ends[ends >= x])
140 | })
141 |
142 | list(
143 | locations = match(a3w$start, starts),
144 | entities = s[a3wb]
145 | )
146 | }
147 |
148 | #' Prints a entity Object
149 | #'
150 | #' Prints a entity object
151 | #'
152 | #' @param x An \code{entity} object.
153 | #' @param \ldots ignored.
154 | #' @method print entity
155 | #' @export
156 | print.entity <- function(x, ...){
157 | class(x) <- "list"
158 | attributes(x) <- NULL
159 | print(x)
160 | }
161 |
162 | #' Plots a plot.entity Object
163 | #'
164 | #' Plots a plot.entity object
165 | #'
166 | #' @param x An \code{entity} object.
167 | #' @param min Minimum frequency of included entities.
168 | #' @param alphabetical logical. Should rows be arranged alphabetically by entity
169 | #' or by frequency.
170 | #' @param \ldots ignored.
171 | #' @method plot entity
172 | #' @export
173 | plot.entity <- function(x, min = 1, alphabetical = FALSE, ...){
174 |
175 | stopifnot(min > 0)
176 |
177 | entname <- attributes(x)[["type"]]
178 | substring(entname, 1, 1) <- toupper(substring(entname, 1, 1))
179 |
180 | x <- get_counts(x, alphabetical = alphabetical)
181 |
182 | x <- x[x[["frequency"]] >= min, ]
183 |
184 | ggplot2::ggplot(x, ggplot2::aes_string(x='entity', weight='frequency')) +
185 | ggplot2::geom_bar() +
186 | ggplot2::coord_flip() +
187 | ggplot2::ylab("Count") +
188 | ggplot2::xlab(entname) +
189 | ggplot2::scale_y_continuous(expand = c(0, 0), limits = c(0, 1.01 * max(x[["frequency"]]))) +
190 | ggplot2::theme_bw() +
191 | ggplot2::theme(
192 | panel.grid.major.y = ggplot2::element_blank(),
193 | legend.title = ggplot2::element_blank(),
194 | panel.border = ggplot2::element_blank(),
195 | axis.line = ggplot2::element_line(color="grey70")
196 | )
197 | }
198 |
199 |
200 |
201 |
--------------------------------------------------------------------------------
/R/organization_entity.R:
--------------------------------------------------------------------------------
1 | #' Named Organization Recognition
2 | #'
3 | #' A wrapper for \pkg{NLP},/\pkg{openNLP}'s named organization recognition annotation.
4 | #'
5 | #' @inheritParams named_entity
6 | #' @return Returns a data.frame of named entities and frequencies.
7 | #' @keywords organization
8 | #' @export
9 | #' @include utils.R named_entity.R
10 | #' @family variable functions
11 | #' @examples
12 | #' \dontrun{
13 | #' data(presidential_debates_2012)
14 | #'
15 | #' orgs <- organization_entity(presidential_debates_2012$dialogue)
16 | #' unlist(orgs)
17 | #'
18 | #' library(dplyr)
19 | #' presidential_debates_2012$organizations <- organization_entity(presidential_debates_2012$dialogue)
20 | #'
21 | #' presidential_debates_2012 %>%
22 | #' {.[!sapply(.$organizations, is.null), ]} %>%
23 | #' rowwise() %>%
24 | #' mutate(organizations = paste(organizations, collapse=", ")) %>%
25 | #' select(person, time, organizations)
26 | #'
27 | #' library(tidyr)
28 | #' presidential_debates_2012 %>%
29 | #' {.[!sapply(.$organizations, is.null), ]} %>%
30 | #' unnest() %>%
31 | #' select(person, time, organizations)
32 | #' }
33 | organization_entity <- hijack(named_entity,
34 | entity.annotator = 'organization_annotator'
35 | )
36 |
37 |
--------------------------------------------------------------------------------
/R/percent_entity.R:
--------------------------------------------------------------------------------
1 | #' Named Percent Recognition
2 | #'
3 | #' A wrapper for \pkg{NLP},/\pkg{openNLP}'s named percent recognition annotation.
4 | #'
5 | #' @inheritParams named_entity
6 | #' @return Returns a data.frame of named entities and frequencies.
7 | #' @keywords percent
8 | #' @export
9 | #' @include utils.R named_entity.R
10 | #' @family variable functions
11 | #' @examples
12 | #' \dontrun{
13 | #' data(presidential_debates_2012)
14 | #'
15 | #' percents <- percent_entity(presidential_debates_2012$dialogue)
16 | #' unlist(percents)
17 | #'
18 | #' library(dplyr)
19 | #' presidential_debates_2012$percents <- percent_entity(presidential_debates_2012$dialogue)
20 | #'
21 | #' presidential_debates_2012 %>%
22 | #' {.[!sapply(.$percents, is.null), ]} %>%
23 | #' rowwise() %>%
24 | #' mutate(percents = paste(percents, collapse=", ")) %>%
25 | #' select(person, time, percents)
26 | #'
27 | #' library(tidyr)
28 | #' presidential_debates_2012 %>%
29 | #' {.[!sapply(.$percents, is.null), ]} %>%
30 | #' unnest() %>%
31 | #' select(person, time, percents)
32 | #' }
33 | percent_entity <- hijack(named_entity,
34 | entity.annotator = 'percent_annotator'
35 | )
36 |
37 |
--------------------------------------------------------------------------------
/R/person_entity.R:
--------------------------------------------------------------------------------
1 | #' Named Person Recognition
2 | #'
3 | #' A wrapper for \pkg{NLP},/\pkg{openNLP}'s named person recognition annotation.
4 | #'
5 | #' @inheritParams named_entity
6 | #' @return Returns a data.frame of named entities and frequencies.
7 | #' @keywords person people
8 | #' @export
9 | #' @include utils.R named_entity.R
10 | #' @family variable functions
11 | #' @examples
12 | #' \dontrun{
13 | #' data(presidential_debates_2012)
14 | #'
15 | #' peoples <- person_entity(presidential_debates_2012$dialogue)
16 | #' unlist(peoples)
17 | #'
18 | #' library(dplyr)
19 | #' presidential_debates_2012$persons <- person_entity(presidential_debates_2012$dialogue)
20 | #'
21 | #' presidential_debates_2012 %>%
22 | #' {.[!sapply(.$persons, is.null), ]} %>%
23 | #' rowwise() %>%
24 | #' mutate(persons = paste(persons, collapse=", ")) %>%
25 | #' select(person, time, persons)
26 | #'
27 | #' library(tidyr)
28 | #' presidential_debates_2012 %>%
29 | #' {.[!sapply(.$persons, is.null), ]} %>%
30 | #' unnest() %>%
31 | #' select(person, time, persons)
32 | #' }
33 | person_entity <- hijack(named_entity,
34 | entity.annotator = 'person_annotator'
35 | )
36 |
37 |
38 |
--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
1 | ## Hijack a function
2 | ## see: http://stackoverflow.com/a/25366322/1000343
3 | hijack <- function(FUN, ...){
4 |
5 | .FUN <- FUN
6 |
7 | args <- list(...)
8 | invisible(lapply(seq_along(args), function(i) {
9 | formals(.FUN)[[names(args)[i]]] <<- args[[i]]
10 | }))
11 |
12 | .FUN
13 | }
14 |
--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "entity"
3 | date: "`r format(Sys.time(), '%d %B, %Y')`"
4 | output:
5 | md_document:
6 | toc: true
7 | ---
8 |
9 | ```{r, echo=FALSE}
10 | desc <- suppressWarnings(readLines("DESCRIPTION"))
11 | regex <- "(^Version:\\s+)(\\d+\\.\\d+\\.\\d+)"
12 | loc <- grep(regex, desc)
13 | ver <- gsub(regex, "\\2", desc[loc])
14 | verbadge <- sprintf('
', ver, ver)
15 | ````
16 |
17 | [](http://www.repostatus.org/#active)
18 | [](https://travis-ci.org/trinker/entity)
19 | [](https://coveralls.io/r/trinker/entity?branch=master)
20 | `r verbadge`
21 |
22 | ```{r, echo=FALSE, message=FALSE}
23 | library(knitr)
24 | knit_hooks$set(htmlcap = function(before, options, envir) {
25 | if(!before) {
26 | paste('',options$htmlcap,"
",sep="")
27 | }
28 | })
29 | knitr::opts_knit$set(self.contained = TRUE, cache = FALSE)
30 | knitr::opts_chunk$set(fig.path = "tools/figure/")
31 | ```
32 |
33 | 
34 |
35 |
36 | **entity** is wrapper to simplify and extend [**NLP**](https://cran.r-project.org/web/packages/NLP/index.html) and [**openNLP**](https://cran.r-project.org/web/packages/openNLP/index.html) named entity recognition. The package contains 6 entity extractors that take a text vector and return a list of vectors of named entities. The entity extractors include:
37 |
38 | 1. `person_entity`
39 | 2. `location_entity`
40 | 3. `organization_entity`
41 | 4. `date_entity`
42 | 5. `money_entity`
43 | 6. `percent_entity`
44 |
45 | # Installation
46 |
47 | To download the development version of **entity**:
48 |
49 | Download the [zip ball](https://github.com/trinker/entity/zipball/master) or [tar ball](https://github.com/trinker/entity/tarball/master), decompress and run `R CMD INSTALL` on it, or use the **pacman** package to install the development version:
50 |
51 | ```r
52 | if (!require("pacman")) install.packages("pacman")
53 | pacman::p_load_gh("trinker/entity")
54 | ```
55 |
56 | # Contact
57 |
58 | You are welcome to:
59 | * submit suggestions and bug-reports at:
60 | * send a pull request on:
61 | * compose a friendly e-mail to:
62 |
63 | # Examples
64 |
65 | The following examples demonstrate some of the functionality of **termco**.
66 |
67 | ## Load the Package/Data
68 |
69 | ```{r, message=FALSE}
70 | library(entity)
71 | ```
72 |
73 | I will demonstrate the 6 annotators on this [Wikipedia excerpt](https://en.wikipedia.org/wiki/Bell_Labs) about Bell Labs (plus one non Wikipedia line at the end).
74 |
75 | ```{r}
76 | data(wiki)
77 | wiki
78 | ```
79 |
80 | ## Entity Extractors
81 |
82 | ### Person Entities
83 |
84 | ```{r}
85 | person_entity(wiki)
86 | ```
87 |
88 | ### Location Entities
89 |
90 | ```{r}
91 | location_entity(wiki)
92 | ```
93 |
94 | ### Organization Entities
95 |
96 | ```{r}
97 | organization_entity(wiki)
98 | ```
99 |
100 | ### Date Entities
101 |
102 | ```{r}
103 | date_entity(wiki)
104 | ```
105 |
106 | ### Money Entities
107 |
108 | ```{r}
109 | money_entity(wiki)
110 | ```
111 |
112 | ### Percent Entities
113 |
114 | ```{r}
115 | percent_entity(wiki)
116 | ```
117 |
118 |
119 | ## Plotting
120 |
121 | ```{r, fig.height = 7}
122 | organizations <- organization_entity(presidential_debates_2012$dialogue)
123 | plot(organizations)
124 | ```
125 |
126 | You can include only entities above a minimum frequency (`min = n`) as shown below:
127 |
128 | ```{r, fig.height}
129 | plot(organizations, min = 2)
130 | ```
131 |
132 | The user may wish to view the entities alphabetically rather than by frequency. Use `alphabetical = TRUE` to accomplish this:
133 |
134 | ```{r, fig.height = 7}
135 | plot(organizations, alphabetical = TRUE)
136 | ```
137 |
138 |
139 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | entity [](https://twitter.com/intent/follow?screen_name=tylerrinker)
2 | ============
3 |
4 |
5 | [](http://www.repostatus.org/#active)
8 | [](https://travis-ci.org/trinker/entity)
10 | [](https://coveralls.io/r/trinker/entity?branch=master)
12 |
13 |
14 |
15 | 
16 |
17 | **entity** is wrapper to simplify and extend
18 | [**NLP**](https://cran.r-project.org/web/packages/NLP/index.html) and
19 | [**openNLP**](https://cran.r-project.org/web/packages/openNLP/index.html)
20 | named entity recognition. The package contains 6 entity extractors that
21 | take a text vector and return a list of vectors of named entities. The
22 | entity extractors include:
23 |
24 | 1. `person_entity`
25 | 2. `location_entity`
26 | 3. `organization_entity`
27 | 4. `date_entity`
28 | 5. `money_entity`
29 | 6. `percent_entity`
30 |
31 |
32 | Table of Contents
33 | ============
34 |
35 | - [Installation](#installation)
36 | - [Contact](#contact)
37 | - [Examples](#examples)
38 | - [Load the Package/Data](#load-the-packagedata)
39 | - [Entity Extractors](#entity-extractors)
40 | - [Person Entities](#person-entities)
41 | - [Location Entities](#location-entities)
42 | - [Organization Entities](#organization-entities)
43 | - [Date Entities](#date-entities)
44 | - [Money Entities](#money-entities)
45 | - [Percent Entities](#percent-entities)
46 | - [Plotting](#plotting)
47 |
48 | Installation
49 | ============
50 |
51 |
52 | To download the development version of **entity**:
53 |
54 | Download the [zip
55 | ball](https://github.com/trinker/entity/zipball/master) or [tar
56 | ball](https://github.com/trinker/entity/tarball/master), decompress and
57 | run `R CMD INSTALL` on it, or use the **pacman** package to install the
58 | development version:
59 |
60 | if (!require("pacman")) install.packages("pacman")
61 | pacman::p_load_gh("trinker/entity")
62 |
63 | Contact
64 | =======
65 |
66 | You are welcome to:
67 | - submit suggestions and bug-reports at:
68 | - send a pull request on:
69 | - compose a friendly e-mail to:
70 |
71 | Examples
72 | ========
73 |
74 | The following examples demonstrate some of the functionality of
75 | **termco**.
76 |
77 | Load the Package/Data
78 | ---------------------
79 |
80 | library(entity)
81 |
82 | I will demonstrate the 6 annotators on this [Wikipedia
83 | excerpt](https://en.wikipedia.org/wiki/Bell_Labs) about Bell Labs (plus
84 | one non Wikipedia line at the end).
85 |
86 | data(wiki)
87 | wiki
88 |
89 | ## [1] "Bell Laboratories (also known as Bell Labs and formerly known as AT&T Bell Laboratories and Bell Telephone Laboratories) is a research and scientific development company that belongs to Alcatel-Lucent."
90 | ## [2] "Its headquarters are located in Murray Hill, New Jersey, in addition to other laboratories around the rest of the United States and in other countries."
91 | ## [3] "The historic laboratory originated in the late 19th century as the Volta Laboratory and Bureau created by Alexander Graham Bell."
92 | ## [4] "Bell Labs was also at one time a division of the American Telephone & Telegraph Company (AT&T Corporation), half-owned through its Western Electric manufacturing subsidiary."
93 | ## [5] "Researchers working at Bell Labs are credited with the development of radio astronomy, the transistor, the laser, the charge-coupled device (CCD), information theory, the UNIX operating system, the C programming language, S programming language and the C++ programming language."
94 | ## [6] "Eight Nobel Prizes have been awarded for work completed at Bell Laboratories."
95 | ## [7] "And an extra line not from Wikipedia worth 2 cents or .001% of 1 percent."
96 |
97 | Entity Extractors
98 | -----------------
99 |
100 | ### Person Entities
101 |
102 | person_entity(wiki)
103 |
104 | ## [[1]]
105 | ## NULL
106 | ##
107 | ## [[2]]
108 | ## NULL
109 | ##
110 | ## [[3]]
111 | ## [1] "Alexander Graham Bell"
112 | ##
113 | ## [[4]]
114 | ## NULL
115 | ##
116 | ## [[5]]
117 | ## NULL
118 | ##
119 | ## [[6]]
120 | ## NULL
121 | ##
122 | ## [[7]]
123 | ## NULL
124 |
125 | ### Location Entities
126 |
127 | location_entity(wiki)
128 |
129 | ## [[1]]
130 | ## NULL
131 | ##
132 | ## [[2]]
133 | ## [1] "Murray Hill" "New Jersey" "United States"
134 | ##
135 | ## [[3]]
136 | ## NULL
137 | ##
138 | ## [[4]]
139 | ## [1] "Telegraph"
140 | ##
141 | ## [[5]]
142 | ## NULL
143 | ##
144 | ## [[6]]
145 | ## NULL
146 | ##
147 | ## [[7]]
148 | ## NULL
149 |
150 | ### Organization Entities
151 |
152 | organization_entity(wiki)
153 |
154 | ## [[1]]
155 | ## [1] "Bell Laboratories" "Bell Labs"
156 | ## [3] "Bell Laboratories" "Bell Telephone Laboratories"
157 | ##
158 | ## [[2]]
159 | ## NULL
160 | ##
161 | ## [[3]]
162 | ## [1] "Volta Laboratory" "Alexander Graham Bell"
163 | ##
164 | ## [[4]]
165 | ## [1] "Bell Labs"
166 | ## [2] "American Telephone & Telegraph Company"
167 | ## [3] "AT&T Corporation"
168 | ## [4] "Western Electric"
169 | ##
170 | ## [[5]]
171 | ## [1] "Bell Labs"
172 | ##
173 | ## [[6]]
174 | ## [1] "Bell Laboratories"
175 | ##
176 | ## [[7]]
177 | ## NULL
178 |
179 | ### Date Entities
180 |
181 | date_entity(wiki)
182 |
183 | ## [[1]]
184 | ## NULL
185 | ##
186 | ## [[2]]
187 | ## NULL
188 | ##
189 | ## [[3]]
190 | ## [1] "late 19th century"
191 | ##
192 | ## [[4]]
193 | ## NULL
194 | ##
195 | ## [[5]]
196 | ## NULL
197 | ##
198 | ## [[6]]
199 | ## NULL
200 | ##
201 | ## [[7]]
202 | ## NULL
203 |
204 | ### Money Entities
205 |
206 | money_entity(wiki)
207 |
208 | ## [[1]]
209 | ## NULL
210 | ##
211 | ## [[2]]
212 | ## NULL
213 | ##
214 | ## [[3]]
215 | ## NULL
216 | ##
217 | ## [[4]]
218 | ## NULL
219 | ##
220 | ## [[5]]
221 | ## NULL
222 | ##
223 | ## [[6]]
224 | ## NULL
225 | ##
226 | ## [[7]]
227 | ## [1] "2 cents"
228 |
229 | ### Percent Entities
230 |
231 | percent_entity(wiki)
232 |
233 | ## [[1]]
234 | ## NULL
235 | ##
236 | ## [[2]]
237 | ## NULL
238 | ##
239 | ## [[3]]
240 | ## NULL
241 | ##
242 | ## [[4]]
243 | ## NULL
244 | ##
245 | ## [[5]]
246 | ## NULL
247 | ##
248 | ## [[6]]
249 | ## NULL
250 | ##
251 | ## [[7]]
252 | ## [1] ".001%" "1 percent"
253 |
254 | Plotting
255 | --------
256 |
257 | organizations <- organization_entity(presidential_debates_2012$dialogue)
258 | plot(organizations)
259 |
260 | 
261 |
262 | You can include only entities above a minimum frequency (`min = n`) as
263 | shown below:
264 |
265 | plot(organizations, min = 2)
266 |
267 | 
268 |
269 | The user may wish to view the entities alphabetically rather than by
270 | frequency. Use `alphabetical = TRUE` to accomplish this:
271 |
272 | plot(organizations, alphabetical = TRUE)
273 |
274 | 
275 |
--------------------------------------------------------------------------------
/data/presidential_debates_2012.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/entity/5549d30f4f7daa91ecb99ea5dbc770260bfdbedd/data/presidential_debates_2012.rda
--------------------------------------------------------------------------------
/data/wiki.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/entity/5549d30f4f7daa91ecb99ea5dbc770260bfdbedd/data/wiki.rda
--------------------------------------------------------------------------------
/inst/CITATION:
--------------------------------------------------------------------------------
1 | citHeader("To cite entity in publications, please use:")
2 |
3 |
4 | citEntry(entry = "manual",
5 | title = "{entity}: Named entity recognition",
6 | author = "Tyler W. Rinker",
7 | organization = "University at Buffalo/SUNY",
8 | address = "Buffalo, New York",
9 | note = "version 0.1.0",
10 | year = "2015",
11 | url = "http://github.com/trinker/entity",
12 | textVersion = paste("Rinker, T. W. (2015).",
13 | "entity: Named entity recognition",
14 | "version 0.1.0. University at Buffalo. Buffalo, New York.",
15 | "http://github.com/trinker/entity")
16 | )
17 |
--------------------------------------------------------------------------------
/inst/build.R:
--------------------------------------------------------------------------------
1 | root <- Sys.getenv("USERPROFILE")
2 | pack <- basename(getwd())
3 |
4 | quick <- TRUE
5 | pdf <- TRUE
6 |
7 | unlink(paste0(pack, ".pdf"), recursive = TRUE, force = TRUE)
8 | devtools::document()
9 | devtools::install(quick = quick, build_vignettes = FALSE, dependencies = TRUE)
10 |
11 | if(pdf){
12 | path <- find.package(pack)
13 | system(paste(shQuote(file.path(R.home("bin"), "R")), "CMD", "Rd2pdf", shQuote(path)))
14 | file.copy(paste0(pack, '.pdf'), file.path(root,"Desktop", paste0(pack, '.pdf')))
15 | while (file.exists(paste0(pack, ".pdf"))) {unlink(paste0(pack, ".pdf"), recursive = TRUE, force = TRUE)}
16 | empts <- grep("^\\.Rd", dir(all.files = TRUE), value = TRUE)
17 | unlink(empts, recursive = TRUE, force = TRUE)
18 | }
19 |
20 | message("Done!")
21 |
--------------------------------------------------------------------------------
/inst/extra_statdoc/readme.R:
--------------------------------------------------------------------------------
1 | 
2 |
entity is a...
3 | Download the development version of entity here
4 |
--------------------------------------------------------------------------------
/inst/staticdocs/index.R:
--------------------------------------------------------------------------------
1 | library(staticdocs)
2 |
3 | sd_section("",
4 | "Function for...",
5 | c(
6 | "myfun"
7 | )
8 | )
--------------------------------------------------------------------------------
/man/annotators.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/annotators.R
3 | \name{word_annotator}
4 | \alias{word_annotator}
5 | \alias{person_annotator}
6 | \alias{location_annotator}
7 | \alias{organization_annotator}
8 | \alias{date_annotator}
9 | \alias{money_annotator}
10 | \alias{percent_annotator}
11 | \title{Annotators}
12 | \usage{
13 | word_annotator()
14 |
15 | person_annotator()
16 |
17 | location_annotator()
18 |
19 | organization_annotator()
20 |
21 | date_annotator()
22 |
23 | money_annotator()
24 |
25 | percent_annotator()
26 | }
27 | \value{
28 | Returns an annotator for entities or words.
29 | }
30 | \description{
31 | A wrapper for \code{\link[openNLP]{Maxent_Entity_Annotator}} and
32 | \code{\link[openNLP]{Maxent_Word_Token_Annotator}}.
33 | }
34 | \seealso{
35 | \code{\link[openNLP]{Maxent_Entity_Annotator}},
36 | \code{\link[openNLP]{Maxent_Word_Token_Annotator}}
37 | }
38 |
--------------------------------------------------------------------------------
/man/date_entity.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/date_entity.R
3 | \name{date_entity}
4 | \alias{date_entity}
5 | \title{Named Date Recognition}
6 | \usage{
7 | date_entity(text.var, entity.annotator = "date_annotator",
8 | word.annotator = word_annotator(), element.chunks = floor(2000 *
9 | (23.5/mean(sapply(text.var, nchar), na.rm = TRUE))))
10 | }
11 | \arguments{
12 | \item{text.var}{The text string variable.}
13 |
14 | \item{entity.annotator}{A character vector identifying an entity recognition
15 | annotator (\code{c("person_annotator", "location_annotator", "date_annotator",
16 | "money_annotator", "percent_annotator")}. See \code{?annotators}.}
17 |
18 | \item{word.annotator}{A word annotator.}
19 |
20 | \item{element.chunks}{The number of elements to include in a chunk. Chunks are
21 | passed through an \code{\link[base]{lapply}} and size is kept within a tolerance
22 | because of memory allocation in the tagging process with \pkg{Java}.}
23 | }
24 | \value{
25 | Returns a data.frame of named entities and frequencies.
26 | }
27 | \description{
28 | A wrapper for \pkg{NLP},/\pkg{openNLP}'s named date recognition annotation.
29 | }
30 | \examples{
31 | \dontrun{
32 | data(presidential_debates_2012)
33 |
34 | dates <- date_entity(presidential_debates_2012$dialogue)
35 | unlist(dates)
36 |
37 | library(dplyr)
38 | presidential_debates_2012$dates <- date_entity(presidential_debates_2012$dialogue)
39 |
40 | presidential_debates_2012 \%>\%
41 | {.[!sapply(.$dates, is.null), ]} \%>\%
42 | rowwise() \%>\%
43 | mutate(dates = paste(dates, collapse=", ")) \%>\%
44 | select(person, time, dates)
45 |
46 | library(tidyr)
47 | presidential_debates_2012 \%>\%
48 | {.[!sapply(.$dates, is.null), ]} \%>\%
49 | unnest() \%>\%
50 | select(person, time, dates)
51 | }
52 | }
53 | \seealso{
54 | Other variable functions: \code{\link{location_entity}},
55 | \code{\link{money_entity}},
56 | \code{\link{organization_entity}},
57 | \code{\link{percent_entity}}, \code{\link{person_entity}}
58 | }
59 | \keyword{date}
60 |
--------------------------------------------------------------------------------
/man/entity.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/entity-package.R
3 | \docType{package}
4 | \name{entity}
5 | \alias{entity}
6 | \alias{package-entity}
7 | \alias{entity-package}
8 | \title{Named Entity Extaction}
9 | \description{
10 | A wrapper for \pkg{NLP} and \pkg{openNLP} to facilitate named entity extraction.
11 | }
12 |
--------------------------------------------------------------------------------
/man/location_entity.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/location_entity.R
3 | \name{location_entity}
4 | \alias{location_entity}
5 | \title{Named Location Recognition}
6 | \usage{
7 | location_entity(text.var, entity.annotator = "location_annotator",
8 | word.annotator = word_annotator(), element.chunks = floor(2000 *
9 | (23.5/mean(sapply(text.var, nchar), na.rm = TRUE))))
10 | }
11 | \arguments{
12 | \item{text.var}{The text string variable.}
13 |
14 | \item{entity.annotator}{A character vector identifying an entity recognition
15 | annotator (\code{c("person_annotator", "location_annotator", "date_annotator",
16 | "money_annotator", "percent_annotator")}. See \code{?annotators}.}
17 |
18 | \item{word.annotator}{A word annotator.}
19 |
20 | \item{element.chunks}{The number of elements to include in a chunk. Chunks are
21 | passed through an \code{\link[base]{lapply}} and size is kept within a tolerance
22 | because of memory allocation in the tagging process with \pkg{Java}.}
23 | }
24 | \value{
25 | Returns a data.frame of named entities and frequencies.
26 | }
27 | \description{
28 | A wrapper for \pkg{NLP},/\pkg{openNLP}'s named location recognition annotation.
29 | }
30 | \examples{
31 | \dontrun{
32 | data(presidential_debates_2012)
33 |
34 | locales <- location_entity(presidential_debates_2012$dialogue)
35 | unlist(locales)
36 |
37 | library(dplyr)
38 | presidential_debates_2012$locations <- location_entity(presidential_debates_2012$dialogue)
39 |
40 | presidential_debates_2012 \%>\%
41 | {.[!sapply(.$locations, is.null), ]} \%>\%
42 | rowwise() \%>\%
43 | mutate(locations = paste(locations, collapse=", ")) \%>\%
44 | select(person, time, locations)
45 |
46 | library(tidyr)
47 | presidential_debates_2012 \%>\%
48 | {.[!sapply(.$locations, is.null), ]} \%>\%
49 | unnest() \%>\%
50 | select(person, time, locations)
51 | }
52 | }
53 | \seealso{
54 | Other variable functions: \code{\link{date_entity}},
55 | \code{\link{money_entity}},
56 | \code{\link{organization_entity}},
57 | \code{\link{percent_entity}}, \code{\link{person_entity}}
58 | }
59 | \keyword{location}
60 |
--------------------------------------------------------------------------------
/man/money_entity.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/money_entity.R
3 | \name{money_entity}
4 | \alias{money_entity}
5 | \title{Named Money Recognition}
6 | \usage{
7 | money_entity(text.var, entity.annotator = "money_annotator",
8 | word.annotator = word_annotator(), element.chunks = floor(2000 *
9 | (23.5/mean(sapply(text.var, nchar), na.rm = TRUE))))
10 | }
11 | \arguments{
12 | \item{text.var}{The text string variable.}
13 |
14 | \item{entity.annotator}{A character vector identifying an entity recognition
15 | annotator (\code{c("person_annotator", "location_annotator", "date_annotator",
16 | "money_annotator", "percent_annotator")}. See \code{?annotators}.}
17 |
18 | \item{word.annotator}{A word annotator.}
19 |
20 | \item{element.chunks}{The number of elements to include in a chunk. Chunks are
21 | passed through an \code{\link[base]{lapply}} and size is kept within a tolerance
22 | because of memory allocation in the tagging process with \pkg{Java}.}
23 | }
24 | \value{
25 | Returns a data.frame of named entities and frequencies.
26 | }
27 | \description{
28 | A wrapper for \pkg{NLP},/\pkg{openNLP}'s named money recognition annotation.
29 | }
30 | \examples{
31 | \dontrun{
32 | data(presidential_debates_2012)
33 |
34 | monies <- money_entity(presidential_debates_2012$dialogue)
35 | unlist(monies)
36 |
37 | library(dplyr)
38 | presidential_debates_2012$monies <- money_entity(presidential_debates_2012$dialogue)
39 |
40 | presidential_debates_2012 \%>\%
41 | {.[!sapply(.$monies, is.null), ]} \%>\%
42 | rowwise() \%>\%
43 | mutate(monies = paste(monies, collapse=", ")) \%>\%
44 | select(person, time, monies)
45 |
46 | library(tidyr)
47 | presidential_debates_2012 \%>\%
48 | {.[!sapply(.$monies, is.null), ]} \%>\%
49 | unnest() \%>\%
50 | select(person, time, monies)
51 | }
52 | }
53 | \seealso{
54 | Other variable functions: \code{\link{date_entity}},
55 | \code{\link{location_entity}},
56 | \code{\link{organization_entity}},
57 | \code{\link{percent_entity}}, \code{\link{person_entity}}
58 | }
59 | \keyword{money}
60 |
--------------------------------------------------------------------------------
/man/named_entity.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/named_entity.R
3 | \name{named_entity}
4 | \alias{named_entity}
5 | \title{Named Entity Recognition}
6 | \usage{
7 | named_entity(text.var, entity.annotator, word.annotator = word_annotator(),
8 | element.chunks = floor(2000 * (23.5/mean(sapply(text.var, nchar), na.rm =
9 | TRUE))))
10 | }
11 | \arguments{
12 | \item{text.var}{The text string variable.}
13 |
14 | \item{entity.annotator}{A character vector identifying an entity recognition
15 | annotator (\code{c("person_annotator", "location_annotator", "date_annotator",
16 | "money_annotator", "percent_annotator")}. See \code{?annotators}.}
17 |
18 | \item{word.annotator}{A word annotator.}
19 |
20 | \item{element.chunks}{The number of elements to include in a chunk. Chunks are
21 | passed through an \code{\link[base]{lapply}} and size is kept within a tolerance
22 | because of memory allocation in the tagging process with \pkg{Java}.}
23 | }
24 | \value{
25 | Returns a data.frame of named entities and frequencies.
26 | }
27 | \description{
28 | A wrapper for \pkg{NLP},/\pkg{openNLP}'s named entity recognition annotation
29 | tools.
30 | }
31 | \examples{
32 | \dontrun{
33 | data(presidential_debates_2012)
34 |
35 | peoples <- named_entity(presidential_debates_2012$dialogue, 'person_annotator')
36 | unlist(peoples)
37 | plot(peoples)
38 |
39 | orgs <-named_entity(presidential_debates_2012$dialogue, 'organization_annotator')
40 | unlist(orgs)
41 |
42 | dates <-named_entity(presidential_debates_2012$dialogue, 'date_annotator')
43 | unlist(dates)
44 |
45 | library(dplyr)
46 | presidential_debates_2012$organizations <- named_entity(
47 | presidential_debates_2012$dialogue,
48 | 'organization_annotator'
49 | )
50 |
51 | presidential_debates_2012 \%>\%
52 | {.[!sapply(.$organizations, is.null), ]} \%>\%
53 | rowwise() \%>\%
54 | mutate(organizations = paste(organizations, collapse=", ")) \%>\%
55 | select(person, time, organizations)
56 |
57 | library(tidyr)
58 | presidential_debates_2012 \%>\%
59 | {.[!sapply(.$organizations, is.null), ]} \%>\%
60 | unnest() \%>\%
61 | select(person, time, organizations)
62 | }
63 | }
64 | \seealso{
65 | \code{\link[openNLP]{Maxent_Entity_Annotator}}
66 | }
67 | \keyword{entity}
68 | \keyword{named}
69 | \keyword{ner}
70 |
--------------------------------------------------------------------------------
/man/organization_entity.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/organization_entity.R
3 | \name{organization_entity}
4 | \alias{organization_entity}
5 | \title{Named Organization Recognition}
6 | \usage{
7 | organization_entity(text.var, entity.annotator = "organization_annotator",
8 | word.annotator = word_annotator(), element.chunks = floor(2000 *
9 | (23.5/mean(sapply(text.var, nchar), na.rm = TRUE))))
10 | }
11 | \arguments{
12 | \item{text.var}{The text string variable.}
13 |
14 | \item{entity.annotator}{A character vector identifying an entity recognition
15 | annotator (\code{c("person_annotator", "location_annotator", "date_annotator",
16 | "money_annotator", "percent_annotator")}. See \code{?annotators}.}
17 |
18 | \item{word.annotator}{A word annotator.}
19 |
20 | \item{element.chunks}{The number of elements to include in a chunk. Chunks are
21 | passed through an \code{\link[base]{lapply}} and size is kept within a tolerance
22 | because of memory allocation in the tagging process with \pkg{Java}.}
23 | }
24 | \value{
25 | Returns a data.frame of named entities and frequencies.
26 | }
27 | \description{
28 | A wrapper for \pkg{NLP},/\pkg{openNLP}'s named organization recognition annotation.
29 | }
30 | \examples{
31 | \dontrun{
32 | data(presidential_debates_2012)
33 |
34 | orgs <- organization_entity(presidential_debates_2012$dialogue)
35 | unlist(orgs)
36 |
37 | library(dplyr)
38 | presidential_debates_2012$organizations <- organization_entity(presidential_debates_2012$dialogue)
39 |
40 | presidential_debates_2012 \%>\%
41 | {.[!sapply(.$organizations, is.null), ]} \%>\%
42 | rowwise() \%>\%
43 | mutate(organizations = paste(organizations, collapse=", ")) \%>\%
44 | select(person, time, organizations)
45 |
46 | library(tidyr)
47 | presidential_debates_2012 \%>\%
48 | {.[!sapply(.$organizations, is.null), ]} \%>\%
49 | unnest() \%>\%
50 | select(person, time, organizations)
51 | }
52 | }
53 | \seealso{
54 | Other variable functions: \code{\link{date_entity}},
55 | \code{\link{location_entity}},
56 | \code{\link{money_entity}}, \code{\link{percent_entity}},
57 | \code{\link{person_entity}}
58 | }
59 | \keyword{organization}
60 |
--------------------------------------------------------------------------------
/man/percent_entity.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/percent_entity.R
3 | \name{percent_entity}
4 | \alias{percent_entity}
5 | \title{Named Percent Recognition}
6 | \usage{
7 | percent_entity(text.var, entity.annotator = "percent_annotator",
8 | word.annotator = word_annotator(), element.chunks = floor(2000 *
9 | (23.5/mean(sapply(text.var, nchar), na.rm = TRUE))))
10 | }
11 | \arguments{
12 | \item{text.var}{The text string variable.}
13 |
14 | \item{entity.annotator}{A character vector identifying an entity recognition
15 | annotator (\code{c("person_annotator", "location_annotator", "date_annotator",
16 | "money_annotator", "percent_annotator")}. See \code{?annotators}.}
17 |
18 | \item{word.annotator}{A word annotator.}
19 |
20 | \item{element.chunks}{The number of elements to include in a chunk. Chunks are
21 | passed through an \code{\link[base]{lapply}} and size is kept within a tolerance
22 | because of memory allocation in the tagging process with \pkg{Java}.}
23 | }
24 | \value{
25 | Returns a data.frame of named entities and frequencies.
26 | }
27 | \description{
28 | A wrapper for \pkg{NLP},/\pkg{openNLP}'s named percent recognition annotation.
29 | }
30 | \examples{
31 | \dontrun{
32 | data(presidential_debates_2012)
33 |
34 | percents <- percent_entity(presidential_debates_2012$dialogue)
35 | unlist(percents)
36 |
37 | library(dplyr)
38 | presidential_debates_2012$percents <- percent_entity(presidential_debates_2012$dialogue)
39 |
40 | presidential_debates_2012 \%>\%
41 | {.[!sapply(.$percents, is.null), ]} \%>\%
42 | rowwise() \%>\%
43 | mutate(percents = paste(percents, collapse=", ")) \%>\%
44 | select(person, time, percents)
45 |
46 | library(tidyr)
47 | presidential_debates_2012 \%>\%
48 | {.[!sapply(.$percents, is.null), ]} \%>\%
49 | unnest() \%>\%
50 | select(person, time, percents)
51 | }
52 | }
53 | \seealso{
54 | Other variable functions: \code{\link{date_entity}},
55 | \code{\link{location_entity}},
56 | \code{\link{money_entity}},
57 | \code{\link{organization_entity}},
58 | \code{\link{person_entity}}
59 | }
60 | \keyword{percent}
61 |
--------------------------------------------------------------------------------
/man/person_entity.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/person_entity.R
3 | \name{person_entity}
4 | \alias{person_entity}
5 | \title{Named Person Recognition}
6 | \usage{
7 | person_entity(text.var, entity.annotator = "person_annotator",
8 | word.annotator = word_annotator(), element.chunks = floor(2000 *
9 | (23.5/mean(sapply(text.var, nchar), na.rm = TRUE))))
10 | }
11 | \arguments{
12 | \item{text.var}{The text string variable.}
13 |
14 | \item{entity.annotator}{A character vector identifying an entity recognition
15 | annotator (\code{c("person_annotator", "location_annotator", "date_annotator",
16 | "money_annotator", "percent_annotator")}. See \code{?annotators}.}
17 |
18 | \item{word.annotator}{A word annotator.}
19 |
20 | \item{element.chunks}{The number of elements to include in a chunk. Chunks are
21 | passed through an \code{\link[base]{lapply}} and size is kept within a tolerance
22 | because of memory allocation in the tagging process with \pkg{Java}.}
23 | }
24 | \value{
25 | Returns a data.frame of named entities and frequencies.
26 | }
27 | \description{
28 | A wrapper for \pkg{NLP},/\pkg{openNLP}'s named person recognition annotation.
29 | }
30 | \examples{
31 | \dontrun{
32 | data(presidential_debates_2012)
33 |
34 | peoples <- person_entity(presidential_debates_2012$dialogue)
35 | unlist(peoples)
36 |
37 | library(dplyr)
38 | presidential_debates_2012$persons <- person_entity(presidential_debates_2012$dialogue)
39 |
40 | presidential_debates_2012 \%>\%
41 | {.[!sapply(.$persons, is.null), ]} \%>\%
42 | rowwise() \%>\%
43 | mutate(persons = paste(persons, collapse=", ")) \%>\%
44 | select(person, time, persons)
45 |
46 | library(tidyr)
47 | presidential_debates_2012 \%>\%
48 | {.[!sapply(.$persons, is.null), ]} \%>\%
49 | unnest() \%>\%
50 | select(person, time, persons)
51 | }
52 | }
53 | \seealso{
54 | Other variable functions: \code{\link{date_entity}},
55 | \code{\link{location_entity}},
56 | \code{\link{money_entity}},
57 | \code{\link{organization_entity}},
58 | \code{\link{percent_entity}}
59 | }
60 | \keyword{people}
61 | \keyword{person}
62 |
--------------------------------------------------------------------------------
/man/plot.entity.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/named_entity.R
3 | \name{plot.entity}
4 | \alias{plot.entity}
5 | \title{Plots a plot.entity Object}
6 | \usage{
7 | \method{plot}{entity}(x, min = 1, alphabetical = FALSE, ...)
8 | }
9 | \arguments{
10 | \item{x}{An \code{entity} object.}
11 |
12 | \item{min}{Minimum frequency of included entities.}
13 |
14 | \item{alphabetical}{logical. Should rows be arranged alphabetically by entity
15 | or by frequency.}
16 |
17 | \item{\ldots}{ignored.}
18 | }
19 | \description{
20 | Plots a plot.entity object
21 | }
22 |
--------------------------------------------------------------------------------
/man/presidential_debates_2012.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/entity-package.R
3 | \docType{data}
4 | \name{presidential_debates_2012}
5 | \alias{presidential_debates_2012}
6 | \title{2012 U.S. Presidential Debates}
7 | \format{A data frame with 2912 rows and 4 variables}
8 | \usage{
9 | data(presidential_debates_2012)
10 | }
11 | \description{
12 | A dataset containing a cleaned version of all three presidential debates for
13 | the 2012 election.
14 | }
15 | \details{
16 | \itemize{
17 | \item person. The speaker
18 | \item tot. Turn of talk
19 | \item dialogue. The words spoken
20 | \item time. Variable indicating which of the three debates the dialogue is from
21 | }
22 | }
23 | \keyword{datasets}
24 |
--------------------------------------------------------------------------------
/man/print.entity.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/named_entity.R
3 | \name{print.entity}
4 | \alias{print.entity}
5 | \title{Prints a entity Object}
6 | \usage{
7 | \method{print}{entity}(x, ...)
8 | }
9 | \arguments{
10 | \item{x}{An \code{entity} object.}
11 |
12 | \item{\ldots}{ignored.}
13 | }
14 | \description{
15 | Prints a entity object
16 | }
17 |
--------------------------------------------------------------------------------
/man/wiki.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/entity-package.R
3 | \docType{data}
4 | \name{wiki}
5 | \alias{wiki}
6 | \title{Bell Labs Wikipedia Article}
7 | \format{A character vector with 7 elements}
8 | \usage{
9 | data(wiki)
10 | }
11 | \description{
12 | A dataset containing a character vector of an excerpt from Wikipedia about
13 | Bell Labs with an extra final sentence to include percent and money when
14 | extracting entities.
15 | }
16 | \references{
17 | \url{https://en.wikipedia.org/wiki/Bell_Labs}
18 | }
19 | \keyword{datasets}
20 |
--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library("testthat")
2 | library("entity")
3 |
4 | test_check("entity")
--------------------------------------------------------------------------------
/tests/testthat/test-named_entity.R:
--------------------------------------------------------------------------------
1 | context("Checking named_entity")
2 |
3 | test_that("named_entity ...",{
4 |
5 |
6 | })
7 |
8 |
--------------------------------------------------------------------------------
/tools/entity_logo/r_entity.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/entity/5549d30f4f7daa91ecb99ea5dbc770260bfdbedd/tools/entity_logo/r_entity.png
--------------------------------------------------------------------------------
/tools/entity_logo/r_entity.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/entity/5549d30f4f7daa91ecb99ea5dbc770260bfdbedd/tools/entity_logo/r_entity.pptx
--------------------------------------------------------------------------------
/tools/entity_logo/r_entitya.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/entity/5549d30f4f7daa91ecb99ea5dbc770260bfdbedd/tools/entity_logo/r_entitya.png
--------------------------------------------------------------------------------
/tools/entity_logo/resize_icon.txt:
--------------------------------------------------------------------------------
1 | cd C:\Users\Tyler\GitHub\entity\tools\entity_logo
2 | ffmpeg -i r_entitya.png -vf scale=250:-1 r_entity.png
--------------------------------------------------------------------------------
/tools/figure/fig.height-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/entity/5549d30f4f7daa91ecb99ea5dbc770260bfdbedd/tools/figure/fig.height-1.png
--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-11-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/entity/5549d30f4f7daa91ecb99ea5dbc770260bfdbedd/tools/figure/unnamed-chunk-11-1.png
--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-12-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/entity/5549d30f4f7daa91ecb99ea5dbc770260bfdbedd/tools/figure/unnamed-chunk-12-1.png
--------------------------------------------------------------------------------