├── .Rbuildignore ├── .gitignore ├── .travis.yml ├── CODE_OF_CONDUCT.md ├── DESCRIPTION ├── NAMESPACE ├── NEWS ├── NEWS.md ├── R ├── add_comma_space.R ├── add_missing_endmark.R ├── check_text.R ├── check_text_logicals.R ├── drop_element.R ├── drop_row.R ├── fgsub.R ├── fix_mdyyyy.R ├── glue-reexports.R ├── has_endmark.R ├── like.R ├── make_plural.R ├── match_tokens.R ├── mgsub.R ├── replace_contraction.R ├── replace_date.R ├── replace_email.R ├── replace_emoji.R ├── replace_emoticon.R ├── replace_grade.R ├── replace_hash.R ├── replace_html.R ├── replace_incomplete.R ├── replace_internet_slang.R ├── replace_kerning.R ├── replace_misspelling.R ├── replace_money.R ├── replace_names.R ├── replace_non_ascii.R ├── replace_number.R ├── replace_ordinal.R ├── replace_rating.R ├── replace_symbol.R ├── replace_tag.R ├── replace_time.R ├── replace_to.R ├── replace_tokens.R ├── replace_url.R ├── replace_white.R ├── replace_word_elongation.R ├── strip.R ├── sub_holder.R ├── swap.R ├── sysdata.rda ├── textclean-package.R └── utils.R ├── README.Rmd ├── README.md ├── data └── DATA.rda ├── inst ├── CITATION ├── articles │ ├── Clark2011.pdf │ ├── Jurafsky2016.pdf │ └── Sproat2001.pdf ├── build.R ├── docs │ ├── emoji_sample.txt │ └── r_tweets.txt ├── extra_statdoc │ └── readme.R ├── maintenance.R ├── scraping_scripts │ ├── google_ngram_to_canonical.R │ └── scrape_leet.R └── staticdocs │ └── index.R ├── man ├── DATA.Rd ├── add_comma_space.Rd ├── add_missing_endmark.Rd ├── check_text.Rd ├── drop_element.Rd ├── drop_row.Rd ├── fgsub.Rd ├── fix_mdyyyy.Rd ├── has_endmark.Rd ├── like.Rd ├── make_plural.Rd ├── match_tokens.Rd ├── mgsub.Rd ├── print.check_text.Rd ├── print.sub_holder.Rd ├── print.which_are_locs.Rd ├── reexports.Rd ├── replace_contraction.Rd ├── replace_date.Rd ├── replace_email.Rd ├── replace_emoji.Rd ├── replace_emoticon.Rd ├── replace_grade.Rd ├── replace_hash.Rd ├── replace_html.Rd ├── replace_incomplete.Rd ├── replace_internet_slang.Rd ├── replace_kern.Rd ├── replace_misspelling.Rd ├── replace_money.Rd ├── replace_names.Rd ├── replace_non_ascii.Rd ├── replace_number.Rd ├── replace_ordinal.Rd ├── replace_rating.Rd ├── replace_symbol.Rd ├── replace_tag.Rd ├── replace_time.Rd ├── replace_to.Rd ├── replace_tokens.Rd ├── replace_url.Rd ├── replace_white.Rd ├── replace_word_elongation.Rd ├── strip.Rd ├── sub_holder.Rd ├── swap.Rd ├── textclean.Rd └── which_are.Rd ├── tests ├── testthat.R └── testthat │ ├── test-replace_emoticon.R │ ├── test-replace_grade.R │ ├── test-replace_rating.R │ └── test-strip.R └── tools └── textclean_logo ├── r_textclean.png ├── r_textclean.pptx ├── r_textcleana.png └── resize_icon.txt /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^\.gitignore 4 | NEWS$ 5 | FAQ.md 6 | NEWS.html 7 | FAQ.html 8 | ^\.travis\.yml$ 9 | travis-tool.sh 10 | inst/web 11 | contributors.geojson 12 | inst/build.R 13 | ^.*\.Rprofile$ 14 | README.Rmd 15 | README.R 16 | travis.yml 17 | inst/maintenance.R 18 | tools/textclean_logo/r_textcleana.png 19 | tools/textclean_logo/r_textclean.pptx 20 | tools/textclean_logo/resize_icon.txt 21 | inst/staticdocs 22 | inst/extra_statdoc 23 | Thumbs.db 24 | inst/scraping_scripts 25 | inst/articles 26 | ^CODE_OF_CONDUCT\.md$ 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | 4 | # Example code in package build process 5 | *-Ex.R 6 | 7 | .Rprofile 8 | .Rproj.user 9 | textmod.Rproj 10 | Thumbs.db 11 | 12 | *.Rproj -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: r 2 | 3 | sudo: false 4 | 5 | before_install: 6 | - sh -e /etc/init.d/xvfb start 7 | 8 | r_github_packages: 9 | - jimhester/covr 10 | - trinker/textshape 11 | - trinker/lexicon 12 | - jeroenooms/hunspell 13 | 14 | notifications: 15 | email: 16 | on_success: change 17 | on_failure: change 18 | 19 | after_success: 20 | - Rscript -e 'covr::coveralls()' 21 | 22 | r_build_args: "--resave-data=best" 23 | r_check_args: "--as-cran" 24 | 25 | env: 26 | global: 27 | - DISPLAY=:99.0 28 | - BOOTSTRAP_LATEX=1 29 | - NOT_CRAN=true 30 | - secure: "nhzZdgVEOmRO/pCpkb6vBgTbLU2igXmb5gbX+QWaV5YzDT5pqMnT+AtE5/+GMH7QxfFE1SKeA/r2w8XomNMpDvhIIpedwHpGywRGK3rtav2u108oQ73m2k2D3AQZ/YTAx7xPgVwCMveUqZ3xDyGkt220J3Hwfkpe341B2xQH0JQ=" 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Code of Conduct 2 | 3 | As contributors and maintainers of this project, we pledge to respect all people who 4 | contribute through reporting issues, posting feature requests, updating documentation, 5 | submitting pull requests or patches, and other activities. 6 | 7 | We are committed to making participation in this project a harassment-free experience for 8 | everyone, regardless of level of experience, gender, gender identity and expression, 9 | sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion. 10 | 11 | Examples of unacceptable behavior by participants include the use of sexual language or 12 | imagery, derogatory comments or personal attacks, trolling, public or private harassment, 13 | insults, or other unprofessional conduct. 14 | 15 | Project maintainers have the right and responsibility to remove, edit, or reject comments, 16 | commits, code, wiki edits, issues, and other contributions that are not aligned to this 17 | Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed 18 | from the project team. 19 | 20 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by 21 | opening an issue or contacting one or more of the project maintainers. 22 | 23 | This Code of Conduct is adapted from the Contributor Covenant 24 | (http://contributor-covenant.org), version 1.0.0, available at 25 | http://contributor-covenant.org/version/1/0/0/ 26 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: textclean 2 | Title: Text Cleaning Tools 3 | Version: 0.9.7 4 | Authors@R: c( 5 | person("Tyler", "Rinker", email = "tyler.rinker@gmail.com", role = c("aut", "cre")), 6 | person("ctwheels", "StackOverflow", role = "ctb"), 7 | person("Surin", "Space", role = "ctb") 8 | ) 9 | Maintainer: Tyler Rinker 10 | Description: Tools to clean and process text. Tools are geared at checking for substrings that 11 | are not optimal for analysis and replacing or removing them (normalizing) with more 12 | analysis friendly substrings (see Sproat, Black, Chen, Kumar, Ostendorf, & Richards 13 | (2001) ) or extracting them into new variables. For 14 | example, emoticons are often used in text but not always easily handled by analysis 15 | algorithms. The replace_emoticon() function replaces emoticons with word 16 | equivalents. 17 | Depends: R (>= 3.4.0) 18 | Imports: data.table, english(>= 1.0-2), glue (>= 1.3.0), lexicon (>= 1.0.0), mgsub (>= 1.5.0), qdapRegex, 19 | stringi, textshape(>= 1.0.1), utils 20 | Suggests: hunspell, testthat 21 | License: GPL-2 22 | LazyData: TRUE 23 | RoxygenNote: 7.1.2 24 | Encoding: UTF-8 25 | URL: https://github.com/trinker/textclean 26 | BugReports: https://github.com/trinker/textclean/issues 27 | Collate: 28 | 'add_comma_space.R' 29 | 'add_missing_endmark.R' 30 | 'utils.R' 31 | 'replace_html.R' 32 | 'check_text_logicals.R' 33 | 'check_text.R' 34 | 'drop_element.R' 35 | 'drop_row.R' 36 | 'fgsub.R' 37 | 'fix_mdyyyy.R' 38 | 'glue-reexports.R' 39 | 'has_endmark.R' 40 | 'like.R' 41 | 'make_plural.R' 42 | 'match_tokens.R' 43 | 'mgsub.R' 44 | 'replace_contraction.R' 45 | 'replace_date.R' 46 | 'replace_email.R' 47 | 'replace_emoji.R' 48 | 'replace_emoticon.R' 49 | 'replace_grade.R' 50 | 'replace_hash.R' 51 | 'replace_incomplete.R' 52 | 'replace_internet_slang.R' 53 | 'replace_kerning.R' 54 | 'replace_misspelling.R' 55 | 'replace_money.R' 56 | 'replace_names.R' 57 | 'replace_non_ascii.R' 58 | 'replace_number.R' 59 | 'replace_ordinal.R' 60 | 'replace_rating.R' 61 | 'replace_symbol.R' 62 | 'replace_tag.R' 63 | 'replace_time.R' 64 | 'replace_to.R' 65 | 'replace_tokens.R' 66 | 'replace_url.R' 67 | 'replace_white.R' 68 | 'replace_word_elongation.R' 69 | 'strip.R' 70 | 'sub_holder.R' 71 | 'swap.R' 72 | 'textclean-package.R' 73 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(fix_mdyyyy,date) 4 | S3method(fix_mdyyyy,default) 5 | S3method(print,check_text) 6 | S3method(print,sub_holder) 7 | S3method(print,which_are_locs) 8 | S3method(strip,character) 9 | S3method(strip,default) 10 | S3method(strip,factor) 11 | S3method(strip,list) 12 | export("%LIKE%") 13 | export("%SLIKE%") 14 | export("%like%") 15 | export("%slike%") 16 | export(add_comma_space) 17 | export(add_missing_endmark) 18 | export(as_ordinal) 19 | export(available_checks) 20 | export(check_text) 21 | export(drop_NA) 22 | export(drop_element) 23 | export(drop_element_fixed) 24 | export(drop_element_regex) 25 | export(drop_empty_row) 26 | export(drop_row) 27 | export(fgsub) 28 | export(fix_mdyyyy) 29 | export(glue) 30 | export(glue_collapse) 31 | export(has_endmark) 32 | export(is_it) 33 | export(keep_element) 34 | export(keep_element_fixed) 35 | export(keep_element_regex) 36 | export(keep_row) 37 | export(make_plural) 38 | export(match_tokens) 39 | export(mgsub) 40 | export(mgsub_fixed) 41 | export(mgsub_regex) 42 | export(mgsub_regex_safe) 43 | export(replace_contraction) 44 | export(replace_curly_quote) 45 | export(replace_date) 46 | export(replace_email) 47 | export(replace_emoji) 48 | export(replace_emoji_identifier) 49 | export(replace_emoticon) 50 | export(replace_from) 51 | export(replace_grade) 52 | export(replace_hash) 53 | export(replace_html) 54 | export(replace_incomplete) 55 | export(replace_internet_slang) 56 | export(replace_kern) 57 | export(replace_misspelling) 58 | export(replace_money) 59 | export(replace_names) 60 | export(replace_non_ascii) 61 | export(replace_non_ascii2) 62 | export(replace_number) 63 | export(replace_ordinal) 64 | export(replace_rating) 65 | export(replace_symbol) 66 | export(replace_tag) 67 | export(replace_time) 68 | export(replace_to) 69 | export(replace_tokens) 70 | export(replace_url) 71 | export(replace_white) 72 | export(replace_word_elongation) 73 | export(strip) 74 | export(sub_holder) 75 | export(swap) 76 | export(which_are) 77 | importFrom(data.table,":=") 78 | importFrom(glue,glue) 79 | importFrom(glue,glue_collapse) 80 | importFrom(qdapRegex,grab) 81 | -------------------------------------------------------------------------------- /NEWS: -------------------------------------------------------------------------------- 1 | NEWS 2 | ==== 3 | 4 | Versioning 5 | ---------- 6 | 7 | Releases will be numbered with the following semantic versioning format: 8 | 9 | .. 10 | 11 | And constructed with the following guidelines: 12 | 13 | * Breaking backward compatibility bumps the major (and resets the minor 14 | and patch) 15 | * New additions without breaking backward compatibility bumps the minor 16 | (and resets the patch) 17 | * Bug fixes and misc changes bumps the patch 18 | 19 | 20 | 21 | textclean 0.9.4 - 22 | ---------------------------------------------------------------- 23 | 24 | BUG FIXES 25 | 26 | * `replace_emoticon` replaced emoticon-like substrings within actual words. 27 | Spotted thanks to Carolyn Challoner; see issue #46. 28 | 29 | * `replace_number` failed if the number pattern contained two leading decimals 30 | or hyphens. Spotted thanks to Stefano De Sabbata; see issue #60. 31 | 32 | * `replace_word_elongation` failed for repeating of the same character but of 33 | different case (e.g., `replace_word_elongation("Ooo")` resulted in `NA`. This 34 | has been corrected. Additionally, the `elongation.search.pattern` defined as 35 | `"(?i)(?:^|\\b)\\w*([a-z])(?:\\1{2,})\\w*($|\\b)"` has been moved exterally, to 36 | a parameter, allowing the user to alter this pattern if desired. Spotted 37 | thanks to Stefano De Sabbata; see issue #59. 38 | 39 | NEW FEATURES 40 | 41 | * `replace_misspelling` added as a way to replace misspelled words with their 42 | most likely replacement using **hunspell** in the backend. Suggested by Surin 43 | Space; see issue #39. 44 | 45 | * `as_ordinal` added as a convenience wrapper for `english::ordinal` that 46 | takes integers and converts them to ordinal form. 47 | 48 | * `%like%` added as an binary operator similar to SQL's LIKE. 49 | 50 | MINOR FEATURES 51 | 52 | * `fix_mdyyyy` added to correct dates in the form of m/d/yyyy to yyyy-mm-dd. 53 | 54 | IMPROVEMENTS 55 | 56 | * `replace_html` pics up the ability to replace "«" & "»" with ASCII 57 | equivalents "<<" & ">>". Suggested by Ilya Shutov; see issue #48. 58 | 59 | * All internal calls to `grepl()` now have `perl = TRUE` added as this is 60 | generally a speed up. Suggested by Kyle Haynes (see #51). 61 | 62 | CHANGES 63 | 64 | * `filter_element()` and `filter_row()` have been deprecated for a few years. 65 | They have now been removed. 66 | 67 | 68 | textclean 0.9.3 69 | ---------------------------------------------------------------- 70 | 71 | Version update to comply with changes in the **glue** package's API. 72 | 73 | 74 | 75 | textclean 0.8.0 - 0.9.2 76 | ---------------------------------------------------------------- 77 | 78 | BUG FIXES 79 | 80 | * `fgsub` had a bug in which the the original `pattern` in `fgsub` matches the 81 | location in the string but when the replacement occurs this was done on the 82 | entire string rather than the location of the first `pattern` match. This 83 | means the extracted string was used as a search and might be found in places 84 | other than the original location (e.g., a leading boundary in '^T' replaced 85 | with '__' may have led to '__he __itle' rather than '__he Title' as expected 86 | in the string 'The Title'). See #35 for details. The fix will add some time 87 | to the computation but is safer. 88 | 89 | NEW FEATURES 90 | 91 | * `replace_to`/`replace_from` added to remove from/to begin/end of string to/from 92 | a character(s). 93 | 94 | * The following replacement functions were added to provide remediation for 95 | problems found in `check_text`: `replace_email`, `replace_hash`, 96 | `replace_tag`, & `replace_url`. 97 | 98 | MINOR FEATURES 99 | 100 | * `check_text` picks up a `checks` and `n` argument. The former allows the user 101 | to specify which checks to conduct. The latter allows the user to truncate the 102 | output to n number of elements with a closing `...[truncated]...`. This makes 103 | the function more useful and the code easier to maintain. 104 | 105 | IMPROVEMENTS 106 | 107 | * `replace_non_ascii` did not replace all non-ASCII characters. This has been 108 | fixed by an explicit replacement of '[^ -~]+' which are all non-ASCII characters. 109 | See issue #34 for details. 110 | 111 | 112 | 113 | textclean 0.7.3 114 | ---------------------------------------------------------------- 115 | 116 | Maintenance release to bring package up to date with the lexicon package API changes. 117 | 118 | 119 | textclean 0.7.0 - 0.7.2 120 | ---------------------------------------------------------------- 121 | 122 | NEW FEATURES 123 | 124 | * `match_tokens` added to find all the tokens that match a regex(es) within a 125 | given text vector. This useful when combined with the `replace_tokens` 126 | function. 127 | 128 | * Fixed versions of `drop_element`/`keep_element` added to allow for dropping 129 | elements specified by a known vector rather than a regex. 130 | 131 | * The `collapse` and `glue` functions from the **glue** package are reexported 132 | for easy string manipulation. 133 | 134 | * `replace_date` added for normalizing dates. 135 | 136 | * `replace_time` added for normalizing time stamps. 137 | 138 | * `replace_money` added for normalizing money references. 139 | 140 | * `mgsub` picks up a `safe` argument using the **mgsub** package as the backend. 141 | In addition `mgsub_regex_safe` added to make the usage explicit. The safe mode 142 | comes at the cost of speed. 143 | 144 | IMPROVEMENTS 145 | 146 | * `replace_names` drops the replacement of 147 | `c('An', 'To', 'Oh', 'So', 'Do', 'He', 'Ha', 'In', 'Pa', 'Un')` which are 148 | likely words and not names. 149 | 150 | * `replace_html` picks ups some additional symbol replacements including: 151 | `c("™", "“", "”", "‘", "’", "•", "·", 152 | "⋅", "–", "—", "≠", "½", "¼", "¾", 153 | "°", "←", "→", "…")`. 154 | 155 | 156 | 157 | textclean 0.6.0 - 0.6.3 158 | ---------------------------------------------------------------- 159 | 160 | NEW FEATURES 161 | 162 | * `replace_kern` added to replace a form of informal emphasis in which the 163 | writer takes words >2 letters long, capitalizes the entire word, and places 164 | spaces in between each letter. This was contributed by Stack Overflow's 165 | @ctwheels: https://stackoverflow.com/a/47438305/1000343. 166 | 167 | * `replace_internet_slang` added to replace Internet acronyms and abbreviations 168 | with machine friendly word equivalents. 169 | 170 | * `replace_word_elongation` added to replace word elongations (a.k.a. "word 171 | lengthening") with the most likely normalized word form. See 172 | http://www.aclweb.org/anthology/D11-105 for details. 173 | 174 | * `fgsub` added for the ability to match, extract, operate a function over the 175 | extracted strings, & replace the original matches with the extracted strings. 176 | This performs similar functionality to `gsubfn::gsubfn` but is less powerful. 177 | For more powerful needs see the **gsubfn** package. 178 | 179 | 180 | 181 | textclean 0.4.0 - 0.5.1 182 | ---------------------------------------------------------------- 183 | 184 | BUG FIXES 185 | 186 | * `replace_grade` did not use `fixed = TRUE` for its call to `mgsub`. This could 187 | result in the plus signs being interpreted as meta-characters. This has been 188 | corrected. 189 | 190 | NEW FEATURES 191 | 192 | * `replace_names` added to remove/replace common first and last names from text 193 | data. 194 | 195 | * `make_plural` added to make a vector of singular noun forms plural. 196 | 197 | * `replace_emoji` and `replace_emoji_identifier` added for replacing emojis with 198 | text or an identifier token for use in the **sentimentr** package. 199 | 200 | MINOR FEATURES 201 | 202 | * `mgsub_regex` and `mgsub_fixed` to provide wrappers for `mgsub` that makes 203 | their use apparent without setting the `fixed` command. 204 | 205 | * `replace_curly_quote` added to replace curly quotes with straight versions. 206 | 207 | IMPROVEMENTS 208 | 209 | * `replace_non_ascii` now uses `stringi::stri_trans_general` to coerce more 210 | non-ASCII characters to ASCII format. 211 | 212 | * `check_text` now checks for HTML characters/tags. Thanks to @Peter Gensler 213 | for suggesting this (see issue #15). 214 | 215 | CHANGES 216 | 217 | * `filter_` functions deprecated in favor of `drop_`/`keep_` versions of filter 218 | functions. This was change was to address the opposite meaning that **dplyr**'s 219 | `filter` has, which retains rows matching a pattern be default. 220 | 221 | 222 | 223 | textclean 0.3.1 224 | ---------------------------------------------------------------- 225 | 226 | BUG FIXES 227 | 228 | * `replace_tokens` added to complement `mgsub` for times when the user wants to 229 | replace fixed tokens with a single value or remove them entirely. This yields 230 | an optimized solution that is much faster than `mgsub`. 231 | 232 | CHANGES 233 | 234 | * `mgusb` no longer uses `trim = TRUE` by default. 235 | 236 | textclean 0.2.1 - 0.3.0 237 | ---------------------------------------------------------------- 238 | 239 | BUG FIXES 240 | 241 | * `check_text` reported to use `replace_incomplete` rather than 242 | `add_missing_endmark` when endmark is missing. 243 | 244 | NEW FEATURES 245 | 246 | * The `replace_emoticon`, `replace_grade` and `replace_rating` functions have 247 | been moved from the **sentimentr** package to **textclean** as these are 248 | cleaning functions. This makes the functions more modular and generalizable 249 | to all types of text cleaning. These functions are still imported and 250 | exported by **sentimentr**. 251 | 252 | * `replace_html` added to remove html tags and repalce symbols with appropriate 253 | ASCII symbols. 254 | 255 | * `add_missing_endmarks` added to detect missing endmarks and replace with the 256 | desired symbol. 257 | 258 | IMPROVEMENTS 259 | 260 | * `replace_number` now uses the *english* package making it faster and more 261 | maintainable. In addition, the function now handles decimal places as well. 262 | 263 | 264 | 265 | textclean 0.1.0 - 0.2.0 266 | ---------------------------------------------------------------- 267 | 268 | BUG FIXES 269 | 270 | * `check_text` reported `NA` as non-ASCII. This has been fixed. 271 | 272 | NEW FEATURES 273 | 274 | * `check_text` added to report on potential problems in a text vector. 275 | 276 | * `replace_ordinal` added to replace ordinal numbers (e.g., 1st) with word 277 | representation (e.g., first). 278 | 279 | * `swap` added to swap two patterns simultaneously. 280 | 281 | * `filter_element` added to exclude matching elements from a vector. 282 | 283 | 284 | 285 | textclean 0.0.1 286 | ---------------------------------------------------------------- 287 | 288 | This package is a collection of tools to clean and process text. Many of these tools have been taken from the **qdap** package and revamped to be more intuitive, better named, and faster. 289 | -------------------------------------------------------------------------------- /R/add_comma_space.R: -------------------------------------------------------------------------------- 1 | #' Ensure Space After Comma 2 | #' 3 | #' Adds a space after a comma as \code{strip} and many other functions may consider a 4 | #' comma separated string as one word (i.e., \code{"one,two,three"} becomes 5 | #' \code{"onetwothree"} rather than \code{"one two three"}). 6 | #' 7 | #' @param x The text variable. 8 | #' @return Returns a vector of strings with commas that have a space after them. 9 | #' @keywords comma space 10 | #' @export 11 | #' @examples 12 | #' \dontrun{ 13 | #' x <- c("the, dog,went", "I,like,it", "where are you", NA, "why", ",", ",f") 14 | #' add_comma_space(x) 15 | #' } 16 | add_comma_space <- function(x) { 17 | gsub("(,)([^ ])", "\\1 \\2", x) 18 | } 19 | 20 | -------------------------------------------------------------------------------- /R/add_missing_endmark.R: -------------------------------------------------------------------------------- 1 | #' Add Missing Endmarks 2 | #' 3 | #' Detect missing endmarks and replace with the desired symbol. 4 | #' 5 | #' @param x The text variable. 6 | #' @param replacement Character string equal in length to pattern or of length 7 | #' one which are a replacement for matched pattern. 8 | #' @param endmarks The potential ending punctuation marks. 9 | #' @param \dots Additional arguments passed to 10 | #' \code{\link[textclean]{has_endmark}}. 11 | #' @return Returns a vector with missing endmarks added. 12 | #' @export 13 | #' @examples 14 | #' x <- c( 15 | #' "This in a", 16 | #' "I am funny!", 17 | #' "An ending of sorts%", 18 | #' "What do you want?" 19 | #' ) 20 | #' 21 | #' add_missing_endmark(x) 22 | add_missing_endmark <- function(x, replacement = "|", 23 | endmarks = c("?", ".", "!"), ...){ 24 | 25 | locs <- which(!has_endmark(x, ...)) 26 | x[locs] <- paste0(x[locs], replacement) 27 | x 28 | 29 | } 30 | 31 | -------------------------------------------------------------------------------- /R/drop_element.R: -------------------------------------------------------------------------------- 1 | #' Filter Elements in a Vetor 2 | #' 3 | #' \code{drop_element} - Filter to drop the matching elements of a vector. 4 | #' 5 | #' @param x A character vector. 6 | #' @param pattern A regex pattern to match for exclusion. 7 | #' @param regex logical. If setting this to \code{TRUE} please use 8 | #' \code{drop_element_regex} or \code{keep_element_regex} directly as this will 9 | #' provide better control and optimization. 10 | #' @param \ldots Other arguments passed to \code{\link[base]{grep}} if 11 | #' \code{regex}. If \code{fixed}, then elements to drop/keep. 12 | #' @return Returns a vector with matching elements removed. 13 | #' @rdname drop_element 14 | #' @export 15 | #' @examples 16 | #' x <- c('dog', 'cat', 'bat', 'dingo', 'dragon', 'dino') 17 | #' drop_element(x, '^d.+?g') 18 | #' keep_element(x, '^d.+?g') 19 | #' drop_element(x, 'at$') 20 | #' drop_element(x, '^d') 21 | #' drop_element(x, '\\b(dog|cat)\\b') 22 | #' 23 | #' drop_element_fixed(x, 'bat', 'cat') 24 | #' drops <- c('bat', 'cat') 25 | #' drop_element_fixed(x, drops) 26 | drop_element <- function(x, pattern, regex = TRUE, ...){ 27 | 28 | if (isTRUE(regex)) { 29 | drop_element_regex(x, pattern, ...) 30 | } else { 31 | message('Please use `drop_element_fixed` for better control.') 32 | drop_element_fixed(x, ...) 33 | } 34 | } 35 | 36 | #' @rdname drop_element 37 | #' @export 38 | drop_element_regex <- function(x, pattern, ...){ 39 | 40 | grep(pattern, x, value = TRUE, invert = TRUE, perl = TRUE, ...) 41 | } 42 | 43 | #' @rdname drop_element 44 | #' @export 45 | drop_element_fixed <- function(x, ...){ 46 | 47 | x[!x %in% unlist(list(...))] 48 | } 49 | 50 | #' Filter Elements in a Vetor 51 | #' 52 | #' \code{keep_element} - Filter to keep the matching elements of a vector. 53 | #' 54 | #' @rdname drop_element 55 | #' @export 56 | keep_element <- function(x, pattern, regex = TRUE, ...){ 57 | 58 | if (isTRUE(regex)) { 59 | keep_element_regex(x, pattern, ...) 60 | } else { 61 | message('Please use `keep_element_fixed` for better control.') 62 | keep_element_fixed(x, ...) 63 | } 64 | } 65 | 66 | #' @rdname drop_element 67 | #' @export 68 | keep_element_fixed <- function(x, ...){ 69 | 70 | x[x %in% unlist(list(...))] 71 | } 72 | 73 | 74 | #' @rdname drop_element 75 | #' @export 76 | keep_element_regex <- function(x, pattern, ...){ 77 | 78 | grep(pattern, x, value = TRUE, perl = TRUE, ...) 79 | } 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /R/drop_row.R: -------------------------------------------------------------------------------- 1 | #' Filter Rows That Contain Markers 2 | #' 3 | #' \code{drop_row} - Remove rows from a data set that contain a given 4 | #' marker/term. 5 | #' 6 | #' @param dataframe A dataframe object. 7 | #' @param column Column name to search for markers/terms. 8 | #' @param terms The regex terms/markers of the rows that are to be removed from 9 | #' the dataframe. 10 | #' @param \ldots Other arguments passed to \code{\link[base]{grepl}}. 11 | #' @return \code{drop_row} - returns a dataframe with the termed/markered rows 12 | #' removed. 13 | #' @rdname drop_row 14 | #' @export 15 | #' @examples 16 | #' \dontrun{ 17 | #' ## drop_row EXAMPLE: 18 | #' drop_row(DATA, "person", c("sam", "greg")) 19 | #' keep_row(DATA, "person", c("sam", "greg")) 20 | #' drop_row(DATA, 1, c("sam", "greg")) 21 | #' drop_row(DATA, "state", c("Comp")) 22 | #' drop_row(DATA, "state", c("I ")) 23 | #' drop_row(DATA, "state", c("you"), ignore.case=TRUE) 24 | #' 25 | #' ## drop_empty_row EXAMPLE: 26 | #' (dat <- rbind.data.frame(DATA[, c(1, 4)], matrix(rep(" ", 4), 27 | #' ncol =2, dimnames=list(12:13, colnames(DATA)[c(1, 4)])))) 28 | #' drop_empty_row(dat) 29 | #' 30 | #' ## drop_NA EXAMPLE: 31 | #' DATA[1:3, "state"] <- NA 32 | #' drop_NA(DATA) 33 | #' } 34 | drop_row <- function(dataframe, column, terms, ...) { 35 | 36 | terms <- paste(terms, collapse="|") 37 | if (length(dataframe[[column]]) == 0) { 38 | stop( 39 | "No columns in the data appear to match supplied `column`", 40 | call. = FALSE 41 | ) 42 | } 43 | dataframe <- dataframe[!grepl(terms, dataframe[[column]], perl=TRUE, ...), ] 44 | rownames(dataframe) <- NULL 45 | 46 | dataframe 47 | } 48 | 49 | #' Filter Rows That Contain Markers 50 | #' 51 | #' \code{keep_row} - Keep rows from a data set that contain a given marker/term. 52 | #' @rdname drop_row 53 | #' @export 54 | keep_row <- function(dataframe, column, terms, ...) { 55 | 56 | terms <- paste(terms, collapse="|") 57 | if (length(dataframe[[column]]) == 0) { 58 | stop( 59 | "No columns in the data appear to match supplied `column`", 60 | call. = FALSE 61 | ) 62 | } 63 | dataframe <- dataframe[grepl(terms, dataframe[[column]], perl=TRUE, ...), ] 64 | rownames(dataframe) <- NULL 65 | 66 | dataframe 67 | } 68 | 69 | 70 | #' Remove Empty Rows in a Data Frame 71 | #' 72 | #' \code{drop_empty_row} - Removes the empty rows of a data set that are common in 73 | #' reading in data. 74 | #' 75 | #' @return \code{drop_empty_row} - returns a dataframe with empty rows removed. 76 | #' @rdname drop_row 77 | #' @export 78 | drop_empty_row <- function(dataframe) { 79 | x <- apply(dataframe, 1, function(x) { 80 | paste(stats::na.omit(x), collapse = "") 81 | }) 82 | return(dataframe[!grepl("^\\s*$", x, perl = TRUE), ,drop = FALSE] ) 83 | } 84 | 85 | 86 | #' Remove Empty Rows in a Data Frame 87 | #' 88 | #' \code{drop_NA} - Removes the \code{NA} rows of a data set. 89 | #' 90 | #' @return \code{drop_NA} - returns a dataframe with \code{NA} rows removed. 91 | #' @rdname drop_row 92 | #' @export 93 | drop_NA <- function(dataframe, column = TRUE, ...){ 94 | 95 | column <- detect_text_column(dataframe, column) 96 | 97 | dataframe[!is.na(dataframe[[column]]), ,drop = FALSE] 98 | 99 | } 100 | 101 | 102 | 103 | 104 | -------------------------------------------------------------------------------- /R/fgsub.R: -------------------------------------------------------------------------------- 1 | #' Replace a Regex with an Functional Operation on the Regex Match 2 | #' 3 | #' This is a stripped down version of \code{gsubfn} from the \pkg{gsubfn} 4 | #' package. It finds a regex match, and then uses a function to operate on 5 | #' these matches and uses them to replace the original matches. Note that 6 | #' the \pkg{stringi} packages is used for matching and extracting the regex 7 | #' matches. For more powerful or flexible needs please see the \pkg{gsubfn} 8 | #' package. 9 | #' 10 | #' @param x A character vector. 11 | #' @param pattern Character string to be matched in the given character vector. 12 | #' @param fun A function to operate on the extracted matches. 13 | #' @param \ldots ignored. 14 | #' @return Returns a vector with the pattern replaced. 15 | #' @export 16 | #' @importFrom data.table := 17 | #' @seealso \code{\link[gsubfn]{gsubfn}} 18 | #' @examples 19 | #' ## In this example the regex looks for words that contain a lower case letter 20 | #' ## followed by the same letter at least 2 more times. It then extracts these 21 | #' ## words, splits them appart into letters, reverses the string, pastes them 22 | #' ## back together, wraps them with double angle braces, and then puts them back 23 | #' ## at the original locations. 24 | #' fgsub( 25 | #' x = c(NA, 'df dft sdf', 'sd fdggg sd dfhhh d', 'ddd'), 26 | #' pattern = "\\b\\w*([a-z])(\\1{2,})\\w*\\b", 27 | #' fun = function(x) { 28 | #' paste0('<<', paste(rev(strsplit(x, '')[[1]]), collapse =''), '>>') 29 | #' } 30 | #' ) 31 | #' 32 | #' ## In this example we extract numbers, strip out non-digits, coerce them to 33 | #' ## numeric, cut them in half, round up to the closest integer, add the commas 34 | #' ## back, and replace back into the original locations. 35 | #' fgsub( 36 | #' x = c(NA, 'I want 32 grapes', 'he wants 4 ice creams', 37 | #' 'they want 1,234,567 dollars' 38 | #' ), 39 | #' pattern = "[\\d,]+", 40 | #' fun = function(x) { 41 | #' prettyNum( 42 | #' ceiling(as.numeric(gsub('[^0-9]', '', x))/2), 43 | #' big.mark = ',' 44 | #' ) 45 | #' } 46 | #' ) 47 | #' 48 | #' ## In this example we extract leading zeros, convert to an equal number of 49 | #' ## spaces. 50 | #' fgsub( 51 | #' x = c(NA, "00:04", "00:08", "00:01", "06:14", "00:02", "00:04"), 52 | #' pattern = '^0+', 53 | #' fun = function(x) {gsub('0', ' ', x)} 54 | #' ) 55 | fgsub <- function(x, pattern, fun, ...){ 56 | 57 | hit_id <- pattern_id <- pat <- NULL 58 | 59 | locs <- stringi::stri_detect_regex(x, pattern) 60 | locs[is.na(locs)] <- FALSE 61 | txt <- x[locs] 62 | 63 | hits <- stringi::stri_extract_all_regex(txt, pattern) 64 | 65 | ## Make unique replacement substrings 66 | h <- lengths(hits) 67 | y <- sum(h) 68 | if (y == 0) return(x) 69 | counter <- ceiling(y/26) 70 | 71 | ## Make a replacement key 72 | pats <- unique(unlist(hits)) 73 | reps <- paste0("textcleanholder", seq_along(pats), "textcleanholder") 74 | freps <- unlist(lapply(pats, fun)) 75 | 76 | pat_key <- data.table::data.table(pat = reps, replacement = freps) 77 | 78 | hit_key <- data.table::data.table( 79 | hit_id = rep(seq_len(length(h)), h), 80 | pat = reps, 81 | pattern_id = unlist(lapply(h, seq_len)) 82 | ) 83 | 84 | data.table::setkey(pat_key, pat) 85 | data.table::setkey(hit_key, pat) 86 | 87 | hit_key <- hit_key[pat_key][, 88 | hit_id := as.integer(hit_id)][, 89 | pattern_id := as.integer(pattern_id)] 90 | 91 | data.table::setorderv(hit_key, cols = c('hit_id', 'pattern_id')) 92 | 93 | ## Loop through and replace the first pattern in each element with a unique 94 | ## replacement substring 95 | for (i in seq_len(y)) { 96 | 97 | hkr <- hit_key[i,] 98 | 99 | txt[hkr[, 'hit_id'][[1]]] <- sub( 100 | pattern, 101 | hkr[, 'pat'][[1]], 102 | txt[hkr[, 'hit_id'][[1]]], 103 | perl = TRUE 104 | ) 105 | 106 | } 107 | 108 | ## Because the unique repalcment substrings are so unlikely to have a 109 | ## collision, we can use fixed = TRUE and be very quick here 110 | txt <- mgsub(txt, hit_key[['pat']], hit_key[['replacement']], fixed = TRUE, ...) 111 | 112 | x[locs] <- txt 113 | x 114 | 115 | } 116 | 117 | ## defunct version 2018-06-06 118 | # fgsub <- function(x, pattern, fun, ...){ 119 | # 120 | # hit_id <- pattern_id <- pat <- NULL 121 | # 122 | # locs <- stringi::stri_detect_regex(x, pattern) 123 | # locs[is.na(locs)] <- FALSE 124 | # txt <- x[locs] 125 | # 126 | # hits <- stringi::stri_extract_all_regex(txt, pattern) 127 | # 128 | # 129 | # pats <- unique(unlist(hits)) 130 | # reps <- paste0("textcleanholder", seq_along(pats), "textcleanholder") 131 | # freps <- unlist(lapply(pats, fun)) 132 | # 133 | # pat_key <- data.table::data.table(pat = pats, replacement = freps) 134 | # 135 | # hit_key <- textshape::tidy_list( 136 | # set_names( 137 | # lapply(hits, function(x) set_names(x, seq_along(x))), 138 | # seq_along(hits) 139 | # ), 140 | # 'hit_id', 'pat', 'pattern_id' 141 | # ) 142 | # 143 | # 144 | # data.table::setkey(pat_key, pat) 145 | # data.table::setkey(hit_key, pat) 146 | # 147 | # hit_key <- hit_key[pat_key][, 148 | # hit_id := as.integer(hit_id)][, 149 | # pattern_id := as.integer(pattern_id)] 150 | # 151 | # data.table::setorderv(hit_key, cols = c('hit_id', 'pattern_id')) 152 | # 153 | # for (i in seq_len(nrow(hit_key))) { 154 | # hkr <- hit_key[i,] 155 | # hkr[, 'pattern_id'][[1]] 156 | # txt[hkr[, 'hit_id'][[1]]] <- sub( 157 | # hkr[, 'pat'][[1]], 158 | # hkr[, 'replacement'][[1]], 159 | # txt[hkr[, 'hit_id'][[1]]], 160 | # perl = TRUE 161 | # ) 162 | # } 163 | # 164 | # x[locs] <- txt 165 | # x 166 | # 167 | # } 168 | 169 | ## old version removed 2018-06-01 170 | # fgsub <- function(x, pattern, fun, ...){ 171 | # 172 | # locs <- stringi::stri_detect_regex(x, pattern) 173 | # locs[is.na(locs)] <- FALSE 174 | # txt <- x[locs] 175 | # 176 | # hits <- stringi::stri_extract_all_regex(txt, pattern) 177 | # pats <- unique(unlist(hits)) 178 | # reps <- paste0('textcleanholder', seq_along(pats), 'textcleanholder') 179 | # freps <- unlist(lapply(pats, fun)) 180 | # 181 | # txt <- mgsub(txt, pats, reps) 182 | # 183 | # x[locs] <- mgsub(txt, reps, freps) 184 | # x 185 | # 186 | # } 187 | -------------------------------------------------------------------------------- /R/fix_mdyyyy.R: -------------------------------------------------------------------------------- 1 | #' Coerce Character m/d/yyyy to Date 2 | #' 3 | #' Uses regular expressions to sub out a single day or month with a leading zero 4 | #' and then coerces to a date object. 5 | #' 6 | #' @param x A character date in the form of m/d/yyyy where m and d can be single 7 | #' integers like 1 for January. 8 | #' @param \ldots ignored. 9 | #' @return Returns a data vector 10 | #' @export 11 | #' @rdname fix_mdyyyy 12 | #' @examples 13 | #' fix_mdyyyy(c('4/23/2017', '12/1/2016', '3/3/2013', '12/12/2012', '2013-01-01')) 14 | #' \dontrun{ 15 | #' library(dplyr) 16 | #' data_frame( 17 | #' x = 1:4, 18 | #' y = LETTERS[1:4], 19 | #' start_date = c('4/23/2017', '12/1/2016', '3/3/2013', '12/12/2012'), 20 | #' end_date = c('5/23/2017', '12/9/2016', '3/3/2016', '2/01/2012') 21 | #' ) %>% 22 | #' mutate_at(vars(ends_with('_date')), fix_mdyyyy) 23 | #' } 24 | fix_mdyyyy <- function(x, ...){ 25 | UseMethod('fix_mdyyyy') 26 | } 27 | 28 | 29 | #' @export 30 | #' @method fix_mdyyyy date 31 | fix_mdyyyy.date <- function(x, ...){ 32 | x 33 | } 34 | 35 | #' @export 36 | #' @method fix_mdyyyy default 37 | fix_mdyyyy.default <- function(x, ...){ 38 | as.Date(fix_mdyyyy_character(x), format = '%Y-%m-%d') 39 | } 40 | 41 | fix_mdyyyy_character <- function(x, ...){ 42 | gsub( 43 | '(^\\d{2})(?:/)(\\d{2})(?:/)(\\d{4})', 44 | '\\3-\\1-\\2', 45 | gsub( 46 | '(/)(\\d{1}/)', 47 | '\\10\\2', 48 | gsub( 49 | '(^\\d{1}/)', 50 | '0\\1', 51 | x 52 | ) 53 | ) 54 | ) 55 | } 56 | -------------------------------------------------------------------------------- /R/glue-reexports.R: -------------------------------------------------------------------------------- 1 | #' @importFrom glue glue 2 | #' @export 3 | glue::glue 4 | 5 | #' @importFrom glue glue_collapse 6 | #' @export 7 | glue::glue_collapse 8 | -------------------------------------------------------------------------------- /R/has_endmark.R: -------------------------------------------------------------------------------- 1 | #' Test for Incomplete Sentences 2 | #' 3 | #' A logical test of missing sentence ending punctuation. 4 | #' 5 | #' @param x A character vector. 6 | #' @param endmarks The potential ending punctuation marks, 7 | #' @param \dots ignored. 8 | #' @return Returns a logical vector. 9 | #' @keywords incomplete 10 | #' @export 11 | #' @examples 12 | #' x <- c( 13 | #' "I like it.", 14 | #' "Et tu?", 15 | #' "Not so much", 16 | #' "Oh, I understand.", 17 | #' "At 3 p.m., we go", 18 | #' NA 19 | #' ) 20 | #' has_endmark(x) 21 | has_endmark <- function(x, endmarks = c('?', '.', '!'), ...){ 22 | !is.na(x) & grepl( 23 | sprintf('[%s]\\s*$', paste(endmarks, collapse = "")), 24 | x, perl = TRUE, 25 | ... 26 | ) 27 | } 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /R/like.R: -------------------------------------------------------------------------------- 1 | #' SQL Style LIKE 2 | #' 3 | #' Use like as a SQL-esque opertator for pattern matching. \code{\%like\%} is 4 | #' case insensitive while \code{\%slike\%} is case sensitive. This is most useful 5 | #' in a \code{dplyr::filter}. 6 | #' 7 | #' @param var A variable/column. 8 | #' @param pattern A search pattern. 9 | #' @export 10 | #' @rdname like 11 | #' @examples 12 | #' state.name[state.name %like% 'or'] 13 | #' state.name[state.name %LIKE% 'or'] 14 | #' state.name[state.name %slike% 'or'] ## No Oregon 15 | `%like%` <- function(var, pattern){ 16 | stringi::stri_detect_regex(var, pattern, case_insensitive = TRUE) 17 | } 18 | 19 | #' @rdname like 20 | #' @export 21 | `%LIKE%` <- `%like%` 22 | 23 | #' @rdname like 24 | #' @export 25 | `%slike%` <- function(var, pattern){ 26 | stringi::stri_detect_regex(var, pattern, case_insensitive = FALSE) 27 | } 28 | 29 | #' @rdname like 30 | #' @export 31 | `%SLIKE%` <- `%slike%` -------------------------------------------------------------------------------- /R/make_plural.R: -------------------------------------------------------------------------------- 1 | #' Make Plural (or Verb to Singular) Versions of Words 2 | #' 3 | #' Add -s, -es, or -ies to words. 4 | #' 5 | #' @param x A vector of words to make plural. 6 | #' @param keep.original logical. If \code{TRUE} the original words are kept in 7 | #' the return vector. 8 | #' @param irregular A \code{data.frame} of singular and plural conversions for 9 | #' irregular nouns. The first column should be singular and the second plural 10 | #' form of the irregular noun. 11 | #' @return Returns a vector of plural words. 12 | #' @keywords plural 13 | #' @export 14 | #' @examples 15 | #' x <- c('fox', 'sky', 'dog', 'church', 'fish', 'miss', 'match', 'deer', 'block') 16 | #' make_plural(x) 17 | make_plural <- function (x, keep.original = FALSE, 18 | irregular = lexicon::pos_df_irregular_nouns) { 19 | 20 | stopifnot(is.data.frame(irregular)) 21 | 22 | hits <- match(tolower(x), tolower(irregular[[1]])) 23 | 24 | ends <- "(sh?|x|z|ch)$" 25 | pluralify <- ifelse(grepl(ends, x, perl = TRUE), "es", "s") 26 | out <- gsub("ys$", "ies", paste0(x, pluralify)) 27 | out[which(!is.na(hits))] <- irregular[[2]][hits[which(!is.na(hits))]] 28 | 29 | c(if (keep.original) { 30 | x 31 | }, out) 32 | 33 | } 34 | -------------------------------------------------------------------------------- /R/match_tokens.R: -------------------------------------------------------------------------------- 1 | #' Find Tokens that Match a Regex 2 | #' 3 | #' Given a text, find all the tokens that match a regex(es). This function is 4 | #' particularly useful with \code{\link[textclean]{replace_tokens}}. 5 | #' 6 | #' @param x A character vector. 7 | #' @param pattern Character string(s) to be matched in the given character vector. 8 | #' @param ignore.case logical. If \code{TRUE} the case of the tokens/patterns 9 | #' will be ignored. 10 | #' @param \ldots ignored. 11 | #' @return Returns a vector of tokens from a text matching a specific regex 12 | #' pattern. 13 | #' @export 14 | #' @seealso \code{\link[textclean]{replace_tokens}} 15 | #' @examples 16 | #' with(DATA, match_tokens(state, c('^li', 'ou'))) 17 | #' 18 | #' with(DATA, match_tokens(state, c('^Th', '^I'), ignore.case = TRUE)) 19 | #' with(DATA, match_tokens(state, c('^Th', '^I'), ignore.case = FALSE)) 20 | match_tokens <- function(x, pattern, ignore.case = TRUE, ...){ 21 | 22 | if (!is.atomic(x)) stop('`x` should be a character vector') 23 | y <- rm_na(unique(unlist(textshape::split_token(x, lower = ignore.case)))) 24 | if (isTRUE(ignore.case)) pattern <- tolower(pattern) 25 | 26 | y[grepl(paste(paste0('(', pattern, ')'), collapse = '|'), y, perl = TRUE)] 27 | 28 | } 29 | 30 | 31 | -------------------------------------------------------------------------------- /R/mgsub.R: -------------------------------------------------------------------------------- 1 | #' Multiple \code{\link[base]{gsub}} 2 | #' 3 | #' \code{mgsub} - A wrapper for \code{\link[base]{gsub}} that takes a vector 4 | #' of search terms and a vector or single value of replacements. 5 | #' 6 | #' @param x A character vector. 7 | #' @param pattern Character string to be matched in the given character vector. 8 | #' @param replacement Character string equal in length to pattern or of length 9 | #' one which are a replacement for matched pattern. 10 | #' @param leadspace logical. If \code{TRUE} inserts a leading space in the 11 | #' replacements. 12 | #' @param trailspace logical. If \code{TRUE} inserts a trailing space in the 13 | #' replacements. 14 | #' @param fixed logical. If \code{TRUE}, pattern is a string to be matched as 15 | #' is. 16 | #' Overrides all conflicting arguments. 17 | #' @param trim logical. If \code{TRUE} leading and trailing white spaces are 18 | #' removed and multiple white spaces are reduced to a single white space. 19 | #' @param order.pattern logical. If \code{TRUE} and \code{fixed = TRUE}, the 20 | #' \code{pattern} string is sorted by number of characters to prevent substrings 21 | #' replacing meta strings (e.g., \code{pattern = c("the", "then")} resorts to 22 | #' search for "then" first). 23 | #' @param safe logical. If \code{TRUE} then the \pkg{mgsub} package is used as 24 | #' the backend and performs safe substitutions. The trade-off is that this mode 25 | #' will slow the replacements down considerably. 26 | #' @param \dots Additional arguments passed to \code{\link[base]{gsub}}. In 27 | #' \code{mgsub_regex_safe} this is other arguments passed to 28 | #' \code{\link[mgsub]{mgsub}}. 29 | #' @return \code{mgsub} - Returns a vector with the pattern replaced. 30 | #' @seealso \code{\link[textclean]{replace_tokens}} 31 | #' \code{\link[base]{gsub}} 32 | #' @export 33 | #' @rdname mgsub 34 | #' @examples 35 | #' mgsub(DATA$state, c("it's", "I'm"), c("it is", "I am")) 36 | #' mgsub(DATA$state, "[[:punct:]]", "PUNC", fixed = FALSE) 37 | #' \dontrun{ 38 | #' library(textclean) 39 | #' hunthou <- replace_number(seq_len(1e5)) 40 | #' 41 | #' textclean::mgsub( 42 | #' "'twenty thousand three hundred five' into 20305", 43 | #' hunthou, 44 | #' seq_len(1e5) 45 | #' ) 46 | #' ## "'20305' into 20305" 47 | #' 48 | #' ## Larger example from: https://stackoverflow.com/q/18332463/1000343 49 | #' ## A slower approach 50 | #' fivehunthou <- replace_number(seq_len(5e5)) 51 | #' 52 | #' testvect <- c("fifty seven", "four hundred fifty seven", 53 | #' "six thousand four hundred fifty seven", 54 | #' "forty six thousand four hundred fifty seven", 55 | #' "forty six thousand four hundred fifty seven", 56 | #' "three hundred forty six thousand four hundred fifty seven" 57 | #' ) 58 | #' 59 | #' textclean::mgsub(testvect, fivehunthou, seq_len(5e5)) 60 | #' 61 | #' ## Safe substitution: Uses the mgsub package as the backend 62 | #' dubious_string <- "Dopazamine is a fake chemical" 63 | #' pattern <- c("dopazamin","do.*ne") 64 | #' replacement <- c("freakout","metazamine") 65 | #' 66 | #' mgsub(dubious_string, pattern, replacement, ignore.case = TRUE, fixed = FALSE) 67 | #' mgsub(dubious_string, pattern, replacement, safe = TRUE, fixed = FALSE) 68 | #' } 69 | mgsub <- function (x, pattern, replacement, leadspace = FALSE, 70 | trailspace = FALSE, fixed = TRUE, trim = FALSE, order.pattern = fixed, 71 | safe = FALSE, ...) { 72 | 73 | if (!is.null(list(...)$ignore.case) & fixed) { 74 | warning( 75 | paste0('`ignore.case = TRUE` can\'t be used with `fixed = TRUE`.\n', 76 | 'Do you want to set `fixed = FALSE`?' 77 | ), 78 | call. = FALSE 79 | ) 80 | } 81 | 82 | if (safe) { 83 | return(mgsub_regex_safe(x = x, pattern = pattern, 84 | replacement = replacement, ...)) 85 | } 86 | 87 | if (leadspace | trailspace) { 88 | replacement <- spaste( 89 | replacement, 90 | trailing = trailspace, 91 | leading = leadspace 92 | ) 93 | } 94 | 95 | if (fixed && order.pattern) { 96 | ord <- rev(order(nchar(pattern))) 97 | pattern <- pattern[ord] 98 | if (length(replacement) != 1) replacement <- replacement[ord] 99 | } 100 | 101 | if (length(replacement) == 1) { 102 | replacement <- rep(replacement, length(pattern)) 103 | } 104 | 105 | if (any(!nzchar(pattern))) { 106 | good_apples <- which(nzchar(pattern)) 107 | pattern <- pattern[good_apples] 108 | replacement <- replacement[good_apples] 109 | warning(paste0( 110 | 'Empty pattern found (i.e., `pattern = ""`).\n', 111 | 'This pattern and replacement have been removed.' 112 | ), call. = FALSE) 113 | } 114 | 115 | for (i in seq_along(pattern)){ 116 | x <- gsub(pattern[i], replacement[i], x, fixed = fixed, ...) 117 | } 118 | 119 | if (trim) { 120 | x <- gsub("\\s+", " ", gsub("^\\s+|\\s+$", "", x, perl=TRUE), perl=TRUE) 121 | } 122 | 123 | x 124 | } 125 | 126 | #' Multiple \code{\link[base]{gsub}} 127 | #' 128 | #' \code{mgsub_fixed} - An alias for \code{mgsub}. 129 | #' 130 | #' @export 131 | #' @rdname mgsub 132 | mgsub_fixed <- mgsub 133 | 134 | #' Multiple \code{\link[base]{gsub}} 135 | #' 136 | #' \code{mgsub_regex} - An wrapper for \code{mgsub} with \code{fixed = FALSE}. 137 | #' 138 | #' @export 139 | #' @rdname mgsub 140 | mgsub_regex <- function(x, pattern, replacement, leadspace = FALSE, 141 | trailspace = FALSE, fixed = FALSE, trim = FALSE, order.pattern = fixed, 142 | ...) { 143 | 144 | mgsub(x = x, pattern = pattern, replacement = replacement, 145 | leadspace = leadspace, trailspace = trailspace, fixed = fixed, 146 | trim = trim, order.pattern = order.pattern, ... 147 | ) 148 | 149 | } 150 | 151 | #' Multiple \code{\link[base]{gsub}} 152 | #' 153 | #' \code{mgsub_regex_safe} - An wrapper for \code{\link[mgsub]{mgsub}}. 154 | #' 155 | #' @export 156 | #' @rdname mgsub 157 | mgsub_regex_safe <- function(x, pattern, replacement, ...){ 158 | mgsub::mgsub(string = x, pattern = pattern, replacement = replacement, ...) 159 | } 160 | 161 | 162 | spaste <- 163 | function (terms, trailing = TRUE, leading = TRUE) { 164 | if (leading) { 165 | s1 <- " " 166 | } else { 167 | s1 <- "" 168 | } 169 | if (trailing) { 170 | s2 <- " " 171 | } else { 172 | s2 <- "" 173 | } 174 | pas <- function(x) paste0(s1, x, s2) 175 | if (is.list(terms)) { 176 | z <- lapply(terms, pas) 177 | } else { 178 | z <- pas(terms) 179 | } 180 | return(z) 181 | } 182 | 183 | 184 | -------------------------------------------------------------------------------- /R/replace_contraction.R: -------------------------------------------------------------------------------- 1 | #' Replace Contractions 2 | #' 3 | #' This function replaces contractions with long form. 4 | #' 5 | #' @param x The text variable. 6 | #' @param contraction.key A two column hash of contractions (column 1) and 7 | #' expanded form replacements (column 2). Default is to use 8 | #' \code{\link[lexicon]{key_contractions}} data set. 9 | #' @param ignore.case logical. Should case be ignored? 10 | #' @param \dots ignored. 11 | #' @return Returns a vector with contractions replaced. 12 | #' @keywords contraction 13 | #' @export 14 | #' @examples 15 | #' \dontrun{ 16 | #' x <- c("Mr. Jones isn't going.", 17 | #' "Check it out what's going on.", 18 | #' "He's here but didn't go.", 19 | #' "the robot at t.s. wasn't nice", 20 | #' "he'd like it if i'd go away") 21 | #' 22 | #' replace_contraction(x) 23 | #' } 24 | replace_contraction <- 25 | function(x, contraction.key = lexicon::key_contractions, ignore.case=TRUE, 26 | ...) { 27 | 28 | mgsub(x, contraction.key[[1]], contraction.key[[2]], 29 | fixed = FALSE, ignore.case=TRUE) 30 | 31 | } 32 | 33 | -------------------------------------------------------------------------------- /R/replace_date.R: -------------------------------------------------------------------------------- 1 | #' Replace Dates With Words 2 | #' 3 | #' Replaces dates with word equivalents. 4 | #' 5 | #' @param x The text variable. 6 | #' @param pattern Character date regex string to be matched in the given 7 | #' character vector. 8 | #' @param replacement A function to operate on the extracted matches or a 9 | #' character string which is a replacement for the matched pattern. 10 | #' @param \ldots ignored. 11 | #' @return Returns a vector with the pattern replaced. 12 | #' @export 13 | #' @examples 14 | #' x <- c( 15 | #' NA, '11-16-1980 and 11/16/1980', 16 | #' "and 2017-02-08 but then there's 2/8/2017 too" 17 | #' ) 18 | #' 19 | #' replace_date(x) 20 | #' replace_date(x, replacement = '<>') 21 | replace_date <- function(x, 22 | pattern = NULL, 23 | replacement = NULL, ...){ 24 | 25 | if (is.null(pattern)) pattern <- replace_date_pattern 26 | if (is.null(replacement)) replacement <- replace_date_fun 27 | 28 | if (is.function(replacement)) { 29 | f_gsub <- fgsub 30 | } else { 31 | f_gsub <- stringi::stri_replace_all_regex 32 | } 33 | 34 | f_gsub(x, pattern, replacement) 35 | 36 | } 37 | 38 | replace_date_pattern <- paste0( 39 | '([01]?[0-9])[/-]([0-2]?[0-9]|3[01])[/-]\\d{4}|\\d{4}[/-]', 40 | '([01]?[0-9])[/-]([0-2]?[0-9]|3[01])' 41 | ) 42 | 43 | replace_date_fun <- function(x){ 44 | 45 | parts <- strsplit( 46 | gsub('(^.+)([/-])(\\d{4})', '\\3\\2\\1', x, perl = TRUE), 47 | '[/-]' 48 | )[[1]] 49 | 50 | y <- replace_number(parts[1]) 51 | m <- month.name[as.integer(parts[2])] 52 | d <- english::ordinal(as.integer(parts[3])) 53 | paste0(m, ' ', d, ', ', y) 54 | 55 | } 56 | -------------------------------------------------------------------------------- /R/replace_email.R: -------------------------------------------------------------------------------- 1 | #' Replace Email Addresses 2 | #' 3 | #' Replaces email addresses. 4 | #' 5 | #' @param x The text variable. 6 | #' @param pattern Character time regex string to be matched in the given 7 | #' character vector. 8 | #' @param replacement A function to operate on the extracted matches or a 9 | #' character string which is a replacement for the matched pattern. 10 | #' @param \ldots ignored. 11 | #' @return Returns a vector with email addresses replaced. 12 | #' @export 13 | #' @importFrom qdapRegex grab 14 | #' @examples 15 | #' x <- c( 16 | #' "fred is fred@@foo.com and joe is joe@@example.com - but @@this is a", 17 | #' "twitter handle for twit@@here.com or foo+bar@@google.com/fred@@foo.fnord", 18 | #' "hello world", 19 | #' NA 20 | #' ) 21 | #' 22 | #' replace_email(x) 23 | #' replace_email(x, replacement = '<>') 24 | #' replace_email(x, replacement = '$1') 25 | #' 26 | #' ## Replacement with a function 27 | #' replace_email(x, 28 | #' replacement = function(x){ 29 | #' sprintf('%s', x, x) 30 | #' } 31 | #' ) 32 | #' 33 | #' 34 | #' replace_email(x, 35 | #' replacement = function(x){ 36 | #' gsub('@@.+$', ' {{at domain}}', x) 37 | #' } 38 | #' ) 39 | replace_email <- function(x, pattern = qdapRegex::grab('rm_email'), 40 | replacement = '', ...){ 41 | 42 | if (is.function(replacement)) { 43 | f_gsub <- fgsub 44 | } else { 45 | f_gsub <- stringi::stri_replace_all_regex 46 | } 47 | 48 | f_gsub(x, pattern, replacement) 49 | 50 | } 51 | -------------------------------------------------------------------------------- /R/replace_emoji.R: -------------------------------------------------------------------------------- 1 | #' Replace Emojis With Words/Identifier 2 | #' 3 | #' Replaces emojis with word equivalents or a token identifier for use in the 4 | #' \pkg{sentimentr} package. Not that this function will coerce the text to 5 | #' ASCII using 6 | #' \code{Encoding(x) <- "latin1"; iconv(x, "latin1", "ASCII", "byte")}. 7 | #' The function \code{replace_emoji} replaces emojis with text representations 8 | #' while \code{replace_emoji_identifier} replaces with a unique identifier that 9 | #' corresponds to \code{lexicon::hash_sentiment_emoji} for use in the 10 | #' \pkg{sentimentr} package. 11 | #' 12 | #' @param x The text variable. 13 | #' @param emoji_dt A \pkg{data.table} of emojis (ASCII byte representations) 14 | #' and corresponding word/identifier meanings. 15 | #' @param \ldots Other arguments passed to \code{.mgsub} (see 16 | #' \code{textclean:::.mgsub} for details). 17 | #' @return Returns a vector of strings with emojis replaced with word 18 | #' equivalents. 19 | #' @keywords emoji 20 | #' @export 21 | #' @rdname replace_emoji 22 | #' @examples 23 | #' fls <- system.file("docs/emoji_sample.txt", package = "textclean") 24 | #' x <- readLines(fls)[1] 25 | #' replace_emoji(x) 26 | #' replace_emoji_identifier(x) 27 | replace_emoji <- function(x, emoji_dt = lexicon::hash_emojis, ...){ 28 | 29 | gsub("\\s+", " ", .mgsub(emoji_dt[["x"]], paste0(" ", emoji_dt[["y"]], " "), 30 | to_byte(x), ...)) 31 | 32 | } 33 | 34 | lexicon_available_data <- lexicon::available_data 35 | 36 | #' @export 37 | #' @rdname replace_emoji 38 | replace_emoji_identifier <- function(x, 39 | emoji_dt = lexicon::hash_emojis_identifier, ...){ 40 | 41 | gsub("\\s+", " ", .mgsub(emoji_dt[["x"]], paste0(" ", emoji_dt[["y"]], " "), 42 | to_byte(x), ...)) 43 | 44 | } 45 | 46 | 47 | -------------------------------------------------------------------------------- /R/replace_emoticon.R: -------------------------------------------------------------------------------- 1 | #' Replace Emoticons With Words 2 | #' 3 | #' Replaces emoticons with word equivalents. 4 | #' 5 | #' @param x The text variable. 6 | #' @param emoticon_dt A \pkg{data.table} of emoticons (graphical representations) 7 | #' and corresponding word meanings. 8 | #' @param \ldots Other arguments passed to \code{.mgsub} (see 9 | #' \code{textclean:::.mgsub} for details). 10 | #' @return Returns a vector of strings with emoticons replaced with word 11 | #' equivalents. 12 | #' @keywords emoticon 13 | #' @export 14 | #' @examples 15 | #' x <- c( 16 | #' paste( 17 | #' "text from:", 18 | #' "http://www.webopedia.com/quick_ref/textmessageabbreviations_02.asp" 19 | #' ), 20 | #' "... understanding what different characters used in smiley faces mean:", 21 | #' "The close bracket represents a sideways smile )", 22 | #' "Add in the colon and you have sideways eyes :", 23 | #' "Put them together to make a smiley face :)", 24 | #' "Use the dash - to add a nose :-)", 25 | #' paste( 26 | #' "Change the colon to a semi-colon ;", 27 | #' "and you have a winking face ;) with a nose ;-)" 28 | #' ), 29 | #' paste( 30 | #' "Put a zero 0 (halo) on top and now you have a winking,", 31 | #' "smiling angel 0;) with a nose 0;-)" 32 | #' ), 33 | #' "Use the letter 8 in place of the colon for sunglasses 8-)", 34 | #' "Use the open bracket ( to turn the smile into a frown :-(", 35 | #' "I have experience with using the xp emoticon" 36 | #' ) 37 | #' 38 | #' replace_emoticon(x) 39 | replace_emoticon <- function(x, emoticon_dt = lexicon::hash_emoticons, ...){ 40 | 41 | trimws(gsub( 42 | "\\s+", 43 | " ", 44 | mgsub_regex(x, paste0('\\b\\Q', emoticon_dt[['x']], '\\E\\b'), paste0(" ", emoticon_dt[['y']], " ")) 45 | )) 46 | 47 | } 48 | -------------------------------------------------------------------------------- /R/replace_grade.R: -------------------------------------------------------------------------------- 1 | #' Replace Grades With Words 2 | #' 3 | #' Replaces grades with word equivalents. 4 | #' 5 | #' @param x The text variable. 6 | #' @param grade_dt A \pkg{data.table} of grades and corresponding word meanings. 7 | #' @param \ldots ignored. 8 | #' @return Returns a vector of strings with grades replaced with word 9 | #' equivalents. 10 | #' @keywords grade 11 | #' @export 12 | #' @examples 13 | #' (text <- replace_grade(c( 14 | #' "I give an A+", 15 | #' "He deserves an F", 16 | #' "It's C+ work", 17 | #' "A poor example deserves a C!" 18 | #' ))) 19 | replace_grade <- function (x, grade_dt = lexicon::key_grade, ...) { 20 | 21 | mgsub( 22 | x, 23 | paste0(' ', grade_dt[["x"]]), 24 | grade_dt[["y"]], 25 | fixed = TRUE, 26 | leadspace = TRUE 27 | ) 28 | } 29 | 30 | -------------------------------------------------------------------------------- /R/replace_hash.R: -------------------------------------------------------------------------------- 1 | #' Replace Hashes 2 | #' 3 | #' Replaces Twitter style hash tags (e.g., '#rstats'). 4 | #' 5 | #' @param x The text variable. 6 | #' @param pattern Character time regex string to be matched in the given 7 | #' character vector. 8 | #' @param replacement A function to operate on the extracted matches or a 9 | #' character string which is a replacement for the matched pattern. 10 | #' @param \ldots ignored. 11 | #' @return Returns a vector with hashes replaced. 12 | #' @export 13 | #' @importFrom qdapRegex grab 14 | #' @examples 15 | #' x <- c("@@hadley I like #rstats for #ggplot2 work.", 16 | #' "Difference between #magrittr and #pipeR, both implement pipeline operators for #rstats: 17 | #' http://renkun.me/r/2014/07/26/difference-between-magrittr-and-pipeR.html @@timelyportfolio", 18 | #' "Slides from great talk: @@ramnath_vaidya: Interactive slides from Interactive Visualization 19 | #' presentation #user2014. http://ramnathv.github.io/user2014-rcharts/#1" 20 | #' ) 21 | #' 22 | #' replace_hash(x) 23 | #' replace_hash(x, replacement = '<>') 24 | #' replace_hash(x, replacement = '$3') 25 | #' 26 | #' ## Replacement with a function 27 | #' replace_hash(x, 28 | #' replacement = function(x){ 29 | #' paste0('{{', gsub('^#', 'TOPIC: ', x), '}}') 30 | #' } 31 | #' ) 32 | replace_hash <- function(x, pattern = qdapRegex::grab('rm_hash'), 33 | replacement = '', ...){ 34 | 35 | if (is.function(replacement)) { 36 | f_gsub <- fgsub 37 | } else { 38 | f_gsub <- stringi::stri_replace_all_regex 39 | } 40 | 41 | f_gsub(x, pattern, replacement) 42 | 43 | } 44 | -------------------------------------------------------------------------------- /R/replace_html.R: -------------------------------------------------------------------------------- 1 | #' Replace HTML Markup 2 | #' 3 | #' Replaces HTML markup. The angle braces are removed and the HTML symbol 4 | #' markup is replaced with equivalent symbols. 5 | #' 6 | #' @details Replacements for symbols are as follows: 7 | #' 8 | #' \tabular{lr}{ 9 | #' \bold{html} \tab \bold{symbol} \cr 10 | #' © \tab (c) \cr 11 | #' ® \tab (r) \cr 12 | #' ™ \tab tm \cr 13 | #' “ \tab " \cr 14 | #' ” \tab " \cr 15 | #' ‘ \tab ' \cr 16 | #' ’ \tab ' \cr 17 | #' • \tab - \cr 18 | #' · \tab - \cr 19 | #' ⋅ \tab [] \cr 20 | #' – \tab - \cr 21 | #' — \tab - \cr 22 | #' ¢ \tab cents \cr 23 | #' £ \tab pounds \cr 24 | #' € \tab euro \cr 25 | #' ≠ \tab != \cr 26 | #' ½ \tab half \cr 27 | #' ¼ \tab quarter \cr 28 | #' ¾ \tab three fourths \cr 29 | #' ° \tab degrees \cr 30 | #' ← \tab <- \cr 31 | #' → \tab -> \cr 32 | #' … \tab ... \cr 33 | #'   \tab \cr 34 | #' < \tab < \cr 35 | #' > \tab > \cr 36 | #' « \tab << \cr 37 | #' » \tab >> \cr 38 | #' & \tab & \cr 39 | #' " \tab " \cr 40 | #' ' \tab ' \cr 41 | #' ¥ \tab yen \cr 42 | #' } 43 | #' 44 | #' @param x The text variable. 45 | #' @param symbol logical. If code{TRUE} the symbols are retained with appropriate 46 | #' replacements. If \code{FALSE} they are removed. 47 | #' @param \ldots Ignored. 48 | #' @return Returns a vector with HTML markup replaced. 49 | #' @keywords html 50 | #' @export 51 | #' @examples 52 | #' x <- c( 53 | #' "Random text with symbols:   < > & " '", 54 | #' "

More text

¢ £ ¥ € © ® « »" 55 | #' ) 56 | #' 57 | #' replace_html(x) 58 | #' replace_html(x, FALSE) 59 | #' replace_white(replace_html(x, FALSE)) 60 | replace_html <- function(x, symbol = TRUE, ...){ 61 | if (isTRUE(symbol)) { 62 | reps <- html_symbols[['symbol']] 63 | } else { 64 | reps <- " " 65 | } 66 | mgsub(gsub('<[^>]+>', ' ', x), html_symbols[['html']],reps) 67 | } 68 | 69 | 70 | html_symbols <- data.frame( 71 | html = c("©", "®", "™", "“", 72 | "”", "‘", "’", "•", "·", "⋅", 73 | "–", "—", "¢", "£", "€", "≠", 74 | "½", "¼", "¾", "°", "←", "→", 75 | "…", " ", "<", ">", "&", """, "'", 76 | "¥", "«", "»" 77 | ), 78 | symbol = c("(c)", "(r)", "tm", "\"", "\"", "'", 79 | "'", "-", "-", "[]", "-", "-", "cents", "pounds", "euro", "!=", 80 | "half", "quarter", "three fourths", "degrees", "<-", "->", "...", 81 | " ", "<", ">", "&", '"', "'", "yen", "<<", ">>" 82 | ), 83 | stringsAsFactors = FALSE 84 | ) 85 | 86 | ## clipr::write_clip(textclean::glue("#' {html} \\tab {symb} \\cr\n", html = html_table[[1]], symb = html_table[[2]])) 87 | 88 | -------------------------------------------------------------------------------- /R/replace_incomplete.R: -------------------------------------------------------------------------------- 1 | #' Denote Incomplete End Marks With "|" 2 | #' 3 | #' Replaces incomplete sentence end marks (.., ..., .?, ..?, en & em dash etc.) 4 | #' with \code{"|"}. 5 | #' 6 | #' @param x The text variable. 7 | #' @param replacement A string to replace incomplete punctuation marks with. 8 | #' @param \dots ignored. 9 | #' @return Returns a text variable (character sting) with incomplete sentence 10 | #' marks (.., ..., .?, ..?, en & em dash etc.) replaced with "|". 11 | #' @keywords incomplete-sentence 12 | #' @export 13 | #' @examples 14 | #' x <- c("the...", "I.?", "you.", "threw..", "we?") 15 | #' replace_incomplete(x) 16 | #' replace_incomplete(x, '...') 17 | replace_incomplete <- function(x, replacement = "|", ...) { 18 | gsub(sprintf('%s\\s*$', pat), replacement, x) 19 | } 20 | 21 | pat <- paste0("\\?*\\?[.]+|[.?!]*\\? [.][.?!]+|[.?!]*\\. [.?!]+|", 22 | "[.?!]+\\. [.?!]*|[.?!]+\\.[.?!]*|[.?!]*\\.[.?!]+") 23 | 24 | 25 | -------------------------------------------------------------------------------- /R/replace_internet_slang.R: -------------------------------------------------------------------------------- 1 | #' Replace Internet Slang 2 | #' 3 | #' Replaces Internet slang. 4 | #' 5 | #' @param x The text variable. 6 | #' @param slang A vector of slang strings to replace. 7 | #' @param replacement A vector of string to replace slang with. 8 | #' @param ignore.case logical. If \code{TRUE} the case of \code{slang} will be 9 | #' ignored (replacement regardless of case). 10 | #' @param \dots Other arguments passed to \code{\link[textclean]{replace_tokens}}. 11 | #' @return Returns a vector with names replaced. 12 | #' @export 13 | #' @examples 14 | #' x <- c( 15 | #' "Marc the n00b needs to RTFM otherwise ymmv.", 16 | #' "TGIF and a big w00t! The weekend is GR8!", 17 | #' "Will will do it", 18 | #' 'w8...this PITA needs me to say LMGTFY...lmao.', 19 | #' NA 20 | #' ) 21 | #' 22 | #' replace_internet_slang(x) 23 | #' replace_internet_slang(x, ignore.case = FALSE) 24 | #' replace_internet_slang(x, replacement = '<>') 25 | #' replace_internet_slang( 26 | #' x, 27 | #' replacement = paste0('{{ ', lexicon::hash_internet_slang[[2]], ' }}') 28 | #' ) 29 | replace_internet_slang <- function(x, 30 | slang = paste0('\\b', lexicon::hash_internet_slang[[1]], '\\b'), 31 | replacement = lexicon::hash_internet_slang[[2]], ignore.case = TRUE, ...) { 32 | 33 | mgsub(x, slang, replacement, fixed = FALSE, ignore.case = ignore.case, ...) 34 | } 35 | 36 | im_his <- lexicon::hash_internet_slang 37 | 38 | 39 | -------------------------------------------------------------------------------- /R/replace_kerning.R: -------------------------------------------------------------------------------- 1 | #' Replace Kerned (Spaced) with No Space Version 2 | #' 3 | #' In typography kerning is the adjustment of spacing. Often, in informal 4 | #' writing, adding manual spaces (a form of kerning) coupled with all capital 5 | #' letters is used for emphasis. This tool looks for 3 or more consecutive 6 | #' capital letters with spaces in between and removes the spaces. Essentially, 7 | #' the capitalized, kerned version is replaced with the word equivalent. 8 | #' 9 | #' @param x The text variable. 10 | #' @param \ldots ignored. 11 | #' @return Returns a vector with kern spaces removed. 12 | #' @references \url{https://stackoverflow.com/a/47438305/1000343} 13 | #' @author StackOverflow user @@ctwheels 14 | #' @export 15 | #' @examples 16 | #' x <- c( 17 | #' "Welcome to A I: the best W O R L D!", 18 | #' "Hi I R is the B O M B for sure: we A G R E E indeed.", 19 | #' "A sort C A T indeed!", 20 | #' NA 21 | #' ) 22 | #' 23 | #' replace_kern(x) 24 | replace_kern <- function(x, ...){ 25 | ## a possible second approach from: 26 | ## https://stackoverflow.com/a/47438305/1000343 27 | ## paste0( 28 | ## '(?:(?<=\\P{L})(?=(?:\\p{Lu}\\h+){2}\\p{Lu})|', 29 | ## '\\G(?!\\A))\\p{Lu}\\K\\h+(?=\\p{Lu}(?!\\p{L}))' 30 | ## ) 31 | gsub( 32 | paste0( 33 | "(?:(?=\\b(?:\\p{Lu}\\h+){2}\\p{Lu})|", 34 | "\\G(?!\\A))\\p{Lu}\\K\\h+(?=\\p{Lu})" 35 | ), 36 | "", 37 | x, 38 | perl=TRUE 39 | ) 40 | } 41 | -------------------------------------------------------------------------------- /R/replace_misspelling.R: -------------------------------------------------------------------------------- 1 | #' Replace Misspelled Words 2 | #' 3 | #' Replace misspelled words with their most likely replacement. This function 4 | #' uses \pkg{hunspell} in the backend. \pkg{hunspell} must be installed in 5 | #' order to use this feature. 6 | #' 7 | #' @param x A character vector. 8 | #' @param \ldots ignored.. 9 | #' @return Returns a vector of strings with misspellings replaced. 10 | #' @note The function splits the string apart into tokens for speed 11 | #' optimization. After the replacement occurs the strings are pasted back 12 | #' together. The strings are not guaranteed to retain exact spacing of the 13 | #' original. 14 | #' @export 15 | #' @author Surin Space and Tyler Rinker . 16 | #' @examples 17 | #' \dontrun{ 18 | #' bad_string <- c("I cant spelll rigtt noow.", '', NA, 19 | #' 'Thiss is aslo mispelled?', 'this is 6$ and 38 cents in back2back!') 20 | #' replace_misspelling(bad_string) 21 | #' } 22 | replace_misspelling <- function(x, ...){ 23 | 24 | lower <- text <- replacement <- is_cap <- final <- element_id <- token_id <- NULL 25 | 26 | check_install('hunspell') 27 | 28 | if (!(is.character(x) | is.factor(x))) stop('`x` must be a character vector') 29 | is_na <- is.na(x) 30 | dat <- data.frame(text = as.character(x), stringsAsFactors = FALSE) 31 | 32 | token_df <- textshape::split_token(dat, lower = FALSE)[, 33 | lower := tolower(text)] 34 | 35 | tokens <- grep('[a-z]', rm_na(unique(token_df[['lower']])), value = TRUE) 36 | hits <- !hunspell::hunspell_check(tokens) 37 | 38 | misspelled <- tokens[hits] 39 | 40 | map <- data.table::data.table( 41 | lower = misspelled, 42 | replacement = unlist(lapply(hunspell::hunspell_suggest(misspelled), `[`, 1)) 43 | ) 44 | 45 | fixed_df <- map[token_df, on = "lower"] 46 | 47 | fixed_df_a <- fixed_df[!is.na(replacement),][, 48 | is_cap := substring(text, 1, 1) %in% LETTERS][, 49 | final := ifelse(is_cap, upper_first_letter(replacement), replacement)][] 50 | 51 | fixed_df_b <- fixed_df[is.na(replacement),][, final := text][] 52 | 53 | bound <- rbind(fixed_df_a, fixed_df_b, fill = TRUE) 54 | 55 | out <- data.table::setorder(bound, element_id, token_id)[, 56 | list(`final` = paste(final, collapse = ' ')), by = 'element_id'][, 57 | `final` := gsub("(\\s+)([.!?,;:])", "\\2", final, perl = TRUE)][['final']] 58 | out[is_na] <- NA 59 | out 60 | } 61 | 62 | 63 | upper_first_letter <- function(x){ 64 | substring(x, 1, 1) <- toupper(substring(x, 1, 1)) 65 | x 66 | } 67 | -------------------------------------------------------------------------------- /R/replace_money.R: -------------------------------------------------------------------------------- 1 | #' Replace Money With Words 2 | #' 3 | #' Replaces money with word equivalents. 4 | #' 5 | #' @param x The text variable. 6 | #' @param pattern Character money regex string to be matched in the given 7 | #' character vector. 8 | #' @param replacement A function to operate on the extracted matches or a 9 | #' character string which is a replacement for the matched pattern. 10 | #' @param \ldots ignored. 11 | #' @return Returns a vector with the pattern replaced. 12 | #' @export 13 | #' @examples 14 | #' x <- c( 15 | #' NA, 16 | #' '$3.16 into "three dollars, sixteen cents"', 17 | #' "-$20,333.18 too", 'fff' 18 | #' ) 19 | #' 20 | #' replace_money(x) 21 | #' replace_money(x, replacement = '<>') 22 | replace_money <- function(x, pattern = '(-?)([$])([0-9,]+)(\\.\\d{2})?', 23 | replacement = NULL, ...){ 24 | 25 | #if (is.null(pattern)) pattern <- replace_money_pattern 26 | if (is.null(replacement)) replacement <- replace_money_fun 27 | 28 | if (is.function(replacement)) { 29 | f_gsub <- fgsub 30 | } else { 31 | f_gsub <- stringi::stri_replace_all_regex 32 | } 33 | 34 | f_gsub(x, pattern, replacement) 35 | 36 | } 37 | 38 | replace_money_fun <- function(x, decimal = ' and '){ 39 | 40 | sign <- ifelse(grepl('^-', x, perl = TRUE), 'negative ', '') 41 | if (grepl('\\.', x, perl = TRUE)) { 42 | number <- replace_number( 43 | gsub( 44 | '\\.', 45 | paste0(' dollars', decimal), 46 | gsub('(-?)([$])', '', x) 47 | ) 48 | ) 49 | paste0(sign, number, ' cents') 50 | } else { 51 | number <- replace_number(gsub('(-?)([$])', '', x)) 52 | paste0(sign, number) 53 | } 54 | 55 | } 56 | -------------------------------------------------------------------------------- /R/replace_names.R: -------------------------------------------------------------------------------- 1 | #' Replace First/Last Names 2 | #' 3 | #' Replaces first/last names. 4 | #' 5 | #' @param x The text variable. 6 | #' @param names A vector of names to replace. This may be made more custom 7 | #' through a vector provided from a named entity extractor. 8 | #' @param replacement A string to replace names with. 9 | #' @param \dots Other arguments passed to 10 | #' \code{\link[textclean]{replace_tokens}}. 11 | #' @return Returns a vector with names replaced. 12 | #' @export 13 | #' @examples 14 | #' x <- c( 15 | #' "Mary Smith is not here", 16 | #' "Karen is not a nice person", 17 | #' "Will will do it", 18 | #' NA 19 | #' ) 20 | #' 21 | #' replace_names(x) 22 | #' replace_names(x, replacement = '<>') 23 | replace_names <- function(x, 24 | names = textclean::drop_element( 25 | gsub( 26 | "(^.)(.*)", "\\U\\1\\L\\2", 27 | c(lexicon::freq_last_names[[1]], 28 | lexicon::common_names 29 | ), perl = TRUE), 30 | "^([AIU]n|[TSD]o|H[ea]Pa|Oh)$" 31 | ), 32 | replacement = "", ...) { 33 | 34 | replace_tokens(x, names, replacement, ...) 35 | } 36 | 37 | im_ad <- lexicon::available_data 38 | im_cmn <- lexicon::common_names 39 | -------------------------------------------------------------------------------- /R/replace_non_ascii.R: -------------------------------------------------------------------------------- 1 | #' Replace Common Non-ASCII Characters 2 | #' 3 | #' \code{replace_non_ascii} - Replaces common non-ASCII characters. 4 | #' 5 | #' @param x The text variable. 6 | #' @param replacement Character string equal in length to pattern or of length 7 | #' one which are a replacement for matched pattern. 8 | #' @param remove.nonconverted logical. If \code{TRUE} unmapped encodings are 9 | #' deleted from the string. 10 | #' @param \dots ignored. 11 | #' @return Returns a text variable (character sting) with non-ASCII characters 12 | #' replaced. 13 | #' @keywords ascii 14 | #' @rdname replace_non_ascii 15 | #' @export 16 | #' @examples 17 | #' x <- c( 18 | #' "Hello World", "6 Ekstr\xf8m", "J\xf6reskog", "bi\xdfchen Z\xfcrcher", 19 | #' 'This is a \xA9 but not a \xAE', '6 \xF7 2 = 3', 20 | #' 'fractions \xBC, \xBD, \xBE', 'cows go \xB5', '30\xA2' 21 | #' ) 22 | #' Encoding(x) <- "latin1" 23 | #' x 24 | #' 25 | #' replace_non_ascii(x) 26 | #' replace_non_ascii(x, remove.nonconverted = FALSE) 27 | #' 28 | #' z <- '\x95He said, \x93Gross, I am going to!\x94' 29 | #' Encoding(z) <- "latin1" 30 | #' z 31 | #' 32 | #' replace_curly_quote(z) 33 | #' replace_non_ascii(z) 34 | replace_non_ascii <- function (x, replacement = '', 35 | remove.nonconverted = TRUE, ...) { 36 | 37 | x <- replace_curly_quote(x) 38 | x <- stringi::stri_trans_general(x, "latin-ascii") 39 | x <- iconv(as.character(x), "", "ASCII", "byte") 40 | Encoding(x) <- "latin1" 41 | x <- mgsub(x, ser, reps) 42 | 43 | if (isTRUE(remove.nonconverted)) { 44 | x <- qdapRegex::rm_angle(x, replacement = replacement) 45 | x <- stringi::stri_replace_all_regex(x, '[^ -~]+', 46 | replacement = replacement) 47 | } 48 | 49 | x 50 | 51 | } 52 | 53 | 54 | #' Replace Common Non-ASCII Characters 55 | #' 56 | #' \code{place_non_ascii2} - Replaces all non-ASCII (defined as \code{'[^ -~]+'}). 57 | #' This provides a subset of functionality found in \code{replace_non_ascii} that 58 | #' is faster and likely less accurate. 59 | #' 60 | #' @rdname replace_non_ascii 61 | #' @export 62 | replace_non_ascii2 <- function (x, replacement = '', ...) { 63 | 64 | stringi::stri_replace_all_regex(x, '[^ -~]+', replacement = replacement) 65 | 66 | } 67 | 68 | # replace_non_ascii <- function(x, remove.nonconverted = TRUE, ...) { 69 | # x <- replace_curly_quote(x) 70 | # x <- stringi::stri_trans_general(x, "latin-ascii") 71 | # x <- iconv(as.character(x), "", "ASCII", "byte") 72 | # Encoding(x) <-"latin1" 73 | # x <- mgsub(x, ser, reps) 74 | # if (isTRUE(remove.nonconverted)) x <- qdapRegex::rm_angle(x) 75 | # x 76 | # } 77 | 78 | #' Replace Common Non-ASCII Characters 79 | #' 80 | #' \code{replace_curly_quote} - Replaces curly single and double quotes. This 81 | #' provides a subset of functionality found in \code{replace_non_ascii} specific 82 | #' to quotes. 83 | #' 84 | #' @rdname replace_non_ascii 85 | #' @export 86 | replace_curly_quote <- function(x, ...){ 87 | replaces <- c('\x91', '\x92', '\x93', '\x94') 88 | Encoding(replaces) <- "latin1" 89 | for (i in 1:4) { 90 | x <- gsub(replaces[i], c("'", "'", "\"", "\"")[i], x, fixed = TRUE) 91 | } 92 | x 93 | } 94 | 95 | ser <- c("<80><9c>", "<80><9d>", "<80><98>", "<80><99>", 96 | "<80><9b>", "<87>", "<80>", "<80><93>", 97 | "<80><94>", "", "", "", '', '', 98 | '', '', '', '', '', '' 99 | ) 100 | 101 | reps <- c('"', '"', "'", "'", "'", "'", '...', '-', '-', "a", "e", "1/2", 102 | ' copyright ', ' registered trademark ', "/", '1/2', '1/4', '3/4', ' mu ', 103 | ' cent ' 104 | ) 105 | 106 | -------------------------------------------------------------------------------- /R/replace_number.R: -------------------------------------------------------------------------------- 1 | #' Replace Numbers With Text Representation 2 | #' 3 | #' \code{replace_number} - Replaces numeric represented numbers with words 4 | #' (e.g., 1001 becomes one thousand one). 5 | #' 6 | #' @param x The text variable. 7 | #' @param num.paste logical. If \code{FALSE} the elements of larger numbers are 8 | #' separated with spaces. If \code{TRUE} the elements will be joined without 9 | #' spaces. 10 | #' @param remove logical. If \code{TRUE} numbers are removed from the text. 11 | #' @param \ldots Other arguments passed to \code{\link[english]{as.english}} 12 | #' @return Returns a vector with numbers replaced. 13 | #' @references Fox, J. (2005). Programmer's niche: How do you spell that number? 14 | #' R News. Vol. 5(1), pp. 51-55. 15 | #' @note The user may want to use \code{\link[textclean]{replace_ordinal}} 16 | #' first to remove ordinal number notation. For example 17 | #' \code{\link[textclean]{replace_number}} would turn "21st" into 18 | #' "twenty onest", whereas \code{\link[textclean]{replace_ordinal}} would 19 | #' generate "twenty first". 20 | #' @keywords number-to-word 21 | #' @rdname replace_number 22 | #' @export 23 | #' @examples 24 | #' x <- c( 25 | #' NA, 26 | #' 'then .456 good', 27 | #' 'none', 28 | #' "I like 346,457 ice cream cones.", 29 | #' "I like 123456789 cashes.", 30 | #' "They are 99 percent good and 45678.2345667" 31 | #' ) 32 | #' replace_number(x) 33 | #' replace_number(x, num.paste = TRUE) 34 | #' replace_number(x, remove=TRUE) 35 | #' \dontrun{ 36 | #' library(textclean) 37 | #' hunthou <- replace_number(seq_len(1e5)) 38 | #' 39 | #' textclean::mgsub( 40 | #' "'twenty thousand three hundred five' into 20305", 41 | #' hunthou, 42 | #' seq_len(1e5) 43 | #' ) 44 | #' ## "'20305' into 20305" 45 | #' 46 | #' ## Larger example from: https://stackoverflow.com/q/18332463/1000343 47 | #' ## A slower approach 48 | #' fivehunthou <- replace_number(seq_len(5e5)) 49 | #' 50 | #' testvect <- c("fifty seven", "four hundred fifty seven", 51 | #' "six thousand four hundred fifty seven", 52 | #' "forty six thousand four hundred fifty seven", 53 | #' "forty six thousand four hundred fifty seven", 54 | #' "three hundred forty six thousand four hundred fifty seven" 55 | #' ) 56 | #' 57 | #' textclean::mgsub(testvect, fivehunthou, seq_len(5e5)) 58 | #' 59 | #' as_ordinal(1:10) 60 | #' textclean::mgsub('I want to be 1 in line', 1:10, as_ordinal(1:10)) 61 | #' } 62 | replace_number <- function(x, num.paste = FALSE, remove = FALSE, ...) { 63 | 64 | if (is.numeric(x)){ 65 | x <- drop_sci_note(x) ## ensures scientific notation is not used 66 | } else { 67 | x <- as.character(x) 68 | } 69 | 70 | if (remove) return(stringi::stri_replace_all_regex(x, num_regex, "")) 71 | 72 | ## extract the numbers 73 | to_replace <- stringi::stri_extract_all_regex(x, num_regex) 74 | 75 | 76 | # browser() 77 | ## locations of the number strings 78 | locs <- which(!sapply2(to_replace, function(x) length(x) == 1 && is.na(x))) 79 | 80 | ## find locations of decimals 81 | decimal_locs <- lapply(to_replace[locs], stringi::stri_detect_regex, "\\.") 82 | 83 | ## get the numbers/texts tht correspond to number strings 84 | replaces <- to_replace[locs] 85 | 86 | ## lengths of the replacements lists so that it can be 87 | ## unlisted and then relisted later 88 | lens <- lengths(replaces) 89 | # browser() 90 | ## Data frame of the number text. 91 | ## This will be disected and put back together 92 | num_df <- data.frame( 93 | num = gsub(",", "", unlist(replaces)), 94 | stringsAsFactors = FALSE 95 | ) 96 | 97 | num_df[['decimal']] <- unlist( 98 | stringi::stri_extract_all_regex(num_df[[1]], "\\.\\d+") 99 | ) 100 | 101 | num_df[['integer']] <- floor(as.numeric(num_df[[1]])) 102 | num_df[['den']] <- num_df[['den1']] <- 10 ^ (nchar(num_df[['decimal']])- 1) 103 | 104 | num_df[['den']][!is.na(num_df[['den']])] <- paste0( 105 | eng(num_df[['den']][!is.na(num_df[['den']])], ...), 'ths' 106 | ) 107 | 108 | num_df[['numerator']] <- eng( 109 | num_df[['den1']] * as.numeric(num_df[['decimal']]), ... 110 | ) 111 | 112 | num_df[['den']][is.na(num_df[['den']])] <- "" 113 | num_df[['int']] <- eng(num_df[['integer']], ...) 114 | 115 | is_decimal <- grepl("\\.", num_df[[1]], perl = TRUE) 116 | not_integer_decimal <- !grepl('\\d\\.', num_df[[1]], perl = TRUE) 117 | 118 | num_df[['int']][is_decimal & not_integer_decimal] <- ifelse(grepl('^minus', num_df[['int']][is_decimal & not_integer_decimal]), 'minus', "") 119 | 120 | num_df[['numerator']][!not_integer_decimal] <- paste( 121 | 'and', num_df[['numerator']][!not_integer_decimal] 122 | ) 123 | 124 | ## the replacements to swap in 125 | replaces2 <- trimws(paste( 126 | num_df[['int']], num_df[['numerator']], num_df[['den']] 127 | )) 128 | if (num.paste) replaces2 <- gsub("\\s+", "", replaces2) 129 | 130 | ## Reconvert to the original list shape that matches replaces 131 | replaces2 <- textshape::split_index(replaces2, textshape::starts(lens)) 132 | 133 | ## for loop to do the gsubbing 134 | for (i in seq_along(locs)) { 135 | x[locs[i]] <- mgsub(x[locs[i]], replaces[[i]], replaces2[[i]]) 136 | } 137 | x 138 | } 139 | 140 | num_regex <- paste0( 141 | "(?<=^| )-?.?\\d+(?:\\d+)?(?= |\\.?$)|", 142 | "(?<=^| )-?\\d+(?:\\.\\d+)?(?= |\\.?$)|", 143 | "\\d+(?:,\\d{3})+(\\.\\d+)*" 144 | ) 145 | 146 | eng <- function(x, ...) as.character(english::as.english(x, ...)) 147 | 148 | 149 | #' Replace Numbers With Text Representation 150 | #' 151 | #' \code{as_ordinal} - A convenience wrapper for \code{english::ordinal} that 152 | #' takes integers and converts them to ordinal form. 153 | #' 154 | #' @rdname replace_number 155 | #' @export 156 | as_ordinal <- function(x, ...){ 157 | english::ordinal(x) 158 | } 159 | 160 | 161 | -------------------------------------------------------------------------------- /R/replace_ordinal.R: -------------------------------------------------------------------------------- 1 | #' Replace Mixed Ordinal Numbers With Text Representation 2 | #' 3 | #' Replaces mixed text/numeric represented ordinal numbers with words (e.g., 4 | #' "1st" becomes "first"). 5 | #' 6 | #' @param x The text variable. 7 | #' @param num.paste logical. If \code{TRUE} a the elements of larger numbers are 8 | #' separated with spaces. If \code{FALSE} the elements will be joined without 9 | #' spaces. 10 | #' @param remove logical. If \code{TRUE} ordinal numbers are removed from the text. 11 | #' @param \ldots ignored. 12 | #' @keywords ordinal-to-word 13 | #' @note Currently only implemented for ordinal values 1 through 100 14 | #' @export 15 | #' @examples 16 | #' x <- c( 17 | #' "I like the 1st one not the 22nd one.", 18 | #' "For the 100th time stop!" 19 | #' ) 20 | #' replace_ordinal(x) 21 | #' replace_ordinal(x, TRUE) 22 | #' replace_ordinal(x, remove = TRUE) 23 | #' replace_number(replace_ordinal("I like the 1st 1 not the 22nd 1.")) 24 | replace_ordinal <- function(x, num.paste = FALSE, remove = FALSE, ...) { 25 | 26 | symb <- c("1st", "2nd", "3rd", paste0(4:19, "th"), 27 | paste0(20:100, c("th", "st", "nd", "rd", rep("th", 6)))) 28 | 29 | if (remove) { 30 | ordinal <- "" 31 | } else { 32 | base_ord <- ordinal <- c("first", "second", "third", "fourth", 33 | "fifth", "sixth", "seventh", "eighth", "ninth") 34 | prefix <- c("twent", "thirt", "fort", "fift", "sixt", 35 | "sevent", "eight", "ninet") 36 | ordinal <- c(base_ord, "tenth", "eleventh", "twelfth", 37 | "thirteenth", "fourteenth", "fifteenth", "sixteenth", 38 | "seventeenth", "eighteenth", "nineteenth", 39 | paste0(rep(prefix, each=10), c("ieth", paste("y", base_ord))), 40 | "hundredth") 41 | } 42 | if (num.paste & !remove) ordinal <- gsub("\\s+", "", ordinal) 43 | trimws(mgsub(x, paste0("\\b", symb, "\\b"), spaste(ordinal), fixed=FALSE)) 44 | } 45 | -------------------------------------------------------------------------------- /R/replace_rating.R: -------------------------------------------------------------------------------- 1 | #' Replace Ratings With Words 2 | #' 3 | #' Replaces ratings with word equivalents. 4 | #' 5 | #' @param x The text variable. 6 | #' @param rating_dt A \pkg{data.table} of ratings and corresponding word meanings. 7 | #' @param \ldots ignored. 8 | #' @return Returns a vector of strings with ratings replaced with word 9 | #' equivalents. 10 | #' @keywords rating 11 | #' @export 12 | #' @examples 13 | #' x <- c("This place receives 5 stars for their APPETIZERS!!!", 14 | #' "Four stars for the food & the guy in the blue shirt for his great vibe!", 15 | #' "10 out of 10 for both the movie and trilogy.", 16 | #' "* Both the Hot & Sour & the Egg Flower Soups were absolutely 5 Stars!", 17 | #' "For service, I give them no stars.", "This place deserves no stars.", 18 | #' "10 out of 10 stars.", 19 | #' "My rating: just 3 out of 10.", 20 | #' "If there were zero stars I would give it zero stars.", 21 | #' "Rating: 1 out of 10.", 22 | #' "I gave it 5 stars because of the sound quality.", 23 | #' "If it were possible to give them 0/10, they'd have it." 24 | #' ) 25 | #' 26 | #' replace_rating(x) 27 | replace_rating <- function (x, rating_dt = lexicon::key_rating, ...) { 28 | gsub("\\s+", " ", .mgsub(rating_dt[["x"]], paste0(" ", 29 | rating_dt[["y"]], " "), x, ...)) 30 | } 31 | 32 | 33 | IMPORT <- lexicon::available_data -------------------------------------------------------------------------------- /R/replace_symbol.R: -------------------------------------------------------------------------------- 1 | #' Replace Symbols With Word Equivalents 2 | #' 3 | #' This function replaces symbols with word equivalents (e.g., \code{@@} becomes 4 | #' \code{"at"}. 5 | #' 6 | #' @param x A character vector. 7 | #' @param dollar logical. If \code{TRUE} replaces dollar sign ($) with 8 | #' \code{"dollar"}. 9 | #' @param percent logical. If \code{TRUE} replaces percent sign (\%) with 10 | #' \code{"percent"}. 11 | #' @param pound logical. If \code{TRUE} replaces pound sign (#) with 12 | #' \code{"number"}. 13 | #' @param at logical. If \code{TRUE} replaces at sign (@@) with \code{"at"}. 14 | #' @param and logical. If \code{TRUE} replaces and sign (&) with \code{"and"}. 15 | #' @param with logical. If \code{TRUE} replaces with sign (w/) with 16 | #' \code{"with"}. 17 | #' @param \ldots ignored. 18 | #' @return Returns a character vector with symbols replaced.. 19 | #' @keywords symbol-replace 20 | #' @export 21 | #' @examples 22 | #' x <- c("I am @@ Jon's & Jim's w/ Marry", 23 | #' "I owe $41 for food", 24 | #' "two is 10% of a #" 25 | #' ) 26 | #' replace_symbol(x) 27 | replace_symbol <- function(x, dollar = TRUE, percent = TRUE, 28 | pound = TRUE, at = TRUE, and = TRUE, with = TRUE, ...) { 29 | 30 | y <- c(dollar, percent, pound, at, and, with, with) 31 | 32 | gsub("\\+", " ", mgsub( 33 | x, 34 | pattern = symbs[y], 35 | replacement = replaces[y], 36 | fixed = TRUE, 37 | )) 38 | } 39 | 40 | symbs <- c("%", "$", "#", "&", "@", "w/o", "w/") 41 | replaces <- paste0(" ", c("percent", "dollar", "number", "and", "at", 42 | "without", "with"), " ") 43 | 44 | -------------------------------------------------------------------------------- /R/replace_tag.R: -------------------------------------------------------------------------------- 1 | #' Replace Handle Tags 2 | #' 3 | #' Replaces Twitter style handle tags (e.g., '@@trinker'). 4 | #' 5 | #' @param x The text variable. 6 | #' @param pattern Character time regex string to be matched in the given 7 | #' character vector. 8 | #' @param replacement A function to operate on the extracted matches or a 9 | #' character string which is a replacement for the matched pattern. 10 | #' @param \ldots ignored. 11 | #' @return Returns a vector with tags replaced. 12 | #' @export 13 | #' @importFrom qdapRegex grab 14 | #' @examples 15 | #' x <- c("@@hadley I like #rstats for #ggplot2 work.", 16 | #' "Difference between #magrittr and #pipeR, both implement pipeline operators for #rstats: 17 | #' http://renkun.me/r/2014/07/26/difference-between-magrittr-and-pipeR.html @@timelyportfolio", 18 | #' "Slides from great talk: @@ramnath_vaidya: Interactive slides from Interactive Visualization 19 | #' presentation #user2014. http://ramnathv.github.io/user2014-rcharts/#1" 20 | #' ) 21 | #' 22 | #' replace_tag(x) 23 | #' replace_tag(x, replacement = '<>') 24 | #' replace_tag(x, replacement = '$3') 25 | #' 26 | #' ## Replacement with a function 27 | #' replace_tag(x, 28 | #' replacement = function(x){ 29 | #' gsub('@@', ' <> ', x) 30 | #' } 31 | #' ) 32 | replace_tag <- function(x, pattern = qdapRegex::grab('rm_tag'), 33 | replacement = '', ...){ 34 | 35 | if (is.function(replacement)) { 36 | f_gsub <- fgsub 37 | } else { 38 | f_gsub <- stringi::stri_replace_all_regex 39 | } 40 | 41 | f_gsub(x, pattern, replacement) 42 | 43 | } 44 | -------------------------------------------------------------------------------- /R/replace_time.R: -------------------------------------------------------------------------------- 1 | #' Replace Time Stamps With Words 2 | #' 3 | #' Replaces time stamps with word equivalents. 4 | #' 5 | #' @param x The text variable. 6 | #' @param pattern Character time regex string to be matched in the given 7 | #' character vector. 8 | #' @param replacement A function to operate on the extracted matches or a 9 | #' character string which is a replacement for the matched pattern. 10 | #' @param \ldots ignored. 11 | #' @return Returns a vector with the pattern replaced. 12 | #' @export 13 | #' @examples 14 | #' x <- c( 15 | #' NA, '12:47 to "twelve forty-seven" and also 8:35:02', 16 | #' 'what about 14:24.5', 'And then 99:99:99?' 17 | #' ) 18 | #' 19 | #' ## Textual: Word version 20 | #' replace_time(x) 21 | #' 22 | #' ## Normalization: <