├── .Rbuildignore
├── .gitignore
├── .travis.yml
├── CODE_OF_CONDUCT.md
├── DESCRIPTION
├── NAMESPACE
├── NEWS
├── NEWS.md
├── R
    ├── add_comma_space.R
    ├── add_missing_endmark.R
    ├── check_text.R
    ├── check_text_logicals.R
    ├── drop_element.R
    ├── drop_row.R
    ├── fgsub.R
    ├── fix_mdyyyy.R
    ├── glue-reexports.R
    ├── has_endmark.R
    ├── like.R
    ├── make_plural.R
    ├── match_tokens.R
    ├── mgsub.R
    ├── replace_contraction.R
    ├── replace_date.R
    ├── replace_email.R
    ├── replace_emoji.R
    ├── replace_emoticon.R
    ├── replace_grade.R
    ├── replace_hash.R
    ├── replace_html.R
    ├── replace_incomplete.R
    ├── replace_internet_slang.R
    ├── replace_kerning.R
    ├── replace_misspelling.R
    ├── replace_money.R
    ├── replace_names.R
    ├── replace_non_ascii.R
    ├── replace_number.R
    ├── replace_ordinal.R
    ├── replace_rating.R
    ├── replace_symbol.R
    ├── replace_tag.R
    ├── replace_time.R
    ├── replace_to.R
    ├── replace_tokens.R
    ├── replace_url.R
    ├── replace_white.R
    ├── replace_word_elongation.R
    ├── strip.R
    ├── sub_holder.R
    ├── swap.R
    ├── sysdata.rda
    ├── textclean-package.R
    └── utils.R
├── README.Rmd
├── README.md
├── data
    └── DATA.rda
├── inst
    ├── CITATION
    ├── articles
    │   ├── Clark2011.pdf
    │   ├── Jurafsky2016.pdf
    │   └── Sproat2001.pdf
    ├── build.R
    ├── docs
    │   ├── emoji_sample.txt
    │   └── r_tweets.txt
    ├── extra_statdoc
    │   └── readme.R
    ├── maintenance.R
    ├── scraping_scripts
    │   ├── google_ngram_to_canonical.R
    │   └── scrape_leet.R
    └── staticdocs
    │   └── index.R
├── man
    ├── DATA.Rd
    ├── add_comma_space.Rd
    ├── add_missing_endmark.Rd
    ├── check_text.Rd
    ├── drop_element.Rd
    ├── drop_row.Rd
    ├── fgsub.Rd
    ├── fix_mdyyyy.Rd
    ├── has_endmark.Rd
    ├── like.Rd
    ├── make_plural.Rd
    ├── match_tokens.Rd
    ├── mgsub.Rd
    ├── print.check_text.Rd
    ├── print.sub_holder.Rd
    ├── print.which_are_locs.Rd
    ├── reexports.Rd
    ├── replace_contraction.Rd
    ├── replace_date.Rd
    ├── replace_email.Rd
    ├── replace_emoji.Rd
    ├── replace_emoticon.Rd
    ├── replace_grade.Rd
    ├── replace_hash.Rd
    ├── replace_html.Rd
    ├── replace_incomplete.Rd
    ├── replace_internet_slang.Rd
    ├── replace_kern.Rd
    ├── replace_misspelling.Rd
    ├── replace_money.Rd
    ├── replace_names.Rd
    ├── replace_non_ascii.Rd
    ├── replace_number.Rd
    ├── replace_ordinal.Rd
    ├── replace_rating.Rd
    ├── replace_symbol.Rd
    ├── replace_tag.Rd
    ├── replace_time.Rd
    ├── replace_to.Rd
    ├── replace_tokens.Rd
    ├── replace_url.Rd
    ├── replace_white.Rd
    ├── replace_word_elongation.Rd
    ├── strip.Rd
    ├── sub_holder.Rd
    ├── swap.Rd
    ├── textclean.Rd
    └── which_are.Rd
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test-replace_emoticon.R
    │   ├── test-replace_grade.R
    │   ├── test-replace_rating.R
    │   └── test-strip.R
└── tools
    └── textclean_logo
        ├── r_textclean.png
        ├── r_textclean.pptx
        ├── r_textcleana.png
        └── resize_icon.txt


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^.*\.Rproj$
 2 | ^\.Rproj\.user$
 3 | ^\.gitignore
 4 | NEWS$
 5 | FAQ.md
 6 | NEWS.html
 7 | FAQ.html
 8 | ^\.travis\.yml$
 9 | travis-tool.sh
10 | inst/web
11 | contributors.geojson
12 | inst/build.R
13 | ^.*\.Rprofile$
14 | README.Rmd
15 | README.R
16 | travis.yml
17 | inst/maintenance.R
18 | tools/textclean_logo/r_textcleana.png
19 | tools/textclean_logo/r_textclean.pptx
20 | tools/textclean_logo/resize_icon.txt
21 | inst/staticdocs
22 | inst/extra_statdoc
23 | Thumbs.db
24 | inst/scraping_scripts
25 | inst/articles
26 | ^CODE_OF_CONDUCT\.md$
27 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # History files
 2 | .Rhistory
 3 | 
 4 | # Example code in package build process
 5 | *-Ex.R
 6 | 
 7 | .Rprofile
 8 | .Rproj.user
 9 | textmod.Rproj
10 | Thumbs.db
11 | 
12 | *.Rproj


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: r
 2 | 
 3 | sudo: false
 4 | 
 5 | before_install:
 6 |   - sh -e /etc/init.d/xvfb start
 7 | 
 8 | r_github_packages:
 9 |   - jimhester/covr
10 |   - trinker/textshape
11 |   - trinker/lexicon
12 |   - jeroenooms/hunspell
13 | 
14 | notifications:
15 |   email:
16 |     on_success:      change
17 |     on_failure:      change
18 | 
19 | after_success:
20 |   - Rscript -e 'covr::coveralls()'
21 | 
22 | r_build_args: "--resave-data=best"
23 | r_check_args: "--as-cran"
24 | 
25 | env:
26 |    global:
27 |      - DISPLAY=:99.0
28 |      - BOOTSTRAP_LATEX=1
29 |      - NOT_CRAN=true
30 |      - secure: "nhzZdgVEOmRO/pCpkb6vBgTbLU2igXmb5gbX+QWaV5YzDT5pqMnT+AtE5/+GMH7QxfFE1SKeA/r2w8XomNMpDvhIIpedwHpGywRGK3rtav2u108oQ73m2k2D3AQZ/YTAx7xPgVwCMveUqZ3xDyGkt220J3Hwfkpe341B2xQH0JQ="
31 |  
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Code of Conduct
 2 | 
 3 | As contributors and maintainers of this project, we pledge to respect all people who 
 4 | contribute through reporting issues, posting feature requests, updating documentation,
 5 | submitting pull requests or patches, and other activities.
 6 | 
 7 | We are committed to making participation in this project a harassment-free experience for
 8 | everyone, regardless of level of experience, gender, gender identity and expression,
 9 | sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion.
10 | 
11 | Examples of unacceptable behavior by participants include the use of sexual language or
12 | imagery, derogatory comments or personal attacks, trolling, public or private harassment,
13 | insults, or other unprofessional conduct.
14 | 
15 | Project maintainers have the right and responsibility to remove, edit, or reject comments,
16 | commits, code, wiki edits, issues, and other contributions that are not aligned to this 
17 | Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed 
18 | from the project team.
19 | 
20 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by 
21 | opening an issue or contacting one or more of the project maintainers.
22 | 
23 | This Code of Conduct is adapted from the Contributor Covenant 
24 | (http://contributor-covenant.org), version 1.0.0, available at 
25 | http://contributor-covenant.org/version/1/0/0/
26 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: textclean
 2 | Title: Text Cleaning Tools
 3 | Version: 0.9.7
 4 | Authors@R: c( 
 5 |         person("Tyler", "Rinker", email = "tyler.rinker@gmail.com", role = c("aut", "cre")), 
 6 |         person("ctwheels", "StackOverflow", role = "ctb"), 
 7 |         person("Surin", "Space", role = "ctb")  
 8 |     )
 9 | Maintainer: Tyler Rinker <tyler.rinker@gmail.com>
10 | Description: Tools to clean and process text.  Tools are geared at checking for substrings that
11 |           are not optimal for analysis and replacing or removing them (normalizing) with more
12 |           analysis friendly substrings (see Sproat, Black, Chen, Kumar, Ostendorf, & Richards
13 |           (2001) <doi:10.1006/csla.2001.0169>) or extracting them into new variables. For
14 |           example, emoticons are often used in text but not always easily handled by analysis
15 |           algorithms.  The replace_emoticon() function replaces emoticons with word
16 |           equivalents.
17 | Depends: R (>= 3.4.0)
18 | Imports: data.table, english(>= 1.0-2), glue (>= 1.3.0), lexicon (>= 1.0.0), mgsub (>= 1.5.0), qdapRegex,
19 |           stringi, textshape(>= 1.0.1), utils
20 | Suggests: hunspell, testthat
21 | License: GPL-2
22 | LazyData: TRUE
23 | RoxygenNote: 7.1.2
24 | Encoding: UTF-8
25 | URL: https://github.com/trinker/textclean
26 | BugReports: https://github.com/trinker/textclean/issues
27 | Collate: 
28 |     'add_comma_space.R'
29 |     'add_missing_endmark.R'
30 |     'utils.R'
31 |     'replace_html.R'
32 |     'check_text_logicals.R'
33 |     'check_text.R'
34 |     'drop_element.R'
35 |     'drop_row.R'
36 |     'fgsub.R'
37 |     'fix_mdyyyy.R'
38 |     'glue-reexports.R'
39 |     'has_endmark.R'
40 |     'like.R'
41 |     'make_plural.R'
42 |     'match_tokens.R'
43 |     'mgsub.R'
44 |     'replace_contraction.R'
45 |     'replace_date.R'
46 |     'replace_email.R'
47 |     'replace_emoji.R'
48 |     'replace_emoticon.R'
49 |     'replace_grade.R'
50 |     'replace_hash.R'
51 |     'replace_incomplete.R'
52 |     'replace_internet_slang.R'
53 |     'replace_kerning.R'
54 |     'replace_misspelling.R'
55 |     'replace_money.R'
56 |     'replace_names.R'
57 |     'replace_non_ascii.R'
58 |     'replace_number.R'
59 |     'replace_ordinal.R'
60 |     'replace_rating.R'
61 |     'replace_symbol.R'
62 |     'replace_tag.R'
63 |     'replace_time.R'
64 |     'replace_to.R'
65 |     'replace_tokens.R'
66 |     'replace_url.R'
67 |     'replace_white.R'
68 |     'replace_word_elongation.R'
69 |     'strip.R'
70 |     'sub_holder.R'
71 |     'swap.R'
72 |     'textclean-package.R'
73 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(fix_mdyyyy,date)
 4 | S3method(fix_mdyyyy,default)
 5 | S3method(print,check_text)
 6 | S3method(print,sub_holder)
 7 | S3method(print,which_are_locs)
 8 | S3method(strip,character)
 9 | S3method(strip,default)
10 | S3method(strip,factor)
11 | S3method(strip,list)
12 | export("%LIKE%")
13 | export("%SLIKE%")
14 | export("%like%")
15 | export("%slike%")
16 | export(add_comma_space)
17 | export(add_missing_endmark)
18 | export(as_ordinal)
19 | export(available_checks)
20 | export(check_text)
21 | export(drop_NA)
22 | export(drop_element)
23 | export(drop_element_fixed)
24 | export(drop_element_regex)
25 | export(drop_empty_row)
26 | export(drop_row)
27 | export(fgsub)
28 | export(fix_mdyyyy)
29 | export(glue)
30 | export(glue_collapse)
31 | export(has_endmark)
32 | export(is_it)
33 | export(keep_element)
34 | export(keep_element_fixed)
35 | export(keep_element_regex)
36 | export(keep_row)
37 | export(make_plural)
38 | export(match_tokens)
39 | export(mgsub)
40 | export(mgsub_fixed)
41 | export(mgsub_regex)
42 | export(mgsub_regex_safe)
43 | export(replace_contraction)
44 | export(replace_curly_quote)
45 | export(replace_date)
46 | export(replace_email)
47 | export(replace_emoji)
48 | export(replace_emoji_identifier)
49 | export(replace_emoticon)
50 | export(replace_from)
51 | export(replace_grade)
52 | export(replace_hash)
53 | export(replace_html)
54 | export(replace_incomplete)
55 | export(replace_internet_slang)
56 | export(replace_kern)
57 | export(replace_misspelling)
58 | export(replace_money)
59 | export(replace_names)
60 | export(replace_non_ascii)
61 | export(replace_non_ascii2)
62 | export(replace_number)
63 | export(replace_ordinal)
64 | export(replace_rating)
65 | export(replace_symbol)
66 | export(replace_tag)
67 | export(replace_time)
68 | export(replace_to)
69 | export(replace_tokens)
70 | export(replace_url)
71 | export(replace_white)
72 | export(replace_word_elongation)
73 | export(strip)
74 | export(sub_holder)
75 | export(swap)
76 | export(which_are)
77 | importFrom(data.table,":=")
78 | importFrom(glue,glue)
79 | importFrom(glue,glue_collapse)
80 | importFrom(qdapRegex,grab)
81 | 


--------------------------------------------------------------------------------
/NEWS:
--------------------------------------------------------------------------------
  1 | NEWS 
  2 | ====
  3 | 
  4 | Versioning
  5 | ----------
  6 | 
  7 | Releases will be numbered with the following semantic versioning format:
  8 | 
  9 | <major>.<minor>.<patch>
 10 | 
 11 | And constructed with the following guidelines:
 12 | 
 13 | * Breaking backward compatibility bumps the major (and resets the minor 
 14 |   and patch)
 15 | * New additions without breaking backward compatibility bumps the minor 
 16 |   (and resets the patch)
 17 | * Bug fixes and misc changes bumps the patch
 18 | 
 19 | 
 20 | 
 21 | textclean 0.9.4 -  
 22 | ----------------------------------------------------------------
 23 | 
 24 | BUG FIXES
 25 | 
 26 | * `replace_emoticon` replaced emoticon-like substrings within actual words.  
 27 |   Spotted thanks to Carolyn Challoner; see issue #46.
 28 | 
 29 | * `replace_number` failed if the number pattern contained two leading decimals 
 30 |   or hyphens.  Spotted thanks to Stefano De Sabbata; see issue #60.
 31 |   
 32 | * `replace_word_elongation` failed for repeating of the same character but of
 33 |   different case (e.g., `replace_word_elongation("Ooo")` resulted in `NA`.  This
 34 |   has been corrected.  Additionally, the `elongation.search.pattern` defined as 
 35 |   `"(?i)(?:^|\\b)\\w*([a-z])(?:\\1{2,})\\w*($|\\b)"` has been moved exterally, to
 36 |   a parameter, allowing the user to alter this pattern if desired.  Spotted 
 37 |   thanks to Stefano De Sabbata; see issue #59.
 38 | 
 39 | NEW FEATURES
 40 | 
 41 | * `replace_misspelling` added as a way to replace misspelled words with their 
 42 |   most likely replacement using **hunspell** in the backend.  Suggested by Surin
 43 |   Space; see issue #39.
 44 |   
 45 | * `as_ordinal` added as a convenience wrapper for `english::ordinal` that 
 46 |   takes integers and converts them to ordinal form.
 47 |   
 48 | * `%like%` added as an binary operator similar to SQL's LIKE.
 49 | 
 50 | MINOR FEATURES
 51 | 
 52 | * `fix_mdyyyy` added to correct dates in the form of m/d/yyyy to yyyy-mm-dd.
 53 | 
 54 | IMPROVEMENTS
 55 | 
 56 | * `replace_html` pics up the ability to replace "&laquo;" & "&raquo;" with ASCII
 57 |   equivalents "<<" & ">>".  Suggested by Ilya Shutov; see issue #48.
 58 | 
 59 | * All internal calls to `grepl()` now have `perl = TRUE` added as this is 
 60 |   generally a speed up. Suggested by Kyle Haynes (see #51).
 61 |   
 62 | CHANGES
 63 | 
 64 | * `filter_element()` and `filter_row()` have been deprecated for a few years.  
 65 |   They have now been removed.
 66 |   
 67 | 
 68 | textclean 0.9.3 
 69 | ----------------------------------------------------------------
 70 | 
 71 | Version update to comply with changes in the **glue** package's API.
 72 | 
 73 | 
 74 | 
 75 | textclean 0.8.0 - 0.9.2
 76 | ----------------------------------------------------------------
 77 | 
 78 | BUG FIXES
 79 | 
 80 | * `fgsub` had a bug in which the the original `pattern` in `fgsub` matches the 
 81 |   location in the string but when the replacement occurs this was done on the 
 82 |   entire string rather than the location of the first `pattern` match.  This
 83 |   means the extracted string was used as a search and might be found in places
 84 |   other than the original location (e.g., a leading boundary in '^T' replaced
 85 |   with '__' may have led to '__he __itle' rather than '__he Title' as expected
 86 |   in the string 'The Title').  See #35 for details.  The fix will add some time 
 87 |   to the computation but is safer.
 88 | 
 89 | NEW FEATURES
 90 | 
 91 | *  `replace_to`/`replace_from` added to remove from/to begin/end of string to/from 
 92 |   a character(s).
 93 |   
 94 | * The following replacement functions were added to provide remediation for 
 95 |   problems found in `check_text`: `replace_email`, `replace_hash`, 
 96 |   `replace_tag`, & `replace_url`.
 97 | 
 98 | MINOR FEATURES
 99 | 
100 | * `check_text` picks up a `checks` and `n` argument.  The former allows the user
101 |   to specify which checks to conduct.  The latter allows the user to truncate the
102 |   output to n number of elements with a closing `...[truncated]...`.  This makes
103 |   the function more useful and the code easier to maintain.
104 | 
105 | IMPROVEMENTS
106 | 
107 | * `replace_non_ascii` did not replace all non-ASCII characters.  This has been
108 |   fixed by an explicit replacement of '[^ -~]+' which are all non-ASCII characters.
109 |   See issue #34 for details.
110 | 
111 | 
112 | 
113 | textclean 0.7.3
114 | ----------------------------------------------------------------
115 | 
116 | Maintenance release to bring package up to date with the lexicon package API changes.
117 | 
118 | 
119 | textclean 0.7.0 - 0.7.2
120 | ----------------------------------------------------------------
121 | 
122 | NEW FEATURES
123 | 
124 | * `match_tokens` added to find all the tokens that match a regex(es) within a
125 |   given text vector.  This useful when combined with the `replace_tokens` 
126 |   function.
127 |   
128 | * Fixed versions of `drop_element`/`keep_element` added to allow for dropping
129 |   elements specified by a known vector rather than a regex.
130 | 
131 | * The `collapse` and `glue` functions from the **glue** package are reexported
132 |   for easy string manipulation.
133 |   
134 | * `replace_date` added for normalizing dates.
135 | 
136 | * `replace_time` added for normalizing time stamps.
137 | 
138 | * `replace_money` added for normalizing money references.
139 | 
140 | * `mgsub` picks up a `safe` argument using the **mgsub** package as the backend.
141 |   In addition `mgsub_regex_safe` added to make the usage explicit.  The safe mode
142 |   comes at the cost of speed.
143 | 
144 | IMPROVEMENTS
145 | 
146 | * `replace_names` drops the replacement of 
147 |     `c('An', 'To', 'Oh', 'So', 'Do', 'He', 'Ha', 'In', 'Pa', 'Un')` which are 
148 |     likely words and not names.
149 |     
150 | * `replace_html` picks ups some additional symbol replacements including:
151 |   `c("&trade;", "&ldquo;", "&rdquo;", "&lsquo;", "&rsquo;", "&bull;", "&middot;", 
152 |   "&sdot;", "&ndash;", "&mdash;", "&ne;", "&frac12;", "&frac14;", "&frac34;", 
153 |   "&deg;", "&larr;", "&rarr;", "&hellip;")`.
154 | 
155 | 
156 | 
157 | textclean 0.6.0 - 0.6.3
158 | ----------------------------------------------------------------
159 | 
160 | NEW FEATURES
161 | 
162 | * `replace_kern` added to replace a form of informal emphasis in which the
163 |   writer takes words >2 letters long, capitalizes the entire word, and places
164 |   spaces in between each letter.  This was contributed by Stack Overflow's
165 |   @ctwheels: https://stackoverflow.com/a/47438305/1000343.
166 | 
167 | * `replace_internet_slang` added to replace Internet acronyms and abbreviations
168 |   with machine friendly word equivalents.
169 |   
170 | * `replace_word_elongation` added to replace word elongations (a.k.a. "word 
171 |   lengthening") with the most likely normalized word form.  See 
172 |   http://www.aclweb.org/anthology/D11-105 for details.
173 |   
174 | * `fgsub` added for the ability to match, extract, operate a function over the
175 |   extracted strings, & replace the original matches with the extracted strings.
176 |   This performs similar functionality to `gsubfn::gsubfn` but is less powerful.
177 |   For more powerful needs see the **gsubfn** package.
178 | 
179 | 
180 | 
181 | textclean 0.4.0 - 0.5.1
182 | ----------------------------------------------------------------
183 | 
184 | BUG FIXES
185 | 
186 | * `replace_grade` did not use `fixed = TRUE` for its call to `mgsub`.  This could
187 |   result in the plus signs being interpreted as meta-characters.  This has been 
188 |   corrected.
189 | 
190 | NEW FEATURES
191 | 
192 | * `replace_names` added to remove/replace common first and last names from text 
193 |   data.
194 |   
195 | * `make_plural` added to make a vector of singular noun forms plural.
196 | 
197 | * `replace_emoji` and `replace_emoji_identifier` added for replacing emojis with
198 |   text or an identifier token for use in the **sentimentr** package.
199 | 
200 | MINOR FEATURES
201 | 
202 | * `mgsub_regex` and `mgsub_fixed` to provide wrappers for `mgsub` that makes
203 |   their use apparent without setting the `fixed` command.
204 |   
205 | * `replace_curly_quote` added to replace curly quotes with straight versions.
206 | 
207 | IMPROVEMENTS
208 | 
209 | * `replace_non_ascii` now uses `stringi::stri_trans_general` to coerce more 
210 |   non-ASCII characters to ASCII format.
211 |   
212 | * `check_text` now checks for HTML characters/tags.  Thanks to @Peter Gensler
213 |   for suggesting this (see issue #15). 
214 | 
215 | CHANGES
216 | 
217 | * `filter_` functions deprecated in favor of `drop_`/`keep_` versions of filter
218 |   functions.  This was change was to address the opposite meaning that **dplyr**'s 
219 |   `filter` has, which retains rows matching a pattern be default.
220 | 
221 | 
222 | 
223 | textclean 0.3.1
224 | ----------------------------------------------------------------
225 | 
226 | BUG FIXES
227 | 
228 | * `replace_tokens` added to complement `mgsub` for times when the user wants to 
229 |   replace fixed tokens with a single value or remove them entirely.  This yields 
230 |   an optimized solution that is much faster than `mgsub`.
231 | 
232 | CHANGES
233 | 
234 | * `mgusb` no longer uses `trim = TRUE` by default.
235 | 
236 | textclean 0.2.1 - 0.3.0
237 | ----------------------------------------------------------------
238 | 
239 | BUG FIXES
240 | 
241 | * `check_text` reported to use `replace_incomplete` rather than 
242 |   `add_missing_endmark` when endmark is missing.
243 |   
244 | NEW FEATURES
245 | 
246 | * The `replace_emoticon`, `replace_grade` and `replace_rating` functions have 
247 |   been moved from the **sentimentr** package to **textclean** as these are 
248 |   cleaning functions.  This makes the functions more modular and generalizable 
249 |   to all types of text cleaning.  These functions are still imported and 
250 |   exported by **sentimentr**.
251 |   
252 | * `replace_html` added to remove html tags and repalce symbols with appropriate
253 |   ASCII symbols.
254 |   
255 | * `add_missing_endmarks` added to detect missing endmarks and replace with the 
256 |   desired symbol.
257 | 
258 | IMPROVEMENTS
259 | 
260 | * `replace_number` now uses the *english* package making it faster and more 
261 |   maintainable.  In addition, the function now handles decimal places as well.
262 | 
263 | 
264 | 
265 | textclean 0.1.0 - 0.2.0
266 | ----------------------------------------------------------------
267 | 
268 | BUG FIXES
269 | 
270 | * `check_text` reported `NA` as non-ASCII.  This has been fixed.
271 | 
272 | NEW FEATURES
273 | 
274 | * `check_text` added to report on potential problems in a text vector.
275 | 
276 | * `replace_ordinal` added to replace ordinal numbers (e.g., 1st) with word 
277 |   representation (e.g., first).
278 |   
279 | * `swap` added to swap two patterns simultaneously.
280 | 
281 | * `filter_element` added to exclude matching elements from a vector.
282 | 
283 | 
284 | 
285 | textclean 0.0.1 
286 | ----------------------------------------------------------------
287 | 
288 | This package is a collection of tools to clean and process text.  Many of these tools have been taken from the **qdap** package and revamped to be more intuitive, better named, and faster.
289 | 


--------------------------------------------------------------------------------
/R/add_comma_space.R:
--------------------------------------------------------------------------------
 1 | #' Ensure Space After Comma
 2 | #' 
 3 | #' Adds a space after a comma as \code{strip} and many other functions may consider a 
 4 | #' comma separated string as one word (i.e., \code{"one,two,three"} becomes 
 5 | #' \code{"onetwothree"}  rather than \code{"one two three"}).
 6 | #' 
 7 | #' @param x The text variable.
 8 | #' @return Returns a vector of strings with commas that have a space after them.
 9 | #' @keywords comma space
10 | #' @export
11 | #' @examples
12 | #' \dontrun{
13 | #' x <- c("the,  dog,went", "I,like,it", "where are you", NA, "why", ",", ",f")
14 | #' add_comma_space(x)
15 | #' }
16 | add_comma_space <- function(x) {
17 |     gsub("(,)([^ ])", "\\1 \\2", x)
18 | }
19 | 
20 | 


--------------------------------------------------------------------------------
/R/add_missing_endmark.R:
--------------------------------------------------------------------------------
 1 | #' Add Missing Endmarks
 2 | #' 
 3 | #' Detect missing endmarks and replace with the desired symbol.
 4 | #' 
 5 | #' @param x  The text variable.
 6 | #' @param replacement Character string equal in length to pattern or of length 
 7 | #' one which are  a replacement for matched pattern.
 8 | #' @param endmarks The potential ending punctuation marks.
 9 | #' @param \dots Additional arguments passed to 
10 | #' \code{\link[textclean]{has_endmark}}.
11 | #' @return Returns a vector with missing endmarks added.
12 | #' @export
13 | #' @examples 
14 | #' x <- c(
15 | #'     "This in a", 
16 | #'     "I am funny!", 
17 | #'     "An ending of sorts%", 
18 | #'     "What do you want?"
19 | #' )
20 | #' 
21 | #' add_missing_endmark(x)
22 | add_missing_endmark <- function(x, replacement = "|", 
23 |     endmarks = c("?", ".", "!"), ...){
24 | 
25 |     locs <- which(!has_endmark(x, ...))
26 |     x[locs] <- paste0(x[locs], replacement)
27 |     x
28 | 
29 | }
30 | 
31 | 


--------------------------------------------------------------------------------
/R/drop_element.R:
--------------------------------------------------------------------------------
 1 | #' Filter Elements in a Vetor
 2 | #' 
 3 | #' \code{drop_element} - Filter to drop the matching elements of a vector.
 4 | #' 
 5 | #' @param x A character vector.
 6 | #' @param pattern A regex pattern to match for exclusion.
 7 | #' @param regex logical.  If setting this to \code{TRUE} please use 
 8 | #' \code{drop_element_regex} or \code{keep_element_regex} directly as this will
 9 | #' provide better control and optimization.
10 | #' @param \ldots Other arguments passed to \code{\link[base]{grep}} if 
11 | #' \code{regex}.  If \code{fixed}, then elements to drop/keep.
12 | #' @return Returns a vector with matching elements removed.
13 | #' @rdname drop_element
14 | #' @export
15 | #' @examples 
16 | #' x <- c('dog', 'cat', 'bat', 'dingo', 'dragon', 'dino')
17 | #' drop_element(x, '^d.+?g')
18 | #' keep_element(x, '^d.+?g')
19 | #' drop_element(x, 'at$')
20 | #' drop_element(x, '^d')
21 | #' drop_element(x, '\\b(dog|cat)\\b')
22 | #' 
23 | #' drop_element_fixed(x, 'bat', 'cat')
24 | #' drops <- c('bat', 'cat')
25 | #' drop_element_fixed(x, drops)
26 | drop_element <- function(x, pattern, regex = TRUE, ...){
27 | 
28 |     if (isTRUE(regex)) {
29 |         drop_element_regex(x, pattern, ...)
30 |     } else {
31 |         message('Please use `drop_element_fixed` for better control.')
32 |         drop_element_fixed(x, ...) 
33 |     }
34 | }
35 | 
36 | #' @rdname drop_element
37 | #' @export
38 | drop_element_regex  <- function(x, pattern, ...){
39 | 
40 |     grep(pattern, x, value =  TRUE, invert = TRUE, perl = TRUE, ...)
41 | }
42 | 
43 | #' @rdname drop_element
44 | #' @export
45 | drop_element_fixed  <- function(x, ...){
46 | 
47 |     x[!x %in% unlist(list(...))]
48 | }
49 | 
50 | #' Filter Elements in a Vetor
51 | #' 
52 | #' \code{keep_element} - Filter to keep the matching elements of a vector.
53 | #' 
54 | #' @rdname drop_element
55 | #' @export
56 | keep_element  <- function(x, pattern, regex = TRUE, ...){
57 | 
58 |     if (isTRUE(regex)) {
59 |         keep_element_regex(x, pattern, ...)
60 |     } else {
61 |         message('Please use `keep_element_fixed` for better control.')        
62 |         keep_element_fixed(x, ...)    
63 |     }
64 | }
65 | 
66 | #' @rdname drop_element
67 | #' @export
68 | keep_element_fixed  <- function(x, ...){
69 | 
70 |     x[x %in% unlist(list(...))]
71 | }
72 | 
73 | 
74 | #' @rdname drop_element
75 | #' @export
76 | keep_element_regex <- function(x, pattern, ...){
77 | 
78 |     grep(pattern, x, value =  TRUE, perl = TRUE, ...)
79 | }
80 | 
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/R/drop_row.R:
--------------------------------------------------------------------------------
  1 | #' Filter Rows That Contain Markers
  2 | #' 
  3 | #' \code{drop_row} - Remove rows from a data set that contain a given 
  4 | #' marker/term.
  5 | #' 
  6 | #' @param dataframe A dataframe object.
  7 | #' @param column Column name to search for markers/terms.
  8 | #' @param terms The regex terms/markers of the rows that are to be removed from 
  9 | #' the dataframe.  
 10 | #' @param \ldots Other arguments passed to \code{\link[base]{grepl}}.
 11 | #' @return \code{drop_row} - returns a dataframe with the termed/markered rows 
 12 | #' removed.
 13 | #' @rdname drop_row
 14 | #' @export
 15 | #' @examples
 16 | #' \dontrun{
 17 | #' ## drop_row EXAMPLE:
 18 | #' drop_row(DATA, "person", c("sam", "greg"))
 19 | #' keep_row(DATA, "person", c("sam", "greg"))
 20 | #' drop_row(DATA, 1, c("sam", "greg"))
 21 | #' drop_row(DATA, "state", c("Comp"))
 22 | #' drop_row(DATA, "state", c("I "))
 23 | #' drop_row(DATA, "state", c("you"), ignore.case=TRUE)
 24 | #' 
 25 | #' ## drop_empty_row EXAMPLE:
 26 | #' (dat <- rbind.data.frame(DATA[, c(1, 4)], matrix(rep(" ", 4), 
 27 | #'    ncol =2, dimnames=list(12:13, colnames(DATA)[c(1, 4)]))))
 28 | #' drop_empty_row(dat)
 29 | #' 
 30 | #' ## drop_NA EXAMPLE:
 31 | #' DATA[1:3, "state"] <- NA
 32 | #' drop_NA(DATA)
 33 | #' }
 34 | drop_row <- function(dataframe, column, terms, ...) {
 35 |     
 36 |     terms <- paste(terms, collapse="|")
 37 |     if (length(dataframe[[column]]) == 0) {
 38 |         stop(
 39 |             "No columns in the data appear to match supplied `column`", 
 40 |             call. = FALSE
 41 |         )   
 42 |     }
 43 |     dataframe <- dataframe[!grepl(terms, dataframe[[column]], perl=TRUE, ...), ]
 44 |     rownames(dataframe) <- NULL
 45 |     
 46 |     dataframe
 47 | }
 48 | 
 49 | #' Filter Rows That Contain Markers
 50 | #' 
 51 | #' \code{keep_row} - Keep rows from a data set that contain a given marker/term.
 52 | #' @rdname drop_row
 53 | #' @export
 54 | keep_row <- function(dataframe, column, terms, ...) {
 55 |     
 56 |     terms <- paste(terms, collapse="|")
 57 |     if (length(dataframe[[column]]) == 0) {
 58 |         stop(
 59 |             "No columns in the data appear to match supplied `column`", 
 60 |             call. = FALSE
 61 |             )    
 62 |     }
 63 |     dataframe <- dataframe[grepl(terms, dataframe[[column]], perl=TRUE, ...), ]
 64 |     rownames(dataframe) <- NULL
 65 |     
 66 |     dataframe
 67 | }
 68 | 
 69 | 
 70 | #' Remove Empty Rows in a Data Frame
 71 | #' 
 72 | #' \code{drop_empty_row} - Removes the empty rows of a data set that are common in 
 73 | #' reading in data.
 74 | #' 
 75 | #' @return \code{drop_empty_row} - returns a dataframe with empty rows removed.
 76 | #' @rdname drop_row
 77 | #' @export
 78 | drop_empty_row <- function(dataframe) {
 79 |     x <- apply(dataframe, 1, function(x) {
 80 |         paste(stats::na.omit(x), collapse = "")
 81 |     })
 82 |     return(dataframe[!grepl("^\\s*$", x, perl = TRUE),  ,drop = FALSE] )
 83 | }
 84 | 
 85 | 
 86 | #' Remove Empty Rows in a Data Frame
 87 | #' 
 88 | #' \code{drop_NA} - Removes the \code{NA} rows of a data set.
 89 | #' 
 90 | #' @return \code{drop_NA} - returns a dataframe with \code{NA} rows removed.
 91 | #' @rdname drop_row
 92 | #' @export
 93 | drop_NA <- function(dataframe, column = TRUE, ...){
 94 |     
 95 |     column <- detect_text_column(dataframe, column)
 96 |     
 97 |     dataframe[!is.na(dataframe[[column]]), ,drop = FALSE]
 98 |     
 99 | }
100 | 
101 | 
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/R/fgsub.R:
--------------------------------------------------------------------------------
  1 | #' Replace a Regex with an Functional Operation on the Regex Match
  2 | #' 
  3 | #' This is a stripped down version of \code{gsubfn} from the \pkg{gsubfn} 
  4 | #' package.  It finds a regex match, and then uses a function to operate on
  5 | #' these matches and uses them to replace the original matches.  Note that
  6 | #' the \pkg{stringi} packages is used for matching and extracting the regex 
  7 | #' matches.  For more powerful or flexible needs please see the \pkg{gsubfn} 
  8 | #' package.
  9 | #' 
 10 | #' @param x A character vector.
 11 | #' @param pattern Character string to be matched in the given character vector. 
 12 | #' @param fun A function to operate on the extracted matches.
 13 | #' @param \ldots ignored.
 14 | #' @return Returns a vector with the pattern replaced.
 15 | #' @export
 16 | #' @importFrom data.table :=
 17 | #' @seealso \code{\link[gsubfn]{gsubfn}}
 18 | #' @examples 
 19 | #' ## In this example the regex looks for words that contain a lower case letter 
 20 | #' ## followed by the same letter at least 2 more times.  It then extracts these
 21 | #' ## words, splits them appart into letters, reverses the string, pastes them
 22 | #' ## back together, wraps them with double angle braces, and then puts them back 
 23 | #' ## at the original locations.
 24 | #' fgsub(
 25 | #'     x = c(NA, 'df dft sdf', 'sd fdggg sd dfhhh d', 'ddd'),
 26 | #'     pattern = "\\b\\w*([a-z])(\\1{2,})\\w*\\b",
 27 | #'     fun = function(x) {
 28 | #'         paste0('<<', paste(rev(strsplit(x, '')[[1]]), collapse =''), '>>')
 29 | #'     }    
 30 | #' )
 31 | #' 
 32 | #' ## In this example we extract numbers, strip out non-digits, coerce them to 
 33 | #' ## numeric, cut them in half, round up to the closest integer, add the commas 
 34 | #' ## back, and replace back into the original locations.
 35 | #' fgsub(
 36 | #'     x = c(NA, 'I want 32 grapes', 'he wants 4 ice creams', 
 37 | #'         'they want 1,234,567 dollars'
 38 | #'     ),
 39 | #'     pattern = "[\\d,]+",
 40 | #'     fun = function(x) {
 41 | #'         prettyNum(
 42 | #'             ceiling(as.numeric(gsub('[^0-9]', '', x))/2), 
 43 | #'             big.mark = ','
 44 | #'         )
 45 | #'     }    
 46 | #' )
 47 | #' 
 48 | #' ## In this example we extract leading zeros, convert to an equal number of 
 49 | #' ## spaces. 
 50 | #' fgsub(
 51 | #'     x = c(NA, "00:04", "00:08", "00:01", "06:14", "00:02", "00:04"),
 52 | #'     pattern = '^0+',
 53 | #'     fun = function(x) {gsub('0', ' ', x)}
 54 | #' )
 55 | fgsub <- function(x, pattern, fun, ...){
 56 | 
 57 |     hit_id <- pattern_id <- pat <- NULL
 58 |     
 59 |     locs <- stringi::stri_detect_regex(x, pattern)
 60 |     locs[is.na(locs)] <- FALSE
 61 |     txt <- x[locs]
 62 | 
 63 |     hits <- stringi::stri_extract_all_regex(txt, pattern)
 64 |     
 65 |     ## Make unique replacement substrings
 66 |     h <- lengths(hits)
 67 |     y <- sum(h)
 68 |     if (y == 0) return(x)
 69 |     counter <- ceiling(y/26)
 70 | 
 71 |     ## Make a replacement key
 72 |     pats <- unique(unlist(hits))
 73 |     reps <- paste0("textcleanholder", seq_along(pats), "textcleanholder")
 74 |     freps <- unlist(lapply(pats, fun))
 75 | 
 76 |     pat_key <- data.table::data.table(pat = reps, replacement = freps)
 77 |     
 78 |     hit_key <- data.table::data.table(
 79 |         hit_id = rep(seq_len(length(h)), h),
 80 |         pat = reps,
 81 |         pattern_id = unlist(lapply(h, seq_len))
 82 |     )
 83 |     
 84 |     data.table::setkey(pat_key, pat)
 85 |     data.table::setkey(hit_key, pat)
 86 |     
 87 |     hit_key <- hit_key[pat_key][, 
 88 |         hit_id := as.integer(hit_id)][, 
 89 |         pattern_id := as.integer(pattern_id)]
 90 |     
 91 |     data.table::setorderv(hit_key, cols = c('hit_id', 'pattern_id'))
 92 | 
 93 |     ## Loop through and replace the first pattern in each element with a unique 
 94 |     ## replacement substring
 95 |     for (i in seq_len(y)) {
 96 |         
 97 |         hkr <- hit_key[i,]
 98 |         
 99 |         txt[hkr[, 'hit_id'][[1]]] <- sub(
100 |             pattern, 
101 |             hkr[, 'pat'][[1]], 
102 |             txt[hkr[, 'hit_id'][[1]]], 
103 |             perl = TRUE
104 |         )
105 |         
106 |     }
107 | 
108 |     ## Because the unique repalcment substrings are so unlikely to have a 
109 |     ## collision, we can use fixed = TRUE and be very quick here
110 |     txt <- mgsub(txt, hit_key[['pat']],  hit_key[['replacement']], fixed = TRUE, ...)
111 |     
112 |     x[locs] <- txt
113 |     x
114 | 
115 | }
116 | 
117 | ## defunct version 2018-06-06
118 | # fgsub <- function(x, pattern, fun, ...){
119 | # 
120 | #     hit_id <- pattern_id <- pat <- NULL
121 | #     
122 | #     locs <- stringi::stri_detect_regex(x, pattern)
123 | #     locs[is.na(locs)] <- FALSE
124 | #     txt <- x[locs]
125 | # 
126 | #     hits <- stringi::stri_extract_all_regex(txt, pattern)
127 | #     
128 | # 
129 | #     pats <- unique(unlist(hits))
130 | #     reps <- paste0("textcleanholder", seq_along(pats), "textcleanholder")
131 | #     freps <- unlist(lapply(pats, fun))
132 | # 
133 | #     pat_key <- data.table::data.table(pat = pats, replacement = freps)
134 | #     
135 | #     hit_key <- textshape::tidy_list(
136 | #         set_names(
137 | #             lapply(hits, function(x) set_names(x, seq_along(x))), 
138 | #             seq_along(hits)
139 | #         ),
140 | #         'hit_id', 'pat', 'pattern_id'
141 | #     )
142 | #     
143 | # 
144 | #     data.table::setkey(pat_key, pat)
145 | #     data.table::setkey(hit_key, pat)
146 | #     
147 | #     hit_key <- hit_key[pat_key][, 
148 | #         hit_id := as.integer(hit_id)][, 
149 | #         pattern_id := as.integer(pattern_id)]
150 | #     
151 | #     data.table::setorderv(hit_key, cols = c('hit_id', 'pattern_id'))
152 | # 
153 | #     for (i in seq_len(nrow(hit_key))) {
154 | #         hkr <- hit_key[i,]     
155 | #         hkr[, 'pattern_id'][[1]]
156 | #         txt[hkr[, 'hit_id'][[1]]] <- sub(
157 | #             hkr[, 'pat'][[1]], 
158 | #             hkr[, 'replacement'][[1]], 
159 | #             txt[hkr[, 'hit_id'][[1]]], 
160 | #             perl = TRUE
161 | #         )
162 | #     }
163 | #         
164 | #     x[locs] <- txt
165 | #     x
166 | # 
167 | # }
168 | 
169 | ## old version removed 2018-06-01
170 | # fgsub <- function(x, pattern, fun, ...){
171 | #     
172 | #     locs <- stringi::stri_detect_regex(x, pattern)
173 | #     locs[is.na(locs)] <- FALSE
174 | #     txt <- x[locs]
175 | #     
176 | #     hits <- stringi::stri_extract_all_regex(txt, pattern)
177 | #     pats <- unique(unlist(hits))
178 | #     reps <- paste0('textcleanholder', seq_along(pats), 'textcleanholder')
179 | #     freps <- unlist(lapply(pats, fun))
180 | #         
181 | #     txt <- mgsub(txt, pats, reps)
182 | #     
183 | #     x[locs] <- mgsub(txt, reps, freps)
184 | #     x
185 | # 
186 | # }
187 | 


--------------------------------------------------------------------------------
/R/fix_mdyyyy.R:
--------------------------------------------------------------------------------
 1 | #' Coerce Character m/d/yyyy to Date
 2 | #' 
 3 | #' Uses regular expressions to sub out a single day or month with a leading zero
 4 | #' and then coerces to a date object.
 5 | #' 
 6 | #' @param x A character date in the form of m/d/yyyy where m and d can be single 
 7 | #' integers like 1 for January.
 8 | #' @param \ldots ignored.
 9 | #' @return Returns a data vector
10 | #' @export
11 | #' @rdname fix_mdyyyy
12 | #' @examples 
13 | #' fix_mdyyyy(c('4/23/2017', '12/1/2016', '3/3/2013', '12/12/2012', '2013-01-01'))
14 | #' \dontrun{
15 | #' library(dplyr)
16 | #' data_frame(
17 | #'     x = 1:4,
18 | #'     y = LETTERS[1:4],
19 | #'     start_date = c('4/23/2017', '12/1/2016', '3/3/2013', '12/12/2012'),
20 | #'     end_date = c('5/23/2017', '12/9/2016', '3/3/2016', '2/01/2012')
21 | #' ) %>%
22 | #' mutate_at(vars(ends_with('_date')), fix_mdyyyy)
23 | #' }
24 | fix_mdyyyy <- function(x, ...){
25 |     UseMethod('fix_mdyyyy')
26 | } 
27 | 
28 | 
29 | #' @export
30 | #' @method fix_mdyyyy date
31 | fix_mdyyyy.date <- function(x, ...){
32 |     x
33 | } 
34 | 
35 | #' @export
36 | #' @method fix_mdyyyy default
37 | fix_mdyyyy.default <- function(x, ...){
38 |     as.Date(fix_mdyyyy_character(x), format = '%Y-%m-%d')
39 | } 
40 | 
41 | fix_mdyyyy_character <- function(x, ...){
42 |     gsub(
43 |         '(^\\d{2})(?:/)(\\d{2})(?:/)(\\d{4})', 
44 |         '\\3-\\1-\\2', 
45 |         gsub(
46 |             '(/)(\\d{1}/)',
47 |             '\\10\\2',     
48 |             gsub(
49 |                 '(^\\d{1}/)',
50 |                 '0\\1', 
51 |                 x
52 |             )
53 |         )
54 |     )
55 | } 
56 | 


--------------------------------------------------------------------------------
/R/glue-reexports.R:
--------------------------------------------------------------------------------
1 | #' @importFrom glue glue
2 | #' @export
3 | glue::glue
4 | 
5 | #' @importFrom glue glue_collapse
6 | #' @export
7 | glue::glue_collapse
8 | 


--------------------------------------------------------------------------------
/R/has_endmark.R:
--------------------------------------------------------------------------------
 1 | #' Test for Incomplete Sentences
 2 | #' 
 3 | #' A logical test of missing sentence ending punctuation.
 4 | #' 
 5 | #' @param x A character vector.
 6 | #' @param endmarks The potential ending punctuation marks,
 7 | #' @param \dots ignored.
 8 | #' @return Returns a logical vector.
 9 | #' @keywords incomplete
10 | #' @export
11 | #' @examples
12 | #' x <- c(
13 | #'     "I like it.", 
14 | #'     "Et tu?",  
15 | #'     "Not so much", 
16 | #'     "Oh, I understand.",  
17 | #'     "At 3 p.m., we go",
18 | #'     NA
19 | #' )
20 | #' has_endmark(x)
21 | has_endmark <- function(x, endmarks = c('?', '.', '!'), ...){
22 |     !is.na(x) & grepl(
23 |         sprintf('[%s]\\s*$', paste(endmarks, collapse = "")), 
24 |         x, perl = TRUE, 
25 |         ...
26 |     )
27 | }
28 | 
29 | 
30 | 
31 | 
32 | 
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/R/like.R:
--------------------------------------------------------------------------------
 1 | #' SQL Style LIKE
 2 | #' 
 3 | #' Use like as a SQL-esque opertator for pattern matching.  \code{\%like\%} is
 4 | #' case insensitive while \code{\%slike\%} is case sensitive.  This is most useful 
 5 | #' in a \code{dplyr::filter}.
 6 | #' 
 7 | #' @param var A variable/column.
 8 | #' @param pattern A search pattern.
 9 | #' @export
10 | #' @rdname like
11 | #' @examples
12 | #' state.name[state.name %like% 'or']
13 | #' state.name[state.name %LIKE% 'or']
14 | #' state.name[state.name %slike% 'or'] ## No Oregon
15 | `%like%` <- function(var, pattern){
16 |     stringi::stri_detect_regex(var, pattern, case_insensitive = TRUE)
17 | }
18 | 
19 | #' @rdname like
20 | #' @export
21 | `%LIKE%` <- `%like%`
22 | 
23 | #' @rdname like
24 | #' @export
25 | `%slike%` <- function(var, pattern){
26 |     stringi::stri_detect_regex(var, pattern, case_insensitive = FALSE)
27 | }
28 | 
29 | #' @rdname like
30 | #' @export
31 | `%SLIKE%` <- `%slike%`


--------------------------------------------------------------------------------
/R/make_plural.R:
--------------------------------------------------------------------------------
 1 | #' Make Plural (or Verb to Singular) Versions of Words
 2 | #' 
 3 | #' Add -s, -es, or -ies to words.
 4 | #' 
 5 | #' @param x A vector of words to make plural.
 6 | #' @param keep.original logical.  If \code{TRUE} the original words are kept in 
 7 | #' the return vector.
 8 | #' @param irregular A \code{data.frame} of singular and plural conversions for 
 9 | #' irregular nouns.  The first column should be singular and the second plural 
10 | #' form of the irregular noun.
11 | #' @return Returns a vector of plural words.
12 | #' @keywords plural
13 | #' @export
14 | #' @examples 
15 | #' x <- c('fox', 'sky', 'dog', 'church', 'fish', 'miss', 'match', 'deer', 'block')
16 | #' make_plural(x)
17 | make_plural <- function (x, keep.original = FALSE, 
18 |     irregular = lexicon::pos_df_irregular_nouns) {
19 |    
20 |     stopifnot(is.data.frame(irregular))
21 |     
22 |     hits <- match(tolower(x), tolower(irregular[[1]]))
23 |     
24 |     ends <- "(sh?|x|z|ch)$"
25 |     pluralify <- ifelse(grepl(ends, x, perl = TRUE), "es", "s")
26 |     out <- gsub("ys$", "ies", paste0(x, pluralify))
27 |     out[which(!is.na(hits))] <- irregular[[2]][hits[which(!is.na(hits))]]
28 |     
29 |     c(if (keep.original) {
30 |         x
31 |     }, out)
32 |     
33 | }
34 | 


--------------------------------------------------------------------------------
/R/match_tokens.R:
--------------------------------------------------------------------------------
 1 | #' Find Tokens that Match a Regex
 2 | #' 
 3 | #' Given a text, find all the tokens that match a regex(es).  This function is
 4 | #' particularly useful with \code{\link[textclean]{replace_tokens}}.
 5 | #' 
 6 | #' @param x A character vector.
 7 | #' @param pattern Character string(s) to be matched in the given character vector. 
 8 | #' @param ignore.case logical.  If \code{TRUE} the case of the tokens/patterns 
 9 | #' will be ignored.
10 | #' @param \ldots ignored.
11 | #' @return Returns a vector of tokens from a text matching a specific regex 
12 | #' pattern.
13 | #' @export
14 | #' @seealso \code{\link[textclean]{replace_tokens}}
15 | #' @examples
16 | #' with(DATA, match_tokens(state, c('^li', 'ou')))
17 | #' 
18 | #' with(DATA, match_tokens(state, c('^Th', '^I'), ignore.case = TRUE))
19 | #' with(DATA, match_tokens(state, c('^Th', '^I'), ignore.case = FALSE))
20 | match_tokens <- function(x, pattern, ignore.case = TRUE, ...){
21 | 
22 |     if (!is.atomic(x)) stop('`x` should be a character vector')
23 |     y <- rm_na(unique(unlist(textshape::split_token(x, lower = ignore.case))))
24 |     if (isTRUE(ignore.case)) pattern <- tolower(pattern)
25 |     
26 |     y[grepl(paste(paste0('(', pattern, ')'), collapse = '|'), y, perl = TRUE)]
27 |     
28 | }
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/R/mgsub.R:
--------------------------------------------------------------------------------
  1 | #' Multiple \code{\link[base]{gsub}}
  2 | #' 
  3 | #' \code{mgsub} - A wrapper for \code{\link[base]{gsub}} that takes a vector 
  4 | #' of search terms and a vector or single value of replacements.
  5 | #' 
  6 | #' @param x A character vector.
  7 | #' @param pattern Character string to be matched in the given character vector. 
  8 | #' @param replacement Character string equal in length to pattern or of length 
  9 | #' one which are a replacement for matched pattern. 
 10 | #' @param leadspace logical.  If \code{TRUE} inserts a leading space in the 
 11 | #' replacements.
 12 | #' @param trailspace logical.  If \code{TRUE} inserts a trailing space in the 
 13 | #' replacements.
 14 | #' @param fixed logical. If \code{TRUE}, pattern is a string to be matched as 
 15 | #' is. 
 16 | #' Overrides all conflicting arguments.
 17 | #' @param trim logical.  If \code{TRUE} leading and trailing white spaces are 
 18 | #' removed and multiple white spaces are reduced to a single white space.
 19 | #' @param order.pattern logical.  If \code{TRUE} and \code{fixed = TRUE}, the 
 20 | #' \code{pattern} string is sorted by number of characters to prevent substrings 
 21 | #' replacing meta strings (e.g., \code{pattern = c("the", "then")} resorts to 
 22 | #' search for "then" first).
 23 | #' @param safe logical.  If \code{TRUE} then the \pkg{mgsub} package is used as
 24 | #' the backend and performs safe substitutions.  The trade-off is that this mode
 25 | #' will slow the replacements down considerably.
 26 | #' @param \dots Additional arguments passed to \code{\link[base]{gsub}}. In 
 27 | #' \code{mgsub_regex_safe} this is other arguments passed to 
 28 | #' \code{\link[mgsub]{mgsub}}.
 29 | #' @return \code{mgsub} - Returns a vector with the pattern replaced.
 30 | #' @seealso \code{\link[textclean]{replace_tokens}}
 31 | #' \code{\link[base]{gsub}}
 32 | #' @export
 33 | #' @rdname mgsub
 34 | #' @examples
 35 | #' mgsub(DATA$state, c("it's", "I'm"), c("it is", "I am"))
 36 | #' mgsub(DATA$state, "[[:punct:]]", "PUNC", fixed = FALSE)
 37 | #' \dontrun{
 38 | #' library(textclean)
 39 | #' hunthou <- replace_number(seq_len(1e5)) 
 40 | #' 
 41 | #' textclean::mgsub(
 42 | #'     "'twenty thousand three hundred five' into 20305", 
 43 | #'     hunthou, 
 44 | #'     seq_len(1e5)
 45 | #' )
 46 | #' ## "'20305' into 20305"
 47 | #' 
 48 | #' ## Larger example from: https://stackoverflow.com/q/18332463/1000343
 49 | #' ## A slower approach
 50 | #' fivehunthou <- replace_number(seq_len(5e5)) 
 51 | #' 
 52 | #' testvect <- c("fifty seven", "four hundred fifty seven", 
 53 | #'     "six thousand four hundred fifty seven", 
 54 | #'     "forty six thousand four hundred fifty seven", 
 55 | #'     "forty six thousand four hundred fifty seven", 
 56 | #'     "three hundred forty six thousand four hundred fifty seven"
 57 | #' )
 58 | #' 
 59 | #' textclean::mgsub(testvect, fivehunthou, seq_len(5e5))
 60 | #' 
 61 | #' ## Safe substitution: Uses the mgsub package as the backend
 62 | #' dubious_string <- "Dopazamine is a fake chemical"
 63 | #' pattern <- c("dopazamin","do.*ne")
 64 | #' replacement <- c("freakout","metazamine")
 65 | #' 
 66 | #' mgsub(dubious_string, pattern, replacement, ignore.case = TRUE, fixed = FALSE)
 67 | #' mgsub(dubious_string, pattern, replacement, safe = TRUE, fixed = FALSE)
 68 | #' }
 69 | mgsub <- function (x, pattern, replacement, leadspace = FALSE, 
 70 |     trailspace = FALSE, fixed = TRUE, trim = FALSE, order.pattern = fixed, 
 71 |     safe = FALSE, ...) {
 72 | 
 73 |     if (!is.null(list(...)$ignore.case) & fixed) {
 74 |         warning(
 75 |             paste0('`ignore.case = TRUE` can\'t be used with `fixed = TRUE`.\n',
 76 |                 'Do you want to set `fixed = FALSE`?'
 77 |             ),
 78 |             call. = FALSE
 79 |         )    
 80 |     }
 81 |     
 82 |     if (safe) {
 83 |         return(mgsub_regex_safe(x = x, pattern = pattern, 
 84 |             replacement = replacement, ...))
 85 |     }
 86 |     
 87 |     if (leadspace | trailspace) {
 88 |         replacement <- spaste(
 89 |             replacement, 
 90 |             trailing = trailspace, 
 91 |             leading = leadspace
 92 |         )
 93 |     }
 94 | 
 95 |     if (fixed && order.pattern) {
 96 |         ord <- rev(order(nchar(pattern)))
 97 |         pattern <- pattern[ord]
 98 |         if (length(replacement) != 1) replacement <- replacement[ord]
 99 |     }
100 |     
101 |     if (length(replacement) == 1) {
102 |         replacement <- rep(replacement, length(pattern))
103 |     }
104 |     
105 |     if (any(!nzchar(pattern))) {
106 |         good_apples <- which(nzchar(pattern))  
107 |         pattern <- pattern[good_apples]
108 |         replacement <- replacement[good_apples]      
109 |         warning(paste0(
110 |             'Empty pattern found (i.e., `pattern = ""`).\n', 
111 |             'This pattern and replacement have been removed.'
112 |         ), call. = FALSE)
113 |     }
114 |     
115 |     for (i in seq_along(pattern)){
116 |         x <- gsub(pattern[i], replacement[i], x, fixed = fixed, ...)
117 |     }
118 | 
119 |     if (trim) {
120 |         x <- gsub("\\s+", " ", gsub("^\\s+|\\s+$", "", x, perl=TRUE), perl=TRUE)
121 |     }
122 |     
123 |     x
124 | }
125 | 
126 | #' Multiple \code{\link[base]{gsub}}
127 | #' 
128 | #' \code{mgsub_fixed} - An alias for \code{mgsub}.
129 | #' 
130 | #' @export
131 | #' @rdname mgsub
132 | mgsub_fixed <- mgsub 
133 | 
134 | #' Multiple \code{\link[base]{gsub}}
135 | #' 
136 | #' \code{mgsub_regex} - An wrapper for \code{mgsub} with \code{fixed = FALSE}.
137 | #' 
138 | #' @export
139 | #' @rdname mgsub
140 | mgsub_regex <- function(x, pattern, replacement, leadspace = FALSE, 
141 |     trailspace = FALSE, fixed = FALSE, trim = FALSE, order.pattern = fixed, 
142 |     ...) {
143 |     
144 |     mgsub(x = x, pattern = pattern, replacement = replacement, 
145 |         leadspace = leadspace, trailspace = trailspace, fixed = fixed, 
146 |         trim = trim, order.pattern = order.pattern, ...
147 |     )
148 |     
149 | }
150 | 
151 | #' Multiple \code{\link[base]{gsub}}
152 | #' 
153 | #' \code{mgsub_regex_safe} - An wrapper for \code{\link[mgsub]{mgsub}}.
154 | #' 
155 | #' @export
156 | #' @rdname mgsub
157 | mgsub_regex_safe <- function(x, pattern, replacement, ...){
158 |     mgsub::mgsub(string = x, pattern = pattern, replacement = replacement, ...)
159 | }
160 |     
161 |     
162 | spaste <-
163 | function (terms, trailing = TRUE, leading = TRUE) {
164 |     if (leading) {
165 |         s1 <- " "
166 |     }     else {
167 |         s1 <- ""
168 |     }
169 |     if (trailing) {
170 |         s2 <- " "
171 |     } else {
172 |         s2 <- ""
173 |     }
174 |     pas <- function(x) paste0(s1, x, s2)
175 |     if (is.list(terms)) {
176 |         z <- lapply(terms, pas)
177 |     } else {
178 |         z <- pas(terms)
179 |     }
180 |     return(z)
181 | }
182 | 
183 | 
184 | 


--------------------------------------------------------------------------------
/R/replace_contraction.R:
--------------------------------------------------------------------------------
 1 | #' Replace Contractions
 2 | #'
 3 | #' This function replaces contractions with long form.
 4 | #' 
 5 | #' @param x The text variable.
 6 | #' @param contraction.key A two column hash of contractions (column 1) and 
 7 | #' expanded form replacements (column 2).  Default is to use 
 8 | #' \code{\link[lexicon]{key_contractions}} data set.
 9 | #' @param ignore.case logical.  Should case be ignored?
10 | #' @param \dots ignored.
11 | #' @return Returns a vector with contractions replaced.
12 | #' @keywords contraction
13 | #' @export
14 | #' @examples
15 | #' \dontrun{
16 | #' x <- c("Mr. Jones isn't going.",  
17 | #'     "Check it out what's going on.",
18 | #'     "He's here but didn't go.",
19 | #'     "the robot at t.s. wasn't nice", 
20 | #'     "he'd like it if i'd go away")
21 | #' 
22 | #' replace_contraction(x)
23 | #' }
24 | replace_contraction <- 
25 | function(x, contraction.key = lexicon::key_contractions, ignore.case=TRUE, 
26 |     ...) {
27 | 
28 |     mgsub(x, contraction.key[[1]], contraction.key[[2]], 
29 |         fixed = FALSE, ignore.case=TRUE)
30 | 
31 | }
32 | 
33 | 


--------------------------------------------------------------------------------
/R/replace_date.R:
--------------------------------------------------------------------------------
 1 | #' Replace Dates With Words
 2 | #'
 3 | #' Replaces dates with word equivalents.
 4 | #'
 5 | #' @param x The text variable.
 6 | #' @param pattern Character date regex string to be matched in the given 
 7 | #' character vector. 
 8 | #' @param replacement A function to operate on the extracted matches or a 
 9 | #' character string which is a replacement for the matched pattern.
10 | #' @param \ldots ignored.
11 | #' @return Returns a vector with the pattern replaced.
12 | #' @export
13 | #' @examples
14 | #' x <- c(
15 | #'     NA, '11-16-1980 and 11/16/1980', 
16 | #'     "and 2017-02-08 but then there's 2/8/2017 too"
17 | #' )
18 | #' 
19 | #' replace_date(x)
20 | #' replace_date(x, replacement = '<<DATE>>')
21 | replace_date <- function(x, 
22 |     pattern = NULL, 
23 |     replacement = NULL, ...){
24 | 
25 |     if (is.null(pattern)) pattern <- replace_date_pattern
26 |     if (is.null(replacement)) replacement <- replace_date_fun 
27 |     
28 |     if (is.function(replacement)) {
29 |         f_gsub <- fgsub
30 |     } else { 
31 |         f_gsub <- stringi::stri_replace_all_regex
32 |     }
33 | 
34 |     f_gsub(x, pattern, replacement)
35 | 
36 | }
37 | 
38 | replace_date_pattern <- paste0(
39 |     '([01]?[0-9])[/-]([0-2]?[0-9]|3[01])[/-]\\d{4}|\\d{4}[/-]', 
40 |     '([01]?[0-9])[/-]([0-2]?[0-9]|3[01])'
41 | )
42 | 
43 | replace_date_fun <- function(x){
44 | 
45 |         parts <- strsplit(
46 |             gsub('(^.+)([/-])(\\d{4})', '\\3\\2\\1', x, perl = TRUE), 
47 |             '[/-]'
48 |         )[[1]]
49 | 
50 |         y <- replace_number(parts[1])
51 |         m <- month.name[as.integer(parts[2])]
52 |         d <- english::ordinal(as.integer(parts[3]))
53 |         paste0(m, ' ', d, ', ', y)
54 | 
55 | }
56 | 


--------------------------------------------------------------------------------
/R/replace_email.R:
--------------------------------------------------------------------------------
 1 | #' Replace Email Addresses
 2 | #' 
 3 | #' Replaces email addresses.  
 4 | #' 
 5 | #' @param x The text variable.
 6 | #' @param pattern Character time regex string to be matched in the given 
 7 | #' character vector. 
 8 | #' @param replacement A function to operate on the extracted matches or a 
 9 | #' character string which is a replacement for the matched pattern.
10 | #' @param \ldots ignored.
11 | #' @return Returns a vector with email addresses replaced.
12 | #' @export
13 | #' @importFrom qdapRegex grab
14 | #' @examples
15 | #' x <- c(
16 | #'     "fred is fred@@foo.com and joe is joe@@example.com - but @@this is a", 
17 | #'     "twitter handle for twit@@here.com or foo+bar@@google.com/fred@@foo.fnord", 
18 | #'     "hello world", 
19 | #'     NA
20 | #' )
21 | #' 
22 | #' replace_email(x)
23 | #' replace_email(x, replacement = '<<EMAIL>>')
24 | #' replace_email(x, replacement = '<a href="mailto:$1" target="_blank">$1</a>')
25 | #' 
26 | #' ## Replacement with a function
27 | #' replace_email(x, 
28 | #'     replacement = function(x){
29 | #'         sprintf('<a href="mailto:%s" target="_blank">%s</a>', x, x)
30 | #'     }
31 | #' )
32 | #' 
33 | #' 
34 | #' replace_email(x, 
35 | #'     replacement = function(x){
36 | #'         gsub('@@.+$', ' {{at domain}}', x)
37 | #'     }
38 | #' )
39 | replace_email <- function(x, pattern = qdapRegex::grab('rm_email'), 
40 |     replacement = '', ...){
41 | 
42 |     if (is.function(replacement)) {
43 |         f_gsub <- fgsub
44 |     } else { 
45 |         f_gsub <- stringi::stri_replace_all_regex
46 |     }
47 | 
48 |     f_gsub(x, pattern, replacement)
49 | 
50 | }
51 | 


--------------------------------------------------------------------------------
/R/replace_emoji.R:
--------------------------------------------------------------------------------
 1 | #' Replace Emojis With Words/Identifier
 2 | #'
 3 | #' Replaces emojis with word equivalents or a token identifier for use in the
 4 | #' \pkg{sentimentr} package.  Not that this function will coerce the text to 
 5 | #' ASCII using 
 6 | #' \code{Encoding(x) <- "latin1"; iconv(x, "latin1", "ASCII", "byte")}.
 7 | #' The function \code{replace_emoji} replaces emojis with text representations
 8 | #' while \code{replace_emoji_identifier} replaces with a unique identifier that
 9 | #' corresponds to \code{lexicon::hash_sentiment_emoji} for use in the 
10 | #' \pkg{sentimentr} package.
11 | #' 
12 | #' @param x The text variable.
13 | #' @param emoji_dt A \pkg{data.table} of emojis (ASCII byte representations)
14 | #' and corresponding word/identifier meanings.  
15 | #' @param \ldots Other arguments passed to \code{.mgsub} (see
16 | #' \code{textclean:::.mgsub} for details).
17 | #' @return Returns a vector of strings with emojis replaced with word
18 | #' equivalents.
19 | #' @keywords emoji
20 | #' @export
21 | #' @rdname replace_emoji
22 | #' @examples
23 | #' fls <- system.file("docs/emoji_sample.txt", package = "textclean")
24 | #' x <- readLines(fls)[1]
25 | #' replace_emoji(x)
26 | #' replace_emoji_identifier(x)
27 | replace_emoji <- function(x, emoji_dt = lexicon::hash_emojis, ...){
28 |     
29 |     gsub("\\s+", " ", .mgsub(emoji_dt[["x"]], paste0(" ", emoji_dt[["y"]], " "), 
30 |         to_byte(x), ...))
31 |     
32 | }
33 | 
34 | lexicon_available_data <- lexicon::available_data
35 | 
36 | #' @export
37 | #' @rdname replace_emoji
38 | replace_emoji_identifier <- function(x, 
39 |     emoji_dt = lexicon::hash_emojis_identifier, ...){
40 |     
41 |     gsub("\\s+", " ", .mgsub(emoji_dt[["x"]], paste0(" ", emoji_dt[["y"]], " "), 
42 |         to_byte(x), ...))
43 |     
44 | }
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/R/replace_emoticon.R:
--------------------------------------------------------------------------------
 1 | #' Replace Emoticons With Words
 2 | #'
 3 | #' Replaces emoticons with word equivalents.
 4 | #'
 5 | #' @param x The text variable.
 6 | #' @param emoticon_dt A \pkg{data.table} of emoticons (graphical representations)
 7 | #' and corresponding word meanings.
 8 | #' @param \ldots Other arguments passed to \code{.mgsub} (see
 9 | #' \code{textclean:::.mgsub} for details).
10 | #' @return Returns a vector of strings with emoticons replaced with word
11 | #' equivalents.
12 | #' @keywords emoticon
13 | #' @export
14 | #' @examples
15 | #' x <- c(
16 | #'     paste(
17 | #'         "text from:", 
18 | #'         "http://www.webopedia.com/quick_ref/textmessageabbreviations_02.asp"
19 | #'     ),
20 | #'     "... understanding what different characters used in smiley faces mean:",
21 | #'     "The close bracket represents a sideways smile  )",
22 | #'     "Add in the colon and you have sideways eyes   :",
23 | #'     "Put them together to make a smiley face  :)",
24 | #'     "Use the dash -  to add a nose   :-)",
25 | #'     paste(
26 | #'         "Change the colon to a semi-colon ;", 
27 | #'         "and you have a winking face ;)  with a nose  ;-)"
28 | #'     ),
29 | #'     paste(
30 | #'         "Put a zero 0 (halo) on top and now you have a winking,", 
31 | #'         "smiling angel 0;) with a nose 0;-)"
32 | #'     ),
33 | #'     "Use the letter 8 in place of the colon for sunglasses 8-)",
34 | #'     "Use the open bracket ( to turn the smile into a frown  :-(",
35 | #'     "I have experience with using the xp emoticon"
36 | #' )
37 | #'
38 | #' replace_emoticon(x)
39 | replace_emoticon <- function(x, emoticon_dt = lexicon::hash_emoticons, ...){
40 | 
41 |     trimws(gsub(
42 |         "\\s+", 
43 |         " ", 
44 |         mgsub_regex(x, paste0('\\b\\Q', emoticon_dt[['x']], '\\E\\b'), paste0(" ", emoticon_dt[['y']], " "))
45 |     ))
46 |     
47 | }
48 | 


--------------------------------------------------------------------------------
/R/replace_grade.R:
--------------------------------------------------------------------------------
 1 | #' Replace Grades With Words
 2 | #'
 3 | #' Replaces grades with word equivalents.
 4 | #'
 5 | #' @param x The text variable.
 6 | #' @param grade_dt A \pkg{data.table} of grades and corresponding word meanings.
 7 | #' @param \ldots ignored.
 8 | #' @return Returns a vector of strings with grades replaced with word
 9 | #' equivalents.
10 | #' @keywords grade
11 | #' @export
12 | #' @examples
13 | #' (text <- replace_grade(c(
14 | #'     "I give an A+",
15 | #'     "He deserves an F",
16 | #'     "It's C+ work",
17 | #'     "A poor example deserves a C!"
18 | #' )))
19 | replace_grade <- function (x, grade_dt = lexicon::key_grade, ...) {
20 |     
21 |     mgsub(
22 |         x, 
23 |         paste0(' ', grade_dt[["x"]]), 
24 |         grade_dt[["y"]], 
25 |         fixed = TRUE, 
26 |         leadspace = TRUE
27 |     )
28 | }
29 | 
30 | 


--------------------------------------------------------------------------------
/R/replace_hash.R:
--------------------------------------------------------------------------------
 1 | #' Replace Hashes
 2 | #' 
 3 | #' Replaces Twitter style hash tags (e.g., '#rstats').  
 4 | #' 
 5 | #' @param x The text variable.
 6 | #' @param pattern Character time regex string to be matched in the given 
 7 | #' character vector. 
 8 | #' @param replacement A function to operate on the extracted matches or a 
 9 | #' character string which is a replacement for the matched pattern.
10 | #' @param \ldots ignored.
11 | #' @return Returns a vector with hashes replaced.
12 | #' @export
13 | #' @importFrom qdapRegex grab
14 | #' @examples
15 | #' x <- c("@@hadley I like #rstats for #ggplot2 work.",
16 | #'     "Difference between #magrittr and #pipeR, both implement pipeline operators for #rstats: 
17 | #'         http://renkun.me/r/2014/07/26/difference-between-magrittr-and-pipeR.html @@timelyportfolio",
18 | #'     "Slides from great talk: @@ramnath_vaidya: Interactive slides from Interactive Visualization 
19 | #'         presentation #user2014. http://ramnathv.github.io/user2014-rcharts/#1"
20 | #' )
21 | #' 
22 | #' replace_hash(x)
23 | #' replace_hash(x, replacement = '<<HASH>>')
24 | #' replace_hash(x, replacement = '$3')
25 | #' 
26 | #' ## Replacement with a function
27 | #' replace_hash(x, 
28 | #'     replacement = function(x){
29 | #'         paste0('{{', gsub('^#', 'TOPIC: ', x), '}}')
30 | #'     }
31 | #' )
32 | replace_hash <- function(x, pattern = qdapRegex::grab('rm_hash'), 
33 |     replacement = '', ...){
34 | 
35 |     if (is.function(replacement)) {
36 |         f_gsub <- fgsub
37 |     } else { 
38 |         f_gsub <- stringi::stri_replace_all_regex
39 |     }
40 | 
41 |     f_gsub(x, pattern, replacement)
42 | 
43 | }
44 | 


--------------------------------------------------------------------------------
/R/replace_html.R:
--------------------------------------------------------------------------------
 1 | #' Replace HTML Markup
 2 | #' 
 3 | #' Replaces HTML markup.  The angle braces are removed and the HTML symbol 
 4 | #' markup is replaced with equivalent symbols.  
 5 | #'
 6 | #' @details Replacements for symbols are as follows:
 7 | #' 
 8 | #' \tabular{lr}{
 9 | #'  \bold{html} \tab  \bold{symbol} \cr
10 | #'   &copy;   \tab (c) \cr
11 | #'   &reg;   \tab (r) \cr
12 | #'   &trade;   \tab tm \cr
13 | #'   &ldquo;   \tab " \cr
14 | #'   &rdquo;   \tab " \cr
15 | #'   &lsquo;   \tab ' \cr
16 | #'   &rsquo;   \tab ' \cr
17 | #'   &bull;   \tab - \cr
18 | #'   &middot;   \tab - \cr
19 | #'   &sdot;   \tab [] \cr
20 | #'   &ndash;   \tab - \cr
21 | #'   &mdash;   \tab - \cr
22 | #'   &cent;   \tab cents \cr
23 | #'   &pound;   \tab pounds \cr
24 | #'   &euro;   \tab euro \cr
25 | #'   &ne;   \tab != \cr
26 | #'   &frac12;   \tab half \cr
27 | #'   &frac14;   \tab quarter \cr
28 | #'   &frac34;   \tab three fourths \cr
29 | #'   &deg;   \tab degrees \cr
30 | #'   &larr;   \tab <- \cr
31 | #'   &rarr;   \tab -> \cr
32 | #'   &hellip;   \tab ... \cr
33 | #'   &nbsp;   \tab   \cr
34 | #'   &lt;   \tab < \cr
35 | #'   &gt;   \tab > \cr
36 | #'   &laquo; \tab << \cr
37 | #'   &raquo; \tab >> \cr
38 | #'   &amp;   \tab & \cr
39 | #'   &quot;   \tab " \cr
40 | #'   &apos;   \tab ' \cr
41 | #'   &yen;   \tab yen \cr
42 | #' }
43 | #' 
44 | #' @param x The text variable.
45 | #' @param symbol logical.  If code{TRUE} the symbols are retained with appropriate
46 | #' replacements.  If \code{FALSE} they are removed.
47 | #' @param \ldots Ignored.
48 | #' @return Returns a vector with HTML markup replaced.
49 | #' @keywords html
50 | #' @export
51 | #' @examples
52 | #' x <- c(
53 | #'     "<bold>Random</bold> text with symbols: &nbsp; &lt; &gt; &amp; &quot; &apos;",
54 | #'     "<p>More text</p> &cent; &pound; &yen; &euro; &copy; &reg; &laquo; &raquo;"
55 | #' )
56 | #' 
57 | #' replace_html(x)
58 | #' replace_html(x, FALSE)
59 | #' replace_white(replace_html(x, FALSE))
60 | replace_html <- function(x, symbol = TRUE, ...){
61 |     if (isTRUE(symbol)) {
62 |         reps <-  html_symbols[['symbol']]
63 |     } else {
64 |         reps <- " "
65 |     }
66 |     mgsub(gsub('<[^>]+>', ' ', x), html_symbols[['html']],reps)
67 | }
68 | 
69 | 
70 | html_symbols <- data.frame(
71 |     html = c("&copy;", "&reg;", "&trade;", "&ldquo;", 
72 |         "&rdquo;", "&lsquo;", "&rsquo;", "&bull;", "&middot;", "&sdot;", 
73 |         "&ndash;", "&mdash;", "&cent;", "&pound;", "&euro;", "&ne;", 
74 |         "&frac12;", "&frac14;", "&frac34;", "&deg;", "&larr;", "&rarr;", 
75 |         "&hellip;", "&nbsp;", "&lt;", "&gt;", "&amp;", "&quot;", "&apos;",
76 |         "&yen;", "&laquo;", "&raquo;"
77 |     ), 
78 |     symbol = c("(c)", "(r)", "tm", "\"", "\"", "'", 
79 |         "'", "-", "-", "[]", "-", "-", "cents", "pounds", "euro", "!=", 
80 |         "half", "quarter", "three fourths", "degrees", "<-", "->", "...",
81 |         " ", "<", ">", "&", '"', "'", "yen", "<<", ">>"
82 |     ), 
83 |     stringsAsFactors = FALSE
84 | )
85 | 
86 | ## clipr::write_clip(textclean::glue("#'   {html}   \\tab {symb} \\cr\n", html = html_table[[1]], symb = html_table[[2]]))
87 | 
88 | 


--------------------------------------------------------------------------------
/R/replace_incomplete.R:
--------------------------------------------------------------------------------
 1 | #' Denote Incomplete End Marks With "|"
 2 | #' 
 3 | #' Replaces incomplete sentence end marks (.., ..., .?, ..?, en & em dash etc.)
 4 | #' with \code{"|"}.
 5 | #' 
 6 | #' @param x  The text variable.
 7 | #' @param replacement A string to replace incomplete punctuation marks with.
 8 | #' @param \dots ignored.
 9 | #' @return Returns a text variable (character sting) with incomplete sentence 
10 | #' marks (.., ..., .?, ..?, en & em dash etc.) replaced with "|".  
11 | #' @keywords incomplete-sentence
12 | #' @export
13 | #' @examples
14 | #' x <- c("the...",  "I.?", "you.", "threw..", "we?")
15 | #' replace_incomplete(x)
16 | #' replace_incomplete(x, '...')
17 | replace_incomplete <- function(x, replacement = "|", ...) {
18 |     gsub(sprintf('%s\\s*$', pat), replacement, x)
19 | }
20 | 
21 | pat <- paste0("\\?*\\?[.]+|[.?!]*\\? [.][.?!]+|[.?!]*\\. [.?!]+|",
22 |     "[.?!]+\\. [.?!]*|[.?!]+\\.[.?!]*|[.?!]*\\.[.?!]+")
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/R/replace_internet_slang.R:
--------------------------------------------------------------------------------
 1 | #' Replace Internet Slang
 2 | #' 
 3 | #' Replaces Internet slang.  
 4 | #' 
 5 | #' @param x  The text variable.
 6 | #' @param slang A vector of slang strings to replace. 
 7 | #' @param replacement A vector of string to replace slang with.
 8 | #' @param ignore.case logical.  If \code{TRUE} the case of \code{slang} will be 
 9 | #' ignored (replacement regardless of case).
10 | #' @param \dots Other arguments passed to \code{\link[textclean]{replace_tokens}}.
11 | #' @return Returns a vector with names replaced.
12 | #' @export
13 | #' @examples
14 | #' x <- c(
15 | #'     "Marc the n00b needs to RTFM otherwise ymmv.",
16 | #'     "TGIF and a big w00t!  The weekend is GR8!",
17 | #'     "Will will do it",
18 | #'     'w8...this PITA needs me to say LMGTFY...lmao.',
19 | #'     NA
20 | #' )
21 | #' 
22 | #' replace_internet_slang(x)
23 | #' replace_internet_slang(x, ignore.case = FALSE)
24 | #' replace_internet_slang(x, replacement = '<<SLANG>>')
25 | #' replace_internet_slang(
26 | #'     x, 
27 | #'     replacement = paste0('{{ ', lexicon::hash_internet_slang[[2]], ' }}')
28 | #' )
29 | replace_internet_slang  <- function(x, 
30 |     slang = paste0('\\b', lexicon::hash_internet_slang[[1]], '\\b'),
31 |     replacement = lexicon::hash_internet_slang[[2]], ignore.case = TRUE, ...) {
32 | 
33 |     mgsub(x, slang, replacement, fixed = FALSE, ignore.case = ignore.case, ...)
34 | }
35 | 
36 | im_his <- lexicon::hash_internet_slang
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/R/replace_kerning.R:
--------------------------------------------------------------------------------
 1 | #' Replace Kerned (Spaced) with No Space Version
 2 | #' 
 3 | #' In typography kerning is the adjustment of spacing.  Often, in informal 
 4 | #' writing, adding manual spaces (a form of kerning) coupled with all capital 
 5 | #' letters is used for emphasis.  This tool looks for 3 or more consecutive 
 6 | #' capital letters with spaces in between and removes the spaces.  Essentially, 
 7 | #' the capitalized, kerned version is replaced with the word equivalent.
 8 | #' 
 9 | #' @param x  The text variable.
10 | #' @param \ldots ignored.
11 | #' @return Returns a vector with kern spaces removed.
12 | #' @references \url{https://stackoverflow.com/a/47438305/1000343}
13 | #' @author StackOverflow user @@ctwheels 
14 | #' @export
15 | #' @examples
16 | #' x <- c(
17 | #'     "Welcome to A I: the best W O R L D!",
18 | #'     "Hi I R is the B O M B for sure: we A G R E E indeed.",
19 | #'     "A sort C A T indeed!",
20 | #'     NA
21 | #' )
22 | #' 
23 | #' replace_kern(x)
24 | replace_kern <- function(x, ...){
25 |     ## a possible second approach from: 
26 |     ##   https://stackoverflow.com/a/47438305/1000343
27 |     ## paste0(
28 |     ##     '(?:(?<=\\P{L})(?=(?:\\p{Lu}\\h+){2}\\p{Lu})|', 
29 |     ##     '\\G(?!\\A))\\p{Lu}\\K\\h+(?=\\p{Lu}(?!\\p{L}))'
30 |     ## )
31 |     gsub(
32 |         paste0(
33 |             "(?:(?=\\b(?:\\p{Lu}\\h+){2}\\p{Lu})|", 
34 |             "\\G(?!\\A))\\p{Lu}\\K\\h+(?=\\p{Lu})"
35 |         ), 
36 |         "", 
37 |         x, 
38 |         perl=TRUE
39 |     )
40 | }
41 | 


--------------------------------------------------------------------------------
/R/replace_misspelling.R:
--------------------------------------------------------------------------------
 1 | #' Replace Misspelled Words
 2 | #' 
 3 | #' Replace misspelled words with their most likely replacement.  This function 
 4 | #' uses \pkg{hunspell} in the backend.  \pkg{hunspell}  must be installed in 
 5 | #' order to use this feature.
 6 | #' 
 7 | #' @param x A character vector.
 8 | #' @param \ldots ignored..
 9 | #' @return Returns a vector of strings with misspellings replaced.
10 | #' @note The function splits the string apart into tokens for speed
11 | #' optimization.  After the replacement occurs the strings are pasted back
12 | #' together.  The strings are not guaranteed to retain exact spacing of the
13 | #' original.
14 | #' @export
15 | #' @author Surin Space and Tyler Rinker <tyler.rinker@@gmail.com>.
16 | #' @examples
17 | #' \dontrun{
18 | #' bad_string <- c("I cant spelll rigtt noow.", '', NA, 
19 | #'     'Thiss is aslo mispelled?', 'this is 6$ and 38 cents in back2back!')
20 | #' replace_misspelling(bad_string)
21 | #' }
22 | replace_misspelling <- function(x, ...){
23 | 
24 |     lower <- text <- replacement <- is_cap <- final <- element_id <- token_id <- NULL
25 |     
26 |     check_install('hunspell')
27 | 
28 |     if (!(is.character(x) | is.factor(x))) stop('`x` must be a character vector')
29 |     is_na <- is.na(x)
30 |     dat <- data.frame(text = as.character(x), stringsAsFactors = FALSE)
31 | 
32 |     token_df <- textshape::split_token(dat, lower = FALSE)[, 
33 |         lower := tolower(text)]
34 | 
35 |     tokens <- grep('[a-z]', rm_na(unique(token_df[['lower']])), value = TRUE)
36 |     hits <- !hunspell::hunspell_check(tokens)
37 | 
38 |     misspelled <- tokens[hits]
39 | 
40 |     map <- data.table::data.table(
41 |         lower = misspelled,
42 |         replacement = unlist(lapply(hunspell::hunspell_suggest(misspelled), `[`, 1))
43 |     )
44 | 
45 |     fixed_df <- map[token_df, on = "lower"]
46 | 
47 |     fixed_df_a <- fixed_df[!is.na(replacement),][,
48 |         is_cap := substring(text, 1, 1) %in% LETTERS][,
49 |         final := ifelse(is_cap,  upper_first_letter(replacement), replacement)][]
50 | 
51 |     fixed_df_b <- fixed_df[is.na(replacement),][, final := text][]
52 | 
53 |     bound <- rbind(fixed_df_a, fixed_df_b, fill = TRUE)
54 | 
55 |     out <- data.table::setorder(bound, element_id, token_id)[, 
56 |         list(`final` = paste(final, collapse = ' ')), by = 'element_id'][,
57 |         `final` := gsub("(\\s+)([.!?,;:])", "\\2", final, perl = TRUE)][['final']]
58 |     out[is_na] <- NA
59 |     out
60 | }
61 | 
62 | 
63 | upper_first_letter <- function(x){
64 |     substring(x, 1, 1) <- toupper(substring(x, 1, 1))
65 |     x
66 | }
67 | 


--------------------------------------------------------------------------------
/R/replace_money.R:
--------------------------------------------------------------------------------
 1 | #' Replace Money With Words
 2 | #'
 3 | #' Replaces money with word equivalents.
 4 | #'
 5 | #' @param x The text variable.
 6 | #' @param pattern Character money regex string to be matched in the given 
 7 | #' character vector. 
 8 | #' @param replacement A function to operate on the extracted matches or a 
 9 | #' character string which is a replacement for the matched pattern.
10 | #' @param \ldots ignored.
11 | #' @return Returns a vector with the pattern replaced.
12 | #' @export
13 | #' @examples
14 | #' x <- c(
15 | #'     NA, 
16 | #'     '$3.16 into "three dollars, sixteen cents"', 
17 | #'     "-$20,333.18 too", 'fff'
18 | #' )
19 | #' 
20 | #' replace_money(x)
21 | #' replace_money(x, replacement = '<<MONEY>>')
22 | replace_money <- function(x, pattern = '(-?)([$])([0-9,]+)(\\.\\d{2})?', 
23 |     replacement = NULL, ...){
24 | 
25 |     #if (is.null(pattern)) pattern <- replace_money_pattern
26 |     if (is.null(replacement)) replacement <- replace_money_fun 
27 |     
28 |     if (is.function(replacement)) {
29 |         f_gsub <- fgsub
30 |     } else { 
31 |         f_gsub <- stringi::stri_replace_all_regex
32 |     }
33 | 
34 |     f_gsub(x, pattern, replacement)
35 | 
36 | }
37 | 
38 | replace_money_fun <- function(x, decimal = ' and '){
39 | 
40 |     sign <- ifelse(grepl('^-', x, perl = TRUE), 'negative ', '')
41 |     if (grepl('\\.', x, perl = TRUE)) {
42 |         number <- replace_number(
43 |             gsub(
44 |                 '\\.', 
45 |                 paste0(' dollars', decimal), 
46 |                 gsub('(-?)([$])', '', x)
47 |             )
48 |         )
49 |         paste0(sign, number, ' cents')
50 |     } else {
51 |         number <- replace_number(gsub('(-?)([$])', '', x))
52 |         paste0(sign, number)
53 |     }
54 | 
55 | }
56 | 


--------------------------------------------------------------------------------
/R/replace_names.R:
--------------------------------------------------------------------------------
 1 | #' Replace First/Last Names
 2 | #' 
 3 | #' Replaces first/last names.  
 4 | #' 
 5 | #' @param x  The text variable.
 6 | #' @param names A vector of names to replace.  This may be made more custom 
 7 | #' through a vector provided from a named entity extractor.
 8 | #' @param replacement A string to replace names with.
 9 | #' @param \dots Other arguments passed to 
10 | #' \code{\link[textclean]{replace_tokens}}.
11 | #' @return Returns a vector with names replaced.
12 | #' @export
13 | #' @examples
14 | #' x <- c(
15 | #'     "Mary Smith is not here",
16 | #'     "Karen is not a nice person",
17 | #'     "Will will do it",
18 | #'     NA
19 | #' ) 
20 | #' 
21 | #' replace_names(x)
22 | #' replace_names(x, replacement = '<<NAME>>')
23 | replace_names  <- function(x, 
24 |     names = textclean::drop_element(
25 |         gsub(
26 |             "(^.)(.*)", "\\U\\1\\L\\2", 
27 |             c(lexicon::freq_last_names[[1]], 
28 |             lexicon::common_names
29 |         ), perl = TRUE), 
30 |         "^([AIU]n|[TSD]o|H[ea]Pa|Oh)$"
31 |     ), 
32 |     replacement = "",  ...) {
33 | 
34 |     replace_tokens(x, names, replacement, ...)
35 | }
36 | 
37 | im_ad <- lexicon::available_data
38 | im_cmn <- lexicon::common_names
39 | 


--------------------------------------------------------------------------------
/R/replace_non_ascii.R:
--------------------------------------------------------------------------------
  1 | #' Replace Common Non-ASCII Characters
  2 | #' 
  3 | #' \code{replace_non_ascii} - Replaces common non-ASCII characters.
  4 | #' 
  5 | #' @param x The text variable.
  6 | #' @param replacement Character string equal in length to pattern or of length 
  7 | #' one which are a replacement for matched pattern. 
  8 | #' @param remove.nonconverted logical.  If \code{TRUE} unmapped encodings are
  9 | #' deleted from the string.
 10 | #' @param \dots ignored.
 11 | #' @return Returns a text variable (character sting) with non-ASCII characters 
 12 | #' replaced.
 13 | #' @keywords ascii
 14 | #' @rdname replace_non_ascii
 15 | #' @export
 16 | #' @examples
 17 | #' x <- c(
 18 | #'     "Hello World", "6 Ekstr\xf8m", "J\xf6reskog", "bi\xdfchen Z\xfcrcher",
 19 | #'     'This is a \xA9 but not a \xAE', '6 \xF7 2 = 3', 
 20 | #'     'fractions \xBC, \xBD, \xBE', 'cows go \xB5', '30\xA2'
 21 | #' )
 22 | #' Encoding(x) <- "latin1"
 23 | #' x
 24 | #' 
 25 | #' replace_non_ascii(x)
 26 | #' replace_non_ascii(x, remove.nonconverted = FALSE)
 27 | #' 
 28 | #' z <- '\x95He said, \x93Gross, I am going to!\x94'
 29 | #' Encoding(z) <- "latin1"
 30 | #' z
 31 | #' 
 32 | #' replace_curly_quote(z)
 33 | #' replace_non_ascii(z)
 34 | replace_non_ascii <- function (x, replacement = '',  
 35 |     remove.nonconverted = TRUE, ...) {
 36 |     
 37 |     x <- replace_curly_quote(x)
 38 |     x <- stringi::stri_trans_general(x, "latin-ascii")
 39 |     x <- iconv(as.character(x), "", "ASCII", "byte")
 40 |     Encoding(x) <- "latin1"
 41 |     x <- mgsub(x, ser, reps)
 42 |     
 43 |     if (isTRUE(remove.nonconverted)) {
 44 |         x <- qdapRegex::rm_angle(x, replacement = replacement)
 45 |         x <- stringi::stri_replace_all_regex(x, '[^ -~]+', 
 46 |             replacement = replacement)
 47 |     }
 48 |     
 49 |     x
 50 |     
 51 | }
 52 | 
 53 | 
 54 | #' Replace Common Non-ASCII Characters
 55 | #' 
 56 | #' \code{place_non_ascii2} - Replaces all non-ASCII (defined as \code{'[^ -~]+'}).  
 57 | #' This provides a subset of functionality found in \code{replace_non_ascii} that
 58 | #' is faster and likely less accurate.
 59 | #' 
 60 | #' @rdname replace_non_ascii
 61 | #' @export
 62 | replace_non_ascii2 <- function (x, replacement = '',  ...) {
 63 |     
 64 |     stringi::stri_replace_all_regex(x, '[^ -~]+', replacement = replacement)
 65 |     
 66 | }
 67 | 
 68 | # replace_non_ascii <- function(x, remove.nonconverted = TRUE, ...) {
 69 | #     x <- replace_curly_quote(x)
 70 | #     x <- stringi::stri_trans_general(x, "latin-ascii")
 71 | #     x <- iconv(as.character(x), "", "ASCII", "byte")
 72 | #     Encoding(x) <-"latin1"    
 73 | #     x <- mgsub(x, ser, reps)
 74 | #     if (isTRUE(remove.nonconverted)) x <- qdapRegex::rm_angle(x)
 75 | #     x
 76 | # }
 77 | 
 78 | #' Replace Common Non-ASCII Characters
 79 | #' 
 80 | #' \code{replace_curly_quote} - Replaces curly single and double quotes.  This 
 81 | #' provides a subset of functionality found in \code{replace_non_ascii} specific 
 82 | #' to quotes.
 83 | #' 
 84 | #' @rdname replace_non_ascii
 85 | #' @export
 86 | replace_curly_quote <- function(x, ...){
 87 |     replaces <- c('\x91', '\x92', '\x93', '\x94')
 88 |     Encoding(replaces) <- "latin1"
 89 |     for (i in 1:4) {
 90 |         x <- gsub(replaces[i], c("'", "'", "\"", "\"")[i], x, fixed = TRUE)
 91 |     }
 92 |     x
 93 | }
 94 | 
 95 | ser <- c("<e2><80><9c>", "<e2><80><9d>", "<e2><80><98>", "<e2><80><99>",
 96 | 	"<e2><80><9b>", "<ef><bc><87>", "<e2><80><a6>", "<e2><80><93>",
 97 | 	"<e2><80><94>", "<c3><a1>", "<c3><a9>", "<c2><bd>", '<a9>', '<ae>',
 98 |     '<f7>', '<bc>', '<bd>', '<be>', '<b5>', '<a2>'
 99 |     )
100 | 
101 | reps <- c('"', '"', "'", "'", "'", "'", '...', '-', '-', "a", "e", "1/2", 
102 |     ' copyright ', ' registered trademark ', "/", '1/2', '1/4', '3/4', ' mu ', 
103 |     ' cent '
104 | )
105 | 
106 | 


--------------------------------------------------------------------------------
/R/replace_number.R:
--------------------------------------------------------------------------------
  1 | #' Replace Numbers With Text Representation
  2 | #' 
  3 | #' \code{replace_number} - Replaces numeric represented numbers with words 
  4 | #' (e.g., 1001 becomes one thousand one).
  5 | #' 
  6 | #' @param x The text variable.
  7 | #' @param num.paste logical.  If \code{FALSE} the elements of larger numbers are 
  8 | #' separated with spaces.  If \code{TRUE} the elements will be joined without 
  9 | #' spaces.
 10 | #' @param remove logical.  If \code{TRUE} numbers are removed from the text.
 11 | #' @param \ldots Other arguments passed to  \code{\link[english]{as.english}}
 12 | #' @return Returns a vector with numbers replaced.
 13 | #' @references Fox, J. (2005). Programmer's niche: How do you spell that number? 
 14 | #' R News. Vol. 5(1), pp. 51-55.
 15 | #' @note The user may want to use \code{\link[textclean]{replace_ordinal}} 
 16 | #' first to remove ordinal number notation.  For example 
 17 | #' \code{\link[textclean]{replace_number}} would turn "21st" into 
 18 | #' "twenty onest", whereas \code{\link[textclean]{replace_ordinal}} would 
 19 | #' generate "twenty first".
 20 | #' @keywords number-to-word
 21 | #' @rdname replace_number
 22 | #' @export
 23 | #' @examples
 24 | #' x <- c(
 25 | #'     NA, 
 26 | #'     'then .456 good', 
 27 | #'     'none', 
 28 | #'     "I like 346,457 ice cream cones.", 
 29 | #'     "I like 123456789 cashes.",     
 30 | #'     "They are 99 percent good and 45678.2345667"
 31 | #' )
 32 | #' replace_number(x)
 33 | #' replace_number(x, num.paste = TRUE)
 34 | #' replace_number(x, remove=TRUE)
 35 | #' \dontrun{
 36 | #' library(textclean)
 37 | #' hunthou <- replace_number(seq_len(1e5)) 
 38 | #' 
 39 | #' textclean::mgsub(
 40 | #'     "'twenty thousand three hundred five' into 20305", 
 41 | #'     hunthou, 
 42 | #'     seq_len(1e5)
 43 | #' )
 44 | #' ## "'20305' into 20305"
 45 | #' 
 46 | #' ## Larger example from: https://stackoverflow.com/q/18332463/1000343
 47 | #' ## A slower approach
 48 | #' fivehunthou <- replace_number(seq_len(5e5)) 
 49 | #' 
 50 | #' testvect <- c("fifty seven", "four hundred fifty seven", 
 51 | #'     "six thousand four hundred fifty seven", 
 52 | #'     "forty six thousand four hundred fifty seven", 
 53 | #'     "forty six thousand four hundred fifty seven", 
 54 | #'     "three hundred forty six thousand four hundred fifty seven"
 55 | #' )
 56 | #' 
 57 | #' textclean::mgsub(testvect, fivehunthou, seq_len(5e5))
 58 | #' 
 59 | #' as_ordinal(1:10)
 60 | #' textclean::mgsub('I want to be 1 in line', 1:10, as_ordinal(1:10))
 61 | #' }
 62 | replace_number  <- function(x, num.paste = FALSE, remove = FALSE, ...) {
 63 | 
 64 |     if (is.numeric(x)){    
 65 |         x <- drop_sci_note(x) ## ensures scientific notation is not used
 66 |     } else {
 67 |         x <- as.character(x)    
 68 |     }
 69 | 
 70 |     if (remove) return(stringi::stri_replace_all_regex(x, num_regex, ""))
 71 | 
 72 |     ## extract the numbers
 73 |     to_replace <- stringi::stri_extract_all_regex(x, num_regex)
 74 |     
 75 |     
 76 | # browser()
 77 |     ## locations of the number strings
 78 |     locs <- which(!sapply2(to_replace, function(x) length(x) == 1 && is.na(x)))
 79 | 
 80 |     ## find locations of decimals
 81 |     decimal_locs <- lapply(to_replace[locs], stringi::stri_detect_regex, "\\.")
 82 | 
 83 |     ## get the numbers/texts tht correspond to number strings
 84 |     replaces <- to_replace[locs]
 85 | 
 86 |     ## lengths of the replacements lists so that it can be  
 87 |     ## unlisted and then relisted later
 88 |     lens <- lengths(replaces)
 89 | # browser()
 90 |     ## Data frame of the number text.  
 91 |     ## This will be disected and put back together
 92 |     num_df <- data.frame(
 93 |         num = gsub(",", "", unlist(replaces)), 
 94 |         stringsAsFactors = FALSE
 95 |     )
 96 |     
 97 |     num_df[['decimal']] <- unlist(
 98 |         stringi::stri_extract_all_regex(num_df[[1]], "\\.\\d+")
 99 |     )
100 |     
101 |     num_df[['integer']] <- floor(as.numeric(num_df[[1]]))
102 |     num_df[['den']] <- num_df[['den1']] <- 10 ^ (nchar(num_df[['decimal']])- 1)
103 |     
104 |     num_df[['den']][!is.na(num_df[['den']])] <- paste0(
105 |         eng(num_df[['den']][!is.na(num_df[['den']])], ...), 'ths'
106 |     ) 
107 |     
108 |     num_df[['numerator']] <- eng(
109 |         num_df[['den1']] * as.numeric(num_df[['decimal']]), ...
110 |     )
111 |     
112 |     num_df[['den']][is.na(num_df[['den']])] <- ""
113 |     num_df[['int']] <- eng(num_df[['integer']], ...)
114 |     
115 |     is_decimal <- grepl("\\.", num_df[[1]], perl = TRUE)  
116 |     not_integer_decimal <- !grepl('\\d\\.', num_df[[1]], perl = TRUE)
117 | 
118 |     num_df[['int']][is_decimal & not_integer_decimal] <- ifelse(grepl('^minus', num_df[['int']][is_decimal & not_integer_decimal]), 'minus', "")
119 |     
120 |     num_df[['numerator']][!not_integer_decimal] <- paste(
121 |         'and', num_df[['numerator']][!not_integer_decimal]
122 |     )
123 | 
124 |     ## the replacements to swap in
125 |     replaces2 <- trimws(paste(
126 |         num_df[['int']], num_df[['numerator']], num_df[['den']]
127 |     ))
128 |     if (num.paste) replaces2 <- gsub("\\s+", "", replaces2)
129 | 
130 |     ## Reconvert to the original list shape that matches replaces
131 |     replaces2 <- textshape::split_index(replaces2, textshape::starts(lens))
132 | 
133 |     ## for loop to do the gsubbing
134 |     for (i in seq_along(locs)) {
135 |         x[locs[i]] <- mgsub(x[locs[i]], replaces[[i]], replaces2[[i]])
136 |     }
137 |     x
138 | }
139 | 
140 | num_regex <- paste0(
141 |     "(?<=^| )-?.?\\d+(?:\\d+)?(?= |\\.?$)|", 
142 |     "(?<=^| )-?\\d+(?:\\.\\d+)?(?= |\\.?$)|",
143 |     "\\d+(?:,\\d{3})+(\\.\\d+)*"
144 | )
145 | 
146 | eng <- function(x, ...) as.character(english::as.english(x, ...))
147 | 
148 | 
149 | #' Replace Numbers With Text Representation
150 | #' 
151 | #' \code{as_ordinal} - A convenience wrapper for \code{english::ordinal} that 
152 | #' takes integers and converts them to ordinal form.
153 | #' 
154 | #' @rdname replace_number
155 | #' @export
156 | as_ordinal <- function(x, ...){
157 |     english::ordinal(x)
158 | }
159 | 
160 | 
161 | 


--------------------------------------------------------------------------------
/R/replace_ordinal.R:
--------------------------------------------------------------------------------
 1 | #' Replace Mixed Ordinal Numbers With Text Representation
 2 | #' 
 3 | #' Replaces mixed text/numeric represented ordinal numbers with words (e.g., 
 4 | #' "1st" becomes "first").
 5 | #' 
 6 | #' @param x The text variable.
 7 | #' @param num.paste logical.  If \code{TRUE} a the elements of larger numbers are 
 8 | #' separated with spaces.  If \code{FALSE} the elements will be joined without 
 9 | #' spaces.
10 | #' @param remove logical.  If \code{TRUE} ordinal numbers are removed from the text.
11 | #' @param \ldots ignored.
12 | #' @keywords ordinal-to-word
13 | #' @note Currently only implemented for ordinal values 1 through 100
14 | #' @export
15 | #' @examples
16 | #' x <- c(
17 | #'     "I like the 1st one not the 22nd one.", 
18 | #'     "For the 100th time stop!"
19 | #' )
20 | #' replace_ordinal(x)
21 | #' replace_ordinal(x, TRUE)
22 | #' replace_ordinal(x, remove = TRUE)
23 | #' replace_number(replace_ordinal("I like the 1st 1 not the 22nd 1."))
24 | replace_ordinal <- function(x, num.paste = FALSE, remove = FALSE, ...) {
25 | 
26 |     symb <- c("1st", "2nd", "3rd", paste0(4:19, "th"),
27 |         paste0(20:100, c("th", "st", "nd", "rd", rep("th", 6))))
28 | 
29 |     if (remove) {
30 |         ordinal <- ""
31 |     } else {
32 |         base_ord <- ordinal <- c("first", "second", "third", "fourth", 
33 |             "fifth", "sixth", "seventh", "eighth", "ninth")
34 |         prefix <- c("twent", "thirt", "fort", "fift", "sixt", 
35 |             "sevent", "eight", "ninet")
36 |         ordinal <- c(base_ord, "tenth", "eleventh", "twelfth", 
37 |             "thirteenth", "fourteenth", "fifteenth", "sixteenth", 
38 |             "seventeenth", "eighteenth", "nineteenth", 
39 |             paste0(rep(prefix, each=10), c("ieth", paste("y", base_ord))), 
40 |             "hundredth")
41 |     }
42 |     if (num.paste & !remove) ordinal <- gsub("\\s+", "", ordinal)
43 |     trimws(mgsub(x, paste0("\\b", symb, "\\b"), spaste(ordinal), fixed=FALSE))
44 | }
45 | 


--------------------------------------------------------------------------------
/R/replace_rating.R:
--------------------------------------------------------------------------------
 1 | #' Replace Ratings With Words
 2 | #'
 3 | #' Replaces ratings with word equivalents.
 4 | #'
 5 | #' @param x The text variable.
 6 | #' @param rating_dt A \pkg{data.table} of ratings and corresponding word meanings.
 7 | #' @param \ldots ignored.
 8 | #' @return Returns a vector of strings with ratings replaced with word
 9 | #' equivalents.
10 | #' @keywords rating
11 | #' @export
12 | #' @examples
13 | #' x <- c("This place receives 5 stars for their APPETIZERS!!!",
14 | #'      "Four stars for the food & the guy in the blue shirt for his great vibe!",
15 | #'      "10 out of 10 for both the movie and trilogy.",
16 | #'      "* Both the Hot & Sour & the Egg Flower Soups were absolutely 5 Stars!",
17 | #'      "For service, I give them no stars.", "This place deserves no stars.",
18 | #'      "10 out of 10 stars.",
19 | #'      "My rating: just 3 out of 10.",
20 | #'      "If there were zero stars I would give it zero stars.",
21 | #'      "Rating: 1 out of 10.",
22 | #'      "I gave it 5 stars because of the sound quality.",
23 | #'      "If it were possible to give them 0/10, they'd have it."
24 | #' )
25 | #'
26 | #' replace_rating(x)
27 | replace_rating <- function (x, rating_dt = lexicon::key_rating, ...) {
28 |     gsub("\\s+", " ", .mgsub(rating_dt[["x"]], paste0(" ",
29 |         rating_dt[["y"]], " "), x, ...))
30 | }
31 | 
32 | 
33 | IMPORT <- lexicon::available_data


--------------------------------------------------------------------------------
/R/replace_symbol.R:
--------------------------------------------------------------------------------
 1 | #' Replace Symbols With Word Equivalents
 2 | #' 
 3 | #' This function replaces symbols with word equivalents (e.g., \code{@@} becomes 
 4 | #' \code{"at"}.
 5 | #' 
 6 | #' @param x A character vector.
 7 | #' @param dollar logical.  If \code{TRUE} replaces dollar sign ($) with 
 8 | #' \code{"dollar"}.
 9 | #' @param percent logical.  If \code{TRUE} replaces percent sign (\%) with 
10 | #' \code{"percent"}.
11 | #' @param pound logical.  If \code{TRUE} replaces pound sign (#) with 
12 | #' \code{"number"}.
13 | #' @param at logical.  If \code{TRUE} replaces at sign (@@) with \code{"at"}.
14 | #' @param and logical.  If \code{TRUE} replaces and sign (&) with \code{"and"}.
15 | #' @param with logical.  If \code{TRUE} replaces with sign (w/) with 
16 | #' \code{"with"}.
17 | #' @param \ldots ignored.
18 | #' @return Returns a character vector with symbols replaced..
19 | #' @keywords symbol-replace
20 | #' @export
21 | #' @examples
22 | #' x <- c("I am @@ Jon's & Jim's w/ Marry", 
23 | #'     "I owe $41 for food", 
24 | #'     "two is 10% of a #"
25 | #' )
26 | #' replace_symbol(x)
27 | replace_symbol <- function(x, dollar = TRUE, percent = TRUE, 
28 |     pound = TRUE, at = TRUE, and = TRUE, with = TRUE, ...) {
29 |     
30 |     y <- c(dollar, percent, pound, at, and, with, with)
31 |   
32 |     gsub("\\+", " ", mgsub(
33 |         x,
34 |         pattern = symbs[y], 
35 |         replacement = replaces[y], 
36 |         fixed = TRUE,
37 |     ))
38 | }
39 | 
40 | symbs <-  c("%", "$", "#", "&", "@", "w/o", "w/")
41 | replaces <- paste0(" ", c("percent", "dollar", "number", "and", "at", 
42 |         "without", "with"), " ")
43 | 
44 | 


--------------------------------------------------------------------------------
/R/replace_tag.R:
--------------------------------------------------------------------------------
 1 | #' Replace Handle Tags
 2 | #' 
 3 | #' Replaces Twitter style handle tags (e.g., '@@trinker').  
 4 | #' 
 5 | #' @param x The text variable.
 6 | #' @param pattern Character time regex string to be matched in the given 
 7 | #' character vector. 
 8 | #' @param replacement A function to operate on the extracted matches or a 
 9 | #' character string which is a replacement for the matched pattern.
10 | #' @param \ldots ignored.
11 | #' @return Returns a vector with tags replaced.
12 | #' @export
13 | #' @importFrom qdapRegex grab
14 | #' @examples
15 | #' x <- c("@@hadley I like #rstats for #ggplot2 work.",
16 | #'     "Difference between #magrittr and #pipeR, both implement pipeline operators for #rstats: 
17 | #'         http://renkun.me/r/2014/07/26/difference-between-magrittr-and-pipeR.html @@timelyportfolio",
18 | #'     "Slides from great talk: @@ramnath_vaidya: Interactive slides from Interactive Visualization 
19 | #'         presentation #user2014. http://ramnathv.github.io/user2014-rcharts/#1"
20 | #' )
21 | #' 
22 | #' replace_tag(x)
23 | #' replace_tag(x, replacement = '<<TAG>>')
24 | #' replace_tag(x, replacement = '$3')
25 | #' 
26 | #' ## Replacement with a function
27 | #' replace_tag(x, 
28 | #'     replacement = function(x){
29 | #'         gsub('@@', ' <<TO>> ', x)
30 | #'     }
31 | #' )
32 | replace_tag <- function(x, pattern = qdapRegex::grab('rm_tag'), 
33 |     replacement = '', ...){
34 | 
35 |     if (is.function(replacement)) {
36 |         f_gsub <- fgsub
37 |     } else { 
38 |         f_gsub <- stringi::stri_replace_all_regex
39 |     }
40 | 
41 |     f_gsub(x, pattern, replacement)
42 | 
43 | }
44 | 


--------------------------------------------------------------------------------
/R/replace_time.R:
--------------------------------------------------------------------------------
 1 | #' Replace Time Stamps With Words
 2 | #'
 3 | #' Replaces time stamps with word equivalents.
 4 | #'
 5 | #' @param x The text variable.
 6 | #' @param pattern Character time regex string to be matched in the given 
 7 | #' character vector. 
 8 | #' @param replacement A function to operate on the extracted matches or a 
 9 | #' character string which is a replacement for the matched pattern.
10 | #' @param \ldots ignored.
11 | #' @return Returns a vector with the pattern replaced.
12 | #' @export
13 | #' @examples
14 | #' x <- c(
15 | #'     NA, '12:47 to "twelve forty-seven" and also 8:35:02', 
16 | #'     'what about 14:24.5', 'And then 99:99:99?'
17 | #' )
18 | #' 
19 | #' ## Textual: Word version
20 | #' replace_time(x)
21 | #' 
22 | #' ## Normalization: <<TIME>>
23 | #' replace_time(x, replacement = '<<TIME>>')
24 | #' 
25 | #' ## Normalization: hh:mm:ss or hh:mm
26 | #' replace_time(x, replacement = function(y){
27 | #'         z <- unlist(strsplit(y, '[:.]'))
28 | #'         z[1] <- 'hh'
29 | #'         z[2] <- 'mm'
30 | #'         if(!is.na(z[3])) z[3] <- 'ss'
31 | #'         glue_collapse(z, ':')
32 | #'     }
33 | #' )
34 | #' 
35 | #' ## Textual: Word version (forced seconds)
36 | #' replace_time(x, replacement = function(y){
37 | #'         z <- replace_number(unlist(strsplit(y, '[:.]')))
38 | #'         z[3] <- paste0('and ', ifelse(is.na(z[3]), '0', z[3]), ' seconds')
39 | #'         paste(z, collapse = ' ')
40 | #'     }
41 | #' )
42 | #'  
43 | #' ## Normalization: hh:mm:ss
44 | #' replace_time(x, replacement = function(y){
45 | #'         z <- unlist(strsplit(y, '[:.]'))
46 | #'         z[1] <- 'hh'
47 | #'         z[2] <- 'mm'
48 | #'         z[3] <- 'ss'
49 | #'         glue_collapse(z, ':')
50 | #'     }
51 | #' )
52 | replace_time <- function(x, 
53 |     pattern = '(2[0-3]|[01]?[0-9]):([0-5][0-9])[.:]?([0-5]?[0-9])?', 
54 |     replacement = NULL, ...){
55 | 
56 |     if (is.null(replacement)) replacement <- replace_time_fun 
57 |     
58 |     if (is.function(replacement)) {
59 |         f_gsub <- fgsub
60 |     } else { 
61 |         f_gsub <- stringi::stri_replace_all_regex
62 |     }
63 | 
64 |     f_gsub(x, pattern, replacement)
65 | 
66 | }
67 | 
68 | replace_time_fun <- function(y){
69 |     z <- replace_number(unlist(strsplit(y, '[:.]')))
70 |     if(!is.na(z[3])) z[3] <- paste0('and ', z[3], ' seconds')
71 |     paste(z, collapse = ' ')
72 | }
73 | 


--------------------------------------------------------------------------------
/R/replace_to.R:
--------------------------------------------------------------------------------
 1 | #' Grab Begin/End of String to/from Character
 2 | #' 
 3 | #' \code{replace_to} - Grab from beginning of string to a character(s).
 4 | #' 
 5 | #' @param x A character string
 6 | #' @param char The character from which to grab until/from.
 7 | #' @param n Number of times the character appears before the grab.
 8 | #' @param include logical.  If \code{TRUE} includes the character in the grab.
 9 | #' @param \ldots ignored.
10 | #' @return returns a vector of text with begin/end of string to/from character removed.
11 | #' @author Josh O'Brien and Tyler Rinker <tyler.rinker@@gmail.com>.
12 | #' @references \url{https://stackoverflow.com/q/15909626/1000343}
13 | #' @rdname replace_to
14 | #' @export
15 | #' @examples
16 | #' \dontrun{
17 | #' x <- c("a_b_c_d", "1_2_3_4", "<_?_._:")
18 | #' replace_to(x, "_")
19 | #' replace_to(x, "_", 2)
20 | #' replace_to(x, "_", 3)
21 | #' replace_to(x, "_", 4)
22 | #' replace_to(x, "_", 3, include=TRUE)
23 | #' 
24 | #' replace_from(x, "_")
25 | #' replace_from(x, "_", 2)
26 | #' replace_from(x, "_", 3)
27 | #' replace_from(x, "_", 4)
28 | #' replace_from(x, "_", 3, include=TRUE)
29 | #' 
30 | #' x2 <- gsub("_", " ", x)
31 | #' replace_from(x2, " ", 2)
32 | #' replace_to(x2, " ", 2)
33 | #' 
34 | #' x3 <- gsub("_", "\\^", x)
35 | #' replace_from(x3, "^", 2)
36 | #' replace_to(x3, "^", 2)
37 | #'
38 | #' x4 <- c("_a_b", "a__b")
39 | #' replace_from(x4, "_", 1)
40 | #' replace_to(x4, "_", 1)
41 | #' }
42 | replace_to <- function(x, char = " ", n = 1, include = FALSE, ...) {
43 | 
44 |     gsub(match_to_nth(char, n, include = include), "\\1", x)
45 | 	
46 | }
47 | 
48 | 
49 | #' Grab Begin/End of Sting to/from Character
50 | #' 
51 | #' \code{replace_from} - Grab from character(s) to end of string.
52 | #' 
53 | #' @rdname replace_to
54 | #' @export
55 | replace_from <- function(x, char = " ", n = 1, include = FALSE, ...) {
56 | 
57 |     gsub(match_from_nth(char, n, include = include), "\\1", x)
58 | 
59 | }
60 | 
61 | 
62 | 
63 | 
64 | specchar <- c(".", "|", "(", ")", "[", "{", "^", "$", "*", "+", "?", "\\")
65 | 
66 | match_to_nth <- function(char, n, include) {
67 | 
68 |     if (char %in% specchar) char <- paste0("\\", char)
69 | 	
70 |     others <- paste0("[^", char, "]*")
71 |     mainPat <- paste0(c(rep(c(others, char), n - 1), others), collapse = "")
72 |     paste0("(^", mainPat, ifelse(include, char, ''), ")", "(.*$)")
73 | 
74 | }
75 | 
76 | 
77 | match_from_nth <- function(char, n, include) {
78 | 
79 |     if (char %in% specchar) char <- paste0("\\", char)
80 | 
81 |     others <- paste0("[^", char, "]*")
82 |     mainPat <- paste0(c(rep(c(others, char), n - 1), others), collapse = "")
83 |     paste0("^", mainPat, ifelse(include, '', char), "", "(.*$)")
84 | 	
85 | }
86 | 


--------------------------------------------------------------------------------
/R/replace_tokens.R:
--------------------------------------------------------------------------------
 1 | #' Replace Tokens
 2 | #' 
 3 | #' Replace tokens with a single substring.  This is much faster than 
 4 | #' \code{\link[textclean]{mgsub}} if one wants to replace fixed tokens
 5 | #' with a single value or remove them all together.  This can be useful
 6 | #' for quickly replacing tokens like names in string with a single
 7 | #' value in order to reduce noise.
 8 | #' 
 9 | #' @param x A character vector.
10 | #' @param tokens A vector of token to be replaced.
11 | #' @param replacement A single character string to replace the tokens with.
12 | #' The default, \code{NULL}, replaces the tokens with nothing.
13 | #' @param ignore.case logical.  If \code{TRUE} the case of the tokens will 
14 | #' be ignored.
15 | #' @param \ldots ignored.
16 | #' @return Returns a vector of strings with tokens replaced.
17 | #' @note The function splits the string apart into tokens for speed
18 | #' optimization.  After the replacement occurs the strings are pasted back
19 | #' together.  The strings are not guaranteed to retain exact spacing of the
20 | #' original.
21 | #' @export
22 | #' @seealso \code{\link[textclean]{mgsub}}, \code{\link[textclean]{match_tokens}}
23 | #' @examples 
24 | #' replace_tokens(DATA$state, c('No', 'what', "it's"))
25 | #' replace_tokens(DATA$state, c('No', 'what', "it's"), "<<TOKEN>>")
26 | #' replace_tokens(
27 | #'     DATA$state, 
28 | #'     c('No', 'what', "it's"), 
29 | #'     "<<TOKEN>>", 
30 | #'     ignore.case = TRUE
31 | #' )
32 | #' 
33 | #' \dontrun{
34 | #' ## Now let's see the speed
35 | #' ## Set up data
36 | #' library(textshape)
37 | #' data(hamlet)
38 | #' set.seed(11)
39 | #' tokens <- sample(unique(unlist(split_token(hamlet$dialogue))), 2000)
40 | #' 
41 | #' tic <- Sys.time()
42 | #' head(replace_tokens(hamlet$dialogue, tokens))
43 | #' (toc <- Sys.time() - tic)
44 | #' 
45 | #' 
46 | #' tic <- Sys.time()
47 | #' head(mgsub(hamlet$dialogue, tokens, ""))
48 | #' (toc <- Sys.time() - tic)
49 | #' 
50 | #' 
51 | #' ## Amp it up 20x more data
52 | #' tic <- Sys.time()
53 | #' head(replace_tokens(rep(hamlet$dialogue, 20), tokens))
54 | #' (toc <- Sys.time() - tic)
55 | #'
56 | #' ## Replace names example
57 | #'
58 | #' library(lexicon)
59 | #' library(textshape)
60 | #' nms <- gsub("(^.)(.*)", "\\U\\1\\L\\2", common_names, perl = TRUE)
61 | #' x <- split_portion(
62 | #'     sample(c(sample(grady_augmented, 5000), sample(nms, 10000, TRUE))), 
63 | #'     n.words = 12
64 | #' )
65 | #' x$text.var <- paste0(
66 | #'     x$text.var, 
67 | #'     sample(c('.', '!', '?'), length(x$text.var), TRUE)
68 | #'  )
69 | #' replace_tokens(x$text.var, nms, 'NAME')
70 | #' }
71 | replace_tokens  <- function(x, tokens, replacement = NULL, 
72 |     ignore.case = FALSE, ...) {
73 |     
74 |     replace_string_elements_generic(x = x, y = tokens, z = replacement, 
75 |         ignore.case = ignore.case, ...)
76 |     
77 | }
78 | 


--------------------------------------------------------------------------------
/R/replace_url.R:
--------------------------------------------------------------------------------
 1 | #' Replace URLs
 2 | #' 
 3 | #' Replaces URLs.  
 4 | #' 
 5 | #' @param x The text variable.
 6 | #' @param pattern Character time regex string to be matched in the given 
 7 | #' character vector. 
 8 | #' @param replacement A function to operate on the extracted matches or a 
 9 | #' character string which is a replacement for the matched pattern.
10 | #' @param \ldots ignored.
11 | #' @return Returns a vector with URLs replaced.
12 | #' @export
13 | #' @importFrom qdapRegex grab
14 | #' @examples
15 | #' x <- c("@@hadley I like #rstats for #ggplot2 work. ftp://cran.r-project.org/incoming/",
16 | #'     "Difference between #magrittr and #pipeR, both implement pipeline operators for #rstats: 
17 | #'         http://renkun.me/r/2014/07/26/difference-between-magrittr-and-pipeR.html @@timelyportfolio",
18 | #'     "Slides from great talk: @@ramnath_vaidya: Interactive slides from Interactive Visualization 
19 | #'         presentation #user2014. https://ramnathv.github.io/user2014-rcharts/#1",
20 | #'     NA 
21 | #' )
22 | #' 
23 | #' replace_url(x)
24 | #' replace_url(x, replacement = '<<URL>>')
25 | #' 
26 | #' \dontrun{
27 | #' ## Replacement with a function
28 | #' library(urltools)
29 | #' replace_url(x, 
30 | #'     replacement = function(x){
31 | #'         sprintf('{{%s}}', urltools::url_parse(x)$domain)
32 | #'     }
33 | #' )
34 | #' }
35 | replace_url <- function(x, pattern = qdapRegex::grab('rm_url'), 
36 |     replacement = '', ...){
37 | 
38 |     if (is.function(replacement)) {
39 |         f_gsub <- fgsub
40 |     } else { 
41 |         f_gsub <- stringi::stri_replace_all_regex
42 |     }
43 | 
44 |     f_gsub(x, pattern, replacement)
45 | 
46 | }
47 | 


--------------------------------------------------------------------------------
/R/replace_white.R:
--------------------------------------------------------------------------------
 1 | #' Remove White Space Characters
 2 | #' 
 3 | #' Pre-process data to replace one or more white space character with a single 
 4 | #' space (this includes new line characters).
 5 | #' 
 6 | #' @param x The character vector.
 7 | #' @param \dots ignored.
 8 | #' @return Returns a vector of character strings with escaped characters removed.
 9 | #' @keywords escaped character
10 | #' @export
11 | #' @examples
12 | #' x <- "I go \r
13 | #'     to   the \tnext line"
14 | #' x
15 | #' replace_white(x)
16 | replace_white <- function(x, ...) {
17 |     gsub("\\s+", " ",  x)
18 | }
19 | 


--------------------------------------------------------------------------------
/R/replace_word_elongation.R:
--------------------------------------------------------------------------------
  1 | #' Replace Word Elongations
  2 | #' 
  3 | #' In informal writing people may use a form of text embellishment to emphasize 
  4 | #' or alter word meanings called elongation (a.k.a. "word lengthening").  For 
  5 | #' example, the use of "Whyyyyy" conveys frustration.  Other times the usage may 
  6 | #' be to be more sexy (e.g., "Heyyyy there").  Other times it may be used for 
  7 | #' emphasis (e.g., "This is so gooood").  This function uses an augmented form
  8 | #' of Armstrong & Fogarty's (2007) algorithm.  The algorithm first attempts to
  9 | #' replace the elongation with known semantic replacements (optional; default is
 10 | #' \code{FALSE}).  After this the algorithm locates all places where the same 
 11 | #' letter (case insensitive) appears 3 times consecutively.  These elements are
 12 | #' then further processed.  The matches are replaced via \code{fgsub} by first
 13 | #' taking the elongation to it's canonical form (drop all > 1 consecutive 
 14 | #' letters to a single letter) and then replacing with the most common word 
 15 | #' used in 2008 in Google's ngram data set that takes the canonical form.  If 
 16 | #' the canonical form is not found in the Google data set then the canonical 
 17 | #' form is used as the replacement.
 18 | #' 
 19 | #' @param x  The text variable.
 20 | #' @param impart.meaning logical.  If \code{TRUE}, known elongation semantics
 21 | #' are used as replacements (see \code{textclean:::meaning_elongations} for 
 22 | #' known elongation semantics and replacements).
 23 | #' @param elongation.search.pattern The elongation pattern to search for.  The default
 24 | #' only considers a repeat of \code{'[A-Za-z]'} within a "word" that is bounded
 25 | #' by a word boundary or the beginning or end of the string and contains only
 26 | #' \code{'\\w'} characters.  This means "words" with non-ASCII characters will 
 27 | #' not be considered.
 28 | #' @param conservative By default the \code{elongation.search.pattern} will find3 or 
 29 | #' more of the same character in a row after in initial word character as the 
 30 | #' starting boundary to pull out words with 3 or more of the same character in a 
 31 | #' row.  You can choose to replace all letters that appear 3 or more times in a 
 32 | #' row with the single character replacement (conservative) or any letters that 
 33 | #' appear 2 or more times in a row (not conservative).  This is most important in 
 34 | #' words that can contain two of the same letter as the correct spelling that 
 35 | #' would not be found in the canonical lookup table.  For example 'Lookkkkkk!'
 36 | #' is in the lookup table and would be corrected to 'Look!' regardless, while
 37 | #' the workd 'mook' (that is then elongated into the word 'Mookkkkkk') would not 
 38 | #' be found in the lookup table.
 39 | #' @param elongation.pattern The actual pattern used for replacement.  We use a 
 40 | #' search pattern and then this pattern with the assumption that an elongated 
 41 | #' word must have 3 or more letters in a row but often these elongations can 
 42 | #' also contain 2 or more letters in a row as well.
 43 | #' @param \ldots ignored.
 44 | #' @return Returns a vector with word elongations replaced.
 45 | #' @references
 46 | #' Armstrong, D. B., Fogarty, G. J., & Dingsdag, D. (2007). Scales measuring 
 47 | #' characteristics of small business information systems. Proceedings of the 
 48 | #' 2011 Conference on Empirical Methods in Natural Language Processing (pp. 
 49 | #' 562-570). Edinburgh, Scotland. Retrieved from 
 50 | #' http://www.aclweb.org/anthology/D11-1052 \cr \cr
 51 | #' \url{https://storage.googleapis.com/books/ngrams/books/datasetsv2.html} \cr \cr
 52 | #' \url{https://www.theatlantic.com/magazine/archive/2013/03/dragging-it-out/309220} \cr \cr
 53 | #' \url{https://english.stackexchange.com/questions/189517/is-there-a-name-term-for-multiplied-vowels}
 54 | #' @export
 55 | #' @examples
 56 | #' x <- c('look', 'noooooo!', 'real coooool!', "it's sooo goooood", 'fsdfds', 
 57 | #'     'fdddf', 'as', "aaaahahahahaha", "aabbccxccbbaa", 'I said heyyy!',
 58 | #'     "I'm liiiike whyyyyy me?", "WwwhhaTttt!", NA)
 59 | #' 
 60 | #' replace_word_elongation(x)                      #Look at "WwwhhaTttt!" as "what!"
 61 | #' replace_word_elongation(x, conservative = TRUE) #Look at "WwwhhaTttt!" as "whhat!"
 62 | #' replace_word_elongation(x, impart.meaning = TRUE)
 63 | #' replace_word_elongation(c('online mookkkkk!', "WwwhhaTttt!"))
 64 | #' replace_word_elongation(c('online mookkkkk!', "WwwhhaTttt!"), conservative = TRUE)
 65 | replace_word_elongation <- function(x, impart.meaning = FALSE, 
 66 |     elongation.search.pattern = "(?i)(?:^|\\b)\\w+([a-z])(\\1{2,})\\w*(?:$|\\b)", 
 67 |     conservative = FALSE, 
 68 |     elongation.pattern = sprintf("([a-z])(\\1{%s,})", as.integer(conservative) + 1), ...){
 69 | 
 70 |     ## replace with meaningful
 71 |     if (isTRUE(impart.meaning)){
 72 |         x <- mgsub(x, meaning_elongations[['x']], meaning_elongations[['y']], 
 73 |             fixed = FALSE, perl = TRUE, ignore.case = TRUE)
 74 |     }
 75 | 
 76 |     ## consider only groupings with a triple letter
 77 |     locs <- stringi::stri_detect_regex(x, elongation.search.pattern, 
 78 |         opts_regex = list(case_insensitive = TRUE))
 79 |     
 80 |     locs[is.na(locs)] <- FALSE
 81 |     
 82 |     if (sum(locs) == 0) return(x)
 83 |     
 84 |     txt <- x[locs]
 85 |     canonicalk <- data.table::data.table(canonical)
 86 | # browser()
 87 |     ## replace triple letter words with most common form or else canonical form
 88 |     x[locs] <- .fgsub(txt, elongation.search.pattern, function(x, can = canonical){
 89 | 
 90 |         y <- gsub(elongation.pattern, '\\1', tolower(x), perl = TRUE)
 91 |     
 92 |         z <- data.table::data.table(canonical = y)
 93 |         out <- merge(z, can, by = 'canonical')$word
 94 | 
 95 |         if ((length(out) == 0 || is.na(out))) {
 96 |             if (!is.na(y)){
 97 |                 out <- y
 98 |             } else {
 99 |                 warning(sprintf("Elongation detected for '%s' but could not be replaced", x))
100 |                 out <- x
101 |             }                
102 |         } 
103 |         out
104 |         
105 |     })
106 |     
107 |     x
108 | 
109 | }
110 | 
111 | 
112 | 
113 | 
114 | 
115 | # Known with meaning
116 | b2 <- "(?<=^|[^A-Za-z'-])(%s)(?=$|[^A-Za-z'-])"
117 | meaning_elongations <- data.frame(
118 |     x = sprintf(
119 |         b2, 
120 |         c(
121 |             'hey{2,}', 'fi{3,}ne', 'no{3,}', 'sor{3,}y|sory{2,}|sor{3,}y{2,}',
122 |             'thanks{2,}', 'tha{2,}nks', 'ri{3,}ght', 'why{3,}', 'real{2,}y'
123 |         )
124 |     ),
125 |     y = c(
126 |         'hey sexy', 'not fine', 'sarcastic', 'not sorry', 'not thankful', 
127 |         'very thankful', 'not correct', 'frustration', 'surprised'
128 |     ), 
129 |     stringsAsFactors = FALSE
130 | )
131 | 
132 | #elongation_search_pattern <- "(?i)([a-z])(\\1{2,})"
133 | #elongation_pattern <- "(?i)(^|\\b)\\w*([a-z])(\\1{2,})\\w*($|\\b)"
134 | 
135 | 
136 | 
137 | 
138 | 


--------------------------------------------------------------------------------
/R/strip.R:
--------------------------------------------------------------------------------
 1 | #' Strip Text 
 2 | #' 
 3 | #' Strip text of unwanted characters.
 4 | #' 
 5 | #' @param x The text variable.
 6 | #' @param char.keep A character vector of symbols (i.e., punctuation) that 
 7 | #' \code{\link[textclean]{strip}} should keep.  The default is to strip every 
 8 | #' symbol except apostrophes and a double tilde \code{"~~"}.  The double tilde 
 9 | #' \code{"~~"} is included for a convenient means of keeping word groups 
10 | #' together in functions that split text apart based on spaces.  To remove 
11 | #' double tildes \code{"~~"} set \code{char.keep} to \code{NULL}.
12 | #' @param digit.remove logical.  If \code{TRUE} strips digits from the text.
13 | #' @param apostrophe.remove logical.  If \code{TRUE} removes apostrophes from 
14 | #' the output.
15 | #' @param lower.case logical.  If \code{TRUE} forces all alpha characters to 
16 | #' lower case.
17 | #' @return Returns a vector of text that has been stripped of unwanted 
18 | #' characters.
19 | #' @export
20 | #' @rdname strip
21 | #' @examples
22 | #' \dontrun{
23 | #' DATA$state #no strip applied
24 | #' strip(DATA$state)
25 | #' strip(DATA$state, apostrophe.remove=TRUE)
26 | #' strip(DATA$state, char.keep = c("?", "."))
27 | #' }
28 | strip <- function(x, char.keep = "~~", digit.remove = TRUE, 
29 |     apostrophe.remove = FALSE, lower.case = TRUE){
30 | 
31 |     UseMethod("strip")
32 | }
33 | 
34 | #' \code{strip.character} - factor method for \code{strip}.
35 | #' @rdname strip
36 | #' @export
37 | #' @method strip character
38 | strip.character <- function(x, char.keep = "~~", digit.remove = TRUE, 
39 |     apostrophe.remove = FALSE, lower.case = TRUE){
40 | 
41 |     x <- gsub(
42 |         paste0(
43 |             ifelse(digit.remove, "[0-9]|", ""), "\\\\r|\\\\n|\\n|\\\\t"), 
44 |         " ", 
45 |             x
46 |         )
47 | 
48 |     regex1 <- sprintf(".*?($%s%s|[^[:punct:]]).*?",
49 |         ifelse(apostrophe.remove, "", "|'"),
50 |         ifelse(
51 |             is.null(char.keep), 
52 |             "", 
53 |             paste0("|", paste(paste0("\\", char.keep), collapse="|"))
54 |         )
55 |     )
56 | 
57 |     white <- paste0(
58 |         "^\\s+|\\s+$|\\s+(?=[.](?:\\D|$))|(\\s+)(?=[,]|[;:?!\\]\\}\\)]+)|", 
59 |         "(?<=[\\(\\[\\{])(\\s+)|(\\s+)(?=[\\s])"
60 |     )
61 | 
62 |     x <- gsub(regex1, "\\1", ifelse(lower.case, tolower, c)(x))
63 |     gsub("\\s+", " ", gsub("^\\s+|\\s+$", "", x))
64 | }
65 | 
66 | 
67 | #' \code{strip.factor} - factor method for \code{strip}.
68 | #' @rdname strip
69 | #' @export
70 | #' @method strip factor 
71 | strip.factor <- function(x, char.keep = "~~", digit.remove = TRUE, 
72 |     apostrophe.remove = TRUE, lower.case = TRUE){
73 | 
74 |     strip(as.character(x), char.keep = char.keep, digit.remove = digit.remove, 
75 |         apostrophe.remove = apostrophe.remove, lower.case = lower.case)
76 | }
77 | 
78 | #' \code{strip.default} - factor method for \code{strip}.
79 | #' @rdname strip
80 | #' @export
81 | #' @method strip default
82 | strip.default <- function(x, char.keep = "~~", digit.remove = TRUE, 
83 |     apostrophe.remove = TRUE, lower.case = TRUE){
84 | 
85 |     strip(as.character(x), char.keep = char.keep, digit.remove = digit.remove, 
86 |         apostrophe.remove = apostrophe.remove, lower.case = lower.case)
87 | }
88 | 
89 | #' \code{strip.list} - factor method for \code{strip}.
90 | #' @rdname strip
91 | #' @export
92 | #' @method strip list
93 | strip.list <- function(x, char.keep = "~~", digit.remove = TRUE, 
94 |     apostrophe.remove = TRUE, lower.case = TRUE){
95 | 
96 |     unlist(lapply(x, strip))
97 | }
98 | 
99 | 


--------------------------------------------------------------------------------
/R/sub_holder.R:
--------------------------------------------------------------------------------
  1 | #' Hold the Place of Characters Prior to Subbing
  2 | #' 
  3 | #' This function holds the place for particular character values, allowing the 
  4 | #' user to manipulate the vector and then revert the place holders back to the 
  5 | #' original values.
  6 | #' 
  7 | #' @param x A character vector.
  8 | #' @param pattern Character string to be matched in the given character vector.
  9 | #' @param alpha.type logical.  If \code{TRUE} alpha (lower case letters) are 
 10 | #' used for the key.  If \code{FALSE} numbers are used as the key.
 11 | #' @param holder.prefix The prefix to use before the alpha key in the palce 
 12 | #' holder when \code{alpha.type = TRUE}; this ensures uniqueness.
 13 | #' @param holder.suffix The suffix to use after the alpha key in the palce 
 14 | #' holder when \code{alpha.type = TRUE}; this ensures uniqueness.
 15 | #' @param \dots Additional arguments passed to \code{\link[base]{gsub}}.
 16 | #' @return Returns a list with the following:
 17 | #' \item{output}{keyed place holder character vector} 
 18 | #' \item{unhold}{A function used to revert back to the original values}
 19 | #' @note The \code{unhold} function for \code{sub_holder} will only work on keys
 20 | #' that have not been disturbed by subsequent alterations.  The key follows the 
 21 | #' pattern of holder.prefix (`zzzplaceholder`) followed by lower case letter 
 22 | #' keys followed by holder.suffix (`zzz`) when \code{alpha.type = TRUE}, 
 23 | #' otherwise the holder is numeric.
 24 | #' @export
 25 | #' @examples
 26 | #' ## `alpha.type` as TRUE
 27 | #' library(lexicon); library(textshape)
 28 | #' (fake_dat <- paste(hash_emoticons[1:11, 1, with=FALSE][[1]], DATA$state))
 29 | #' (m <- sub_holder(fake_dat, hash_emoticons[[1]]))
 30 | #' m$unhold(strip(m$output))
 31 | #' 
 32 | #' ## `alpha.type` as FALSE (numeric keys)
 33 | #' vowels <- LETTERS[c(1, 5, 9, 15, 21)]
 34 | #' (m2 <- sub_holder(toupper(DATA$state), vowels, alpha.type = FALSE))
 35 | #' m2$unhold(gsub("[^0-9]", "", m2$output))
 36 | #' mtabulate(strsplit(m2$unhold(gsub("[^0-9]", "", m2$output)), ""))
 37 | sub_holder <- function(x, pattern, alpha.type = TRUE, 
 38 |     holder.prefix = 'zzzplaceholder', holder.suffix = 'zzz', ...) {
 39 | 
 40 |     if (!is.character(pattern)) pattern <- as.character(pattern)
 41 |     y <- length(pattern)
 42 | 
 43 |     if (alpha.type) {
 44 |   
 45 |         # counter <- 0
 46 |         # while(y > 26) {
 47 |         #     y <- y/26
 48 |         #     counter <- counter + 1
 49 |         # }
 50 |         # if (y > 0) counter <- counter + 1
 51 |  
 52 |         ## replaced the above:https://www.youtube.com/watch?v=zJmTJR6s4QU
 53 |         counter <- max(ceiling(log(y, 26)), 0L)
 54 |     
 55 |         keys <- apply(
 56 |             expand.grid(lapply(seq_len(counter), function(i) letters)), 
 57 |             1, 
 58 |             paste, 
 59 |             collapse=""
 60 |         )[seq_len(y)]
 61 |         reps <- paste0(holder.prefix, keys, holder.suffix)
 62 |     } else {
 63 |         counter <- NULL
 64 |         keys <- reps <- seq_len(y)
 65 |     }
 66 | 
 67 |     if (!is.null(counter) && counter == 0) {
 68 |         output <- x
 69 |     } else {
 70 |         output <- mgsub(x, pattern, reps, ...)
 71 |     }
 72 | 
 73 | 
 74 |     FUN <- function(x, ...) {
 75 |         mgsub(x, reps, pattern, ...)
 76 |     }
 77 | 
 78 |     out <- list(output = output, unhold = FUN)
 79 | 
 80 |     attributes(out) <- list(
 81 |         class = c("sub_holder", "list"), 
 82 |         names = names(out),
 83 |         pattern = pattern, 
 84 |         keys = keys, 
 85 |         len = y
 86 |     )
 87 |     out
 88 | 
 89 | }
 90 | 
 91 | 
 92 | #' Prints a sub_holder object
 93 | #' 
 94 | #' Prints a sub_holder object
 95 | #' 
 96 | #' @param x The sub_holder object
 97 | #' @param \ldots ignored
 98 | #' @export
 99 | #' @method print sub_holder
100 | print.sub_holder <- function(x, ...) {
101 |     print(x[["output"]])
102 | }
103 | 


--------------------------------------------------------------------------------
/R/swap.R:
--------------------------------------------------------------------------------
 1 | #' Swap Two Patterns Simultaneously
 2 | #' 
 3 | #' Swap pattern x for pattern y and pattern y for pattern x in one fell swoop.
 4 | #' 
 5 | #' @param x A text variable.
 6 | #' @param pattern1 Character string to be matched in the given character vector.
 7 | #' This will be replaced by \code{pattern2}.
 8 | #' @param pattern2 Character string to be matched in the given character vector.
 9 | #' This will be replaced by \code{pattern1}.
10 | #' @param \ldots ignored.
11 | #' @return Returns a vector with patterns 1 & 2 swapped.
12 | #' @export
13 | #' @examples 
14 | #' x <- c("hash_abbreviation", "hash_contractions", "hash_grade", "key_emoticons", 
15 | #'     "key_power", "key_sentiment", "key_sentiment_nrc", "key_strength", 
16 | #'     "key_syllable", "key_valence_shifters")
17 | #' 
18 | #' x
19 | #' swap(x, 'hash_', 'key_')
20 | swap <- function(x, pattern1, pattern2, ...){
21 | 
22 |     y <- mgsub(x, pattern1, "zzzplaceholderaazzz", ...)
23 |     y <- mgsub(y, pattern2, pattern1,...)
24 |     mgsub(y, "zzzplaceholderaazzz", pattern2,...)
25 | 
26 | }
27 | 


--------------------------------------------------------------------------------
/R/sysdata.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textclean/5443d7484cd798e7494a63f794e6fb30834e5ec2/R/sysdata.rda


--------------------------------------------------------------------------------
/R/textclean-package.R:
--------------------------------------------------------------------------------
 1 | #' Text Cleaning Tools
 2 | #'
 3 | #' Tools to clean and process text.
 4 | #' @docType package
 5 | #' @name textclean
 6 | #' @aliases textclean package-textclean
 7 | NULL
 8 | 
 9 | #' Fictitious Classroom Dialogue
10 | #'
11 | #' A fictitious dataset useful for small demonstrations.
12 | #'
13 | #' @details
14 | #' \itemize{
15 | #'   \item person. Speaker
16 | #'   \item sex. Gender
17 | #'   \item adult. Dummy coded adult (0-no; 1-yes)
18 | #'   \item state. Statement (dialogue)
19 | #'   \item code. Dialogue coding scheme
20 | #' }
21 | #'
22 | #' @docType data
23 | #' @keywords datasets
24 | #' @name DATA
25 | #' @usage data(DATA)
26 | #' @format A data frame with 11 rows and 5 variables
27 | NULL
28 | 
29 | 


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
  1 | abbr_rep <- lapply(list(
  2 |   Titles   = c('jr', 'mr', 'mrs', 'ms', 'dr', 'prof', 'sr', 'sen', 'rep',
  3 |          'rev', 'gov', 'atty', 'supt', 'det', 'rev', 'col','gen', 'lt',
  4 |          'cmdr', 'adm', 'capt', 'sgt', 'cpl', 'maj'),
  5 | 
  6 |   Entities = c('dept', 'univ', 'uni', 'assn', 'bros', 'inc', 'ltd', 'co',
  7 |          'corp', 'plc'),
  8 | 
  9 |   Months   = c('jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
 10 |          'aug', 'sep', 'oct', 'nov', 'dec', 'sept'),
 11 | 
 12 |   Days     = c('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'),
 13 | 
 14 |   Misc     = c('vs', 'etc', 'no', 'esp', 'cf', 'al', 'mt'),
 15 | 
 16 |   Streets  = c('ave', 'bld', 'blvd', 'cl', 'ct', 'cres', 'dr', 'rd', 'st')
 17 | ), function(x){
 18 |     fl <- sub("(^[a-z])(.+)", "\\1", x)
 19 |     sprintf("[%s%s]%s", fl, toupper(fl), sub("(^[a-z])(.+)", "\\2", x))
 20 | })
 21 | 
 22 | period_reg <- paste0(
 23 |     "(?:(?<=[a-z])\\.\\s(?=[a-z]\\.))",
 24 |         "|",
 25 |     "(?:(?<=([ .][a-z]))\\.)(?!(?:\\s[A-Z]|$)|(?:\\s\\s))",
 26 |         "|",
 27 |     "(?:(?<=[A-Z])\\.(?=\\s??[A-Z]\\.))",
 28 |         "|",
 29 |     "(?:(?<=[A-Z])\\.(?!\\s+[A-Z][A-Za-z]))"
 30 | )
 31 | 
 32 | 
 33 | sent_regex <- sprintf("((?<=\\b(%s))\\.)|%s|(%s)",
 34 |     paste(unlist(abbr_rep), collapse = "|"),
 35 |     period_reg,
 36 | 	'\\.(?=\\d+)'
 37 | )
 38 | 
 39 | 
 40 | count_endmark <- function(x) {
 41 |     y <- stringi::stri_replace_all_regex(trimws(x), sent_regex, "<<<TEMP>>>")
 42 |     stringi::stri_count_regex(
 43 |         y, 
 44 |         paste0(
 45 |             "(?<!\\w\\.\\w.)(?<![A-Z][a-z]\\.)(?<=\\.|", 
 46 |             "\\?|\\!)(\\s|(?=[a-zA-Z][a-zA-Z]*\\s))"
 47 |         )
 48 |     )
 49 | }
 50 | 
 51 | 
 52 | check_install <- function(x, fun = 'function'){
 53 | 
 54 |     found <- TRUE
 55 |     path <- try(find.package(x), silent = TRUE)
 56 |     
 57 |     if (inherits(path, "try-error")) found <- FALSE
 58 | 
 59 |     if (!found) {
 60 |         if (interactive()){
 61 |             message(paste(x, "package not found.  Do you want to install?\n"))
 62 |             ans <- utils::menu(c("Yes", "No"))
 63 |             if (ans == "1") {
 64 |                 utils::install.packages(x)
 65 |             } else {
 66 |                 stop(
 67 |                     paste(
 68 |                         fun, 
 69 |                         'requires', 
 70 |                         x, 
 71 |                         'package to be install.  Please install before using.'
 72 |                     ), 
 73 |                     call. = FALSE, 
 74 |                 )
 75 |             }   
 76 |         } else {
 77 |             stop(
 78 |                 paste(
 79 |                     fun, 
 80 |                     'requires', 
 81 |                     x, 
 82 |                     'package to be install.  Please install before using.'
 83 |                 ), 
 84 |                 call. = FALSE
 85 |             )
 86 |         }     
 87 |     }
 88 | 
 89 |     path <- try(find.package(x), silent = TRUE)
 90 |     if (inherits(path, "try-error")) {
 91 |         stop(paste(
 92 |             'Could not install.', 
 93 |             fun, 
 94 |             'requires', 
 95 |             x, 
 96 |             'package to be installed.  Please install before using.'
 97 |         ), call. = FALSE)
 98 |     }  
 99 | 
100 | }
101 | 
102 | .mgsub <- function (pattern, replacement, text.var, ...) {
103 | 
104 |     ord <- rev(order(nchar(pattern)))
105 |     pattern <- pattern[ord]
106 |     if (length(replacement) != 1) replacement <- replacement[ord]
107 | 
108 |     if (length(replacement) == 1) replacement <- rep(replacement, length(pattern))
109 | 
110 |     text.var <- stringi::stri_replace_all_fixed(text.var, pattern, replacement,
111 |         vectorize_all=FALSE, opts_fixed = list(case_insensitive = TRUE)
112 |     )
113 | 
114 |     text.var
115 | }
116 | 
117 | 
118 | replace_string_elements_generic  <- function(x, y, z = NULL, 
119 |     ignore.case = FALSE, ...) {
120 | 
121 |     z_null <- is.null(z)
122 |     if(isTRUE(z_null)) z <- 'replacermentfunctionstringholder'
123 | 
124 |     na_locs <- is.na(x)
125 |     tokens <- textshape::split_token(x, lower = FALSE, ...)
126 |     locs <- textshape::starts(lengths(tokens))[-1]
127 | 
128 |     tokens <- unlist(tokens)
129 |     fun <- ifelse(ignore.case, tolower, c)
130 |     #match(fun(tokens), fun(y))
131 |     tokens[which(fun(tokens) %in% fun(y))] <- z
132 | 
133 |     replaced <- textshape::split_index(tokens, locs)
134 |     replaced[na_locs] <- x[na_locs]
135 |     replaced[!na_locs] <- unlist(lapply(replaced[!na_locs],function(x) {
136 |          paste(x, collapse = " ")
137 |     }))
138 |     out <- unlist(replaced)
139 | 
140 |     if(isTRUE(z_null)) {
141 |         out <- trimws(gsub("\\s+", " ", gsub(z, "", out, fixed = TRUE)))
142 |     }
143 | 
144 |     gsub("(\\s+)([.!?,;:])", "\\2", out, perl = TRUE)
145 | }
146 | 
147 | 
148 | to_byte <- function(x){
149 |     Encoding(x) <- "latin1"
150 |     iconv(x, "latin1", "ASCII", "byte")
151 | }
152 | 
153 | 
154 | .fgsub <- function(x, pattern, fun, ...){
155 |     
156 | 
157 |     hits <- stringi::stri_extract_all_regex(x, pattern)
158 |     pats <- unique(unlist(hits))
159 |     reps <- paste0('textcleanholder', seq_along(pats), 'textcleanholder')
160 |     freps <- unlist(lapply(pats, fun))
161 |         
162 |     x <- mgsub(x, pats, reps)
163 |     
164 |     mgsub(x, reps, freps)
165 | 
166 | }
167 | 
168 | 
169 | set_names <- function(x, nms){
170 |     names(x) <- nms
171 |     x
172 | }
173 | 
174 | rm_na <- function(x) x[!is.na(x)]
175 | 
176 | rm_class <- function(x, cls){
177 |     class(x) <- class(x)[!class(x) %in% cls]    
178 |     x
179 | }
180 | 
181 | ## function to detect text columns
182 | detect_text_column <- function(dat, text.var){
183 |     
184 |     if (isTRUE(text.var)) {
185 |     
186 |         dat <- as.data.frame(dat, stringsAsFactors = FALSE)
187 |         
188 |         mean_lens <- unlist(lapply(dat, function(y) {
189 |          
190 |             if(!is.character(y) && !is.factor(y)) return(0)
191 |             mean(nchar(as.character(y)), na.rm = TRUE)
192 |             
193 |         }))
194 |     
195 |         max_cols <- which.max(mean_lens)
196 |         
197 |         text.var <- colnames(dat)[max_cols[1]]
198 |         
199 |         if (length(text.var) == 0 | sum(as.integer(mean_lens)) == 0) {
200 |             stop(
201 |                 paste(
202 |                     "Could not detect ` text.var`.", 
203 |                     "Please supply `text.var` explicitly."
204 |                 ),
205 |                 call. = FALSE
206 |             )
207 |         }
208 |         
209 |         if (length(max_cols) > 1) {
210 |             warning(
211 |                 sprintf(
212 |                     'More than one text column detected...using `%s`', 
213 |                     text.var
214 |                 ), 
215 |                 call. = FALSE
216 |             )    
217 |         }
218 |     } 
219 |     
220 |     text.var
221 |     
222 | }
223 | 
224 | ## safer type sapply
225 | sapply2 <- function (X, FUN, ...) {
226 |     unlist(lapply(X, FUN, ...))
227 | }
228 | 
229 | 
230 | avail <- lexicon::available_data
231 | 
232 | drop_sci_note <- function(x, ...){
233 | 
234 |     if (!is.numeric(x)) return(x)
235 | 
236 |     x <- as.character(as.numeric(x))
237 | 
238 |     locs <- grepl('e\\+', x, perl = TRUE)
239 | 
240 |     x[locs] <- unlist(Map(function(b, e) {
241 | 
242 |             subs <- nchar(gsub('^.*\\.', '', b))
243 | 
244 |             paste0(gsub('[.]', '', b), paste(rep('0', e - subs), collapse = ''))
245 | 
246 |         }, gsub('e\\+.+', '', x[locs]), as.integer(gsub('^.+?e\\+', '', x[locs]))
247 |     ))
248 | 
249 | 
250 |     x
251 | }
252 | 
253 | 


--------------------------------------------------------------------------------
/data/DATA.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textclean/5443d7484cd798e7494a63f794e6fb30834e5ec2/data/DATA.rda


--------------------------------------------------------------------------------
/inst/CITATION:
--------------------------------------------------------------------------------
 1 | citHeader("To cite textclean in publications, please use:")
 2 | 
 3 | 
 4 | citEntry(entry = "manual",
 5 |     title = "{textclean}: Text Cleaning Tools",
 6 |     author = "Tyler W. Rinker",
 7 |     address = "Buffalo, New York",
 8 |     note = "version 0.9.6",
 9 |     year = "2021",
10 |     url = "https://github.com/trinker/textclean", 
11 |     textVersion  = paste("Rinker, T. W. (2021).",
12 |         "textclean: Text Cleaning Tools",
13 |         "version 0.9.6. Buffalo, New York.",
14 |         "https://github.com/trinker/textclean")
15 | )


--------------------------------------------------------------------------------
/inst/articles/Clark2011.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textclean/5443d7484cd798e7494a63f794e6fb30834e5ec2/inst/articles/Clark2011.pdf


--------------------------------------------------------------------------------
/inst/articles/Jurafsky2016.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textclean/5443d7484cd798e7494a63f794e6fb30834e5ec2/inst/articles/Jurafsky2016.pdf


--------------------------------------------------------------------------------
/inst/articles/Sproat2001.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textclean/5443d7484cd798e7494a63f794e6fb30834e5ec2/inst/articles/Sproat2001.pdf


--------------------------------------------------------------------------------
/inst/build.R:
--------------------------------------------------------------------------------
 1 | root <- Sys.getenv("USERPROFILE")
 2 | pack <- basename(getwd())
 3 | 
 4 | quick <-  TRUE
 5 | pdf <- FALSE
 6 | 
 7 | unlink(paste0(pack, ".pdf"), recursive = TRUE, force = TRUE)
 8 | devtools::document()
 9 | devtools::install(quick = quick, build_vignettes = FALSE, dependencies = TRUE, upgrade = 'never')
10 | 
11 | if(pdf){
12 |     path <- find.package(pack)
13 |     system(paste(shQuote(file.path(R.home("bin"), "R")), "CMD", "Rd2pdf", shQuote(path)))
14 |     file.copy(paste0(pack, '.pdf'), file.path(root,"Desktop", paste0(pack, '.pdf')))
15 |     while (file.exists(paste0(pack, ".pdf"))) {unlink(paste0(pack, ".pdf"), recursive = TRUE, force = TRUE)}
16 |     empts <- grep("^\\.Rd", dir(all.files = TRUE), value = TRUE)
17 |     unlink(empts, recursive = TRUE, force = TRUE)    
18 | }
19 | 
20 | message("Done!")
21 | 
22 | 
23 | nh <- function() cat(paste(c("BUG FIXES", "NEW FEATURES", "MINOR FEATURES", "IMPROVEMENTS", "CHANGES"), collapse = "\n\n"), file="clipboard")
24 | 
25 | 
26 | 
27 | update_news <- function(repo = basename(getwd())) {
28 | 
29 |     News <- readLines("NEWS")
30 | 
31 |     News <- textclean::mgsub(News, 
32 |         c("<", ">", "&lt;major&gt;.&lt;minor&gt;.&lt;patch&gt;", "BUG FIXES",
33 |             "NEW FEATURES", "MINOR FEATURES", "CHANGES", "IMPROVEMENTS", " TRUE ", " FALSE ",
34 |             " NULL ", "TRUE.", "FALSE.", "NULL.", ":m:"),
35 |         c("&lt;", "&gt;", "**&lt;major&gt;.&lt;minor&gt;.&lt;patch&gt;**",
36 |             "**BUG FIXES**", "**NEW FEATURES**", "**MINOR FEATURES**",
37 |             "**CHANGES**", "**IMPROVEMENTS**", " `TRUE` ", "`FALSE`.", "`NULL`.", "`TRUE`.",
38 |             " `FALSE` ", " `NULL` ", " : m : "),
39 |             trim = FALSE, fixed=TRUE)
40 | 
41 |     News <- sub(pattern="issue *# *([0-9]+)",
42 |         replacement=sprintf("<a href=\"https://github.com/trinker/%s/issues/\\1\">issue #\\1</a>",
43 |         repo),
44 |         x=News)
45 | 
46 |     News <- sub(pattern="pull request *# *([0-9]+)",
47 |         replacement=sprintf("<a href=\"https://github.com/trinker/%s/issues/\\1\">pull request #\\1</a>",
48 |         repo),
49 |         x=News)
50 | 
51 |     News <- gsub(sprintf(" %s", repo),
52 |         sprintf(" <a href=\"https://github.com/trinker/%s\" target=\"_blank\">%s</a>",
53 |         repo, repo), News)
54 |     
55 |     News <- gsub(pattern="(#)([0-9]+)",
56 |         replacement=sprintf("<a href=\"https://github.com/trinker/%s/issues/\\2\">#\\2</a>", repo),
57 |         x=News)    
58 | 
59 |     cat(paste(News, collapse = "\n"), file = "NEWS.md")
60 |     message("news.md updated")
61 | }
62 | 


--------------------------------------------------------------------------------
/inst/docs/emoji_sample.txt:
--------------------------------------------------------------------------------
1 | Proin 😍 ut maecenas 😏 condimentum 😔 purus eget. Erat, 😂vitae nunc elit. Condimentum 😢 semper iaculis bibendum sed tellus. Ut suscipit interdum😑 in. Faucib😞 us nunc quis a vitae posuere. 😛 Eget amet sit condimentum non. Nascetur vitae ☹ et. Auctor ornare ☺ vestibulum primis justo congue 😀urna ac magna. Quam 😥 pharetra 😟 eros 😒facilisis ac lectus nibh est 😙vehicula 😐 ornare! Vitae, malesuada 😎 erat sociosqu urna, 😏 nec sed ad aliquet 😮 .
2 | 
3 | 
4 | 


--------------------------------------------------------------------------------
/inst/docs/r_tweets.txt:
--------------------------------------------------------------------------------
 1 | Person	Handle	Tweet
 2 | Mara Averick‏	@dataandme	Hello, helpful! 📦❌👾 "debugme: Easy & efficient debugging for R packages" 👨🏻‍💻 @GaborCsardi https://buff.ly/2nNKcps  #rstats
 3 | Mark Sellors‏	@sellorm	Did you ever get bored and accidentally create a 📦 to make #Rstats speak on a Mac? I have -> 
 4 | Alex Bresler‏	@abresler	A gift to my fellow nfl loving #rstats folks this package is 💥💥
 5 | Julia Silge‏	@juliasilge	Julia Silge Retweeted Jesse Sadler Ah, the REAL JOY of sitting and coding with another live person... 😅
 6 | Julia Silge‏	@juliasilge	💯 talk on algorithmic fairness by @geomblog at our SLC data science meetup
 7 | Julia Silge‏	@juliasilge	I ❤️💛💚💙💜 emoji, but 😭😭😭 also this:
 8 | Julia Silge‏	@juliasilge	NEW POST: Inspired by xkcd, generate new emoji sports in #rstats, like ⭐️🐣🔪  (so scary, if you ask me) and 🤷🏻🏃🏻‍♂️ (already popular???) https://juliasilge.com/blog/emoji-sports/ …
 9 | Maëlle Salmon	@ma_salmon	I had the honour to join @rweekly_org team last week. 😍 You too should contribute by sharing the fantastic #rstats content you stumble upon! How? ➡️
10 | Lucy	@LucyStats	Excited about the new R Release? #rstats 😋🎋 Welcome "Kite-Eating Tree"! 🙌🎥 gif courtesy of @pdalgd! 🤔👇 Curious about the other R release name origins? Check out our post! 🔗 http://livefreeordichotomize.com/2017/09/28/r-release-names/ …
11 | Julia Silge‏	@juliasilge	This post has helpful examples of how to find and fix various encoding issues in #rstats 👍 “Functions with R and rvest: A Laymen’s Guide” by @stankyy_pete
12 | Hadley Wickham‏	@hadleywickham	So 🎉😁👍🤗 to announce that @dataandme has joined my team as #tidyverse developer advocate #rstats


--------------------------------------------------------------------------------
/inst/extra_statdoc/readme.R:
--------------------------------------------------------------------------------
1 | <p><img src="https://raw.githubusercontent.com/trinker/textmod/master/inst/textmod_logo/r_textmod.png" width="300"/><br/>
2 | <p><a href="http://trinker.github.com/textmod_dev">textmod</a> is a...</p>
3 | <p>Download the development version of textmod <a href="https://github.com/trinker/textmod/">here</a>
4 | 


--------------------------------------------------------------------------------
/inst/maintenance.R:
--------------------------------------------------------------------------------
 1 | #========
 2 | # BUILD
 3 | #========
 4 | update_news(); source("inst/build.R")
 5 | 
 6 | #==========================
 7 | # Run unit tests
 8 | #==========================
 9 | devtools::test()
10 | 
11 | #==========================
12 | # knit README.md
13 | #==========================
14 | rmarkdown::render("README.Rmd", "all"); md_toc()
15 | 
16 | #==========================
17 | # UPDATE NEWS
18 | #==========================
19 | update_news()
20 | 
21 | #==========================
22 | # UPDATE VERSION
23 | #==========================
24 | update_version()
25 | 
26 | #========================
27 | #staticdocs dev version
28 | #========================
29 | 
30 | if (!require("pacman")) install.packages("pacman")
31 | pacman::p_load_gh("hadley/staticdocs", "trinker/acc.roxygen2")
32 | p_load(rstudioapi, qdap)
33 | 
34 | R_USER <-  switch(Sys.info()[["user"]],
35 |     Tyler = "C:/Users/Tyler",
36 |     trinker = "C:/Users/trinker",
37 |     message("Computer name not found")
38 | )
39 | build_site(pkg=file.path(R_USER, "GitHub", basename(getwd())), launch = FALSE)
40 | 
41 | #STEP 2: reshape index
42 | path <- "inst/web"
43 | path2 <- file.path(path, "/index.html")
44 | rdme <- file.path(R_USER, "GitHub", basename(getwd()), "inst/extra_statdoc/readme.R")
45 | 
46 | extras <- qcv("")
47 | ## drops <- qcv()
48 | expand_statdoc(path2, to.icon = extras, readme = rdme)
49 | 
50 | x <- readLines(path2)
51 | x[grepl("<h2>Authors</h2>", x)] <- paste(
52 |     c("<h2>Author</h2>" 
53 |     #rep("<h2>Contributor</h2>", 1)
54 |     ),
55 |     c("Tyler W. Rinker")
56 | )
57 | 
58 | cat(paste(x, collapse="\n"), file=path2)
59 | 
60 | #STEP 3: move to trinker.guthub
61 | library(reports)
62 | file <- file.path(R_USER, "/GitHub/trinker.github.com")
63 | # incoming <- file.path(file, basename(getwd()))
64 | delete(incoming)
65 | file.copy(path, file, TRUE, TRUE)
66 | file.rename(file.path(file, "web"), incoming)
67 | ## delete(path)
68 | 
69 | #==========================
70 | #staticdocs current version
71 | #==========================
72 | 
73 | #STEP 3: move to trinker.guthub
74 | library(reports)
75 | file <- file.path(R_USER, "/GitHub/trinker.github.com")
76 | incoming <- file.path(file, "discon")
77 | ##  delete(incoming); file.copy(path, file, TRUE, TRUE); file.rename(file.path(file, "web"), incoming)
78 | 
79 | #==========================
80 | # NEWS new version
81 | #==========================
82 | x <- c("BUG FIXES", "NEW FEATURES", "MINOR FEATURES", "IMPROVEMENTS", "CHANGES")
83 | cat(paste(x, collapse = "\n\n"), file="clipboard")
84 | 
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/inst/scraping_scripts/google_ngram_to_canonical.R:
--------------------------------------------------------------------------------
 1 | if (!require("pacman")) install.packages("pacman")
 2 | pacman::p_load(dplyr, data.table, stringi, R.utils)
 3 | 
 4 | if (!dir.exists('google_ngram')) dir.create('google_ngram')
 5 | 
 6 | #letter <- 'b'
 7 | 
 8 | get_google_ngram_data <- function(letter, ...){
 9 |     
10 |     loc <- sprintf('http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-1gram-20120701-%s.gz', letter) %>%
11 |         textreadr::download() 
12 | 
13 |     R.utils::gunzip(loc, destname = sprintf('google_ngram/googlebooks-eng-all-1gram-20120701-%s', letter), remove = FALSE)
14 | 
15 | }
16 | 
17 | 
18 | make_canonical_hash <- function(letter, ...){
19 | 
20 | 	dat <- fread(sprintf('google_ngram/googlebooks-eng-all-1gram-20120701-%s', letter), sep = '\t', header = FALSE)[, 1:3
21 | 		][V2 == 2008,
22 | 		][, V1 := tolower(stri_replace_all_regex(V1, '_[A-Z]+$', ''))
23 | 		][stri_detect_regex(V1, '[^a-z\'-]', negate = TRUE),
24 | 		][, list(V3 = sum(V3)), by = c('V1')
25 | 		][, canonical := gsub("([a-z])(\\1+)", '\\1', V1, perl = TRUE)
26 | 		][, cnt := .N, by = 'canonical'
27 | 		][cnt > 1,
28 | 		][, cnt := NULL
29 | 		#][order(canonical, -V3)
30 | 		][, .SD[which.max(V3)], by = 'canonical'
31 | 		][, V3 := NULL
32 | 		][]
33 | 
34 | 	setnames(dat, c('V1'), c('word'))
35 | 	setkey(dat, 'canonical')
36 | 
37 | 	#setnames(dat, c('V1', 'V3'), c('word', 'n'))
38 | 	#setcolorder(dat, c("canonical", "word", "n"))
39 | 
40 | 	dat
41 | }
42 | 
43 | canonical <- lapply(letters, function(letter){
44 |     gc()
45 |     print(letter); flush.console()
46 |     try(get_google_ngram_data(letter))
47 |     try(make_canonical_hash(letter))
48 | })
49 | 
50 | ## check for errors
51 | canonical %>%
52 |     sapply(inherits, 'try-error') %>%
53 |     sum()
54 | 
55 | 
56 | canonical %>%
57 |     rbindlist() %>%
58 |     unique() %>%
59 |     saveRDS('canonical.rds')
60 | 
61 | 
62 | canonical <- readRDS('C:\\Users\\Tyler\\Desktop/canonical.rds')
63 | ## canonical <- textclean:::canonical
64 | data.table::setkey(canonical, 'canonical')
65 | devtools::use_data(canonical, internal = TRUE)
66 | 
67 | 
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/inst/scraping_scripts/scrape_leet.R:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textclean/5443d7484cd798e7494a63f794e6fb30834e5ec2/inst/scraping_scripts/scrape_leet.R


--------------------------------------------------------------------------------
/inst/staticdocs/index.R:
--------------------------------------------------------------------------------
1 | library(staticdocs)
2 | 
3 | sd_section("",
4 |   "Function for...",
5 |   c(
6 |       "myfun"
7 |   )
8 | )


--------------------------------------------------------------------------------
/man/DATA.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/textclean-package.R
 3 | \docType{data}
 4 | \name{DATA}
 5 | \alias{DATA}
 6 | \title{Fictitious Classroom Dialogue}
 7 | \format{
 8 | A data frame with 11 rows and 5 variables
 9 | }
10 | \usage{
11 | data(DATA)
12 | }
13 | \description{
14 | A fictitious dataset useful for small demonstrations.
15 | }
16 | \details{
17 | \itemize{
18 |   \item person. Speaker
19 |   \item sex. Gender
20 |   \item adult. Dummy coded adult (0-no; 1-yes)
21 |   \item state. Statement (dialogue)
22 |   \item code. Dialogue coding scheme
23 | }
24 | }
25 | \keyword{datasets}
26 | 


--------------------------------------------------------------------------------
/man/add_comma_space.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/add_comma_space.R
 3 | \name{add_comma_space}
 4 | \alias{add_comma_space}
 5 | \title{Ensure Space After Comma}
 6 | \usage{
 7 | add_comma_space(x)
 8 | }
 9 | \arguments{
10 | \item{x}{The text variable.}
11 | }
12 | \value{
13 | Returns a vector of strings with commas that have a space after them.
14 | }
15 | \description{
16 | Adds a space after a comma as \code{strip} and many other functions may consider a 
17 | comma separated string as one word (i.e., \code{"one,two,three"} becomes 
18 | \code{"onetwothree"}  rather than \code{"one two three"}).
19 | }
20 | \examples{
21 | \dontrun{
22 | x <- c("the,  dog,went", "I,like,it", "where are you", NA, "why", ",", ",f")
23 | add_comma_space(x)
24 | }
25 | }
26 | \keyword{comma}
27 | \keyword{space}
28 | 


--------------------------------------------------------------------------------
/man/add_missing_endmark.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/add_missing_endmark.R
 3 | \name{add_missing_endmark}
 4 | \alias{add_missing_endmark}
 5 | \title{Add Missing Endmarks}
 6 | \usage{
 7 | add_missing_endmark(x, replacement = "|", endmarks = c("?", ".", "!"), ...)
 8 | }
 9 | \arguments{
10 | \item{x}{The text variable.}
11 | 
12 | \item{replacement}{Character string equal in length to pattern or of length 
13 | one which are  a replacement for matched pattern.}
14 | 
15 | \item{endmarks}{The potential ending punctuation marks.}
16 | 
17 | \item{\dots}{Additional arguments passed to 
18 | \code{\link[textclean]{has_endmark}}.}
19 | }
20 | \value{
21 | Returns a vector with missing endmarks added.
22 | }
23 | \description{
24 | Detect missing endmarks and replace with the desired symbol.
25 | }
26 | \examples{
27 | x <- c(
28 |     "This in a", 
29 |     "I am funny!", 
30 |     "An ending of sorts\%", 
31 |     "What do you want?"
32 | )
33 | 
34 | add_missing_endmark(x)
35 | }
36 | 


--------------------------------------------------------------------------------
/man/check_text.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/check_text.R
  3 | \name{check_text}
  4 | \alias{check_text}
  5 | \alias{available_checks}
  6 | \title{Check Text For Potential Problems}
  7 | \usage{
  8 | check_text(x, file = NULL, checks = NULL, n = 10, ...)
  9 | 
 10 | available_checks()
 11 | }
 12 | \arguments{
 13 | \item{x}{The text variable.}
 14 | 
 15 | \item{file}{A connection, or a character string naming the file to print to.
 16 | If \code{NULL} prints to the console.  Note that this is assigned as an 
 17 | attribute and passed to \code{print}.}
 18 | 
 19 | \item{checks}{A vector of checks to include from \code{which_are}.  If 
 20 | \code{checks = NULL}, all checks from \code{which_are} which be used.  Note
 21 | that all meta checks will be conducted (see \code{which_are} for details on
 22 | meta checks).}
 23 | 
 24 | \item{n}{The number of affected elements to print out (the rest are truncated).}
 25 | 
 26 | \item{\ldots}{ignored.}
 27 | }
 28 | \value{
 29 | Returns a list with the following potential text faults report:\cr
 30 | \itemize{
 31 |   \item{contraction}{- Text elements that contain contractions}
 32 |   \item{date}{- Text elements that contain dates}
 33 |   \item{digit}{- Text elements that contain digits/numbers}
 34 |   \item{email}{- Text elements that contain email addresses}
 35 |   \item{emoticon}{- Text elements that contain emoticons}
 36 |   \item{empty}{- Text elements that contain empty text cells (all white space)}
 37 |   \item{escaped}{- Text elements that contain escaped back spaced characters}
 38 |   \item{hash}{- Text elements that contain Twitter style hash tags (e.g., #rstats)}
 39 |   \item{html}{- Text elements that contain HTML markup}
 40 |   \item{incomplete}{- Text elements that contain incomplete sentences (e.g., uses ending punctuation like ...)}
 41 |   \item{kern}{- Text elements that contain kerning (e.g., 'The B O M B!')}
 42 |   \item{list_column}{- Text variable that is a list column}
 43 |   \item{missing_value}{- Text elements that contain missing values}
 44 |   \item{misspelled}{- Text elements that contain potentially misspelled words}
 45 |   \item{no_alpha}{- Text elements that contain elements with no alphabetic (a-z) letters}
 46 |   \item{no_endmark}{- Text elements that contain elements with missing ending punctuation}
 47 |   \item{no_space_after_comma}{- Text elements that contain commas with no space afterwards}
 48 |   \item{non_ascii}{- Text elements that contain non-ASCII text}
 49 |   \item{non_character}{- Text variable that is not a character column (likely \code{factor})}
 50 |   \item{non_split_sentence}{- Text elements that contain unsplit sentences (more than one sentence per element)}
 51 |   \item{tag}{- Text elements that contain Twitter style handle tags (e.g., @trinker)}
 52 |   \item{time}{- Text elements that contain timestamps}
 53 |   \item{url}{- Text elements that contain URLs}
 54 | }
 55 | }
 56 | \description{
 57 | \code{check_text} - Uncleaned text may result in errors, warnings, and 
 58 | incorrect results in subsequent analysis.  \code{check_text} checks text for 
 59 | potential problems and suggests possible fixes.  Potential text anomalies 
 60 | that are detected include: factors, missing ending punctuation, empty cells, 
 61 | double punctuation, non-space after comma, no alphabetic characters, 
 62 | non-ASCII, missing value, and potentially misspelled words.
 63 | 
 64 | \code{available_check} - Provide a data.frame view of all the available 
 65 | checks in the \code{check_text} function.
 66 | }
 67 | \note{
 68 | The output is a list containing meta checks and elemental checks
 69 | but prints as a pretty formatted output with potential problem elements, the 
 70 | accompanying text, and possible suggestions to fix the text.
 71 | }
 72 | \examples{
 73 | \dontrun{
 74 | v <- list(c('foo', 'bar'), NA, c('hello', 'world'))
 75 | check_text(v)
 76 | 
 77 | w <- factor(unlist(v))
 78 | check_text(w)
 79 | 
 80 | x <- c("i like", "<p>i want. </p>thet them ther .", "I am ! that|", "", NA, 
 81 |     "&quot;they&quot;,were there", ".", "   ", "?", "3;", "I like goud eggs!", 
 82 |     "i 4like...", "\\\\tgreat",  'She said "yes"')
 83 | check_text(x)
 84 | print(check_text(x), include.text=FALSE)
 85 | check_text(x, checks = c('non_split_sentence', 'no_endmark'))
 86 | elementals <- available_checks()[is_meta != TRUE,][['fun']]
 87 | check_text(
 88 |     x, 
 89 |     checks = elementals[
 90 |         !elementals \%in\% c('non_split_sentence', 'no_endmark')
 91 |     ]
 92 | )
 93 | 
 94 | y <- c("A valid sentence.", "yet another!")
 95 | check_text(y)
 96 | 
 97 | z <- rep("dfsdsd'nt", 120)
 98 | check_text(z)
 99 | }
100 | }
101 | \keyword{check}
102 | \keyword{spelling}
103 | \keyword{text}
104 | 


--------------------------------------------------------------------------------
/man/drop_element.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/drop_element.R
 3 | \name{drop_element}
 4 | \alias{drop_element}
 5 | \alias{drop_element_regex}
 6 | \alias{drop_element_fixed}
 7 | \alias{keep_element}
 8 | \alias{keep_element_fixed}
 9 | \alias{keep_element_regex}
10 | \title{Filter Elements in a Vetor}
11 | \usage{
12 | drop_element(x, pattern, regex = TRUE, ...)
13 | 
14 | drop_element_regex(x, pattern, ...)
15 | 
16 | drop_element_fixed(x, ...)
17 | 
18 | keep_element(x, pattern, regex = TRUE, ...)
19 | 
20 | keep_element_fixed(x, ...)
21 | 
22 | keep_element_regex(x, pattern, ...)
23 | }
24 | \arguments{
25 | \item{x}{A character vector.}
26 | 
27 | \item{pattern}{A regex pattern to match for exclusion.}
28 | 
29 | \item{regex}{logical.  If setting this to \code{TRUE} please use 
30 | \code{drop_element_regex} or \code{keep_element_regex} directly as this will
31 | provide better control and optimization.}
32 | 
33 | \item{\ldots}{Other arguments passed to \code{\link[base]{grep}} if 
34 | \code{regex}.  If \code{fixed}, then elements to drop/keep.}
35 | }
36 | \value{
37 | Returns a vector with matching elements removed.
38 | }
39 | \description{
40 | \code{drop_element} - Filter to drop the matching elements of a vector.
41 | 
42 | \code{keep_element} - Filter to keep the matching elements of a vector.
43 | }
44 | \examples{
45 | x <- c('dog', 'cat', 'bat', 'dingo', 'dragon', 'dino')
46 | drop_element(x, '^d.+?g')
47 | keep_element(x, '^d.+?g')
48 | drop_element(x, 'at$')
49 | drop_element(x, '^d')
50 | drop_element(x, '\\\\b(dog|cat)\\\\b')
51 | 
52 | drop_element_fixed(x, 'bat', 'cat')
53 | drops <- c('bat', 'cat')
54 | drop_element_fixed(x, drops)
55 | }
56 | 


--------------------------------------------------------------------------------
/man/drop_row.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/drop_row.R
 3 | \name{drop_row}
 4 | \alias{drop_row}
 5 | \alias{keep_row}
 6 | \alias{drop_empty_row}
 7 | \alias{drop_NA}
 8 | \title{Filter Rows That Contain Markers}
 9 | \usage{
10 | drop_row(dataframe, column, terms, ...)
11 | 
12 | keep_row(dataframe, column, terms, ...)
13 | 
14 | drop_empty_row(dataframe)
15 | 
16 | drop_NA(dataframe, column = TRUE, ...)
17 | }
18 | \arguments{
19 | \item{dataframe}{A dataframe object.}
20 | 
21 | \item{column}{Column name to search for markers/terms.}
22 | 
23 | \item{terms}{The regex terms/markers of the rows that are to be removed from 
24 | the dataframe.}
25 | 
26 | \item{\ldots}{Other arguments passed to \code{\link[base]{grepl}}.}
27 | }
28 | \value{
29 | \code{drop_row} - returns a dataframe with the termed/markered rows 
30 | removed.
31 | 
32 | \code{drop_empty_row} - returns a dataframe with empty rows removed.
33 | 
34 | \code{drop_NA} - returns a dataframe with \code{NA} rows removed.
35 | }
36 | \description{
37 | \code{drop_row} - Remove rows from a data set that contain a given 
38 | marker/term.
39 | 
40 | \code{keep_row} - Keep rows from a data set that contain a given marker/term.
41 | 
42 | \code{drop_empty_row} - Removes the empty rows of a data set that are common in 
43 | reading in data.
44 | 
45 | \code{drop_NA} - Removes the \code{NA} rows of a data set.
46 | }
47 | \examples{
48 | \dontrun{
49 | ## drop_row EXAMPLE:
50 | drop_row(DATA, "person", c("sam", "greg"))
51 | keep_row(DATA, "person", c("sam", "greg"))
52 | drop_row(DATA, 1, c("sam", "greg"))
53 | drop_row(DATA, "state", c("Comp"))
54 | drop_row(DATA, "state", c("I "))
55 | drop_row(DATA, "state", c("you"), ignore.case=TRUE)
56 | 
57 | ## drop_empty_row EXAMPLE:
58 | (dat <- rbind.data.frame(DATA[, c(1, 4)], matrix(rep(" ", 4), 
59 |    ncol =2, dimnames=list(12:13, colnames(DATA)[c(1, 4)]))))
60 | drop_empty_row(dat)
61 | 
62 | ## drop_NA EXAMPLE:
63 | DATA[1:3, "state"] <- NA
64 | drop_NA(DATA)
65 | }
66 | }
67 | 


--------------------------------------------------------------------------------
/man/fgsub.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/fgsub.R
 3 | \name{fgsub}
 4 | \alias{fgsub}
 5 | \title{Replace a Regex with an Functional Operation on the Regex Match}
 6 | \usage{
 7 | fgsub(x, pattern, fun, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{A character vector.}
11 | 
12 | \item{pattern}{Character string to be matched in the given character vector.}
13 | 
14 | \item{fun}{A function to operate on the extracted matches.}
15 | 
16 | \item{\ldots}{ignored.}
17 | }
18 | \value{
19 | Returns a vector with the pattern replaced.
20 | }
21 | \description{
22 | This is a stripped down version of \code{gsubfn} from the \pkg{gsubfn} 
23 | package.  It finds a regex match, and then uses a function to operate on
24 | these matches and uses them to replace the original matches.  Note that
25 | the \pkg{stringi} packages is used for matching and extracting the regex 
26 | matches.  For more powerful or flexible needs please see the \pkg{gsubfn} 
27 | package.
28 | }
29 | \examples{
30 | ## In this example the regex looks for words that contain a lower case letter 
31 | ## followed by the same letter at least 2 more times.  It then extracts these
32 | ## words, splits them appart into letters, reverses the string, pastes them
33 | ## back together, wraps them with double angle braces, and then puts them back 
34 | ## at the original locations.
35 | fgsub(
36 |     x = c(NA, 'df dft sdf', 'sd fdggg sd dfhhh d', 'ddd'),
37 |     pattern = "\\\\b\\\\w*([a-z])(\\\\1{2,})\\\\w*\\\\b",
38 |     fun = function(x) {
39 |         paste0('<<', paste(rev(strsplit(x, '')[[1]]), collapse =''), '>>')
40 |     }    
41 | )
42 | 
43 | ## In this example we extract numbers, strip out non-digits, coerce them to 
44 | ## numeric, cut them in half, round up to the closest integer, add the commas 
45 | ## back, and replace back into the original locations.
46 | fgsub(
47 |     x = c(NA, 'I want 32 grapes', 'he wants 4 ice creams', 
48 |         'they want 1,234,567 dollars'
49 |     ),
50 |     pattern = "[\\\\d,]+",
51 |     fun = function(x) {
52 |         prettyNum(
53 |             ceiling(as.numeric(gsub('[^0-9]', '', x))/2), 
54 |             big.mark = ','
55 |         )
56 |     }    
57 | )
58 | 
59 | ## In this example we extract leading zeros, convert to an equal number of 
60 | ## spaces. 
61 | fgsub(
62 |     x = c(NA, "00:04", "00:08", "00:01", "06:14", "00:02", "00:04"),
63 |     pattern = '^0+',
64 |     fun = function(x) {gsub('0', ' ', x)}
65 | )
66 | }
67 | \seealso{
68 | \code{\link[gsubfn]{gsubfn}}
69 | }
70 | 


--------------------------------------------------------------------------------
/man/fix_mdyyyy.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/fix_mdyyyy.R
 3 | \name{fix_mdyyyy}
 4 | \alias{fix_mdyyyy}
 5 | \title{Coerce Character m/d/yyyy to Date}
 6 | \usage{
 7 | fix_mdyyyy(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{A character date in the form of m/d/yyyy where m and d can be single 
11 | integers like 1 for January.}
12 | 
13 | \item{\ldots}{ignored.}
14 | }
15 | \value{
16 | Returns a data vector
17 | }
18 | \description{
19 | Uses regular expressions to sub out a single day or month with a leading zero
20 | and then coerces to a date object.
21 | }
22 | \examples{
23 | fix_mdyyyy(c('4/23/2017', '12/1/2016', '3/3/2013', '12/12/2012', '2013-01-01'))
24 | \dontrun{
25 | library(dplyr)
26 | data_frame(
27 |     x = 1:4,
28 |     y = LETTERS[1:4],
29 |     start_date = c('4/23/2017', '12/1/2016', '3/3/2013', '12/12/2012'),
30 |     end_date = c('5/23/2017', '12/9/2016', '3/3/2016', '2/01/2012')
31 | ) \%>\%
32 | mutate_at(vars(ends_with('_date')), fix_mdyyyy)
33 | }
34 | }
35 | 


--------------------------------------------------------------------------------
/man/has_endmark.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/has_endmark.R
 3 | \name{has_endmark}
 4 | \alias{has_endmark}
 5 | \title{Test for Incomplete Sentences}
 6 | \usage{
 7 | has_endmark(x, endmarks = c("?", ".", "!"), ...)
 8 | }
 9 | \arguments{
10 | \item{x}{A character vector.}
11 | 
12 | \item{endmarks}{The potential ending punctuation marks,}
13 | 
14 | \item{\dots}{ignored.}
15 | }
16 | \value{
17 | Returns a logical vector.
18 | }
19 | \description{
20 | A logical test of missing sentence ending punctuation.
21 | }
22 | \examples{
23 | x <- c(
24 |     "I like it.", 
25 |     "Et tu?",  
26 |     "Not so much", 
27 |     "Oh, I understand.",  
28 |     "At 3 p.m., we go",
29 |     NA
30 | )
31 | has_endmark(x)
32 | }
33 | \keyword{incomplete}
34 | 


--------------------------------------------------------------------------------
/man/like.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/like.R
 3 | \name{\%like\%}
 4 | \alias{\%like\%}
 5 | \alias{\%LIKE\%}
 6 | \alias{\%slike\%}
 7 | \alias{\%SLIKE\%}
 8 | \title{SQL Style LIKE}
 9 | \usage{
10 | var \%like\% pattern
11 | 
12 | var \%LIKE\% pattern
13 | 
14 | var \%slike\% pattern
15 | 
16 | var \%SLIKE\% pattern
17 | }
18 | \arguments{
19 | \item{var}{A variable/column.}
20 | 
21 | \item{pattern}{A search pattern.}
22 | }
23 | \description{
24 | Use like as a SQL-esque opertator for pattern matching.  \code{\%like\%} is
25 | case insensitive while \code{\%slike\%} is case sensitive.  This is most useful 
26 | in a \code{dplyr::filter}.
27 | }
28 | \examples{
29 | state.name[state.name \%like\% 'or']
30 | state.name[state.name \%LIKE\% 'or']
31 | state.name[state.name \%slike\% 'or'] ## No Oregon
32 | }
33 | 


--------------------------------------------------------------------------------
/man/make_plural.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/make_plural.R
 3 | \name{make_plural}
 4 | \alias{make_plural}
 5 | \title{Make Plural (or Verb to Singular) Versions of Words}
 6 | \usage{
 7 | make_plural(
 8 |   x,
 9 |   keep.original = FALSE,
10 |   irregular = lexicon::pos_df_irregular_nouns
11 | )
12 | }
13 | \arguments{
14 | \item{x}{A vector of words to make plural.}
15 | 
16 | \item{keep.original}{logical.  If \code{TRUE} the original words are kept in 
17 | the return vector.}
18 | 
19 | \item{irregular}{A \code{data.frame} of singular and plural conversions for 
20 | irregular nouns.  The first column should be singular and the second plural 
21 | form of the irregular noun.}
22 | }
23 | \value{
24 | Returns a vector of plural words.
25 | }
26 | \description{
27 | Add -s, -es, or -ies to words.
28 | }
29 | \examples{
30 | x <- c('fox', 'sky', 'dog', 'church', 'fish', 'miss', 'match', 'deer', 'block')
31 | make_plural(x)
32 | }
33 | \keyword{plural}
34 | 


--------------------------------------------------------------------------------
/man/match_tokens.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/match_tokens.R
 3 | \name{match_tokens}
 4 | \alias{match_tokens}
 5 | \title{Find Tokens that Match a Regex}
 6 | \usage{
 7 | match_tokens(x, pattern, ignore.case = TRUE, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{A character vector.}
11 | 
12 | \item{pattern}{Character string(s) to be matched in the given character vector.}
13 | 
14 | \item{ignore.case}{logical.  If \code{TRUE} the case of the tokens/patterns 
15 | will be ignored.}
16 | 
17 | \item{\ldots}{ignored.}
18 | }
19 | \value{
20 | Returns a vector of tokens from a text matching a specific regex 
21 | pattern.
22 | }
23 | \description{
24 | Given a text, find all the tokens that match a regex(es).  This function is
25 | particularly useful with \code{\link[textclean]{replace_tokens}}.
26 | }
27 | \examples{
28 | with(DATA, match_tokens(state, c('^li', 'ou')))
29 | 
30 | with(DATA, match_tokens(state, c('^Th', '^I'), ignore.case = TRUE))
31 | with(DATA, match_tokens(state, c('^Th', '^I'), ignore.case = FALSE))
32 | }
33 | \seealso{
34 | \code{\link[textclean]{replace_tokens}}
35 | }
36 | 


--------------------------------------------------------------------------------
/man/mgsub.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/mgsub.R
  3 | \name{mgsub}
  4 | \alias{mgsub}
  5 | \alias{mgsub_fixed}
  6 | \alias{mgsub_regex}
  7 | \alias{mgsub_regex_safe}
  8 | \title{Multiple \code{\link[base]{gsub}}}
  9 | \usage{
 10 | mgsub(
 11 |   x,
 12 |   pattern,
 13 |   replacement,
 14 |   leadspace = FALSE,
 15 |   trailspace = FALSE,
 16 |   fixed = TRUE,
 17 |   trim = FALSE,
 18 |   order.pattern = fixed,
 19 |   safe = FALSE,
 20 |   ...
 21 | )
 22 | 
 23 | mgsub_fixed(
 24 |   x,
 25 |   pattern,
 26 |   replacement,
 27 |   leadspace = FALSE,
 28 |   trailspace = FALSE,
 29 |   fixed = TRUE,
 30 |   trim = FALSE,
 31 |   order.pattern = fixed,
 32 |   safe = FALSE,
 33 |   ...
 34 | )
 35 | 
 36 | mgsub_regex(
 37 |   x,
 38 |   pattern,
 39 |   replacement,
 40 |   leadspace = FALSE,
 41 |   trailspace = FALSE,
 42 |   fixed = FALSE,
 43 |   trim = FALSE,
 44 |   order.pattern = fixed,
 45 |   ...
 46 | )
 47 | 
 48 | mgsub_regex_safe(x, pattern, replacement, ...)
 49 | }
 50 | \arguments{
 51 | \item{x}{A character vector.}
 52 | 
 53 | \item{pattern}{Character string to be matched in the given character vector.}
 54 | 
 55 | \item{replacement}{Character string equal in length to pattern or of length 
 56 | one which are a replacement for matched pattern.}
 57 | 
 58 | \item{leadspace}{logical.  If \code{TRUE} inserts a leading space in the 
 59 | replacements.}
 60 | 
 61 | \item{trailspace}{logical.  If \code{TRUE} inserts a trailing space in the 
 62 | replacements.}
 63 | 
 64 | \item{fixed}{logical. If \code{TRUE}, pattern is a string to be matched as 
 65 | is. 
 66 | Overrides all conflicting arguments.}
 67 | 
 68 | \item{trim}{logical.  If \code{TRUE} leading and trailing white spaces are 
 69 | removed and multiple white spaces are reduced to a single white space.}
 70 | 
 71 | \item{order.pattern}{logical.  If \code{TRUE} and \code{fixed = TRUE}, the 
 72 | \code{pattern} string is sorted by number of characters to prevent substrings 
 73 | replacing meta strings (e.g., \code{pattern = c("the", "then")} resorts to 
 74 | search for "then" first).}
 75 | 
 76 | \item{safe}{logical.  If \code{TRUE} then the \pkg{mgsub} package is used as
 77 | the backend and performs safe substitutions.  The trade-off is that this mode
 78 | will slow the replacements down considerably.}
 79 | 
 80 | \item{\dots}{Additional arguments passed to \code{\link[base]{gsub}}. In 
 81 | \code{mgsub_regex_safe} this is other arguments passed to 
 82 | \code{\link[mgsub]{mgsub}}.}
 83 | }
 84 | \value{
 85 | \code{mgsub} - Returns a vector with the pattern replaced.
 86 | }
 87 | \description{
 88 | \code{mgsub} - A wrapper for \code{\link[base]{gsub}} that takes a vector 
 89 | of search terms and a vector or single value of replacements.
 90 | 
 91 | \code{mgsub_fixed} - An alias for \code{mgsub}.
 92 | 
 93 | \code{mgsub_regex} - An wrapper for \code{mgsub} with \code{fixed = FALSE}.
 94 | 
 95 | \code{mgsub_regex_safe} - An wrapper for \code{\link[mgsub]{mgsub}}.
 96 | }
 97 | \examples{
 98 | mgsub(DATA$state, c("it's", "I'm"), c("it is", "I am"))
 99 | mgsub(DATA$state, "[[:punct:]]", "PUNC", fixed = FALSE)
100 | \dontrun{
101 | library(textclean)
102 | hunthou <- replace_number(seq_len(1e5)) 
103 | 
104 | textclean::mgsub(
105 |     "'twenty thousand three hundred five' into 20305", 
106 |     hunthou, 
107 |     seq_len(1e5)
108 | )
109 | ## "'20305' into 20305"
110 | 
111 | ## Larger example from: https://stackoverflow.com/q/18332463/1000343
112 | ## A slower approach
113 | fivehunthou <- replace_number(seq_len(5e5)) 
114 | 
115 | testvect <- c("fifty seven", "four hundred fifty seven", 
116 |     "six thousand four hundred fifty seven", 
117 |     "forty six thousand four hundred fifty seven", 
118 |     "forty six thousand four hundred fifty seven", 
119 |     "three hundred forty six thousand four hundred fifty seven"
120 | )
121 | 
122 | textclean::mgsub(testvect, fivehunthou, seq_len(5e5))
123 | 
124 | ## Safe substitution: Uses the mgsub package as the backend
125 | dubious_string <- "Dopazamine is a fake chemical"
126 | pattern <- c("dopazamin","do.*ne")
127 | replacement <- c("freakout","metazamine")
128 | 
129 | mgsub(dubious_string, pattern, replacement, ignore.case = TRUE, fixed = FALSE)
130 | mgsub(dubious_string, pattern, replacement, safe = TRUE, fixed = FALSE)
131 | }
132 | }
133 | \seealso{
134 | \code{\link[textclean]{replace_tokens}}
135 | \code{\link[base]{gsub}}
136 | }
137 | 


--------------------------------------------------------------------------------
/man/print.check_text.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/check_text.R
 3 | \name{print.check_text}
 4 | \alias{print.check_text}
 5 | \title{Prints a check_text Object}
 6 | \usage{
 7 | \method{print}{check_text}(x, include.text = TRUE, file = NULL, n = NULL, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{The check_text object.}
11 | 
12 | \item{include.text}{logical.  If \code{TRUE} the offending text is printed as 
13 | well.}
14 | 
15 | \item{file}{A connection, or a character string naming the file to print to.
16 | If \code{NULL} prints to the console.}
17 | 
18 | \item{n}{The number of affected elements to print out (the rest are 
19 | truncated)}
20 | 
21 | \item{\ldots}{ignored}
22 | }
23 | \description{
24 | Prints a check_text object.
25 | }
26 | 


--------------------------------------------------------------------------------
/man/print.sub_holder.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sub_holder.R
 3 | \name{print.sub_holder}
 4 | \alias{print.sub_holder}
 5 | \title{Prints a sub_holder object}
 6 | \usage{
 7 | \method{print}{sub_holder}(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{The sub_holder object}
11 | 
12 | \item{\ldots}{ignored}
13 | }
14 | \description{
15 | Prints a sub_holder object
16 | }
17 | 


--------------------------------------------------------------------------------
/man/print.which_are_locs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/check_text_logicals.R
 3 | \name{print.which_are_locs}
 4 | \alias{print.which_are_locs}
 5 | \title{Prints a which_are_locs Object}
 6 | \usage{
 7 | \method{print}{which_are_locs}(x, n = 100, file = NULL, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{A which_are_locs object}
11 | 
12 | \item{n}{The number of affected elements to print out (the rest are 
13 | truncated)}
14 | 
15 | \item{file}{Path to an external file to print to}
16 | 
17 | \item{\ldots}{ignored.}
18 | }
19 | \description{
20 | Prints a which_are_locs object
21 | }
22 | 


--------------------------------------------------------------------------------
/man/reexports.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/glue-reexports.R
 3 | \docType{import}
 4 | \name{reexports}
 5 | \alias{reexports}
 6 | \alias{glue}
 7 | \alias{glue_collapse}
 8 | \title{Objects exported from other packages}
 9 | \keyword{internal}
10 | \description{
11 | These objects are imported from other packages. Follow the links
12 | below to see their documentation.
13 | 
14 | \describe{
15 |   \item{glue}{\code{\link[glue]{glue}}, \code{\link[glue]{glue_collapse}}}
16 | }}
17 | 
18 | 


--------------------------------------------------------------------------------
/man/replace_contraction.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/replace_contraction.R
 3 | \name{replace_contraction}
 4 | \alias{replace_contraction}
 5 | \title{Replace Contractions}
 6 | \usage{
 7 | replace_contraction(
 8 |   x,
 9 |   contraction.key = lexicon::key_contractions,
10 |   ignore.case = TRUE,
11 |   ...
12 | )
13 | }
14 | \arguments{
15 | \item{x}{The text variable.}
16 | 
17 | \item{contraction.key}{A two column hash of contractions (column 1) and 
18 | expanded form replacements (column 2).  Default is to use 
19 | \code{\link[lexicon]{key_contractions}} data set.}
20 | 
21 | \item{ignore.case}{logical.  Should case be ignored?}
22 | 
23 | \item{\dots}{ignored.}
24 | }
25 | \value{
26 | Returns a vector with contractions replaced.
27 | }
28 | \description{
29 | This function replaces contractions with long form.
30 | }
31 | \examples{
32 | \dontrun{
33 | x <- c("Mr. Jones isn't going.",  
34 |     "Check it out what's going on.",
35 |     "He's here but didn't go.",
36 |     "the robot at t.s. wasn't nice", 
37 |     "he'd like it if i'd go away")
38 | 
39 | replace_contraction(x)
40 | }
41 | }
42 | \keyword{contraction}
43 | 


--------------------------------------------------------------------------------
/man/replace_date.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/replace_date.R
 3 | \name{replace_date}
 4 | \alias{replace_date}
 5 | \title{Replace Dates With Words}
 6 | \usage{
 7 | replace_date(x, pattern = NULL, replacement = NULL, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{The text variable.}
11 | 
12 | \item{pattern}{Character date regex string to be matched in the given 
13 | character vector.}
14 | 
15 | \item{replacement}{A function to operate on the extracted matches or a 
16 | character string which is a replacement for the matched pattern.}
17 | 
18 | \item{\ldots}{ignored.}
19 | }
20 | \value{
21 | Returns a vector with the pattern replaced.
22 | }
23 | \description{
24 | Replaces dates with word equivalents.
25 | }
26 | \examples{
27 | x <- c(
28 |     NA, '11-16-1980 and 11/16/1980', 
29 |     "and 2017-02-08 but then there's 2/8/2017 too"
30 | )
31 | 
32 | replace_date(x)
33 | replace_date(x, replacement = '<<DATE>>')
34 | }
35 | 


--------------------------------------------------------------------------------
/man/replace_email.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/replace_email.R
 3 | \name{replace_email}
 4 | \alias{replace_email}
 5 | \title{Replace Email Addresses}
 6 | \usage{
 7 | replace_email(x, pattern = qdapRegex::grab("rm_email"), replacement = "", ...)
 8 | }
 9 | \arguments{
10 | \item{x}{The text variable.}
11 | 
12 | \item{pattern}{Character time regex string to be matched in the given 
13 | character vector.}
14 | 
15 | \item{replacement}{A function to operate on the extracted matches or a 
16 | character string which is a replacement for the matched pattern.}
17 | 
18 | \item{\ldots}{ignored.}
19 | }
20 | \value{
21 | Returns a vector with email addresses replaced.
22 | }
23 | \description{
24 | Replaces email addresses.
25 | }
26 | \examples{
27 | x <- c(
28 |     "fred is fred@foo.com and joe is joe@example.com - but @this is a", 
29 |     "twitter handle for twit@here.com or foo+bar@google.com/fred@foo.fnord", 
30 |     "hello world", 
31 |     NA
32 | )
33 | 
34 | replace_email(x)
35 | replace_email(x, replacement = '<<EMAIL>>')
36 | replace_email(x, replacement = '<a href="mailto:$1" target="_blank">$1</a>')
37 | 
38 | ## Replacement with a function
39 | replace_email(x, 
40 |     replacement = function(x){
41 |         sprintf('<a href="mailto:\%s" target="_blank">\%s</a>', x, x)
42 |     }
43 | )
44 | 
45 | 
46 | replace_email(x, 
47 |     replacement = function(x){
48 |         gsub('@.+$', ' {{at domain}}', x)
49 |     }
50 | )
51 | }
52 | 


--------------------------------------------------------------------------------
/man/replace_emoji.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/replace_emoji.R
 3 | \name{replace_emoji}
 4 | \alias{replace_emoji}
 5 | \alias{replace_emoji_identifier}
 6 | \title{Replace Emojis With Words/Identifier}
 7 | \usage{
 8 | replace_emoji(x, emoji_dt = lexicon::hash_emojis, ...)
 9 | 
10 | replace_emoji_identifier(x, emoji_dt = lexicon::hash_emojis_identifier, ...)
11 | }
12 | \arguments{
13 | \item{x}{The text variable.}
14 | 
15 | \item{emoji_dt}{A \pkg{data.table} of emojis (ASCII byte representations)
16 | and corresponding word/identifier meanings.}
17 | 
18 | \item{\ldots}{Other arguments passed to \code{.mgsub} (see
19 | \code{textclean:::.mgsub} for details).}
20 | }
21 | \value{
22 | Returns a vector of strings with emojis replaced with word
23 | equivalents.
24 | }
25 | \description{
26 | Replaces emojis with word equivalents or a token identifier for use in the
27 | \pkg{sentimentr} package.  Not that this function will coerce the text to 
28 | ASCII using 
29 | \code{Encoding(x) <- "latin1"; iconv(x, "latin1", "ASCII", "byte")}.
30 | The function \code{replace_emoji} replaces emojis with text representations
31 | while \code{replace_emoji_identifier} replaces with a unique identifier that
32 | corresponds to \code{lexicon::hash_sentiment_emoji} for use in the 
33 | \pkg{sentimentr} package.
34 | }
35 | \examples{
36 | fls <- system.file("docs/emoji_sample.txt", package = "textclean")
37 | x <- readLines(fls)[1]
38 | replace_emoji(x)
39 | replace_emoji_identifier(x)
40 | }
41 | \keyword{emoji}
42 | 


--------------------------------------------------------------------------------
/man/replace_emoticon.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/replace_emoticon.R
 3 | \name{replace_emoticon}
 4 | \alias{replace_emoticon}
 5 | \title{Replace Emoticons With Words}
 6 | \usage{
 7 | replace_emoticon(x, emoticon_dt = lexicon::hash_emoticons, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{The text variable.}
11 | 
12 | \item{emoticon_dt}{A \pkg{data.table} of emoticons (graphical representations)
13 | and corresponding word meanings.}
14 | 
15 | \item{\ldots}{Other arguments passed to \code{.mgsub} (see
16 | \code{textclean:::.mgsub} for details).}
17 | }
18 | \value{
19 | Returns a vector of strings with emoticons replaced with word
20 | equivalents.
21 | }
22 | \description{
23 | Replaces emoticons with word equivalents.
24 | }
25 | \examples{
26 | x <- c(
27 |     paste(
28 |         "text from:", 
29 |         "http://www.webopedia.com/quick_ref/textmessageabbreviations_02.asp"
30 |     ),
31 |     "... understanding what different characters used in smiley faces mean:",
32 |     "The close bracket represents a sideways smile  )",
33 |     "Add in the colon and you have sideways eyes   :",
34 |     "Put them together to make a smiley face  :)",
35 |     "Use the dash -  to add a nose   :-)",
36 |     paste(
37 |         "Change the colon to a semi-colon ;", 
38 |         "and you have a winking face ;)  with a nose  ;-)"
39 |     ),
40 |     paste(
41 |         "Put a zero 0 (halo) on top and now you have a winking,", 
42 |         "smiling angel 0;) with a nose 0;-)"
43 |     ),
44 |     "Use the letter 8 in place of the colon for sunglasses 8-)",
45 |     "Use the open bracket ( to turn the smile into a frown  :-(",
46 |     "I have experience with using the xp emoticon"
47 | )
48 | 
49 | replace_emoticon(x)
50 | }
51 | \keyword{emoticon}
52 | 


--------------------------------------------------------------------------------
/man/replace_grade.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/replace_grade.R
 3 | \name{replace_grade}
 4 | \alias{replace_grade}
 5 | \title{Replace Grades With Words}
 6 | \usage{
 7 | replace_grade(x, grade_dt = lexicon::key_grade, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{The text variable.}
11 | 
12 | \item{grade_dt}{A \pkg{data.table} of grades and corresponding word meanings.}
13 | 
14 | \item{\ldots}{ignored.}
15 | }
16 | \value{
17 | Returns a vector of strings with grades replaced with word
18 | equivalents.
19 | }
20 | \description{
21 | Replaces grades with word equivalents.
22 | }
23 | \examples{
24 | (text <- replace_grade(c(
25 |     "I give an A+",
26 |     "He deserves an F",
27 |     "It's C+ work",
28 |     "A poor example deserves a C!"
29 | )))
30 | }
31 | \keyword{grade}
32 | 


--------------------------------------------------------------------------------
/man/replace_hash.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/replace_hash.R
 3 | \name{replace_hash}
 4 | \alias{replace_hash}
 5 | \title{Replace Hashes}
 6 | \usage{
 7 | replace_hash(x, pattern = qdapRegex::grab("rm_hash"), replacement = "", ...)
 8 | }
 9 | \arguments{
10 | \item{x}{The text variable.}
11 | 
12 | \item{pattern}{Character time regex string to be matched in the given 
13 | character vector.}
14 | 
15 | \item{replacement}{A function to operate on the extracted matches or a 
16 | character string which is a replacement for the matched pattern.}
17 | 
18 | \item{\ldots}{ignored.}
19 | }
20 | \value{
21 | Returns a vector with hashes replaced.
22 | }
23 | \description{
24 | Replaces Twitter style hash tags (e.g., '#rstats').
25 | }
26 | \examples{
27 | x <- c("@hadley I like #rstats for #ggplot2 work.",
28 |     "Difference between #magrittr and #pipeR, both implement pipeline operators for #rstats: 
29 |         http://renkun.me/r/2014/07/26/difference-between-magrittr-and-pipeR.html @timelyportfolio",
30 |     "Slides from great talk: @ramnath_vaidya: Interactive slides from Interactive Visualization 
31 |         presentation #user2014. http://ramnathv.github.io/user2014-rcharts/#1"
32 | )
33 | 
34 | replace_hash(x)
35 | replace_hash(x, replacement = '<<HASH>>')
36 | replace_hash(x, replacement = '$3')
37 | 
38 | ## Replacement with a function
39 | replace_hash(x, 
40 |     replacement = function(x){
41 |         paste0('{{', gsub('^#', 'TOPIC: ', x), '}}')
42 |     }
43 | )
44 | }
45 | 


--------------------------------------------------------------------------------
/man/replace_html.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/replace_html.R
 3 | \name{replace_html}
 4 | \alias{replace_html}
 5 | \title{Replace HTML Markup}
 6 | \usage{
 7 | replace_html(x, symbol = TRUE, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{The text variable.}
11 | 
12 | \item{symbol}{logical.  If code{TRUE} the symbols are retained with appropriate
13 | replacements.  If \code{FALSE} they are removed.}
14 | 
15 | \item{\ldots}{Ignored.}
16 | }
17 | \value{
18 | Returns a vector with HTML markup replaced.
19 | }
20 | \description{
21 | Replaces HTML markup.  The angle braces are removed and the HTML symbol 
22 | markup is replaced with equivalent symbols.
23 | }
24 | \details{
25 | Replacements for symbols are as follows:
26 | 
27 | \tabular{lr}{
28 |  \bold{html} \tab  \bold{symbol} \cr
29 |   &copy;   \tab (c) \cr
30 |   &reg;   \tab (r) \cr
31 |   &trade;   \tab tm \cr
32 |   &ldquo;   \tab " \cr
33 |   &rdquo;   \tab " \cr
34 |   &lsquo;   \tab ' \cr
35 |   &rsquo;   \tab ' \cr
36 |   &bull;   \tab - \cr
37 |   &middot;   \tab - \cr
38 |   &sdot;   \tab [] \cr
39 |   &ndash;   \tab - \cr
40 |   &mdash;   \tab - \cr
41 |   &cent;   \tab cents \cr
42 |   &pound;   \tab pounds \cr
43 |   &euro;   \tab euro \cr
44 |   &ne;   \tab != \cr
45 |   &frac12;   \tab half \cr
46 |   &frac14;   \tab quarter \cr
47 |   &frac34;   \tab three fourths \cr
48 |   &deg;   \tab degrees \cr
49 |   &larr;   \tab <- \cr
50 |   &rarr;   \tab -> \cr
51 |   &hellip;   \tab ... \cr
52 |   &nbsp;   \tab   \cr
53 |   &lt;   \tab < \cr
54 |   &gt;   \tab > \cr
55 |   &laquo; \tab << \cr
56 |   &raquo; \tab >> \cr
57 |   &amp;   \tab & \cr
58 |   &quot;   \tab " \cr
59 |   &apos;   \tab ' \cr
60 |   &yen;   \tab yen \cr
61 | }
62 | }
63 | \examples{
64 | x <- c(
65 |     "<bold>Random</bold> text with symbols: &nbsp; &lt; &gt; &amp; &quot; &apos;",
66 |     "<p>More text</p> &cent; &pound; &yen; &euro; &copy; &reg; &laquo; &raquo;"
67 | )
68 | 
69 | replace_html(x)
70 | replace_html(x, FALSE)
71 | replace_white(replace_html(x, FALSE))
72 | }
73 | \keyword{html}
74 | 


--------------------------------------------------------------------------------
/man/replace_incomplete.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/replace_incomplete.R
 3 | \name{replace_incomplete}
 4 | \alias{replace_incomplete}
 5 | \title{Denote Incomplete End Marks With "|"}
 6 | \usage{
 7 | replace_incomplete(x, replacement = "|", ...)
 8 | }
 9 | \arguments{
10 | \item{x}{The text variable.}
11 | 
12 | \item{replacement}{A string to replace incomplete punctuation marks with.}
13 | 
14 | \item{\dots}{ignored.}
15 | }
16 | \value{
17 | Returns a text variable (character sting) with incomplete sentence 
18 | marks (.., ..., .?, ..?, en & em dash etc.) replaced with "|".
19 | }
20 | \description{
21 | Replaces incomplete sentence end marks (.., ..., .?, ..?, en & em dash etc.)
22 | with \code{"|"}.
23 | }
24 | \examples{
25 | x <- c("the...",  "I.?", "you.", "threw..", "we?")
26 | replace_incomplete(x)
27 | replace_incomplete(x, '...')
28 | }
29 | \keyword{incomplete-sentence}
30 | 


--------------------------------------------------------------------------------
/man/replace_internet_slang.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/replace_internet_slang.R
 3 | \name{replace_internet_slang}
 4 | \alias{replace_internet_slang}
 5 | \title{Replace Internet Slang}
 6 | \usage{
 7 | replace_internet_slang(
 8 |   x,
 9 |   slang = paste0("\\\\b", lexicon::hash_internet_slang[[1]], "\\\\b"),
10 |   replacement = lexicon::hash_internet_slang[[2]],
11 |   ignore.case = TRUE,
12 |   ...
13 | )
14 | }
15 | \arguments{
16 | \item{x}{The text variable.}
17 | 
18 | \item{slang}{A vector of slang strings to replace.}
19 | 
20 | \item{replacement}{A vector of string to replace slang with.}
21 | 
22 | \item{ignore.case}{logical.  If \code{TRUE} the case of \code{slang} will be 
23 | ignored (replacement regardless of case).}
24 | 
25 | \item{\dots}{Other arguments passed to \code{\link[textclean]{replace_tokens}}.}
26 | }
27 | \value{
28 | Returns a vector with names replaced.
29 | }
30 | \description{
31 | Replaces Internet slang.
32 | }
33 | \examples{
34 | x <- c(
35 |     "Marc the n00b needs to RTFM otherwise ymmv.",
36 |     "TGIF and a big w00t!  The weekend is GR8!",
37 |     "Will will do it",
38 |     'w8...this PITA needs me to say LMGTFY...lmao.',
39 |     NA
40 | )
41 | 
42 | replace_internet_slang(x)
43 | replace_internet_slang(x, ignore.case = FALSE)
44 | replace_internet_slang(x, replacement = '<<SLANG>>')
45 | replace_internet_slang(
46 |     x, 
47 |     replacement = paste0('{{ ', lexicon::hash_internet_slang[[2]], ' }}')
48 | )
49 | }
50 | 


--------------------------------------------------------------------------------
/man/replace_kern.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/replace_kerning.R
 3 | \name{replace_kern}
 4 | \alias{replace_kern}
 5 | \title{Replace Kerned (Spaced) with No Space Version}
 6 | \usage{
 7 | replace_kern(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{The text variable.}
11 | 
12 | \item{\ldots}{ignored.}
13 | }
14 | \value{
15 | Returns a vector with kern spaces removed.
16 | }
17 | \description{
18 | In typography kerning is the adjustment of spacing.  Often, in informal 
19 | writing, adding manual spaces (a form of kerning) coupled with all capital 
20 | letters is used for emphasis.  This tool looks for 3 or more consecutive 
21 | capital letters with spaces in between and removes the spaces.  Essentially, 
22 | the capitalized, kerned version is replaced with the word equivalent.
23 | }
24 | \examples{
25 | x <- c(
26 |     "Welcome to A I: the best W O R L D!",
27 |     "Hi I R is the B O M B for sure: we A G R E E indeed.",
28 |     "A sort C A T indeed!",
29 |     NA
30 | )
31 | 
32 | replace_kern(x)
33 | }
34 | \references{
35 | \url{https://stackoverflow.com/a/47438305/1000343}
36 | }
37 | \author{
38 | StackOverflow user @ctwheels
39 | }
40 | 


--------------------------------------------------------------------------------
/man/replace_misspelling.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/replace_misspelling.R
 3 | \name{replace_misspelling}
 4 | \alias{replace_misspelling}
 5 | \title{Replace Misspelled Words}
 6 | \usage{
 7 | replace_misspelling(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{A character vector.}
11 | 
12 | \item{\ldots}{ignored..}
13 | }
14 | \value{
15 | Returns a vector of strings with misspellings replaced.
16 | }
17 | \description{
18 | Replace misspelled words with their most likely replacement.  This function 
19 | uses \pkg{hunspell} in the backend.  \pkg{hunspell}  must be installed in 
20 | order to use this feature.
21 | }
22 | \note{
23 | The function splits the string apart into tokens for speed
24 | optimization.  After the replacement occurs the strings are pasted back
25 | together.  The strings are not guaranteed to retain exact spacing of the
26 | original.
27 | }
28 | \examples{
29 | \dontrun{
30 | bad_string <- c("I cant spelll rigtt noow.", '', NA, 
31 |     'Thiss is aslo mispelled?', 'this is 6$ and 38 cents in back2back!')
32 | replace_misspelling(bad_string)
33 | }
34 | }
35 | \author{
36 | Surin Space and Tyler Rinker <tyler.rinker@gmail.com>.
37 | }
38 | 


--------------------------------------------------------------------------------
/man/replace_money.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/replace_money.R
 3 | \name{replace_money}
 4 | \alias{replace_money}
 5 | \title{Replace Money With Words}
 6 | \usage{
 7 | replace_money(
 8 |   x,
 9 |   pattern = "(-?)([$])([0-9,]+)(\\\\.\\\\d{2})?",
10 |   replacement = NULL,
11 |   ...
12 | )
13 | }
14 | \arguments{
15 | \item{x}{The text variable.}
16 | 
17 | \item{pattern}{Character money regex string to be matched in the given 
18 | character vector.}
19 | 
20 | \item{replacement}{A function to operate on the extracted matches or a 
21 | character string which is a replacement for the matched pattern.}
22 | 
23 | \item{\ldots}{ignored.}
24 | }
25 | \value{
26 | Returns a vector with the pattern replaced.
27 | }
28 | \description{
29 | Replaces money with word equivalents.
30 | }
31 | \examples{
32 | x <- c(
33 |     NA, 
34 |     '$3.16 into "three dollars, sixteen cents"', 
35 |     "-$20,333.18 too", 'fff'
36 | )
37 | 
38 | replace_money(x)
39 | replace_money(x, replacement = '<<MONEY>>')
40 | }
41 | 


--------------------------------------------------------------------------------
/man/replace_names.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/replace_names.R
 3 | \name{replace_names}
 4 | \alias{replace_names}
 5 | \title{Replace First/Last Names}
 6 | \usage{
 7 | replace_names(
 8 |   x,
 9 |   names = textclean::drop_element(gsub("(^.)(.*)", "\\\\U\\\\1\\\\L\\\\2",
10 |     c(lexicon::freq_last_names[[1]], lexicon::common_names), perl = TRUE),
11 |     "^([AIU]n|[TSD]o|H[ea]Pa|Oh)$"),
12 |   replacement = "",
13 |   ...
14 | )
15 | }
16 | \arguments{
17 | \item{x}{The text variable.}
18 | 
19 | \item{names}{A vector of names to replace.  This may be made more custom 
20 | through a vector provided from a named entity extractor.}
21 | 
22 | \item{replacement}{A string to replace names with.}
23 | 
24 | \item{\dots}{Other arguments passed to 
25 | \code{\link[textclean]{replace_tokens}}.}
26 | }
27 | \value{
28 | Returns a vector with names replaced.
29 | }
30 | \description{
31 | Replaces first/last names.
32 | }
33 | \examples{
34 | x <- c(
35 |     "Mary Smith is not here",
36 |     "Karen is not a nice person",
37 |     "Will will do it",
38 |     NA
39 | ) 
40 | 
41 | replace_names(x)
42 | replace_names(x, replacement = '<<NAME>>')
43 | }
44 | 


--------------------------------------------------------------------------------
/man/replace_non_ascii.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/replace_non_ascii.R
 3 | \name{replace_non_ascii}
 4 | \alias{replace_non_ascii}
 5 | \alias{replace_non_ascii2}
 6 | \alias{replace_curly_quote}
 7 | \title{Replace Common Non-ASCII Characters}
 8 | \usage{
 9 | replace_non_ascii(x, replacement = "", remove.nonconverted = TRUE, ...)
10 | 
11 | replace_non_ascii2(x, replacement = "", ...)
12 | 
13 | replace_curly_quote(x, ...)
14 | }
15 | \arguments{
16 | \item{x}{The text variable.}
17 | 
18 | \item{replacement}{Character string equal in length to pattern or of length 
19 | one which are a replacement for matched pattern.}
20 | 
21 | \item{remove.nonconverted}{logical.  If \code{TRUE} unmapped encodings are
22 | deleted from the string.}
23 | 
24 | \item{\dots}{ignored.}
25 | }
26 | \value{
27 | Returns a text variable (character sting) with non-ASCII characters 
28 | replaced.
29 | }
30 | \description{
31 | \code{replace_non_ascii} - Replaces common non-ASCII characters.
32 | 
33 | \code{place_non_ascii2} - Replaces all non-ASCII (defined as \code{'[^ -~]+'}).  
34 | This provides a subset of functionality found in \code{replace_non_ascii} that
35 | is faster and likely less accurate.
36 | 
37 | \code{replace_curly_quote} - Replaces curly single and double quotes.  This 
38 | provides a subset of functionality found in \code{replace_non_ascii} specific 
39 | to quotes.
40 | }
41 | \examples{
42 | x <- c(
43 |     "Hello World", "6 Ekstr\xf8m", "J\xf6reskog", "bi\xdfchen Z\xfcrcher",
44 |     'This is a \xA9 but not a \xAE', '6 \xF7 2 = 3', 
45 |     'fractions \xBC, \xBD, \xBE', 'cows go \xB5', '30\xA2'
46 | )
47 | Encoding(x) <- "latin1"
48 | x
49 | 
50 | replace_non_ascii(x)
51 | replace_non_ascii(x, remove.nonconverted = FALSE)
52 | 
53 | z <- '\x95He said, \x93Gross, I am going to!\x94'
54 | Encoding(z) <- "latin1"
55 | z
56 | 
57 | replace_curly_quote(z)
58 | replace_non_ascii(z)
59 | }
60 | \keyword{ascii}
61 | 


--------------------------------------------------------------------------------
/man/replace_number.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/replace_number.R
 3 | \name{replace_number}
 4 | \alias{replace_number}
 5 | \alias{as_ordinal}
 6 | \title{Replace Numbers With Text Representation}
 7 | \usage{
 8 | replace_number(x, num.paste = FALSE, remove = FALSE, ...)
 9 | 
10 | as_ordinal(x, ...)
11 | }
12 | \arguments{
13 | \item{x}{The text variable.}
14 | 
15 | \item{num.paste}{logical.  If \code{FALSE} the elements of larger numbers are 
16 | separated with spaces.  If \code{TRUE} the elements will be joined without 
17 | spaces.}
18 | 
19 | \item{remove}{logical.  If \code{TRUE} numbers are removed from the text.}
20 | 
21 | \item{\ldots}{Other arguments passed to  \code{\link[english]{as.english}}}
22 | }
23 | \value{
24 | Returns a vector with numbers replaced.
25 | }
26 | \description{
27 | \code{replace_number} - Replaces numeric represented numbers with words 
28 | (e.g., 1001 becomes one thousand one).
29 | 
30 | \code{as_ordinal} - A convenience wrapper for \code{english::ordinal} that 
31 | takes integers and converts them to ordinal form.
32 | }
33 | \note{
34 | The user may want to use \code{\link[textclean]{replace_ordinal}} 
35 | first to remove ordinal number notation.  For example 
36 | \code{\link[textclean]{replace_number}} would turn "21st" into 
37 | "twenty onest", whereas \code{\link[textclean]{replace_ordinal}} would 
38 | generate "twenty first".
39 | }
40 | \examples{
41 | x <- c(
42 |     NA, 
43 |     'then .456 good', 
44 |     'none', 
45 |     "I like 346,457 ice cream cones.", 
46 |     "I like 123456789 cashes.",     
47 |     "They are 99 percent good and 45678.2345667"
48 | )
49 | replace_number(x)
50 | replace_number(x, num.paste = TRUE)
51 | replace_number(x, remove=TRUE)
52 | \dontrun{
53 | library(textclean)
54 | hunthou <- replace_number(seq_len(1e5)) 
55 | 
56 | textclean::mgsub(
57 |     "'twenty thousand three hundred five' into 20305", 
58 |     hunthou, 
59 |     seq_len(1e5)
60 | )
61 | ## "'20305' into 20305"
62 | 
63 | ## Larger example from: https://stackoverflow.com/q/18332463/1000343
64 | ## A slower approach
65 | fivehunthou <- replace_number(seq_len(5e5)) 
66 | 
67 | testvect <- c("fifty seven", "four hundred fifty seven", 
68 |     "six thousand four hundred fifty seven", 
69 |     "forty six thousand four hundred fifty seven", 
70 |     "forty six thousand four hundred fifty seven", 
71 |     "three hundred forty six thousand four hundred fifty seven"
72 | )
73 | 
74 | textclean::mgsub(testvect, fivehunthou, seq_len(5e5))
75 | 
76 | as_ordinal(1:10)
77 | textclean::mgsub('I want to be 1 in line', 1:10, as_ordinal(1:10))
78 | }
79 | }
80 | \references{
81 | Fox, J. (2005). Programmer's niche: How do you spell that number? 
82 | R News. Vol. 5(1), pp. 51-55.
83 | }
84 | \keyword{number-to-word}
85 | 


--------------------------------------------------------------------------------
/man/replace_ordinal.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/replace_ordinal.R
 3 | \name{replace_ordinal}
 4 | \alias{replace_ordinal}
 5 | \title{Replace Mixed Ordinal Numbers With Text Representation}
 6 | \usage{
 7 | replace_ordinal(x, num.paste = FALSE, remove = FALSE, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{The text variable.}
11 | 
12 | \item{num.paste}{logical.  If \code{TRUE} a the elements of larger numbers are 
13 | separated with spaces.  If \code{FALSE} the elements will be joined without 
14 | spaces.}
15 | 
16 | \item{remove}{logical.  If \code{TRUE} ordinal numbers are removed from the text.}
17 | 
18 | \item{\ldots}{ignored.}
19 | }
20 | \description{
21 | Replaces mixed text/numeric represented ordinal numbers with words (e.g., 
22 | "1st" becomes "first").
23 | }
24 | \note{
25 | Currently only implemented for ordinal values 1 through 100
26 | }
27 | \examples{
28 | x <- c(
29 |     "I like the 1st one not the 22nd one.", 
30 |     "For the 100th time stop!"
31 | )
32 | replace_ordinal(x)
33 | replace_ordinal(x, TRUE)
34 | replace_ordinal(x, remove = TRUE)
35 | replace_number(replace_ordinal("I like the 1st 1 not the 22nd 1."))
36 | }
37 | \keyword{ordinal-to-word}
38 | 


--------------------------------------------------------------------------------
/man/replace_rating.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/replace_rating.R
 3 | \name{replace_rating}
 4 | \alias{replace_rating}
 5 | \title{Replace Ratings With Words}
 6 | \usage{
 7 | replace_rating(x, rating_dt = lexicon::key_rating, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{The text variable.}
11 | 
12 | \item{rating_dt}{A \pkg{data.table} of ratings and corresponding word meanings.}
13 | 
14 | \item{\ldots}{ignored.}
15 | }
16 | \value{
17 | Returns a vector of strings with ratings replaced with word
18 | equivalents.
19 | }
20 | \description{
21 | Replaces ratings with word equivalents.
22 | }
23 | \examples{
24 | x <- c("This place receives 5 stars for their APPETIZERS!!!",
25 |      "Four stars for the food & the guy in the blue shirt for his great vibe!",
26 |      "10 out of 10 for both the movie and trilogy.",
27 |      "* Both the Hot & Sour & the Egg Flower Soups were absolutely 5 Stars!",
28 |      "For service, I give them no stars.", "This place deserves no stars.",
29 |      "10 out of 10 stars.",
30 |      "My rating: just 3 out of 10.",
31 |      "If there were zero stars I would give it zero stars.",
32 |      "Rating: 1 out of 10.",
33 |      "I gave it 5 stars because of the sound quality.",
34 |      "If it were possible to give them 0/10, they'd have it."
35 | )
36 | 
37 | replace_rating(x)
38 | }
39 | \keyword{rating}
40 | 


--------------------------------------------------------------------------------
/man/replace_symbol.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/replace_symbol.R
 3 | \name{replace_symbol}
 4 | \alias{replace_symbol}
 5 | \title{Replace Symbols With Word Equivalents}
 6 | \usage{
 7 | replace_symbol(
 8 |   x,
 9 |   dollar = TRUE,
10 |   percent = TRUE,
11 |   pound = TRUE,
12 |   at = TRUE,
13 |   and = TRUE,
14 |   with = TRUE,
15 |   ...
16 | )
17 | }
18 | \arguments{
19 | \item{x}{A character vector.}
20 | 
21 | \item{dollar}{logical.  If \code{TRUE} replaces dollar sign ($) with 
22 | \code{"dollar"}.}
23 | 
24 | \item{percent}{logical.  If \code{TRUE} replaces percent sign (\%) with 
25 | \code{"percent"}.}
26 | 
27 | \item{pound}{logical.  If \code{TRUE} replaces pound sign (#) with 
28 | \code{"number"}.}
29 | 
30 | \item{at}{logical.  If \code{TRUE} replaces at sign (@) with \code{"at"}.}
31 | 
32 | \item{and}{logical.  If \code{TRUE} replaces and sign (&) with \code{"and"}.}
33 | 
34 | \item{with}{logical.  If \code{TRUE} replaces with sign (w/) with 
35 | \code{"with"}.}
36 | 
37 | \item{\ldots}{ignored.}
38 | }
39 | \value{
40 | Returns a character vector with symbols replaced..
41 | }
42 | \description{
43 | This function replaces symbols with word equivalents (e.g., \code{@} becomes 
44 | \code{"at"}.
45 | }
46 | \examples{
47 | x <- c("I am @ Jon's & Jim's w/ Marry", 
48 |     "I owe $41 for food", 
49 |     "two is 10\% of a #"
50 | )
51 | replace_symbol(x)
52 | }
53 | \keyword{symbol-replace}
54 | 


--------------------------------------------------------------------------------
/man/replace_tag.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/replace_tag.R
 3 | \name{replace_tag}
 4 | \alias{replace_tag}
 5 | \title{Replace Handle Tags}
 6 | \usage{
 7 | replace_tag(x, pattern = qdapRegex::grab("rm_tag"), replacement = "", ...)
 8 | }
 9 | \arguments{
10 | \item{x}{The text variable.}
11 | 
12 | \item{pattern}{Character time regex string to be matched in the given 
13 | character vector.}
14 | 
15 | \item{replacement}{A function to operate on the extracted matches or a 
16 | character string which is a replacement for the matched pattern.}
17 | 
18 | \item{\ldots}{ignored.}
19 | }
20 | \value{
21 | Returns a vector with tags replaced.
22 | }
23 | \description{
24 | Replaces Twitter style handle tags (e.g., '@trinker').
25 | }
26 | \examples{
27 | x <- c("@hadley I like #rstats for #ggplot2 work.",
28 |     "Difference between #magrittr and #pipeR, both implement pipeline operators for #rstats: 
29 |         http://renkun.me/r/2014/07/26/difference-between-magrittr-and-pipeR.html @timelyportfolio",
30 |     "Slides from great talk: @ramnath_vaidya: Interactive slides from Interactive Visualization 
31 |         presentation #user2014. http://ramnathv.github.io/user2014-rcharts/#1"
32 | )
33 | 
34 | replace_tag(x)
35 | replace_tag(x, replacement = '<<TAG>>')
36 | replace_tag(x, replacement = '$3')
37 | 
38 | ## Replacement with a function
39 | replace_tag(x, 
40 |     replacement = function(x){
41 |         gsub('@', ' <<TO>> ', x)
42 |     }
43 | )
44 | }
45 | 


--------------------------------------------------------------------------------
/man/replace_time.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/replace_time.R
 3 | \name{replace_time}
 4 | \alias{replace_time}
 5 | \title{Replace Time Stamps With Words}
 6 | \usage{
 7 | replace_time(
 8 |   x,
 9 |   pattern = "(2[0-3]|[01]?[0-9]):([0-5][0-9])[.:]?([0-5]?[0-9])?",
10 |   replacement = NULL,
11 |   ...
12 | )
13 | }
14 | \arguments{
15 | \item{x}{The text variable.}
16 | 
17 | \item{pattern}{Character time regex string to be matched in the given 
18 | character vector.}
19 | 
20 | \item{replacement}{A function to operate on the extracted matches or a 
21 | character string which is a replacement for the matched pattern.}
22 | 
23 | \item{\ldots}{ignored.}
24 | }
25 | \value{
26 | Returns a vector with the pattern replaced.
27 | }
28 | \description{
29 | Replaces time stamps with word equivalents.
30 | }
31 | \examples{
32 | x <- c(
33 |     NA, '12:47 to "twelve forty-seven" and also 8:35:02', 
34 |     'what about 14:24.5', 'And then 99:99:99?'
35 | )
36 | 
37 | ## Textual: Word version
38 | replace_time(x)
39 | 
40 | ## Normalization: <<TIME>>
41 | replace_time(x, replacement = '<<TIME>>')
42 | 
43 | ## Normalization: hh:mm:ss or hh:mm
44 | replace_time(x, replacement = function(y){
45 |         z <- unlist(strsplit(y, '[:.]'))
46 |         z[1] <- 'hh'
47 |         z[2] <- 'mm'
48 |         if(!is.na(z[3])) z[3] <- 'ss'
49 |         glue_collapse(z, ':')
50 |     }
51 | )
52 | 
53 | ## Textual: Word version (forced seconds)
54 | replace_time(x, replacement = function(y){
55 |         z <- replace_number(unlist(strsplit(y, '[:.]')))
56 |         z[3] <- paste0('and ', ifelse(is.na(z[3]), '0', z[3]), ' seconds')
57 |         paste(z, collapse = ' ')
58 |     }
59 | )
60 |  
61 | ## Normalization: hh:mm:ss
62 | replace_time(x, replacement = function(y){
63 |         z <- unlist(strsplit(y, '[:.]'))
64 |         z[1] <- 'hh'
65 |         z[2] <- 'mm'
66 |         z[3] <- 'ss'
67 |         glue_collapse(z, ':')
68 |     }
69 | )
70 | }
71 | 


--------------------------------------------------------------------------------
/man/replace_to.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/replace_to.R
 3 | \name{replace_to}
 4 | \alias{replace_to}
 5 | \alias{replace_from}
 6 | \title{Grab Begin/End of String to/from Character}
 7 | \usage{
 8 | replace_to(x, char = " ", n = 1, include = FALSE, ...)
 9 | 
10 | replace_from(x, char = " ", n = 1, include = FALSE, ...)
11 | }
12 | \arguments{
13 | \item{x}{A character string}
14 | 
15 | \item{char}{The character from which to grab until/from.}
16 | 
17 | \item{n}{Number of times the character appears before the grab.}
18 | 
19 | \item{include}{logical.  If \code{TRUE} includes the character in the grab.}
20 | 
21 | \item{\ldots}{ignored.}
22 | }
23 | \value{
24 | returns a vector of text with begin/end of string to/from character removed.
25 | }
26 | \description{
27 | \code{replace_to} - Grab from beginning of string to a character(s).
28 | 
29 | \code{replace_from} - Grab from character(s) to end of string.
30 | }
31 | \examples{
32 | \dontrun{
33 | x <- c("a_b_c_d", "1_2_3_4", "<_?_._:")
34 | replace_to(x, "_")
35 | replace_to(x, "_", 2)
36 | replace_to(x, "_", 3)
37 | replace_to(x, "_", 4)
38 | replace_to(x, "_", 3, include=TRUE)
39 | 
40 | replace_from(x, "_")
41 | replace_from(x, "_", 2)
42 | replace_from(x, "_", 3)
43 | replace_from(x, "_", 4)
44 | replace_from(x, "_", 3, include=TRUE)
45 | 
46 | x2 <- gsub("_", " ", x)
47 | replace_from(x2, " ", 2)
48 | replace_to(x2, " ", 2)
49 | 
50 | x3 <- gsub("_", "\\\\^", x)
51 | replace_from(x3, "^", 2)
52 | replace_to(x3, "^", 2)
53 | 
54 | x4 <- c("_a_b", "a__b")
55 | replace_from(x4, "_", 1)
56 | replace_to(x4, "_", 1)
57 | }
58 | }
59 | \references{
60 | \url{https://stackoverflow.com/q/15909626/1000343}
61 | }
62 | \author{
63 | Josh O'Brien and Tyler Rinker <tyler.rinker@gmail.com>.
64 | }
65 | 


--------------------------------------------------------------------------------
/man/replace_tokens.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/replace_tokens.R
 3 | \name{replace_tokens}
 4 | \alias{replace_tokens}
 5 | \title{Replace Tokens}
 6 | \usage{
 7 | replace_tokens(x, tokens, replacement = NULL, ignore.case = FALSE, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{A character vector.}
11 | 
12 | \item{tokens}{A vector of token to be replaced.}
13 | 
14 | \item{replacement}{A single character string to replace the tokens with.
15 | The default, \code{NULL}, replaces the tokens with nothing.}
16 | 
17 | \item{ignore.case}{logical.  If \code{TRUE} the case of the tokens will 
18 | be ignored.}
19 | 
20 | \item{\ldots}{ignored.}
21 | }
22 | \value{
23 | Returns a vector of strings with tokens replaced.
24 | }
25 | \description{
26 | Replace tokens with a single substring.  This is much faster than 
27 | \code{\link[textclean]{mgsub}} if one wants to replace fixed tokens
28 | with a single value or remove them all together.  This can be useful
29 | for quickly replacing tokens like names in string with a single
30 | value in order to reduce noise.
31 | }
32 | \note{
33 | The function splits the string apart into tokens for speed
34 | optimization.  After the replacement occurs the strings are pasted back
35 | together.  The strings are not guaranteed to retain exact spacing of the
36 | original.
37 | }
38 | \examples{
39 | replace_tokens(DATA$state, c('No', 'what', "it's"))
40 | replace_tokens(DATA$state, c('No', 'what', "it's"), "<<TOKEN>>")
41 | replace_tokens(
42 |     DATA$state, 
43 |     c('No', 'what', "it's"), 
44 |     "<<TOKEN>>", 
45 |     ignore.case = TRUE
46 | )
47 | 
48 | \dontrun{
49 | ## Now let's see the speed
50 | ## Set up data
51 | library(textshape)
52 | data(hamlet)
53 | set.seed(11)
54 | tokens <- sample(unique(unlist(split_token(hamlet$dialogue))), 2000)
55 | 
56 | tic <- Sys.time()
57 | head(replace_tokens(hamlet$dialogue, tokens))
58 | (toc <- Sys.time() - tic)
59 | 
60 | 
61 | tic <- Sys.time()
62 | head(mgsub(hamlet$dialogue, tokens, ""))
63 | (toc <- Sys.time() - tic)
64 | 
65 | 
66 | ## Amp it up 20x more data
67 | tic <- Sys.time()
68 | head(replace_tokens(rep(hamlet$dialogue, 20), tokens))
69 | (toc <- Sys.time() - tic)
70 | 
71 | ## Replace names example
72 | 
73 | library(lexicon)
74 | library(textshape)
75 | nms <- gsub("(^.)(.*)", "\\\\U\\\\1\\\\L\\\\2", common_names, perl = TRUE)
76 | x <- split_portion(
77 |     sample(c(sample(grady_augmented, 5000), sample(nms, 10000, TRUE))), 
78 |     n.words = 12
79 | )
80 | x$text.var <- paste0(
81 |     x$text.var, 
82 |     sample(c('.', '!', '?'), length(x$text.var), TRUE)
83 |  )
84 | replace_tokens(x$text.var, nms, 'NAME')
85 | }
86 | }
87 | \seealso{
88 | \code{\link[textclean]{mgsub}}, \code{\link[textclean]{match_tokens}}
89 | }
90 | 


--------------------------------------------------------------------------------
/man/replace_url.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/replace_url.R
 3 | \name{replace_url}
 4 | \alias{replace_url}
 5 | \title{Replace URLs}
 6 | \usage{
 7 | replace_url(x, pattern = qdapRegex::grab("rm_url"), replacement = "", ...)
 8 | }
 9 | \arguments{
10 | \item{x}{The text variable.}
11 | 
12 | \item{pattern}{Character time regex string to be matched in the given 
13 | character vector.}
14 | 
15 | \item{replacement}{A function to operate on the extracted matches or a 
16 | character string which is a replacement for the matched pattern.}
17 | 
18 | \item{\ldots}{ignored.}
19 | }
20 | \value{
21 | Returns a vector with URLs replaced.
22 | }
23 | \description{
24 | Replaces URLs.
25 | }
26 | \examples{
27 | x <- c("@hadley I like #rstats for #ggplot2 work. ftp://cran.r-project.org/incoming/",
28 |     "Difference between #magrittr and #pipeR, both implement pipeline operators for #rstats: 
29 |         http://renkun.me/r/2014/07/26/difference-between-magrittr-and-pipeR.html @timelyportfolio",
30 |     "Slides from great talk: @ramnath_vaidya: Interactive slides from Interactive Visualization 
31 |         presentation #user2014. https://ramnathv.github.io/user2014-rcharts/#1",
32 |     NA 
33 | )
34 | 
35 | replace_url(x)
36 | replace_url(x, replacement = '<<URL>>')
37 | 
38 | \dontrun{
39 | ## Replacement with a function
40 | library(urltools)
41 | replace_url(x, 
42 |     replacement = function(x){
43 |         sprintf('{{\%s}}', urltools::url_parse(x)$domain)
44 |     }
45 | )
46 | }
47 | }
48 | 


--------------------------------------------------------------------------------
/man/replace_white.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/replace_white.R
 3 | \name{replace_white}
 4 | \alias{replace_white}
 5 | \title{Remove White Space Characters}
 6 | \usage{
 7 | replace_white(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{The character vector.}
11 | 
12 | \item{\dots}{ignored.}
13 | }
14 | \value{
15 | Returns a vector of character strings with escaped characters removed.
16 | }
17 | \description{
18 | Pre-process data to replace one or more white space character with a single 
19 | space (this includes new line characters).
20 | }
21 | \examples{
22 | x <- "I go \r
23 |     to   the \tnext line"
24 | x
25 | replace_white(x)
26 | }
27 | \keyword{character}
28 | \keyword{escaped}
29 | 


--------------------------------------------------------------------------------
/man/replace_word_elongation.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/replace_word_elongation.R
 3 | \name{replace_word_elongation}
 4 | \alias{replace_word_elongation}
 5 | \title{Replace Word Elongations}
 6 | \usage{
 7 | replace_word_elongation(
 8 |   x,
 9 |   impart.meaning = FALSE,
10 |  
11 |     elongation.search.pattern = "(?i)(?:^|\\\\b)\\\\w+([a-z])(\\\\1{2,})\\\\w*(?:$|\\\\b)",
12 |   conservative = FALSE,
13 |   elongation.pattern = sprintf("([a-z])(\\\\1{\%s,})", as.integer(conservative) + 1),
14 |   ...
15 | )
16 | }
17 | \arguments{
18 | \item{x}{The text variable.}
19 | 
20 | \item{impart.meaning}{logical.  If \code{TRUE}, known elongation semantics
21 | are used as replacements (see \code{textclean:::meaning_elongations} for 
22 | known elongation semantics and replacements).}
23 | 
24 | \item{elongation.search.pattern}{The elongation pattern to search for.  The default
25 | only considers a repeat of \code{'[A-Za-z]'} within a "word" that is bounded
26 | by a word boundary or the beginning or end of the string and contains only
27 | \code{'\\w'} characters.  This means "words" with non-ASCII characters will 
28 | not be considered.}
29 | 
30 | \item{conservative}{By default the \code{elongation.search.pattern} will find3 or 
31 | more of the same character in a row after in initial word character as the 
32 | starting boundary to pull out words with 3 or more of the same character in a 
33 | row.  You can choose to replace all letters that appear 3 or more times in a 
34 | row with the single character replacement (conservative) or any letters that 
35 | appear 2 or more times in a row (not conservative).  This is most important in 
36 | words that can contain two of the same letter as the correct spelling that 
37 | would not be found in the canonical lookup table.  For example 'Lookkkkkk!'
38 | is in the lookup table and would be corrected to 'Look!' regardless, while
39 | the workd 'mook' (that is then elongated into the word 'Mookkkkkk') would not 
40 | be found in the lookup table.}
41 | 
42 | \item{elongation.pattern}{The actual pattern used for replacement.  We use a 
43 | search pattern and then this pattern with the assumption that an elongated 
44 | word must have 3 or more letters in a row but often these elongations can 
45 | also contain 2 or more letters in a row as well.}
46 | 
47 | \item{\ldots}{ignored.}
48 | }
49 | \value{
50 | Returns a vector with word elongations replaced.
51 | }
52 | \description{
53 | In informal writing people may use a form of text embellishment to emphasize 
54 | or alter word meanings called elongation (a.k.a. "word lengthening").  For 
55 | example, the use of "Whyyyyy" conveys frustration.  Other times the usage may 
56 | be to be more sexy (e.g., "Heyyyy there").  Other times it may be used for 
57 | emphasis (e.g., "This is so gooood").  This function uses an augmented form
58 | of Armstrong & Fogarty's (2007) algorithm.  The algorithm first attempts to
59 | replace the elongation with known semantic replacements (optional; default is
60 | \code{FALSE}).  After this the algorithm locates all places where the same 
61 | letter (case insensitive) appears 3 times consecutively.  These elements are
62 | then further processed.  The matches are replaced via \code{fgsub} by first
63 | taking the elongation to it's canonical form (drop all > 1 consecutive 
64 | letters to a single letter) and then replacing with the most common word 
65 | used in 2008 in Google's ngram data set that takes the canonical form.  If 
66 | the canonical form is not found in the Google data set then the canonical 
67 | form is used as the replacement.
68 | }
69 | \examples{
70 | x <- c('look', 'noooooo!', 'real coooool!', "it's sooo goooood", 'fsdfds', 
71 |     'fdddf', 'as', "aaaahahahahaha", "aabbccxccbbaa", 'I said heyyy!',
72 |     "I'm liiiike whyyyyy me?", "WwwhhaTttt!", NA)
73 | 
74 | replace_word_elongation(x)                      #Look at "WwwhhaTttt!" as "what!"
75 | replace_word_elongation(x, conservative = TRUE) #Look at "WwwhhaTttt!" as "whhat!"
76 | replace_word_elongation(x, impart.meaning = TRUE)
77 | replace_word_elongation(c('online mookkkkk!', "WwwhhaTttt!"))
78 | replace_word_elongation(c('online mookkkkk!', "WwwhhaTttt!"), conservative = TRUE)
79 | }
80 | \references{
81 | Armstrong, D. B., Fogarty, G. J., & Dingsdag, D. (2007). Scales measuring 
82 | characteristics of small business information systems. Proceedings of the 
83 | 2011 Conference on Empirical Methods in Natural Language Processing (pp. 
84 | 562-570). Edinburgh, Scotland. Retrieved from 
85 | http://www.aclweb.org/anthology/D11-1052 \cr \cr
86 | \url{https://storage.googleapis.com/books/ngrams/books/datasetsv2.html} \cr \cr
87 | \url{https://www.theatlantic.com/magazine/archive/2013/03/dragging-it-out/309220} \cr \cr
88 | \url{https://english.stackexchange.com/questions/189517/is-there-a-name-term-for-multiplied-vowels}
89 | }
90 | 


--------------------------------------------------------------------------------
/man/strip.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/strip.R
 3 | \name{strip}
 4 | \alias{strip}
 5 | \alias{strip.character}
 6 | \alias{strip.factor}
 7 | \alias{strip.default}
 8 | \alias{strip.list}
 9 | \title{Strip Text}
10 | \usage{
11 | strip(
12 |   x,
13 |   char.keep = "~~",
14 |   digit.remove = TRUE,
15 |   apostrophe.remove = FALSE,
16 |   lower.case = TRUE
17 | )
18 | 
19 | \method{strip}{character}(
20 |   x,
21 |   char.keep = "~~",
22 |   digit.remove = TRUE,
23 |   apostrophe.remove = FALSE,
24 |   lower.case = TRUE
25 | )
26 | 
27 | \method{strip}{factor}(
28 |   x,
29 |   char.keep = "~~",
30 |   digit.remove = TRUE,
31 |   apostrophe.remove = TRUE,
32 |   lower.case = TRUE
33 | )
34 | 
35 | \method{strip}{default}(
36 |   x,
37 |   char.keep = "~~",
38 |   digit.remove = TRUE,
39 |   apostrophe.remove = TRUE,
40 |   lower.case = TRUE
41 | )
42 | 
43 | \method{strip}{list}(
44 |   x,
45 |   char.keep = "~~",
46 |   digit.remove = TRUE,
47 |   apostrophe.remove = TRUE,
48 |   lower.case = TRUE
49 | )
50 | }
51 | \arguments{
52 | \item{x}{The text variable.}
53 | 
54 | \item{char.keep}{A character vector of symbols (i.e., punctuation) that 
55 | \code{\link[textclean]{strip}} should keep.  The default is to strip every 
56 | symbol except apostrophes and a double tilde \code{"~~"}.  The double tilde 
57 | \code{"~~"} is included for a convenient means of keeping word groups 
58 | together in functions that split text apart based on spaces.  To remove 
59 | double tildes \code{"~~"} set \code{char.keep} to \code{NULL}.}
60 | 
61 | \item{digit.remove}{logical.  If \code{TRUE} strips digits from the text.}
62 | 
63 | \item{apostrophe.remove}{logical.  If \code{TRUE} removes apostrophes from 
64 | the output.}
65 | 
66 | \item{lower.case}{logical.  If \code{TRUE} forces all alpha characters to 
67 | lower case.}
68 | }
69 | \value{
70 | Returns a vector of text that has been stripped of unwanted 
71 | characters.
72 | }
73 | \description{
74 | Strip text of unwanted characters.
75 | }
76 | \examples{
77 | \dontrun{
78 | DATA$state #no strip applied
79 | strip(DATA$state)
80 | strip(DATA$state, apostrophe.remove=TRUE)
81 | strip(DATA$state, char.keep = c("?", "."))
82 | }
83 | }
84 | 


--------------------------------------------------------------------------------
/man/sub_holder.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sub_holder.R
 3 | \name{sub_holder}
 4 | \alias{sub_holder}
 5 | \title{Hold the Place of Characters Prior to Subbing}
 6 | \usage{
 7 | sub_holder(
 8 |   x,
 9 |   pattern,
10 |   alpha.type = TRUE,
11 |   holder.prefix = "zzzplaceholder",
12 |   holder.suffix = "zzz",
13 |   ...
14 | )
15 | }
16 | \arguments{
17 | \item{x}{A character vector.}
18 | 
19 | \item{pattern}{Character string to be matched in the given character vector.}
20 | 
21 | \item{alpha.type}{logical.  If \code{TRUE} alpha (lower case letters) are 
22 | used for the key.  If \code{FALSE} numbers are used as the key.}
23 | 
24 | \item{holder.prefix}{The prefix to use before the alpha key in the palce 
25 | holder when \code{alpha.type = TRUE}; this ensures uniqueness.}
26 | 
27 | \item{holder.suffix}{The suffix to use after the alpha key in the palce 
28 | holder when \code{alpha.type = TRUE}; this ensures uniqueness.}
29 | 
30 | \item{\dots}{Additional arguments passed to \code{\link[base]{gsub}}.}
31 | }
32 | \value{
33 | Returns a list with the following:
34 | \item{output}{keyed place holder character vector} 
35 | \item{unhold}{A function used to revert back to the original values}
36 | }
37 | \description{
38 | This function holds the place for particular character values, allowing the 
39 | user to manipulate the vector and then revert the place holders back to the 
40 | original values.
41 | }
42 | \note{
43 | The \code{unhold} function for \code{sub_holder} will only work on keys
44 | that have not been disturbed by subsequent alterations.  The key follows the 
45 | pattern of holder.prefix (`zzzplaceholder`) followed by lower case letter 
46 | keys followed by holder.suffix (`zzz`) when \code{alpha.type = TRUE}, 
47 | otherwise the holder is numeric.
48 | }
49 | \examples{
50 | ## `alpha.type` as TRUE
51 | library(lexicon); library(textshape)
52 | (fake_dat <- paste(hash_emoticons[1:11, 1, with=FALSE][[1]], DATA$state))
53 | (m <- sub_holder(fake_dat, hash_emoticons[[1]]))
54 | m$unhold(strip(m$output))
55 | 
56 | ## `alpha.type` as FALSE (numeric keys)
57 | vowels <- LETTERS[c(1, 5, 9, 15, 21)]
58 | (m2 <- sub_holder(toupper(DATA$state), vowels, alpha.type = FALSE))
59 | m2$unhold(gsub("[^0-9]", "", m2$output))
60 | mtabulate(strsplit(m2$unhold(gsub("[^0-9]", "", m2$output)), ""))
61 | }
62 | 


--------------------------------------------------------------------------------
/man/swap.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/swap.R
 3 | \name{swap}
 4 | \alias{swap}
 5 | \title{Swap Two Patterns Simultaneously}
 6 | \usage{
 7 | swap(x, pattern1, pattern2, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{A text variable.}
11 | 
12 | \item{pattern1}{Character string to be matched in the given character vector.
13 | This will be replaced by \code{pattern2}.}
14 | 
15 | \item{pattern2}{Character string to be matched in the given character vector.
16 | This will be replaced by \code{pattern1}.}
17 | 
18 | \item{\ldots}{ignored.}
19 | }
20 | \value{
21 | Returns a vector with patterns 1 & 2 swapped.
22 | }
23 | \description{
24 | Swap pattern x for pattern y and pattern y for pattern x in one fell swoop.
25 | }
26 | \examples{
27 | x <- c("hash_abbreviation", "hash_contractions", "hash_grade", "key_emoticons", 
28 |     "key_power", "key_sentiment", "key_sentiment_nrc", "key_strength", 
29 |     "key_syllable", "key_valence_shifters")
30 | 
31 | x
32 | swap(x, 'hash_', 'key_')
33 | }
34 | 


--------------------------------------------------------------------------------
/man/textclean.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/textclean-package.R
 3 | \docType{package}
 4 | \name{textclean}
 5 | \alias{textclean}
 6 | \alias{package-textclean}
 7 | \title{Text Cleaning Tools}
 8 | \description{
 9 | Tools to clean and process text.
10 | }
11 | 


--------------------------------------------------------------------------------
/man/which_are.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/check_text_logicals.R
 3 | \name{which_are}
 4 | \alias{which_are}
 5 | \alias{is_it}
 6 | \title{Detect/Locate Potential Non-Normalized Text}
 7 | \usage{
 8 | which_are()
 9 | 
10 | is_it()
11 | }
12 | \value{
13 | \code{which_are} returns an environment of functions that can be used to 
14 | locate and return the integer locations of the particular non-normalized text
15 | named by the function.
16 | 
17 | \code{is_it} returns an environment of functions that can be used to 
18 | detect and return a logical atomic vector of equal length to the input vector
19 | (except for meta functions) of the particular non-normalized text
20 | named by the function.
21 | }
22 | \description{
23 | Detect/Locate potential issues with text data.  This family of functions 
24 | generates a list of detections/location functions that can be accessed via 
25 | the dollar sign or square bracket operators.  Accessible functions include:
26 | }
27 | \details{
28 | \describe{
29 |   \item{contraction}{Contains contractions}
30 |   \item{date}{Contains dates}
31 |   \item{digit}{Contains digits}
32 |   \item{email}{Contains email addresses}
33 |   \item{emoticon}{Contains emoticons}
34 |   \item{empty}{Contains just white space}
35 |   \item{escaped}{Contains escaped backslash character}
36 |   \item{hash}{Contains Twitter style hash tags}
37 |   \item{html}{Contains html mark-up}
38 |   \item{incomplete}{Contains incomplete sentences (e.g., ends with ...)}
39 |   \item{kern}{Contains kerning (e.g. "The B O M B!")}
40 |   \item{list_column}{Is a list of atomic vectors (Not provided by \code{which_are}))}
41 |   \item{misspelled}{Contains potentially misspelled words}
42 |   \item{no_endmark}{Contains a sentence with no ending punctuation}
43 |   \item{no_space_after_comma}{Contains commas with no space after them}
44 |   \item{non_ascii}{Contains non-ASCII characters}
45 |   \item{non_character}{Is a non-character vector (Not provided by \code{which_are}))}
46 |   \item{non_split_sentence}{Contains non split sentences}
47 |   \item{tag}{Contains a Twitter style handle used to tag others (use of the at symbol)}
48 |   \item{time}{Contains a time stamp}
49 |   \item{url}{Contains a URL}
50 | }
51 | 
52 | The functions above that have a description starting with 'is' rather than 'contains'
53 | are meta functions that describe the attribute of the column/vector being passed
54 | rather than attributes about the individual elements of the column/vector.  The
55 | meta functions will return a logical of length one and are not available under
56 | \code{which_are}.
57 | }
58 | \examples{
59 | wa <- which_are()
60 | it <- is_it()
61 | wa$digit(c('The dog',  "I like 2", NA))
62 | it$digit(c('The dog',  "I like 2", NA))
63 | 
64 | is_it()$list_column(c('the dog', 'ate the chicken'))
65 | 
66 | }
67 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library("testthat")
2 | library("textclean")
3 | 
4 | test_check("textclean")


--------------------------------------------------------------------------------
/tests/testthat/test-replace_emoticon.R:
--------------------------------------------------------------------------------
 1 | context("Checking replace_emoticon")
 2 | library(lexicon)
 3 | 
 4 | test_that("replace_emoticon converts emoticons to words",{
 5 | 
 6 |     x <- lexicon::hash_emoticons[[1]][16]
 7 | 
 8 |     # expect_equal(replace_emoticon(x), " devilish ")
 9 | 
10 | })
11 | 
12 | 


--------------------------------------------------------------------------------
/tests/testthat/test-replace_grade.R:
--------------------------------------------------------------------------------
 1 | context("Checking replace_rating")
 2 | 
 3 | test_that("replace_rating replaces ratings",{
 4 | 
 5 |     x <- c(
 6 |         "I give an A+",
 7 |         "He deserves an F",
 8 |         "It's C+ work",
 9 |         "A poor example deserves a C!"
10 |     )
11 | 
12 |     x2 <- c("I give an very excellent excellent", 
13 |         "He deserves an very very bad",
14 |         "It's slightly above average work",
15 |         "A poor example deserves a average!"
16 |     )
17 | 
18 |     expect_equal(replace_grade(x), x2)
19 | 
20 | 
21 | })
22 | 
23 | 


--------------------------------------------------------------------------------
/tests/testthat/test-replace_rating.R:
--------------------------------------------------------------------------------
 1 | context("Checking replace_rating")
 2 | 
 3 | test_that("replace_rating replaces ratings",{
 4 | 
 5 |     x <- c("This place receives 5 stars for their APPETIZERS!!!",
 6 |          paste(
 7 |              "Four stars for the food & the guy in the blue", 
 8 |              "shirt for his great vibe!"
 9 |          ),
10 |          "10 out of 10 for both the movie and trilogy.",
11 |          paste(
12 |              "* Both the Hot & Sour & the Egg Flower", 
13 |              "Soups were absolutely 5 Stars!"
14 |         ),
15 |          "For service, I give them no stars.", "This place deserves no stars.",
16 |          "10 out of 10 stars.",
17 |          "My rating: just 3 out of 10.",
18 |          "If there were zero stars I would give it zero stars.",
19 |          "Rating: 1 out of 10.",
20 |          "I gave it 5 stars because of the sound quality.",
21 |          "If it were possible to give them 0/10, they'd have it."
22 |     )
23 |     
24 |     x2 <- c("This place receives best for their APPETIZERS!!!", 
25 |         " better for the food & the guy in the blue shirt for his great vibe!", 
26 |         " best for both the movie and trilogy.", 
27 |         "* Both the Hot & Sour & the Egg Flower Soups were absolutely best !", 
28 |         "For service, I give them terrible .", "This place deserves terrible .", 
29 |         " best stars.", "My rating: just below average .", 
30 |         "If there were terrible I would give it terrible .", 
31 |         "Rating: extremely below average .", 
32 |         "I gave it best because of the sound quality.", 
33 |         "If it were possible to give them terrible , they'd have it."
34 |         )
35 |     
36 |     # expect_equal(replace_rating(x), x2)
37 | 
38 | 
39 | })
40 | 
41 | 


--------------------------------------------------------------------------------
/tests/testthat/test-strip.R:
--------------------------------------------------------------------------------
1 | context("Checking strip")
2 | 
3 | test_that("strip ...",{
4 | 
5 | 
6 | })
7 | 
8 | 


--------------------------------------------------------------------------------
/tools/textclean_logo/r_textclean.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textclean/5443d7484cd798e7494a63f794e6fb30834e5ec2/tools/textclean_logo/r_textclean.png


--------------------------------------------------------------------------------
/tools/textclean_logo/r_textclean.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textclean/5443d7484cd798e7494a63f794e6fb30834e5ec2/tools/textclean_logo/r_textclean.pptx


--------------------------------------------------------------------------------
/tools/textclean_logo/r_textcleana.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textclean/5443d7484cd798e7494a63f794e6fb30834e5ec2/tools/textclean_logo/r_textcleana.png


--------------------------------------------------------------------------------
/tools/textclean_logo/resize_icon.txt:
--------------------------------------------------------------------------------
1 | cd C:\Users\Tyler\GitHub\textclean\tools\textclean_logo
2 | ffmpeg -i r_textcleana.png -vf scale=250:-1 r_textclean.png


--------------------------------------------------------------------------------