├── .gitignore ├── CHANGES ├── LICENSE ├── README.md ├── documentation ├── api-objects.txt ├── class-tree.html ├── crarr.png ├── epydoc.css ├── epydoc.js ├── frames.html ├── help.html ├── identifier-index.html ├── index.html ├── module-tree.html ├── redirect.html ├── toc-everything.html ├── toc-tweetokenize-module.html ├── toc.html ├── tweetokenize-module.html ├── tweetokenize-pysrc.html ├── tweetokenize.Tokenizer-class.html └── tweetokenize.Tokenizer.TokenizerException-class.html ├── setup.py ├── tests ├── __main__.py └── test_tweetokenize.py └── tweetokenize ├── __init__.py ├── lexicons ├── domains.txt ├── emoticons.txt └── stopwords.txt └── tokenizer.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.pyc 3 | .gitignore 4 | build 5 | bench 6 | -------------------------------------------------------------------------------- /CHANGES: -------------------------------------------------------------------------------- 1 | Changes 2 | ======= 3 | 4 | 1.0.1 (2013-08-15) 5 | ------------------ 6 | 7 | - Module docstring 8 | - Changes to `setup.py` 9 | - Refactored: gained ~15% speed up for tokenization 10 | 11 | 12 | 1.0.0 (2013-05-11 - 2013-06-25) 13 | ------------------------------- 14 | 15 | - First version 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013, Jared Suttles. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | 14 | 3. Neither the name of tweetokenize nor the names of its contributors may be 15 | used to endorse or promote products derived from this software without 16 | specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | tweetokenize 2 | ============ 3 | 4 | Regular expression based tokenizer for Twitter. Focused on tokenization 5 | and pre-processing to train classifiers for sentiment, emotion, or mood. 6 | 7 | Intended as glue between Python wrappers for Twitter API and machine 8 | learning algorithms of the Natural Language Toolkit (NLTK), but probably 9 | applicable to tokenizing any short messages of the social networking 10 | variety. 11 | 12 | ```python 13 | from tweetokenize import Tokenizer 14 | gettokens = Tokenizer() 15 | gettokens.tokenize('hey playa!:):3.....@SHAQ can you still dunk?#old🍕🍔😵LOL') 16 | [u'hey', u'playa', u'!', u':)', u':3', u'...', u'USERNAME', u'can', u'you', u'still', u'dunk', u'?', u'#old', u'🍕', u'🍔', u'😵', u'LOL'] 17 | ``` 18 | 19 | Features 20 | -------- 21 | 22 | * Can easily replace tweet features like usernames, urls, phone numbers, times, 23 | etc. with tokens in order to reduce feature set complexity and improve 24 | performance of classifiers 25 | * Allows user-defined sets of emoticons to be used in tokenization 26 | * Correctly separates emoji, written consecutively, into individual tokens 27 | 28 | Installation 29 | ------------ 30 | 31 | python setup.py install 32 | 33 | After installation, you can make sure everything is working by running the following inside the project root folder, 34 | 35 | python tests 36 | 37 | Documentation 38 | ------------- 39 | 40 | http://htmlpreview.github.io/?https://raw.github.com/jaredks/tweetokenize/master/documentation/tweetokenize.Tokenizer-class.html 41 | 42 | License 43 | ------- 44 | 45 | "Modified BSD License". See LICENSE for details. Copyright Jared Suttles, 2013. 46 | -------------------------------------------------------------------------------- /documentation/api-objects.txt: -------------------------------------------------------------------------------- 1 | tweetokenize tweetokenize-module.html 2 | tweetokenize.__package__ tweetokenize-module.html#__package__ 3 | tweetokenize.Tokenizer tweetokenize.Tokenizer-class.html 4 | tweetokenize.Tokenizer.repeating_re tweetokenize.Tokenizer-class.html#repeating_re 5 | tweetokenize.Tokenizer._cleanword tweetokenize.Tokenizer-class.html#_cleanword 6 | tweetokenize.Tokenizer.phonenumbers_re tweetokenize.Tokenizer-class.html#phonenumbers_re 7 | tweetokenize.Tokenizer._unicode tweetokenize.Tokenizer-class.html#_unicode 8 | tweetokenize.Tokenizer.usernames_re tweetokenize.Tokenizer-class.html#usernames_re 9 | tweetokenize.Tokenizer._replacetokens tweetokenize.Tokenizer-class.html#_replacetokens 10 | tweetokenize.Tokenizer.quotes_re tweetokenize.Tokenizer-class.html#quotes_re 11 | tweetokenize.Tokenizer._topleveldomains tweetokenize.Tokenizer-class.html#_topleveldomains 12 | tweetokenize.Tokenizer.__init__ tweetokenize.Tokenizer-class.html#__init__ 13 | tweetokenize.Tokenizer.emoticons tweetokenize.Tokenizer-class.html#emoticons 14 | tweetokenize.Tokenizer.TokenizerException tweetokenize.Tokenizer.TokenizerException-class.html 15 | tweetokenize.Tokenizer.punctuation tweetokenize.Tokenizer-class.html#punctuation 16 | tweetokenize.Tokenizer._collectset tweetokenize.Tokenizer-class.html#_collectset 17 | tweetokenize.Tokenizer._isemoji tweetokenize.Tokenizer-class.html#_isemoji 18 | tweetokenize.Tokenizer.numbers_re tweetokenize.Tokenizer-class.html#numbers_re 19 | tweetokenize.Tokenizer.times_re tweetokenize.Tokenizer-class.html#times_re 20 | tweetokenize.Tokenizer.tokenize tweetokenize.Tokenizer-class.html#tokenize 21 | tweetokenize.Tokenizer.__call__ tweetokenize.Tokenizer-class.html#__call__ 22 | tweetokenize.Tokenizer._converthtmlentities tweetokenize.Tokenizer-class.html#_converthtmlentities 23 | tweetokenize.Tokenizer._number tweetokenize.Tokenizer-class.html#_number 24 | tweetokenize.Tokenizer._separate_emoticons_punctuation tweetokenize.Tokenizer-class.html#_separate_emoticons_punctuation 25 | tweetokenize.Tokenizer.update tweetokenize.Tokenizer-class.html#update 26 | tweetokenize.Tokenizer.word_re tweetokenize.Tokenizer-class.html#word_re 27 | tweetokenize.Tokenizer.tokenize_re tweetokenize.Tokenizer-class.html#tokenize_re 28 | tweetokenize.Tokenizer.html_entities tweetokenize.Tokenizer-class.html#html_entities 29 | tweetokenize.Tokenizer.urls_re tweetokenize.Tokenizer-class.html#urls_re 30 | tweetokenize.Tokenizer._doublequotes tweetokenize.Tokenizer-class.html#_doublequotes 31 | tweetokenize.Tokenizer._token_regexs tweetokenize.Tokenizer-class.html#_token_regexs 32 | tweetokenize.Tokenizer.other_re tweetokenize.Tokenizer-class.html#other_re 33 | tweetokenize.Tokenizer.ellipsis_re tweetokenize.Tokenizer-class.html#ellipsis_re 34 | tweetokenize.Tokenizer.stopwords tweetokenize.Tokenizer-class.html#stopwords 35 | tweetokenize.Tokenizer.html_entities_re tweetokenize.Tokenizer-class.html#html_entities_re 36 | tweetokenize.Tokenizer.hashtags_re tweetokenize.Tokenizer-class.html#hashtags_re 37 | tweetokenize.Tokenizer.__default_args tweetokenize.Tokenizer-class.html#__default_args 38 | tweetokenize.Tokenizer.TokenizerException tweetokenize.Tokenizer.TokenizerException-class.html 39 | -------------------------------------------------------------------------------- /documentation/class-tree.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | Class Hierarchy 7 | 8 | 9 | 10 | 11 | 13 | 14 | 16 | 17 | 18 | 20 | 21 | 22 | 24 | 25 | 26 | 28 | 29 | 30 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 48 | 49 |
  40 | 41 | 42 | 46 |
[frames] | no frames]
47 |
50 |
51 | [ Module Hierarchy 52 | | Class Hierarchy ] 53 |

54 |

Class Hierarchy

55 | 75 | 76 | 78 | 79 | 80 | 82 | 83 | 84 | 86 | 87 | 88 | 90 | 91 | 92 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 103 | 107 | 108 |
109 | 110 | 119 | 120 | 121 | -------------------------------------------------------------------------------- /documentation/crarr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaredks/tweetokenize/ad3efd2d62aefcb7ab175933601b2f6d4c4d4c63/documentation/crarr.png -------------------------------------------------------------------------------- /documentation/epydoc.css: -------------------------------------------------------------------------------- 1 | 2 | 3 | /* Epydoc CSS Stylesheet 4 | * 5 | * This stylesheet can be used to customize the appearance of epydoc's 6 | * HTML output. 7 | * 8 | */ 9 | 10 | /* Default Colors & Styles 11 | * - Set the default foreground & background color with 'body'; and 12 | * link colors with 'a:link' and 'a:visited'. 13 | * - Use bold for decision list terms. 14 | * - The heading styles defined here are used for headings *within* 15 | * docstring descriptions. All headings used by epydoc itself use 16 | * either class='epydoc' or class='toc' (CSS styles for both 17 | * defined below). 18 | */ 19 | body { background: #ffffff; color: #000000; } 20 | p { margin-top: 0.5em; margin-bottom: 0.5em; } 21 | a:link { color: #0000ff; } 22 | a:visited { color: #204080; } 23 | dt { font-weight: bold; } 24 | h1 { font-size: +140%; font-style: italic; 25 | font-weight: bold; } 26 | h2 { font-size: +125%; font-style: italic; 27 | font-weight: bold; } 28 | h3 { font-size: +110%; font-style: italic; 29 | font-weight: normal; } 30 | code { font-size: 100%; } 31 | /* N.B.: class, not pseudoclass */ 32 | a.link { font-family: monospace; } 33 | 34 | /* Page Header & Footer 35 | * - The standard page header consists of a navigation bar (with 36 | * pointers to standard pages such as 'home' and 'trees'); a 37 | * breadcrumbs list, which can be used to navigate to containing 38 | * classes or modules; options links, to show/hide private 39 | * variables and to show/hide frames; and a page title (using 40 | *

). The page title may be followed by a link to the 41 | * corresponding source code (using 'span.codelink'). 42 | * - The footer consists of a navigation bar, a timestamp, and a 43 | * pointer to epydoc's homepage. 44 | */ 45 | h1.epydoc { margin: 0; font-size: +140%; font-weight: bold; } 46 | h2.epydoc { font-size: +130%; font-weight: bold; } 47 | h3.epydoc { font-size: +115%; font-weight: bold; 48 | margin-top: 0.2em; } 49 | td h3.epydoc { font-size: +115%; font-weight: bold; 50 | margin-bottom: 0; } 51 | table.navbar { background: #a0c0ff; color: #000000; 52 | border: 2px groove #c0d0d0; } 53 | table.navbar table { color: #000000; } 54 | th.navbar-select { background: #70b0ff; 55 | color: #000000; } 56 | table.navbar a { text-decoration: none; } 57 | table.navbar a:link { color: #0000ff; } 58 | table.navbar a:visited { color: #204080; } 59 | span.breadcrumbs { font-size: 85%; font-weight: bold; } 60 | span.options { font-size: 70%; } 61 | span.codelink { font-size: 85%; } 62 | td.footer { font-size: 85%; } 63 | 64 | /* Table Headers 65 | * - Each summary table and details section begins with a 'header' 66 | * row. This row contains a section title (marked by 67 | * 'span.table-header') as well as a show/hide private link 68 | * (marked by 'span.options', defined above). 69 | * - Summary tables that contain user-defined groups mark those 70 | * groups using 'group header' rows. 71 | */ 72 | td.table-header { background: #70b0ff; color: #000000; 73 | border: 1px solid #608090; } 74 | td.table-header table { color: #000000; } 75 | td.table-header table a:link { color: #0000ff; } 76 | td.table-header table a:visited { color: #204080; } 77 | span.table-header { font-size: 120%; font-weight: bold; } 78 | th.group-header { background: #c0e0f8; color: #000000; 79 | text-align: left; font-style: italic; 80 | font-size: 115%; 81 | border: 1px solid #608090; } 82 | 83 | /* Summary Tables (functions, variables, etc) 84 | * - Each object is described by a single row of the table with 85 | * two cells. The left cell gives the object's type, and is 86 | * marked with 'code.summary-type'. The right cell gives the 87 | * object's name and a summary description. 88 | * - CSS styles for the table's header and group headers are 89 | * defined above, under 'Table Headers' 90 | */ 91 | table.summary { border-collapse: collapse; 92 | background: #e8f0f8; color: #000000; 93 | border: 1px solid #608090; 94 | margin-bottom: 0.5em; } 95 | td.summary { border: 1px solid #608090; } 96 | code.summary-type { font-size: 85%; } 97 | table.summary a:link { color: #0000ff; } 98 | table.summary a:visited { color: #204080; } 99 | 100 | 101 | /* Details Tables (functions, variables, etc) 102 | * - Each object is described in its own div. 103 | * - A single-row summary table w/ table-header is used as 104 | * a header for each details section (CSS style for table-header 105 | * is defined above, under 'Table Headers'). 106 | */ 107 | table.details { border-collapse: collapse; 108 | background: #e8f0f8; color: #000000; 109 | border: 1px solid #608090; 110 | margin: .2em 0 0 0; } 111 | table.details table { color: #000000; } 112 | table.details a:link { color: #0000ff; } 113 | table.details a:visited { color: #204080; } 114 | 115 | /* Fields */ 116 | dl.fields { margin-left: 2em; margin-top: 1em; 117 | margin-bottom: 1em; } 118 | dl.fields dd ul { margin-left: 0em; padding-left: 0em; } 119 | dl.fields dd ul li ul { margin-left: 2em; padding-left: 0em; } 120 | div.fields { margin-left: 2em; } 121 | div.fields p { margin-bottom: 0.5em; } 122 | 123 | /* Index tables (identifier index, term index, etc) 124 | * - link-index is used for indices containing lists of links 125 | * (namely, the identifier index & term index). 126 | * - index-where is used in link indices for the text indicating 127 | * the container/source for each link. 128 | * - metadata-index is used for indices containing metadata 129 | * extracted from fields (namely, the bug index & todo index). 130 | */ 131 | table.link-index { border-collapse: collapse; 132 | background: #e8f0f8; color: #000000; 133 | border: 1px solid #608090; } 134 | td.link-index { border-width: 0px; } 135 | table.link-index a:link { color: #0000ff; } 136 | table.link-index a:visited { color: #204080; } 137 | span.index-where { font-size: 70%; } 138 | table.metadata-index { border-collapse: collapse; 139 | background: #e8f0f8; color: #000000; 140 | border: 1px solid #608090; 141 | margin: .2em 0 0 0; } 142 | td.metadata-index { border-width: 1px; border-style: solid; } 143 | table.metadata-index a:link { color: #0000ff; } 144 | table.metadata-index a:visited { color: #204080; } 145 | 146 | /* Function signatures 147 | * - sig* is used for the signature in the details section. 148 | * - .summary-sig* is used for the signature in the summary 149 | * table, and when listing property accessor functions. 150 | * */ 151 | .sig-name { color: #006080; } 152 | .sig-arg { color: #008060; } 153 | .sig-default { color: #602000; } 154 | .summary-sig { font-family: monospace; } 155 | .summary-sig-name { color: #006080; font-weight: bold; } 156 | table.summary a.summary-sig-name:link 157 | { color: #006080; font-weight: bold; } 158 | table.summary a.summary-sig-name:visited 159 | { color: #006080; font-weight: bold; } 160 | .summary-sig-arg { color: #006040; } 161 | .summary-sig-default { color: #501800; } 162 | 163 | /* Subclass list 164 | */ 165 | ul.subclass-list { display: inline; } 166 | ul.subclass-list li { display: inline; } 167 | 168 | /* To render variables, classes etc. like functions */ 169 | table.summary .summary-name { color: #006080; font-weight: bold; 170 | font-family: monospace; } 171 | table.summary 172 | a.summary-name:link { color: #006080; font-weight: bold; 173 | font-family: monospace; } 174 | table.summary 175 | a.summary-name:visited { color: #006080; font-weight: bold; 176 | font-family: monospace; } 177 | 178 | /* Variable values 179 | * - In the 'variable details' sections, each varaible's value is 180 | * listed in a 'pre.variable' box. The width of this box is 181 | * restricted to 80 chars; if the value's repr is longer than 182 | * this it will be wrapped, using a backslash marked with 183 | * class 'variable-linewrap'. If the value's repr is longer 184 | * than 3 lines, the rest will be ellided; and an ellipsis 185 | * marker ('...' marked with 'variable-ellipsis') will be used. 186 | * - If the value is a string, its quote marks will be marked 187 | * with 'variable-quote'. 188 | * - If the variable is a regexp, it is syntax-highlighted using 189 | * the re* CSS classes. 190 | */ 191 | pre.variable { padding: .5em; margin: 0; 192 | background: #dce4ec; color: #000000; 193 | border: 1px solid #708890; } 194 | .variable-linewrap { color: #604000; font-weight: bold; } 195 | .variable-ellipsis { color: #604000; font-weight: bold; } 196 | .variable-quote { color: #604000; font-weight: bold; } 197 | .variable-group { color: #008000; font-weight: bold; } 198 | .variable-op { color: #604000; font-weight: bold; } 199 | .variable-string { color: #006030; } 200 | .variable-unknown { color: #a00000; font-weight: bold; } 201 | .re { color: #000000; } 202 | .re-char { color: #006030; } 203 | .re-op { color: #600000; } 204 | .re-group { color: #003060; } 205 | .re-ref { color: #404040; } 206 | 207 | /* Base tree 208 | * - Used by class pages to display the base class hierarchy. 209 | */ 210 | pre.base-tree { font-size: 80%; margin: 0; } 211 | 212 | /* Frames-based table of contents headers 213 | * - Consists of two frames: one for selecting modules; and 214 | * the other listing the contents of the selected module. 215 | * - h1.toc is used for each frame's heading 216 | * - h2.toc is used for subheadings within each frame. 217 | */ 218 | h1.toc { text-align: center; font-size: 105%; 219 | margin: 0; font-weight: bold; 220 | padding: 0; } 221 | h2.toc { font-size: 100%; font-weight: bold; 222 | margin: 0.5em 0 0 -0.3em; } 223 | 224 | /* Syntax Highlighting for Source Code 225 | * - doctest examples are displayed in a 'pre.py-doctest' block. 226 | * If the example is in a details table entry, then it will use 227 | * the colors specified by the 'table pre.py-doctest' line. 228 | * - Source code listings are displayed in a 'pre.py-src' block. 229 | * Each line is marked with 'span.py-line' (used to draw a line 230 | * down the left margin, separating the code from the line 231 | * numbers). Line numbers are displayed with 'span.py-lineno'. 232 | * The expand/collapse block toggle button is displayed with 233 | * 'a.py-toggle' (Note: the CSS style for 'a.py-toggle' should not 234 | * modify the font size of the text.) 235 | * - If a source code page is opened with an anchor, then the 236 | * corresponding code block will be highlighted. The code 237 | * block's header is highlighted with 'py-highlight-hdr'; and 238 | * the code block's body is highlighted with 'py-highlight'. 239 | * - The remaining py-* classes are used to perform syntax 240 | * highlighting (py-string for string literals, py-name for names, 241 | * etc.) 242 | */ 243 | pre.py-doctest { padding: .5em; margin: 1em; 244 | background: #e8f0f8; color: #000000; 245 | border: 1px solid #708890; } 246 | table pre.py-doctest { background: #dce4ec; 247 | color: #000000; } 248 | pre.py-src { border: 2px solid #000000; 249 | background: #f0f0f0; color: #000000; } 250 | .py-line { border-left: 2px solid #000000; 251 | margin-left: .2em; padding-left: .4em; } 252 | .py-lineno { font-style: italic; font-size: 90%; 253 | padding-left: .5em; } 254 | a.py-toggle { text-decoration: none; } 255 | div.py-highlight-hdr { border-top: 2px solid #000000; 256 | border-bottom: 2px solid #000000; 257 | background: #d8e8e8; } 258 | div.py-highlight { border-bottom: 2px solid #000000; 259 | background: #d0e0e0; } 260 | .py-prompt { color: #005050; font-weight: bold;} 261 | .py-more { color: #005050; font-weight: bold;} 262 | .py-string { color: #006030; } 263 | .py-comment { color: #003060; } 264 | .py-keyword { color: #600000; } 265 | .py-output { color: #404040; } 266 | .py-name { color: #000050; } 267 | .py-name:link { color: #000050 !important; } 268 | .py-name:visited { color: #000050 !important; } 269 | .py-number { color: #005000; } 270 | .py-defname { color: #000060; font-weight: bold; } 271 | .py-def-name { color: #000060; font-weight: bold; } 272 | .py-base-class { color: #000060; } 273 | .py-param { color: #000060; } 274 | .py-docstring { color: #006030; } 275 | .py-decorator { color: #804020; } 276 | /* Use this if you don't want links to names underlined: */ 277 | /*a.py-name { text-decoration: none; }*/ 278 | 279 | /* Graphs & Diagrams 280 | * - These CSS styles are used for graphs & diagrams generated using 281 | * Graphviz dot. 'img.graph-without-title' is used for bare 282 | * diagrams (to remove the border created by making the image 283 | * clickable). 284 | */ 285 | img.graph-without-title { border: none; } 286 | img.graph-with-title { border: 1px solid #000000; } 287 | span.graph-title { font-weight: bold; } 288 | span.graph-caption { } 289 | 290 | /* General-purpose classes 291 | * - 'p.indent-wrapped-lines' defines a paragraph whose first line 292 | * is not indented, but whose subsequent lines are. 293 | * - The 'nomargin-top' class is used to remove the top margin (e.g. 294 | * from lists). The 'nomargin' class is used to remove both the 295 | * top and bottom margin (but not the left or right margin -- 296 | * for lists, that would cause the bullets to disappear.) 297 | */ 298 | p.indent-wrapped-lines { padding: 0 0 0 7em; text-indent: -7em; 299 | margin: 0; } 300 | .nomargin-top { margin-top: 0; } 301 | .nomargin { margin-top: 0; margin-bottom: 0; } 302 | 303 | /* HTML Log */ 304 | div.log-block { padding: 0; margin: .5em 0 .5em 0; 305 | background: #e8f0f8; color: #000000; 306 | border: 1px solid #000000; } 307 | div.log-error { padding: .1em .3em .1em .3em; margin: 4px; 308 | background: #ffb0b0; color: #000000; 309 | border: 1px solid #000000; } 310 | div.log-warning { padding: .1em .3em .1em .3em; margin: 4px; 311 | background: #ffffb0; color: #000000; 312 | border: 1px solid #000000; } 313 | div.log-info { padding: .1em .3em .1em .3em; margin: 4px; 314 | background: #b0ffb0; color: #000000; 315 | border: 1px solid #000000; } 316 | h2.log-hdr { background: #70b0ff; color: #000000; 317 | margin: 0; padding: 0em 0.5em 0em 0.5em; 318 | border-bottom: 1px solid #000000; font-size: 110%; } 319 | p.log { font-weight: bold; margin: .5em 0 .5em 0; } 320 | tr.opt-changed { color: #000000; font-weight: bold; } 321 | tr.opt-default { color: #606060; } 322 | pre.log { margin: 0; padding: 0; padding-left: 1em; } 323 | -------------------------------------------------------------------------------- /documentation/epydoc.js: -------------------------------------------------------------------------------- 1 | function toggle_private() { 2 | // Search for any private/public links on this page. Store 3 | // their old text in "cmd," so we will know what action to 4 | // take; and change their text to the opposite action. 5 | var cmd = "?"; 6 | var elts = document.getElementsByTagName("a"); 7 | for(var i=0; i...
"; 127 | elt.innerHTML = s; 128 | } 129 | } 130 | 131 | function toggle(id) { 132 | elt = document.getElementById(id+"-toggle"); 133 | if (elt.innerHTML == "-") 134 | collapse(id); 135 | else 136 | expand(id); 137 | return false; 138 | } 139 | 140 | function highlight(id) { 141 | var elt = document.getElementById(id+"-def"); 142 | if (elt) elt.className = "py-highlight-hdr"; 143 | var elt = document.getElementById(id+"-expanded"); 144 | if (elt) elt.className = "py-highlight"; 145 | var elt = document.getElementById(id+"-collapsed"); 146 | if (elt) elt.className = "py-highlight"; 147 | } 148 | 149 | function num_lines(s) { 150 | var n = 1; 151 | var pos = s.indexOf("\n"); 152 | while ( pos > 0) { 153 | n += 1; 154 | pos = s.indexOf("\n", pos+1); 155 | } 156 | return n; 157 | } 158 | 159 | // Collapse all blocks that mave more than `min_lines` lines. 160 | function collapse_all(min_lines) { 161 | var elts = document.getElementsByTagName("div"); 162 | for (var i=0; i 0) 166 | if (elt.id.substring(split, elt.id.length) == "-expanded") 167 | if (num_lines(elt.innerHTML) > min_lines) 168 | collapse(elt.id.substring(0, split)); 169 | } 170 | } 171 | 172 | function expandto(href) { 173 | var start = href.indexOf("#")+1; 174 | if (start != 0 && start != href.length) { 175 | if (href.substring(start, href.length) != "-") { 176 | collapse_all(4); 177 | pos = href.indexOf(".", start); 178 | while (pos != -1) { 179 | var id = href.substring(start, pos); 180 | expand(id); 181 | pos = href.indexOf(".", pos+1); 182 | } 183 | var id = href.substring(start, href.length); 184 | expand(id); 185 | highlight(id); 186 | } 187 | } 188 | } 189 | 190 | function kill_doclink(id) { 191 | var parent = document.getElementById(id); 192 | parent.removeChild(parent.childNodes.item(0)); 193 | } 194 | function auto_kill_doclink(ev) { 195 | if (!ev) var ev = window.event; 196 | if (!this.contains(ev.toElement)) { 197 | var parent = document.getElementById(this.parentID); 198 | parent.removeChild(parent.childNodes.item(0)); 199 | } 200 | } 201 | 202 | function doclink(id, name, targets_id) { 203 | var elt = document.getElementById(id); 204 | 205 | // If we already opened the box, then destroy it. 206 | // (This case should never occur, but leave it in just in case.) 207 | if (elt.childNodes.length > 1) { 208 | elt.removeChild(elt.childNodes.item(0)); 209 | } 210 | else { 211 | // The outer box: relative + inline positioning. 212 | var box1 = document.createElement("div"); 213 | box1.style.position = "relative"; 214 | box1.style.display = "inline"; 215 | box1.style.top = 0; 216 | box1.style.left = 0; 217 | 218 | // A shadow for fun 219 | var shadow = document.createElement("div"); 220 | shadow.style.position = "absolute"; 221 | shadow.style.left = "-1.3em"; 222 | shadow.style.top = "-1.3em"; 223 | shadow.style.background = "#404040"; 224 | 225 | // The inner box: absolute positioning. 226 | var box2 = document.createElement("div"); 227 | box2.style.position = "relative"; 228 | box2.style.border = "1px solid #a0a0a0"; 229 | box2.style.left = "-.2em"; 230 | box2.style.top = "-.2em"; 231 | box2.style.background = "white"; 232 | box2.style.padding = ".3em .4em .3em .4em"; 233 | box2.style.fontStyle = "normal"; 234 | box2.onmouseout=auto_kill_doclink; 235 | box2.parentID = id; 236 | 237 | // Get the targets 238 | var targets_elt = document.getElementById(targets_id); 239 | var targets = targets_elt.getAttribute("targets"); 240 | var links = ""; 241 | target_list = targets.split(","); 242 | for (var i=0; i" + 246 | target[0] + ""; 247 | } 248 | 249 | // Put it all together. 250 | elt.insertBefore(box1, elt.childNodes.item(0)); 251 | //box1.appendChild(box2); 252 | box1.appendChild(shadow); 253 | shadow.appendChild(box2); 254 | box2.innerHTML = 255 | "Which "+name+" do you want to see documentation for?" + 256 | ""; 261 | } 262 | return false; 263 | } 264 | 265 | function get_anchor() { 266 | var href = location.href; 267 | var start = href.indexOf("#")+1; 268 | if ((start != 0) && (start != href.length)) 269 | return href.substring(start, href.length); 270 | } 271 | function redirect_url(dottedName) { 272 | // Scan through each element of the "pages" list, and check 273 | // if "name" matches with any of them. 274 | for (var i=0; i-m" or "-c"; 277 | // extract the portion & compare it to dottedName. 278 | var pagename = pages[i].substring(0, pages[i].length-2); 279 | if (pagename == dottedName.substring(0,pagename.length)) { 280 | 281 | // We've found a page that matches `dottedName`; 282 | // construct its URL, using leftover `dottedName` 283 | // content to form an anchor. 284 | var pagetype = pages[i].charAt(pages[i].length-1); 285 | var url = pagename + ((pagetype=="m")?"-module.html": 286 | "-class.html"); 287 | if (dottedName.length > pagename.length) 288 | url += "#" + dottedName.substring(pagename.length+1, 289 | dottedName.length); 290 | return url; 291 | } 292 | } 293 | } 294 | -------------------------------------------------------------------------------- /documentation/frames.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | API Documentation 7 | 8 | 9 | 10 | 12 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /documentation/help.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | Help 7 | 8 | 9 | 10 | 11 | 13 | 14 | 16 | 17 | 18 | 20 | 21 | 22 | 24 | 25 | 26 | 28 | 29 | 30 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 48 | 49 |
  40 | 41 | 42 | 46 |
[frames] | no frames]
47 |
50 | 51 |

API Documentation

52 | 53 |

This document contains the API (Application Programming Interface) 54 | documentation for this project. Documentation for the Python 55 | objects defined by the project is divided into separate pages for each 56 | package, module, and class. The API documentation also includes two 57 | pages containing information about the project as a whole: a trees 58 | page, and an index page.

59 | 60 |

Object Documentation

61 | 62 |

Each Package Documentation page contains:

63 |
    64 |
  • A description of the package.
  • 65 |
  • A list of the modules and sub-packages contained by the 66 | package.
  • 67 |
  • A summary of the classes defined by the package.
  • 68 |
  • A summary of the functions defined by the package.
  • 69 |
  • A summary of the variables defined by the package.
  • 70 |
  • A detailed description of each function defined by the 71 | package.
  • 72 |
  • A detailed description of each variable defined by the 73 | package.
  • 74 |
75 | 76 |

Each Module Documentation page contains:

77 |
    78 |
  • A description of the module.
  • 79 |
  • A summary of the classes defined by the module.
  • 80 |
  • A summary of the functions defined by the module.
  • 81 |
  • A summary of the variables defined by the module.
  • 82 |
  • A detailed description of each function defined by the 83 | module.
  • 84 |
  • A detailed description of each variable defined by the 85 | module.
  • 86 |
87 | 88 |

Each Class Documentation page contains:

89 |
    90 |
  • A class inheritance diagram.
  • 91 |
  • A list of known subclasses.
  • 92 |
  • A description of the class.
  • 93 |
  • A summary of the methods defined by the class.
  • 94 |
  • A summary of the instance variables defined by the class.
  • 95 |
  • A summary of the class (static) variables defined by the 96 | class.
  • 97 |
  • A detailed description of each method defined by the 98 | class.
  • 99 |
  • A detailed description of each instance variable defined by the 100 | class.
  • 101 |
  • A detailed description of each class (static) variable defined 102 | by the class.
  • 103 |
104 | 105 |

Project Documentation

106 | 107 |

The Trees page contains the module and class hierarchies:

108 |
    109 |
  • The module hierarchy lists every package and module, with 110 | modules grouped into packages. At the top level, and within each 111 | package, modules and sub-packages are listed alphabetically.
  • 112 |
  • The class hierarchy lists every class, grouped by base 113 | class. If a class has more than one base class, then it will be 114 | listed under each base class. At the top level, and under each base 115 | class, classes are listed alphabetically.
  • 116 |
117 | 118 |

The Index page contains indices of terms and 119 | identifiers:

120 |
    121 |
  • The term index lists every term indexed by any object's 122 | documentation. For each term, the index provides links to each 123 | place where the term is indexed.
  • 124 |
  • The identifier index lists the (short) name of every package, 125 | module, class, method, function, variable, and parameter. For each 126 | identifier, the index provides a short description, and a link to 127 | its documentation.
  • 128 |
129 | 130 |

The Table of Contents

131 | 132 |

The table of contents occupies the two frames on the left side of 133 | the window. The upper-left frame displays the project 134 | contents, and the lower-left frame displays the module 135 | contents:

136 | 137 | 138 | 139 | 141 | 144 | 145 | 146 | 149 | 150 |
140 | Project
Contents
...
142 | API
Documentation
Frame


143 |
147 | Module
Contents
 
...
  148 |

151 | 152 |

The project contents frame contains a list of all packages 153 | and modules that are defined by the project. Clicking on an entry 154 | will display its contents in the module contents frame. Clicking on a 155 | special entry, labeled "Everything," will display the contents of 156 | the entire project.

157 | 158 |

The module contents frame contains a list of every 159 | submodule, class, type, exception, function, and variable defined by a 160 | module or package. Clicking on an entry will display its 161 | documentation in the API documentation frame. Clicking on the name of 162 | the module, at the top of the frame, will display the documentation 163 | for the module itself.

164 | 165 |

The "frames" and "no frames" buttons below the top 166 | navigation bar can be used to control whether the table of contents is 167 | displayed or not.

168 | 169 |

The Navigation Bar

170 | 171 |

A navigation bar is located at the top and bottom of every page. 172 | It indicates what type of page you are currently viewing, and allows 173 | you to go to related pages. The following table describes the labels 174 | on the navigation bar. Note that not some labels (such as 175 | [Parent]) are not displayed on all pages.

176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 190 | 191 | 192 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 |
LabelHighlighted when...Links to...
[Parent](never highlighted) the parent of the current package
[Package]viewing a packagethe package containing the current object 189 |
[Module]viewing a modulethe module containing the current object 193 |
[Class]viewing a class the class containing the current object
[Trees]viewing the trees page the trees page
[Index]viewing the index page the index page
[Help]viewing the help page the help page
207 | 208 |

The "show private" and "hide private" buttons below 209 | the top navigation bar can be used to control whether documentation 210 | for private objects is displayed. Private objects are usually defined 211 | as objects whose (short) names begin with a single underscore, but do 212 | not end with an underscore. For example, "_x", 213 | "__pprint", and "epydoc.epytext._tokenize" 214 | are private objects; but "re.sub", 215 | "__init__", and "type_" are not. However, 216 | if a module defines the "__all__" variable, then its 217 | contents are used to decide which objects are private.

218 | 219 |

A timestamp below the bottom navigation bar indicates when each 220 | page was last updated.

221 | 222 | 224 | 225 | 226 | 228 | 229 | 230 | 232 | 233 | 234 | 236 | 237 | 238 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 249 | 253 | 254 |
255 | 256 | 265 | 266 | 267 | -------------------------------------------------------------------------------- /documentation/identifier-index.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | Identifier Index 7 | 8 | 9 | 10 | 11 | 13 | 14 | 16 | 17 | 18 | 20 | 21 | 22 | 24 | 25 | 26 | 28 | 29 | 30 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 48 | 49 |
  40 | 41 | 42 | 46 |
[frames] | no frames]
47 |
50 | 51 |
52 |

Identifier Index

53 |
54 | [ 55 | A 56 | B 57 | C 58 | D 59 | E 60 | F 61 | G 62 | H 63 | I 64 | J 65 | K 66 | L 67 | M 68 | N 69 | O 70 | P 71 | Q 72 | R 73 | S 74 | T 75 | U 76 | V 77 | W 78 | X 79 | Y 80 | Z 81 | _ 82 | ] 83 |
84 | 85 | 86 | 98 | 99 | 112 | 113 | 124 | 125 | 136 | 137 | 149 | 150 | 161 | 162 | 173 | 174 | 185 | 186 | 205 | 206 | 219 | 220 | 231 | 232 | 259 |

E

87 | 88 | 89 | 91 | 93 | 94 | 95 | 96 | 97 |

H

100 | 101 | 102 | 104 | 106 | 108 | 109 | 110 | 111 |

N

114 | 115 | 116 | 118 | 119 | 120 | 121 | 122 | 123 |

O

126 | 127 | 128 | 130 | 131 | 132 | 133 | 134 | 135 |

P

138 | 139 | 140 | 142 | 144 | 145 | 146 | 147 | 148 |

Q

151 | 152 | 153 | 155 | 156 | 157 | 158 | 159 | 160 |

R

163 | 164 | 165 | 167 | 168 | 169 | 170 | 171 | 172 |

S

175 | 176 | 177 | 179 | 180 | 181 | 182 | 183 | 184 |

T

187 | 188 | 189 | 191 | 193 | 195 | 196 | 197 | 199 | 201 | 202 | 203 | 204 |

U

207 | 208 | 209 | 211 | 213 | 215 | 216 | 217 | 218 |

W

221 | 222 | 223 | 225 | 226 | 227 | 228 | 229 | 230 |

_

233 | 234 | 235 | 237 | 239 | 241 | 242 | 243 | 245 | 247 | 249 | 250 | 251 | 253 | 255 | 256 | 257 | 258 |
260 |

261 | 263 | 264 | 265 | 267 | 268 | 269 | 271 | 272 | 273 | 275 | 276 | 277 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 288 | 292 | 293 |
294 | 295 | 304 | 305 | 306 | -------------------------------------------------------------------------------- /documentation/index.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | API Documentation 7 | 8 | 9 | 10 | 12 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /documentation/module-tree.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | Module Hierarchy 7 | 8 | 9 | 10 | 11 | 13 | 14 | 16 | 17 | 18 | 20 | 21 | 22 | 24 | 25 | 26 | 28 | 29 | 30 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 48 | 49 |
  40 | 41 | 42 | 46 |
[frames] | no frames]
47 |
50 |
51 | [ Module Hierarchy 52 | | Class Hierarchy ] 53 |

54 |

Module Hierarchy

55 |
    56 |
  • tweetokenize: Tokenization and pre-processing for social media data used to train 57 | classifiers.
  • 58 |
59 | 60 | 62 | 63 | 64 | 66 | 67 | 68 | 70 | 71 | 72 | 74 | 75 | 76 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 87 | 91 | 92 |
93 | 94 | 103 | 104 | 105 | -------------------------------------------------------------------------------- /documentation/redirect.html: -------------------------------------------------------------------------------- 1 | Epydoc Redirect Page 2 | 3 | 4 | 5 | 6 | 7 | 8 | 18 | 19 |

Epydoc Auto-redirect page

20 | 21 |

When javascript is enabled, this page will redirect URLs of 22 | the form redirect.html#dotted.name to the 23 | documentation for the object with the given fully-qualified 24 | dotted name.

25 |

 

26 | 27 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /documentation/toc-everything.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | Everything 7 | 8 | 9 | 10 | 11 | 13 |

Everything

14 |
15 |

All Classes

16 | tweetokenize.Tokenizer
tweetokenize.Tokenizer.TokenizerException

All Variables

19 | tweetokenize.__package__

21 | 22 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /documentation/toc-tweetokenize-module.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | tweetokenize 7 | 8 | 9 | 10 | 11 | 13 |

Module tweetokenize

14 |
15 |

Classes

16 | Tokenizer

Variables

18 | __package__

20 | 21 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /documentation/toc.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | Table of Contents 7 | 8 | 9 | 10 | 11 | 13 |

Table of Contents

14 |
15 | Everything 16 |
17 |

Modules

18 | tweetokenize

20 | 21 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /documentation/tweetokenize-module.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | tweetokenize 7 | 8 | 9 | 10 | 11 | 13 | 14 | 16 | 17 | 18 | 20 | 21 | 22 | 24 | 25 | 26 | 28 | 29 | 30 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 43 | 52 | 53 |
39 | 40 | Module tweetokenize 41 | 42 | 44 | 45 | 46 | 50 |
[frames] | no frames]
51 |
54 | 55 |

Module tweetokenize

source code

56 |

Tokenization and pre-processing for social media data used to train 57 | classifiers. Focused on classification of sentiment, emotion, or 58 | mood.

59 |

Intended as glue between Python wrappers for Twitter API and the 60 | Natural Language Toolkit (NLTK), but probably applicable to tokenizing 61 | any short messages of the social networking variety.

62 |

In many cases, reducing feature-set complexity can increase 63 | performance of classifiers trained for detecting sentiment. The available 64 | settings are based on commonly modified and normalized features in 65 | classification research using content from Twitter.

66 | 67 | 68 | 69 | 71 | 72 | 74 | 75 | 76 | 85 | 86 |
73 | Classes
77 |   78 | 79 | Tokenizer
80 | Can be used to tokenize a string representation of a message, 81 | adjusting features based on the given configuration details, to 82 | enable further processing in feature extraction and training 83 | stages. 84 |
87 | 88 | 89 | 91 | 92 | 94 | 95 | 96 | 101 | 102 |
93 | Variables
97 |   98 | 99 | __package__ = None 100 |
103 | 104 | 106 | 107 | 108 | 110 | 111 | 112 | 114 | 115 | 116 | 118 | 119 | 120 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 131 | 135 | 136 |
137 | 138 | 147 | 148 | 149 | -------------------------------------------------------------------------------- /documentation/tweetokenize.Tokenizer-class.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | tweetokenize.Tokenizer 7 | 8 | 9 | 10 | 11 | 13 | 14 | 16 | 17 | 18 | 20 | 21 | 22 | 24 | 25 | 26 | 28 | 29 | 30 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 44 | 53 | 54 |
39 | 40 | Module tweetokenize :: 41 | Class Tokenizer 42 | 43 | 45 | 46 | 47 | 51 |
[frames] | no frames]
52 |
55 | 56 |

Class Tokenizer

source code

57 |
 58 | object --+
 59 |          |
 60 |         Tokenizer
 61 | 
62 | 63 |
64 |

Can be used to tokenize a string representation of a message, 65 | adjusting features based on the given configuration details, to enable 66 | further processing in feature extraction and training stages.

67 |

An example usage:

68 |
 69 |  >>> from tweetokenize import Tokenizer
 70 |  >>> gettokens = Tokenizer(usernames='USER', urls='')
 71 |  >>> gettokens.tokenize('@justinbeiber yo man!love you#inlove#wantyou in a totally straight way #brotime <3:p:D www.justinbeiber.com')
 72 |  [u'USER', u'yo', u'man', u'!', u'love', u'you', u'#inlove', u'#wantyou', u'in', u'a', u'totally', u'straight', u'way', u'#brotime', u'<3', u':p', u':D']
 73 | 
74 | 75 | 76 | 77 | 79 | 80 | 82 | 83 | 84 | 89 | 90 |
81 | Nested Classes
85 |   86 | 87 | TokenizerException 88 |
91 | 92 | 93 | 95 | 96 | 98 | 99 | 100 | 126 | 127 | 128 | 144 | 145 | 146 | 162 | 163 | 164 | 181 | 182 | 183 | 201 | 202 | 203 | 221 | 222 | 223 | 239 | 240 |
97 | Instance Methods
101 |   102 | 103 | 104 | 105 | 118 | 122 | 123 |
__init__(self, 106 | lowercase=True, 107 | allcapskeep=True, 108 | normalize=3, 109 | usernames='USERNAME', 110 | urls='URL', 111 | hashtags=False, 112 | phonenumbers='PHONENUMBER', 113 | times='TIME', 114 | numbers='NUMBER', 115 | ignorequotes=False, 116 | ignorestopwords=False)
117 | Constructs a new Tokenizer.
119 | source code 120 | 121 |
124 | 125 |
129 | list of str 130 | 131 | 132 | 133 | 136 | 140 | 141 |
__call__(self, 134 | iterable)
135 | Iterator for the tokenization of given messages.
137 | source code 138 | 139 |
142 | 143 |
147 |   148 | 149 | 150 | 151 | 154 | 158 | 159 |
update(self, 152 | **kwargs)
153 | Adjust any settings of the Tokenizer.
155 | source code 156 | 157 |
160 | 161 |
165 | list of str 166 | 167 | 168 | 169 | 173 | 177 | 178 |
tokenize(self, 170 | message)
171 | Tokenize the given string into a list of strings representing the 172 | constituent words of the message.
174 | source code 175 | 176 |
179 | 180 |
184 |   185 | 186 | 187 | 188 | 193 | 197 | 198 |
emoticons(self, 189 | iterable=None, 190 | filename=None)
191 | Consumes an iterable of emoticons that the tokenizer will tokenize 192 | on.
194 | source code 195 | 196 |
199 | 200 |
204 |   205 | 206 | 207 | 208 | 213 | 217 | 218 |
stopwords(self, 209 | iterable=None, 210 | filename=None)
211 | Consumes an iterable of stopwords that the tokenizer will ignore if 212 | the stopwords setting is True.
214 | source code 215 | 216 |
219 | 220 |
224 |

Inherited from object: 225 | __delattr__, 226 | __format__, 227 | __getattribute__, 228 | __hash__, 229 | __new__, 230 | __reduce__, 231 | __reduce_ex__, 232 | __repr__, 233 | __setattr__, 234 | __sizeof__, 235 | __str__, 236 | __subclasshook__ 237 |

238 |
241 | 242 | 243 | 245 | 246 | 248 | 249 | 250 | 263 | 264 | 265 | 270 | 271 | 272 | 285 | 286 | 287 | 292 | 293 | 294 | 299 | 300 | 301 | 307 | 308 | 309 | 314 | 315 | 316 | 322 | 323 | 324 | 330 | 331 | 332 | 337 | 338 | 339 | 352 | 353 | 354 | 359 | 360 | 361 | 366 | 367 | 368 | 373 | 374 | 375 | 381 | 382 |
247 | Class Variables
251 |   252 | 253 | html_entities = {'AElig': u'Æ', 'Aacute': u'Á', 'Acirc': u'Â',... 262 |
266 |   267 | 268 | usernames_re = re.compile(r'@\w{1,15}') 269 |
273 |   274 | 275 | urls_re = re.compile(r'(?:(?:https?://[A-Za-z0-9\.]+)|(?:(?:ww... 284 |
288 |   289 | 290 | hashtags_re = re.compile(r'#\w+[\w\'-]*\w+') 291 |
295 |   296 | 297 | ellipsis_re = re.compile(r'\.\.+') 298 |
302 |   303 | 304 | word_re = re.compile(r'(?:[a-zA-Z0-9]+[\'-]?[a-zA-Z]+[a-zA-Z0-... 306 |
310 |   311 | 312 | times_re = re.compile(r'\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM|am... 313 |
317 |   318 | 319 | phonenumbers_re = re.compile(r'(?:\+?[01][-\s\.]*)?(?:\(?\d{3}... 321 |
325 |   326 | 327 | numbers_re = re.compile(r'(?:[\+-]?\$?\d+(?:\.\d+)?(?:[eE]-?\d... 329 |
333 |   334 | 335 | other_re = '(?:[^#\\s\\.]|\\.(?!\\.))+' 336 |
340 |   341 | 342 | tokenize_re = re.compile(r'@\w{1,15}|(?:(?:https?://[A-Za-z0-9... 351 |
355 |   356 | 357 | html_entities_re = re.compile(r'&#?\w+;') 358 |
362 |   363 | 364 | repeating_re = re.compile(r'([a-zA-Z])\1\1+') 365 |
369 |   370 | 371 | punctuation = u'!$%()*+,-/:;<=>?[\]^_.`{|}~'“”""‘’""' 372 |
376 |   377 | 378 | quotes_re = re.compile(r'(\u201c.*?\u201d)|(".*?")|(\u2018.*?\... 380 |
383 | 384 | 385 | 387 | 388 | 390 | 391 | 392 | 397 | 398 |
389 | Properties
393 |

Inherited from object: 394 | __class__ 395 |

396 |
399 | 400 | 401 | 403 | 404 | 406 | 407 |
405 | Method Details
408 | 409 |
410 | 412 |
413 | 414 | 432 |
415 |

__init__(self, 416 | lowercase=True, 417 | allcapskeep=True, 418 | normalize=3, 419 | usernames='USERNAME', 420 | urls='URL', 421 | hashtags=False, 422 | phonenumbers='PHONENUMBER', 423 | times='TIME', 424 | numbers='NUMBER', 425 | ignorequotes=False, 426 | ignorestopwords=False) 427 |
(Constructor) 428 |

429 |
source code  431 |
433 | 434 |

Constructs a new Tokenizer. Can specify custom settings for various 435 | feature normalizations.

436 |

Any features with replacement tokens can be removed from the message 437 | by setting the token to the empty string (""), 438 | "DELETE", or "REMOVE".

439 |
440 |
Parameters:
441 |
    442 |
  • lowercase (bool) - If True, lowercases words, excluding those with all 443 | letters capitalized.
  • 444 |
  • allcapskeep (bool) - If True, maintains capitalization for words with all 445 | letters in capitals. Otherwise, capitalization for such words is 446 | dependent on lowercase.
  • 447 |
  • normalize (int) - The number of repeating letters when normalizing arbitrary letter 448 | elongations. 449 |

    Example:

    450 |
    451 |    Heyyyyyy i lovvvvvvve youuuuuuuuu <3
    452 | 
    453 |

    Becomes:

    454 |
    455 |    Heyyy i lovvve youuu <3
    456 | 
    457 |

    Not sure why you would want to change this (maybe just for 458 | fun?? :P)

  • 459 |
  • usernames - Serves as the replacement token for anything that parses as a 460 | Twitter username, ie. @rayj. Setting this to 461 | False means no usernames will be changed.
  • 462 |
  • urls - Serves as the replacement token for anything that parses as a 463 | URL, ie. bit.ly or http://example.com. 464 | Setting this to False means no URLs will be changed.
  • 465 |
  • hashtags - Serves as the replacement token for anything that parses as a 466 | Twitter hashtag, ie. #ihititfirst or 467 | #onedirection. Setting this to False 468 | means no hashtags will be changed.
  • 469 |
  • phonenumbers - Replacement token for phone numbers.
  • 470 |
  • times - Replacement token for times.
  • 471 |
  • numbers - Replacement token for any other kinds of numbers.
  • 472 |
  • ignorequotes (bool) - If True, will remove various types of quotes and the 473 | contents within.
  • 474 |
  • ignorestopwords (bool) - If True, will remove any stopwords. The default set 475 | includes 'I', 'me', 'itself', 'against', 'should', etc.
  • 476 |
477 |
Overrides: 478 | object.__init__ 479 |
480 |
481 |
482 |
483 | 484 |
485 | 487 |
488 | 489 | 497 |
490 |

__call__(self, 491 | iterable) 492 |
(Call operator) 493 |

494 |
source code  496 |
498 | 499 |

Iterator for the tokenization of given messages.

500 |
501 |
Parameters:
502 |
    503 |
  • iterable - Object capable of iteration, providing strings for tokenization.
  • 504 |
505 |
Returns: list of str
506 |
Iterator of lists representing message tokenizations.
507 |
508 |
509 |
510 | 511 |
512 | 514 |
515 | 516 | 523 |
517 |

update(self, 518 | **kwargs) 519 |

520 |
source code  522 |
524 | 525 |

Adjust any settings of the Tokenizer.

526 |
527 | >>> gettokens.lowercase
528 | True
529 | >>> gettokens.phonenumbers
530 | 'PHONENUMBER'
531 | >>> gettokens.update(phonenumbers='NUMBER', lowercase=False)
532 | >>> gettokens.lowercase
533 | False
534 | >>> gettokens.phonenumbers
535 | 'NUMBER'
536 |
537 |
538 |
539 |
540 | 541 |
542 | 544 |
545 | 546 | 553 |
547 |

tokenize(self, 548 | message) 549 |

550 |
source code  552 |
554 | 555 |

Tokenize the given string into a list of strings representing the 556 | constituent words of the message.

557 |
558 |
Parameters:
559 |
    560 |
  • message (str) - The string representation of the message.
  • 561 |
562 |
Returns: list of str
563 |
The tokenization of the message.
564 |
565 |
566 |
567 | 568 |
569 | 571 |
572 | 573 | 581 |
574 |

emoticons(self, 575 | iterable=None, 576 | filename=None) 577 |

578 |
source code  580 |
582 | 583 |

Consumes an iterable of emoticons that the tokenizer will tokenize on. 584 | Allows for user-specified set of emoticons to be recognized.

585 |
586 |
Parameters:
587 |
    588 |
  • iterable - Object capable of iteration, providing emoticon strings.
  • 589 |
  • filename (str) - Path to the file containing emoticons delimited by new lines. 590 | Strips trailing whitespace and skips blank lines.
  • 591 |
592 |
593 |
594 |
595 | 596 |
597 | 599 |
600 | 601 | 609 |
602 |

stopwords(self, 603 | iterable=None, 604 | filename=None) 605 |

606 |
source code  608 |
610 | 611 |

Consumes an iterable of stopwords that the tokenizer will ignore if 612 | the stopwords setting is True. The default set is taken from 613 | NLTK's english list.

614 |
615 |
Parameters:
616 |
    617 |
  • iterable - Object capable of iteration, providing stopword strings.
  • 618 |
  • filename (str) - Path to the file containing stopwords delimited by new lines. 619 | Strips trailing whitespace and skips blank lines.
  • 620 |
621 |
622 |
623 |
624 |
625 | 626 | 627 | 629 | 630 | 632 | 633 |
631 | Class Variable Details
634 | 635 |
636 | 638 |
639 |

html_entities

640 | 641 |
642 |
643 |
644 |
Value:
645 |
646 | {'AElig': u'Æ',
647 |  'Aacute': u'Á',
648 |  'Acirc': u'Â',
649 |  'Agrave': u'À',
650 |  'Alpha': u'Α',
651 |  'Aring': u'Å',
652 |  'Atilde': u'Ã',
653 |  'Auml': u'Ä',
654 | ...
655 | 
656 |
657 |
658 |
659 |
660 | 661 |
662 | 664 |
665 |

urls_re

666 | 667 |
668 |
669 |
670 |
Value:
671 |
672 | re.compile(r'(?:(?:https?://[A-Za-z0-9\.]+)|(?:(?:www\.)?[A-Za-z0-9]+\\
673 | .(?:museum|travel|aero|arpa|asia|coop|info|jobs|mobi|name|post|biz|cat\
674 | |com|edu|gov|int|mil|net|org|pro|tel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao\
675 | |aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|\
676 | bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cw|cx|cy|cz|d\
677 | e|dj|dk|dm|do|dz|ec|ee|eg|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf\
678 | |gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|\
679 | im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|l\
680 | ...
681 | 
682 |
683 |
684 |
685 |
686 | 687 |
688 | 690 |
691 |

word_re

692 | 693 |
694 |
695 |
696 |
Value:
697 |
698 | re.compile(r'(?:[a-zA-Z0-9]+[\'-]?[a-zA-Z]+[a-zA-Z0-9]*)|(?:[a-zA-Z0-9\
699 | ]*[a-zA-Z]+[\'-]?[a-zA-Z0-9]+)')
700 | 
701 |
702 |
703 |
704 |
705 | 706 |
707 | 709 |
710 |

times_re

711 | 712 |
713 |
714 |
715 |
Value:
716 |
717 | re.compile(r'\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM|am|pm)?')
718 | 
719 |
720 |
721 |
722 |
723 | 724 |
725 | 727 |
728 |

phonenumbers_re

729 | 730 |
731 |
732 |
733 |
Value:
734 |
735 | re.compile(r'(?:\+?[01][-\s\.]*)?(?:\(?\d{3}[-\s\.\)]*)?\d{3}[-\s\.]*\\
736 | d{4}(?:\s*x\s*\d+)?(?=\s+|$)')
737 | 
738 |
739 |
740 |
741 |
742 | 743 |
744 | 746 |
747 |

numbers_re

748 | 749 |
750 |
751 |
752 |
Value:
753 |
754 | re.compile(r'(?:[\+-]?\$?\d+(?:\.\d+)?(?:[eE]-?\d+)?%?)(?![A-Za-z])(?:\
755 | \s*/\s*(?:[\+-]?\$?\d+(?:\.\d+)?(?:[eE]-?\d+)?%?)(?![A-Za-z]))?')
756 | 
757 |
758 |
759 |
760 |
761 | 762 |
763 | 765 |
766 |

tokenize_re

767 | 768 |
769 |
770 |
771 |
Value:
772 |
773 | re.compile(r'@\w{1,15}|(?:(?:https?://[A-Za-z0-9\.]+)|(?:(?:www\.)?[A-\
774 | Za-z0-9]+\.(?:museum|travel|aero|arpa|asia|coop|info|jobs|mobi|name|po\
775 | st|biz|cat|com|edu|gov|int|mil|net|org|pro|tel|xxx|ac|ad|ae|af|ag|ai|a\
776 | l|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo\
777 | |br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cw|\
778 | cx|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|g\
779 | b|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu\
780 | |id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|\
781 | ...
782 | 
783 |
784 |
785 |
786 |
787 | 788 |
789 | 791 |
792 |

quotes_re

793 | 794 |
795 |
796 |
797 |
Value:
798 |
799 | re.compile(r'(\u201c.*?\u201d)|(".*?")|(\u2018.*?\u2019)|(\uff02.*?\uf\
800 | f02)|\s(\'.*?\')\s')
801 | 
802 |
803 |
804 |
805 |
806 |
807 | 808 | 810 | 811 | 812 | 814 | 815 | 816 | 818 | 819 | 820 | 822 | 823 | 824 | 826 | 827 | 828 | 829 | 830 | 831 | 832 | 835 | 839 | 840 |
841 | 842 | 851 | 852 | 853 | -------------------------------------------------------------------------------- /documentation/tweetokenize.Tokenizer.TokenizerException-class.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | tweetokenize.Tokenizer.TokenizerException 7 | 8 | 9 | 10 | 11 | 13 | 14 | 16 | 17 | 18 | 20 | 21 | 22 | 24 | 25 | 26 | 28 | 29 | 30 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 45 | 54 | 55 |
39 | 40 | Module tweetokenize :: 41 | Class Tokenizer :: 42 | Class TokenizerException 43 | 44 | 46 | 47 | 48 | 52 |
[frames] | no frames]
53 |
56 | 57 |

Class TokenizerException

source code

58 |
 59 |               object --+    
 60 |                        |    
 61 | exceptions.BaseException --+
 62 |                            |
 63 |                           Tokenizer.TokenizerException
 64 | 
65 | 66 |
67 | 68 | 69 | 71 | 72 | 74 | 75 | 76 | 99 | 100 |
73 | Instance Methods
77 |

Inherited from exceptions.BaseException: 78 | __delattr__, 79 | __getattribute__, 80 | __getitem__, 81 | __getslice__, 82 | __init__, 83 | __new__, 84 | __reduce__, 85 | __repr__, 86 | __setattr__, 87 | __setstate__, 88 | __str__, 89 | __unicode__ 90 |

91 |

Inherited from object: 92 | __format__, 93 | __hash__, 94 | __reduce_ex__, 95 | __sizeof__, 96 | __subclasshook__ 97 |

98 |
101 | 102 | 103 | 105 | 106 | 108 | 109 | 110 | 119 | 120 |
107 | Properties
111 |

Inherited from exceptions.BaseException: 112 | args, 113 | message 114 |

115 |

Inherited from object: 116 | __class__ 117 |

118 |
121 | 122 | 124 | 125 | 126 | 128 | 129 | 130 | 132 | 133 | 134 | 136 | 137 | 138 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 149 | 153 | 154 |
155 | 156 | 165 | 166 | 167 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from distutils.core import setup 3 | import tweetokenize 4 | 5 | setup( 6 | name='tweetokenize', 7 | version=tweetokenize.__version__, 8 | description='Regular expression based tokenizer for Twitter', 9 | author='Jared Suttles', 10 | url='https://github.com/jaredks/tweetokenize', 11 | packages=['tweetokenize'], 12 | package_data={'': ['LICENSE'], 'tweetokenize': ['lexicons/*.txt']}, 13 | long_description=open('README.md').read() + '\n\n' + open('CHANGES').read(), 14 | license='BSD License', 15 | classifiers=[ 16 | 'Development Status :: 5 - Production/Stable', 17 | 'Intended Audience :: Developers', 18 | 'Intended Audience :: Education', 19 | 'Intended Audience :: Science/Research', 20 | 'License :: OSI Approved :: BSD License', 21 | 'Operating System :: MacOS :: MacOS X', 22 | 'Operating System :: Microsoft :: Windows', 23 | 'Operating System :: POSIX', 24 | 'Programming Language :: Python', 25 | 'Topic :: Scientific/Engineering :: Information Analysis', 26 | 'Topic :: Software Development :: Libraries :: Python Modules', 27 | 'Topic :: Text Processing :: Linguistic', 28 | ], 29 | ) 30 | -------------------------------------------------------------------------------- /tests/__main__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import unittest 3 | from test_tweetokenize import TokenizeTests 4 | 5 | suite = unittest.TestSuite() 6 | suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TokenizeTests)) 7 | 8 | unittest.TextTestRunner().run(suite) 9 | -------------------------------------------------------------------------------- /tests/test_tweetokenize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # tweetokenize: Regular expression based tokenizer for Twitter 5 | # Copyright: (c) 2013, Jared Suttles. All rights reserved. 6 | # License: BSD, see LICENSE for details. 7 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 8 | import unittest 9 | from tweetokenize import Tokenizer 10 | 11 | 12 | class TokenizeTests(unittest.TestCase): 13 | def setUp(self): 14 | self.tokenizer = Tokenizer(lowercase=True) 15 | 16 | def test_general_1(self): 17 | self.tokenizer.normalize = 2 18 | msg = ('omg wow < & > >.< >.< :):)' 19 | 'i CANT believe thatttt haha lol!!1') 20 | tks = ['omg', 'wow', '<', '&', '>', '>.<', '>.<', ':)', ':)', 21 | 'i', 'CANT', 'believe', 'thatt', 'haha', 'lol', '!', '!', '1'] 22 | self.assertEqual(self.tokenizer.tokenize(msg), tks) 23 | 24 | def test_general_2(self): 25 | msg = "i'm wanting to jump up and down but wouldn't if i couldn't.." 26 | tks = [u"i'm", u'wanting', u'to', u'jump', u'up', u'and', u'down', 27 | u'but', u"wouldn't", u'if', u'i', u"couldn't", u'...'] 28 | self.assertEqual(self.tokenizer.tokenize(msg), tks) 29 | 30 | def test_urls_1(self): 31 | msg = ("hey bro chec'k out http://shitstorm.com its fucking sick") 32 | tks = ['hey', 'bro', "chec'k", 'out', 'URL', 'its', 'fucking', 'sick'] 33 | self.assertEqual(self.tokenizer.tokenize(msg), tks) 34 | 35 | def test_urls_2(self): 36 | msg = ('also see this crazy stuff https://shitstorm.com') 37 | tks = ['also', 'see', 'this', 'crazy', 'stuff', 'URL'] 38 | self.assertEqual(self.tokenizer.tokenize(msg), tks) 39 | 40 | def test_urls_3(self): 41 | msg = 'hiiiii rayj.com/ihititfirst and other google.com http://hobo.net' 42 | tks = [u'hiii', u'URL', u'and', u'other', u'URL', u'URL'] 43 | self.assertEqual(self.tokenizer.tokenize(msg), tks) 44 | 45 | def test_usernames_1(self): 46 | msg = ('@justinbeiber yo man!! ! i love you in a totally ' 47 | 'straight way <3:p:D') 48 | tks = [u'USERNAME', u'yo', u'man', u'!', u'!', u'!', 49 | u'i', u'love', u'you', u'in', u'a', u'totally', u'straight', u'way', 50 | u'<3', u':p', u':D'] 51 | self.assertEqual(self.tokenizer.tokenize(msg), tks) 52 | 53 | def test_usernames_2(self): 54 | msg = '@heyheymango: what did you SAYYY??? or did you just.. NotHING?' 55 | tks = [u'USERNAME', u':', u'what', u'did', u'you', u'SAYYY', u'?', 56 | u'?', u'?', u'or', u'did', u'you', u'just', u'...', u'nothing', u'?'] 57 | self.assertEqual(self.tokenizer.tokenize(msg), tks) 58 | 59 | def test_numbers_1(self): 60 | self.tokenizer.numbers = None 61 | msg = ('i have this much money -2.42 in my bank acct.,friend! but you ' 62 | 'have mucho +88e44 and its about 1000% more than $400.') 63 | tks = [u'i', u'have', u'this', u'much', u'money', u'-2.42', u'in', 64 | u'my', u'bank', u'acct', u'.', u',', u'friend', u'!', u'but', u'you', 65 | u'have', u'mucho', u'+88e44', u'and', u'its', u'about', u'1000%', 66 | u'more', u'than', u'$400', u'.'] 67 | self.assertEqual(self.tokenizer.tokenize(msg), tks) 68 | 69 | def test_numbers_2(self): 70 | msg = ('i have this much money -2.42 in my bank acct.,friend! but you ' 71 | 'have mucho +88e44 and its about 1000% more than $400.') 72 | tks = [u'i', u'have', u'this', u'much', u'money', u'NUMBER', u'in', 73 | u'my', u'bank', u'acct', u'.', u',', u'friend', u'!', u'but', u'you', 74 | u'have', u'mucho', u'NUMBER', u'and', u'its', u'about', u'NUMBER', 75 | u'more', u'than', u'NUMBER', u'.'] 76 | self.assertEqual(self.tokenizer.tokenize(msg), tks) 77 | 78 | def test_numbers_3(self): 79 | self.tokenizer.lowercase = False # keep cases the same everywhere 80 | msg = ('I JUST want To Test FRACTIONZZZ 22432.41414/ 55894385e-341 also' 81 | ' lowercase etc.etc.etc. hope that last part doesn\'t parse as a url ' 82 | 'i would be kinda sad PANda!zsss..... .. . .... 4/5 5.1/4.0e0 3.14 -2') 83 | tks = [u'I', u'JUST', u'want', u'To', u'Test', u'FRACTIONZZZ', 84 | u'NUMBER', u'also', u'lowercase', u'etc', u'.', u'etc', u'.', u'etc', 85 | u'.', u'hope', u'that', u'last', u'part', u"doesn't", u'parse', u'as', 86 | u'a', u'url', u'i', u'would', u'be', u'kinda', u'sad', u'PANda', u'!', 87 | u'zsss', u'...', u'...', u'.', u'...', u'NUMBER', u'NUMBER', u'NUMBER', 88 | u'NUMBER'] 89 | self.assertEqual(self.tokenizer.tokenize(msg), tks) 90 | 91 | def test_time_1(self): 92 | msg = 'is the time now 12:14pm? or is it like 2:42AM??' 93 | tks = [u'is', u'the', u'time', u'now', u'TIME', u'?', u'or', u'is', 94 | u'it', u'like', u'TIME', u'?', u'?'] 95 | self.assertEqual(self.tokenizer.tokenize(msg), tks) 96 | 97 | def test_time_2(self): 98 | msg = 'new time is 2:42:09 PM!!' 99 | tks = [u'new', u'time', u'is', u'TIME', u'!', u'!'] 100 | self.assertEqual(self.tokenizer.tokenize(msg), tks) 101 | 102 | def test_phonenumber_1(self): 103 | msg = ('my number is 18002432242 and 241.413.5584 also 1-242-156-6724' 104 | ' and (958)555-4875 or (999) 415 5542 is 422-5555 a 131-121-1441') 105 | tks = [u'my', u'number', u'is', u'PHONENUMBER', u'and', u'PHONENUMBER', 106 | u'also', u'PHONENUMBER', u'and', u'PHONENUMBER', u'or', u'PHONENUMBER', 107 | u'is', u'PHONENUMBER', u'a', u'PHONENUMBER'] 108 | self.assertEqual(self.tokenizer.tokenize(msg), tks) 109 | 110 | def test_phonenumber_2(self): 111 | msg = 'numbers with extension: (201)-340-4915 x112 or 1 800.341.1311x99' 112 | tks = [u'numbers', u'with', u'extension', u':', u'PHONENUMBER', u'or', 113 | u'PHONENUMBER'] 114 | self.assertEqual(self.tokenizer.tokenize(msg), tks) 115 | 116 | def test_quotes_1(self): 117 | self.tokenizer.ignorequotes = True 118 | msg = 'this is just a tweet with "someone said something funny" lol' 119 | tks = ['this', 'is', 'just', 'a', 'tweet', 'with', 'lol'] 120 | self.assertEqual(self.tokenizer.tokenize(msg), tks) 121 | 122 | def test_quotes_2(self): 123 | self.tokenizer.ignorequotes = False 124 | msg = 'this is just a tweet with "someone said something funny" lol' 125 | tks = ['this', 'is', 'just', 'a', 'tweet', 'with', '"', 'someone', 126 | 'said', 'something', 'funny', '"', 'lol'] 127 | self.assertEqual(self.tokenizer.tokenize(msg), tks) 128 | 129 | def test_quotes_3(self): 130 | self.tokenizer.ignorequotes = True 131 | msg = ('some stuff but he said “yea i know its crazy”other ' 132 | 'stuff...!!! ') 133 | tks = [u'some', u'stuff', u'but', u'he', u'said', u'other', u'stuff', 134 | u'...', u'!', u'!', u'!'] 135 | self.assertEqual(self.tokenizer.tokenize(msg), tks) 136 | 137 | def test_quotes_4(self): 138 | self.tokenizer.ignorequotes = True 139 | msg = ('some stuff but he said “yea i know its crazy”other ' 140 | 'stuff...!!! ') 141 | tks = [u'some', u'stuff', u'but', u'he', u'said', u'other', u'stuff', 142 | u'...', u'!', u'!', u'!'] 143 | self.assertEqual(self.tokenizer.tokenize(msg), tks) 144 | 145 | def test_quotes_5(self): 146 | self.tokenizer.ignorequotes = False 147 | msg = 'heyy buddyyyyy boy \'do you the lady\'s kitty like that??\'' 148 | tks = [u'heyy', u'buddyyy', u'boy', u"'", u'do', u'you', u'the', 149 | u"lady's", u'kitty', u'like', u'that', u'?', u'?', u"'"] 150 | self.assertEqual(self.tokenizer.tokenize(msg), tks) 151 | 152 | def test_hashtags_1(self): 153 | msg = 'omg i love#dog#cat#food#other#things#so#fucking#much!!!11LOLOLOL' 154 | tks = ['omg', 'i', 'love', '#dog', '#cat', '#food', '#other', 155 | '#things', '#so', '#fucking', '#much', '!', '!', '!', '11LOLOLOL'] 156 | self.assertEqual(self.tokenizer.tokenize(msg), tks) 157 | 158 | def test_hashtags_2(self): 159 | self.tokenizer.hashtags = 'HASHTAG' 160 | msg = 'omg i love#dog#cat#food#other#things#so#fucking#much!!!11LOLOLOL' 161 | tks = ['omg', 'i', 'love', 'HASHTAG', 'HASHTAG', 'HASHTAG', 162 | 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', '!', '!', '!', 163 | '11LOLOLOL'] 164 | self.assertEqual(self.tokenizer.tokenize(msg), tks) 165 | 166 | def test_emoticons_1(self): 167 | msg = 'heyyyyyy:):):(>.<.<', u'<', u'v.v', u'whats', 169 | u'up', u'man', u'LOL', u'T.T', u'tomcat', u'.', u'tomcat', u':$', 170 | u';)', u'.', u'!', u'!', u'!'] 171 | self.assertEqual(self.tokenizer.tokenize(msg), tks) 172 | 173 | def test_removefeatures_1(self): 174 | self.tokenizer.usernames = "" # dont' want any usernames to show 175 | msg = ('hey @arnold @nickelodeon #90s#ilove90s#allthat#amandashow' 176 | '@rocko http://en.wikipedia.org/wiki/The_Angry_Beavers ^.^>>><<<^.^') 177 | tks = [u'hey', u'#90s', u'#ilove90s', u'#allthat', u'#amandashow', 178 | u'URL', u'^.^', u'>', u'>', u'>', u'<', u'<', u'<', u'^.^'] 179 | self.assertEqual(self.tokenizer.tokenize(msg), tks) 180 | 181 | def test_removefeatures_2(self): 182 | self.tokenizer.usernames = "" # dont' want any usernames to show 183 | self.tokenizer.hashtags = "" # or hashtags 184 | msg = ('hey @arnold @nickelodeon #90s#ilove90s#allthat#amandashow' 185 | '@rocko http://en.wikipedia.org/wiki/The_Angry_Beavers ^.^>>><<<^.^') 186 | tks = [u'hey', u'URL', u'^.^', u'>', u'>', u'>', u'<', u'<', u'<', 187 | u'^.^'] 188 | self.assertEqual(self.tokenizer.tokenize(msg), tks) 189 | 190 | def test_removefeatures_3(self): 191 | self.tokenizer.usernames = False # keep usernames 192 | self.tokenizer.urls = "" # URLs should be removed 193 | self.tokenizer.hashtags = "$$$" # hashtags should be $$$ 194 | msg = ('hey @arnold @nickelodeon #90s#ilove90s#allthat#amandashow' 195 | '@rocko http://en.wikipedia.org/wiki/The_Angry_Beavers ^.^>>><<<^.^') 196 | tks = [u'hey', u'@arnold', u'@nickelodeon', u'$$$', u'$$$', u'$$$', 197 | u'$$$', u'@rocko', u'^.^', u'>', u'>', u'>', u'<', u'<', u'<', u'^.^'] 198 | self.assertEqual(self.tokenizer.tokenize(msg), tks) 199 | 200 | def test_emoji_1(self): 201 | msg = ('hey mate!:):3.....@and🇨🇳ONE+ BRO#love😘😵💚💛💜💙 ' 202 | '💋😂😂LOLLLL.') 203 | tks = [u'hey', u'mate', u'!', u':)', u':3', u'...', 204 | u'USERNAME', u'\U0001f1e8\U0001f1f3', u'ONE', u'+', u'BRO', u'#love', 205 | u'\U0001f618', u'\U0001f635', u'\U0001f49a', u'\U0001f49b', 206 | u'\U0001f49c', u'\U0001f499', u'\U0001f48b', u'\U0001f602', 207 | u'\U0001f602', u'LOLLL', u'.'] 208 | self.assertEqual(self.tokenizer.tokenize(msg), tks) 209 | 210 | def test_emoji_2(self): 211 | msg = ('hey mate!:):3.....@andONE+🇬🇧 BRO#love😘😵💚💛💜💙 ' 212 | '💋😂😂LOLLLL.') 213 | tks = [u'hey', u'mate', u'!', u':)', u':3', u'...', 214 | u'USERNAME', u'+', u'\U0001f1ec\U0001f1e7', u'BRO', u'#love', u'😘', 215 | u'😵', u'\U0001f49a', u'\U0001f49b', u'\U0001f49c', 216 | u'\U0001f499', u'💋', u'\U0001f602', u'\U0001f602', 217 | u'LOLLL', u'.'] 218 | self.assertEqual(self.tokenizer.tokenize(msg), tks) 219 | 220 | def test_emoji_3(self): 221 | msg = ('🚀=):o 4 | >:0 5 | D:< 6 | D: 7 | D8 8 | D; 9 | D= 10 | Dx 11 | >.< 12 | >_< 13 | d:< 14 | d: 15 | d8 16 | d; 17 | d= 18 | dx 19 | v.v 20 | :/ 21 | :\ 22 | =/ 23 | =\ 24 | >:/ 25 | >:\ 26 | :-/ 27 | :-\ 28 | :) 29 | (: 30 | ;) 31 | ;( 32 | (; 33 | ); 34 | :-) 35 | :3 36 | :d 37 | :D 38 | xd 39 | :') 40 | ^_^ 41 | ^.^ 42 | :] 43 | :} 44 | :p 45 | :b 46 | =p 47 | =b 48 | :-p 49 | :-b 50 | =) 51 | :( 52 | ): 53 | :'( 54 | :c 55 | :-( 56 | >> from tweetokenize import Tokenizer 58 | >>> gettokens = Tokenizer(usernames='USER', urls='') 59 | >>> gettokens.tokenize('@justinbeiber yo man!love you#inlove#wantyou in a totally straight way #brotime <3:p:D www.justinbeiber.com') 60 | [u'USER', u'yo', u'man', u'!', u'love', u'you', u'#inlove', u'#wantyou', u'in', u'a', u'totally', u'straight', u'way', u'#brotime', u'<3', u':p', u':D'] 61 | """ 62 | _default_args = dict( 63 | lowercase=True, allcapskeep=True, normalize=3, usernames='USERNAME', urls='URL', hashtags=False, 64 | phonenumbers='PHONENUMBER', times='TIME', numbers='NUMBER', ignorequotes=False, ignorestopwords=False 65 | ) 66 | _lexicons = path.join(path.dirname(path.realpath(__file__)), 'lexicons/{}.txt') 67 | 68 | # Regular expressions 69 | usernames_re = re.compile(r"@\w{1,15}") 70 | with open(_lexicons.format('domains'), 'r') as f: 71 | domains = f.read().strip().replace('\n', '|') 72 | urls_re = re.compile(r"(?:(?:https?\://[A-Za-z0-9\.]+)|(?:(?:www\.)?[A-Za-z0-9]+\.(?:{})))(?:\/\S+)?" 73 | "(?=\s+|$)".format(domains)) 74 | del domains 75 | hashtags_re = re.compile(r"#\w+[\w'-]*\w+") 76 | ellipsis_re = re.compile(r"\.\.+") 77 | word_re = re.compile(r"(?:[a-zA-Z0-9]+['-]?[a-zA-Z]+[a-zA-Z0-9]*)|(?:[a-zA-Z0-9]*[a-zA-Z]+['-]?[a-zA-Z0-9]+)") 78 | times_re = re.compile(r"\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM|am|pm)?") 79 | phonenumbers_re = re.compile(r"(?:\+?[01][\-\s\.]*)?(?:\(?\d{3}[\-\s\.\)]*)?\d{3}[\-\s\.]*\d{4}(?:\s*x\s*\d+)?" 80 | "(?=\s+|$)") 81 | number_re = r"(?:[+-]?\$?\d+(?:\.\d+)?(?:[eE]-?\d+)?%?)(?![A-Za-z])" 82 | numbers_re = re.compile(r"{0}(?:\s*/\s*{0})?".format(number_re)) # deals with fractions 83 | del number_re 84 | other_re = r"(?:[^#\s\.]|\.(?!\.))+" 85 | _token_regexs = ('usernames', 'urls', 'hashtags', 'times', 'phonenumbers', 'numbers') 86 | tokenize_re = re.compile( 87 | ur"|".join( 88 | imap(lambda x: getattr(x, 'pattern', x), 89 | [locals()[regex + '_re'] for regex in _token_regexs] + [word_re, ellipsis_re, other_re]))) 90 | del regex # otherwise stays in class namespace 91 | repeating_re = re.compile(r"([a-zA-Z])\1\1+") 92 | doublequotes = ((u'“',u'”'),(u'"',u'"'),(u'‘',u'’'),(u'"',u'"')) 93 | punctuation = (u'!$%()*+,-/:;<=>?[\\]^_.`{|}~\'' + u''.join(c for t in doublequotes for c in t)) 94 | quotes_re = re.compile(ur"|".join(ur'({}.*?{})'.format(f,s) for f,s in doublequotes) + ur'|\s(\'.*?\')\s') 95 | del doublequotes 96 | 97 | def __init__(self, **kwargs): 98 | """ 99 | Constructs a new Tokenizer. Can specify custom settings for various 100 | feature normalizations. 101 | 102 | Any features with replacement tokens can be removed from the message by 103 | setting the token to the empty string (C{""}), C{"DELETE"}, or 104 | C{"REMOVE"}. 105 | 106 | @type lowercase: C{bool} 107 | @param lowercase: If C{True}, lowercases words, excluding those with 108 | all letters capitalized. 109 | 110 | @type allcapskeep: C{bool} 111 | @param allcapskeep: If C{True}, maintains capitalization for words with 112 | all letters in capitals. Otherwise, capitalization for such words 113 | is dependent on C{lowercase}. 114 | 115 | @type normalize: C{int} 116 | @param normalize: The number of repeating letters when normalizing 117 | arbitrary letter elongations. 118 | 119 | Example:: 120 | Heyyyyyy i lovvvvvvve youuuuuuuuu <3 121 | 122 | Becomes:: 123 | Heyyy i lovvve youuu <3 124 | 125 | Not sure why you would want to change this (maybe just for fun?? :P) 126 | 127 | @param usernames: Serves as the replacement token for anything that 128 | parses as a Twitter username, ie. C{@rayj}. Setting this to 129 | C{False} means no usernames will be changed. 130 | 131 | @param urls: Serves as the replacement token for anything that 132 | parses as a URL, ie. C{bit.ly} or C{http://example.com}. Setting 133 | this to C{False} means no URLs will be changed. 134 | 135 | @param hashtags: Serves as the replacement token for anything that 136 | parses as a Twitter hashtag, ie. C{#ihititfirst} or 137 | C{#onedirection}. Setting this to C{False} means no hashtags will 138 | be changed. 139 | 140 | @param phonenumbers: Replacement token for phone numbers. 141 | 142 | @param times: Replacement token for times. 143 | 144 | @param numbers: Replacement token for any other kinds of numbers. 145 | 146 | @type ignorequotes: C{bool} 147 | @param ignorequotes: If C{True}, will remove various types of quotes 148 | and the contents within. 149 | 150 | @type ignorestopwords: C{bool} 151 | @param ignorestopwords: If C{True}, will remove any stopwords. The 152 | default set includes 'I', 'me', 'itself', 'against', 'should', etc. 153 | """ 154 | for keyword in self._default_args: 155 | setattr(self, keyword, kwargs.get(keyword, self._default_args[keyword])) 156 | self.emoticons(filename=self._lexicons.format('emoticons')) 157 | self.stopwords(filename=self._lexicons.format('stopwords')) 158 | 159 | def __call__(self, iterable): 160 | """ 161 | Iterator for the tokenization of given messages. 162 | 163 | @rtype: C{list} of C{str} 164 | @return: Iterator of lists representing message tokenizations. 165 | 166 | @param iterable: Object capable of iteration, providing strings for 167 | tokenization. 168 | """ 169 | for msg in iterable: 170 | yield self.tokenize(msg) 171 | 172 | def update(self, **kwargs): 173 | """ 174 | Adjust any settings of the Tokenizer. 175 | 176 | >>> gettokens = Tokenizer()) 177 | >>> gettokens.lowercase 178 | True 179 | >>> gettokens.phonenumbers 180 | 'PHONENUMBER' 181 | >>> gettokens.update(phonenumbers='NUMBER', lowercase=False) 182 | >>> gettokens.lowercase 183 | False 184 | >>> gettokens.phonenumbers 185 | 'NUMBER' 186 | """ 187 | for keyword in self._default_args: 188 | if keyword in kwargs: 189 | setattr(self, keyword, kwargs[keyword]) 190 | 191 | def _replacetokens(self, msg): 192 | tokens = [] 193 | deletion_tokens = {'', 'REMOVE', 'remove', 'DELETE', 'delete'} 194 | for word in msg: 195 | matching = self.word_re.match(word) # 1st check if normal word 196 | if matching and len(matching.group(0)) == len(word): 197 | tokens.append(self._cleanword(word)) 198 | continue # don't check rest of conditions 199 | for token in self._token_regexs: # id & possibly replace tokens 200 | regex = getattr(self, token + '_re') 201 | replacement_token = getattr(self, token) 202 | if regex.match(word): 203 | if replacement_token: # decide if we change it 204 | word = _unicode(str(replacement_token)) 205 | if replacement_token not in deletion_tokens: 206 | tokens.append(word) 207 | break 208 | else: # we didn't find a match for any token so far... 209 | if self.ellipsis_re.match(word): 210 | tokens.append(u"...") 211 | else: # split into tokens based on emoticons or punctuation 212 | tokens.extend(self._separate_emoticons_punctuation(word)) 213 | return tokens 214 | 215 | def _separate_emoticons_punctuation(self, word): 216 | newwords, wordbefore = [], [] 217 | i = 0 218 | def possibly_append_and_reset(): 219 | if wordbefore: 220 | newwords.append(self._cleanword(''.join(wordbefore))) 221 | wordbefore[:] = [] 222 | while i < len(word): 223 | # greedily check for emoticons in this word 224 | for l in range(self._maxlenemo, 0, -1): 225 | if word[i:i+l] in self._emoticons or _isemoji(word[i:i+l]): 226 | possibly_append_and_reset() 227 | newwords.append(word[i:i+l]) 228 | i+=l 229 | break 230 | else: # its safe to break up any punctuation not part of emoticons 231 | if word[i] in self.punctuation: 232 | possibly_append_and_reset() 233 | newwords.append(word[i]) 234 | else: 235 | wordbefore.append(word[i]) 236 | i+=1 237 | # possible ending of word which wasn't emoticon or punctuation 238 | possibly_append_and_reset() 239 | return newwords 240 | 241 | def _cleanword(self, word): 242 | if self.normalize: # replace characters with >=3 alphabetic repeating 243 | word = self.repeating_re.sub(r"\1"*self.normalize, word) 244 | if self.lowercase and (not self.allcapskeep or not word.isupper()): 245 | return word.lower() 246 | return word 247 | 248 | def tokenize(self, message): 249 | """ 250 | Tokenize the given string into a list of strings representing the 251 | constituent words of the message. 252 | 253 | @rtype: C{list} of C{str} 254 | @return: The tokenization of the message. 255 | 256 | @type message: C{str} 257 | @param message: The string representation of the message. 258 | """ 259 | if not isinstance(message, basestring): 260 | raise TypeError('cannot tokenize non-string, {}'.format(repr(type(message).__name__))) 261 | message = _converthtmlentities(_unicode(message)) 262 | if self.ignorequotes: 263 | message = self.quotes_re.sub(" ", message) 264 | message = self._replacetokens(self.tokenize_re.findall(message)) 265 | if self.ignorestopwords: 266 | message = [word for word in message if word not in self._stopwords] 267 | return message 268 | 269 | def emoticons(self, iterable=None, filename=None): 270 | """ 271 | Consumes an iterable of emoticons that the tokenizer will tokenize on. 272 | Allows for user-specified set of emoticons to be recognized. 273 | 274 | @param iterable: Object capable of iteration, providing emoticon 275 | strings. 276 | @type filename: C{str} 277 | @param filename: Path to the file containing emoticons delimited by 278 | new lines. Strips trailing whitespace and skips blank lines. 279 | """ 280 | self._emoticons = self._collectset(iterable, filename) 281 | self._maxlenemo = max(len(max(self._emoticons, key=lambda x: len(x))), 282 | len(u'\U0001f1e8\U0001f1f3'), len(u'\U0001f48b')) 283 | 284 | def stopwords(self, iterable=None, filename=None): 285 | """ 286 | Consumes an iterable of stopwords that the tokenizer will ignore if the 287 | stopwords setting is C{True}. The default set is taken from NLTK's 288 | english list. 289 | 290 | @param iterable: Object capable of iteration, providing stopword 291 | strings. 292 | @type filename: C{str} 293 | @param filename: Path to the file containing stopwords delimited by 294 | new lines. Strips trailing whitespace and skips blank lines. 295 | """ 296 | self._stopwords = self._collectset(iterable, filename) 297 | 298 | @staticmethod 299 | def _collectset(iterable, filename): 300 | if filename: 301 | with open(filename, "r") as f: 302 | iterable = set(l.rstrip() for l in f) 303 | iterable.discard('') 304 | return set(imap(_unicode, iterable)) 305 | --------------------------------------------------------------------------------