├── .gitignore
├── CHANGES
├── LICENSE
├── README.md
├── documentation
    ├── api-objects.txt
    ├── class-tree.html
    ├── crarr.png
    ├── epydoc.css
    ├── epydoc.js
    ├── frames.html
    ├── help.html
    ├── identifier-index.html
    ├── index.html
    ├── module-tree.html
    ├── redirect.html
    ├── toc-everything.html
    ├── toc-tweetokenize-module.html
    ├── toc.html
    ├── tweetokenize-module.html
    ├── tweetokenize-pysrc.html
    ├── tweetokenize.Tokenizer-class.html
    └── tweetokenize.Tokenizer.TokenizerException-class.html
├── setup.py
├── tests
    ├── __main__.py
    └── test_tweetokenize.py
└── tweetokenize
    ├── __init__.py
    ├── lexicons
        ├── domains.txt
        ├── emoticons.txt
        └── stopwords.txt
    └── tokenizer.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | *.pyc
3 | .gitignore
4 | build
5 | bench
6 | 


--------------------------------------------------------------------------------
/CHANGES:
--------------------------------------------------------------------------------
 1 | Changes
 2 | =======
 3 | 
 4 | 1.0.1 (2013-08-15)
 5 | ------------------
 6 | 
 7 | - Module docstring
 8 | - Changes to `setup.py`
 9 | - Refactored: gained ~15% speed up for tokenization
10 | 
11 | 
12 | 1.0.0 (2013-05-11 - 2013-06-25)
13 | -------------------------------
14 | 
15 | - First version
16 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2013, Jared Suttles.
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without modification,
 5 | are permitted provided that the following conditions are met:
 6 | 
 7 |     1. Redistributions of source code must retain the above copyright notice,
 8 |        this list of conditions and the following disclaimer.
 9 | 
10 |     2. Redistributions in binary form must reproduce the above copyright
11 |        notice, this list of conditions and the following disclaimer in the
12 |        documentation and/or other materials provided with the distribution.
13 | 
14 |     3. Neither the name of tweetokenize nor the names of its contributors may be
15 |        used to endorse or promote products derived from this software without
16 |        specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | tweetokenize
 2 | ============
 3 | 
 4 | Regular expression based tokenizer for Twitter. Focused on tokenization
 5 | and pre-processing to train classifiers for sentiment, emotion, or mood.
 6 | 
 7 | Intended as glue between Python wrappers for Twitter API and machine
 8 | learning algorithms of the Natural Language Toolkit (NLTK), but probably
 9 | applicable to tokenizing any short messages of the social networking
10 | variety.
11 | 
12 | ```python
13 | from tweetokenize import Tokenizer
14 | gettokens = Tokenizer()
15 | gettokens.tokenize('hey playa!:):3.....@SHAQ can you still dunk?#old🍕🍔😵LOL')
16 | [u'hey', u'playa', u'!', u':)', u':3', u'...', u'USERNAME', u'can', u'you', u'still', u'dunk', u'?', u'#old', u'🍕', u'🍔', u'😵', u'LOL']
17 | ```
18 | 
19 | Features
20 | --------
21 | 
22 | * Can easily replace tweet features like usernames, urls, phone numbers, times, 
23 | etc. with tokens in order to reduce feature set complexity and improve 
24 | performance of classifiers
25 | * Allows user-defined sets of emoticons to be used in tokenization
26 | * Correctly separates emoji, written consecutively, into individual tokens
27 | 
28 | Installation
29 | ------------
30 | 
31 |     python setup.py install
32 | 
33 | After installation, you can make sure everything is working by running the following inside the project root folder,
34 | 
35 |     python tests
36 | 
37 | Documentation
38 | -------------
39 | 
40 | http://htmlpreview.github.io/?https://raw.github.com/jaredks/tweetokenize/master/documentation/tweetokenize.Tokenizer-class.html
41 | 
42 | License
43 | -------
44 | 
45 | "Modified BSD License". See LICENSE for details. Copyright Jared Suttles, 2013.
46 | 


--------------------------------------------------------------------------------
/documentation/api-objects.txt:
--------------------------------------------------------------------------------
 1 | tweetokenize	tweetokenize-module.html
 2 | tweetokenize.__package__	tweetokenize-module.html#__package__
 3 | tweetokenize.Tokenizer	tweetokenize.Tokenizer-class.html
 4 | tweetokenize.Tokenizer.repeating_re	tweetokenize.Tokenizer-class.html#repeating_re
 5 | tweetokenize.Tokenizer._cleanword	tweetokenize.Tokenizer-class.html#_cleanword
 6 | tweetokenize.Tokenizer.phonenumbers_re	tweetokenize.Tokenizer-class.html#phonenumbers_re
 7 | tweetokenize.Tokenizer._unicode	tweetokenize.Tokenizer-class.html#_unicode
 8 | tweetokenize.Tokenizer.usernames_re	tweetokenize.Tokenizer-class.html#usernames_re
 9 | tweetokenize.Tokenizer._replacetokens	tweetokenize.Tokenizer-class.html#_replacetokens
10 | tweetokenize.Tokenizer.quotes_re	tweetokenize.Tokenizer-class.html#quotes_re
11 | tweetokenize.Tokenizer._topleveldomains	tweetokenize.Tokenizer-class.html#_topleveldomains
12 | tweetokenize.Tokenizer.__init__	tweetokenize.Tokenizer-class.html#__init__
13 | tweetokenize.Tokenizer.emoticons	tweetokenize.Tokenizer-class.html#emoticons
14 | tweetokenize.Tokenizer.TokenizerException	tweetokenize.Tokenizer.TokenizerException-class.html
15 | tweetokenize.Tokenizer.punctuation	tweetokenize.Tokenizer-class.html#punctuation
16 | tweetokenize.Tokenizer._collectset	tweetokenize.Tokenizer-class.html#_collectset
17 | tweetokenize.Tokenizer._isemoji	tweetokenize.Tokenizer-class.html#_isemoji
18 | tweetokenize.Tokenizer.numbers_re	tweetokenize.Tokenizer-class.html#numbers_re
19 | tweetokenize.Tokenizer.times_re	tweetokenize.Tokenizer-class.html#times_re
20 | tweetokenize.Tokenizer.tokenize	tweetokenize.Tokenizer-class.html#tokenize
21 | tweetokenize.Tokenizer.__call__	tweetokenize.Tokenizer-class.html#__call__
22 | tweetokenize.Tokenizer._converthtmlentities	tweetokenize.Tokenizer-class.html#_converthtmlentities
23 | tweetokenize.Tokenizer._number	tweetokenize.Tokenizer-class.html#_number
24 | tweetokenize.Tokenizer._separate_emoticons_punctuation	tweetokenize.Tokenizer-class.html#_separate_emoticons_punctuation
25 | tweetokenize.Tokenizer.update	tweetokenize.Tokenizer-class.html#update
26 | tweetokenize.Tokenizer.word_re	tweetokenize.Tokenizer-class.html#word_re
27 | tweetokenize.Tokenizer.tokenize_re	tweetokenize.Tokenizer-class.html#tokenize_re
28 | tweetokenize.Tokenizer.html_entities	tweetokenize.Tokenizer-class.html#html_entities
29 | tweetokenize.Tokenizer.urls_re	tweetokenize.Tokenizer-class.html#urls_re
30 | tweetokenize.Tokenizer._doublequotes	tweetokenize.Tokenizer-class.html#_doublequotes
31 | tweetokenize.Tokenizer._token_regexs	tweetokenize.Tokenizer-class.html#_token_regexs
32 | tweetokenize.Tokenizer.other_re	tweetokenize.Tokenizer-class.html#other_re
33 | tweetokenize.Tokenizer.ellipsis_re	tweetokenize.Tokenizer-class.html#ellipsis_re
34 | tweetokenize.Tokenizer.stopwords	tweetokenize.Tokenizer-class.html#stopwords
35 | tweetokenize.Tokenizer.html_entities_re	tweetokenize.Tokenizer-class.html#html_entities_re
36 | tweetokenize.Tokenizer.hashtags_re	tweetokenize.Tokenizer-class.html#hashtags_re
37 | tweetokenize.Tokenizer.__default_args	tweetokenize.Tokenizer-class.html#__default_args
38 | tweetokenize.Tokenizer.TokenizerException	tweetokenize.Tokenizer.TokenizerException-class.html
39 | 


--------------------------------------------------------------------------------
/documentation/class-tree.html:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="ascii"?>
  2 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  3 |           "DTD/xhtml1-transitional.dtd">
  4 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
  5 | <head>
  6 |   <title>Class Hierarchy</title>
  7 |   <link rel="stylesheet" href="epydoc.css" type="text/css" />
  8 |   <script type="text/javascript" src="epydoc.js"></script>
  9 | </head>
 10 | 
 11 | <body bgcolor="white" text="black" link="blue" vlink="#204080"
 12 |       alink="#204080">
 13 | <!-- ==================== NAVIGATION BAR ==================== -->
 14 | <table class="navbar" border="0" width="100%" cellpadding="0"
 15 |        bgcolor="#a0c0ff" cellspacing="0">
 16 |   <tr valign="middle">
 17 |   <!-- Home link -->
 18 |       <th>&nbsp;&nbsp;&nbsp;<a
 19 |         href="tweetokenize-module.html">Home</a>&nbsp;&nbsp;&nbsp;</th>
 20 | 
 21 |   <!-- Tree link -->
 22 |       <th bgcolor="#70b0f0" class="navbar-select"
 23 |           >&nbsp;&nbsp;&nbsp;Trees&nbsp;&nbsp;&nbsp;</th>
 24 | 
 25 |   <!-- Index link -->
 26 |       <th>&nbsp;&nbsp;&nbsp;<a
 27 |         href="identifier-index.html">Indices</a>&nbsp;&nbsp;&nbsp;</th>
 28 | 
 29 |   <!-- Help link -->
 30 |       <th>&nbsp;&nbsp;&nbsp;<a
 31 |         href="help.html">Help</a>&nbsp;&nbsp;&nbsp;</th>
 32 | 
 33 |       <th class="navbar" width="100%"></th>
 34 |   </tr>
 35 | </table>
 36 | <table width="100%" cellpadding="0" cellspacing="0">
 37 |   <tr valign="top">
 38 |     <td width="100%">&nbsp;</td>
 39 |     <td>
 40 |       <table cellpadding="0" cellspacing="0">
 41 |         <!-- hide/show private -->
 42 |         <tr><td align="right"><span class="options"
 43 |             >[<a href="frames.html" target="_top">frames</a
 44 |             >]&nbsp;|&nbsp;<a href="class-tree.html"
 45 |             target="_top">no&nbsp;frames</a>]</span></td></tr>
 46 |       </table>
 47 |     </td>
 48 |   </tr>
 49 | </table>
 50 | <center><b>
 51 |  [ <a href="module-tree.html">Module Hierarchy</a>
 52 |  | <a href="class-tree.html">Class Hierarchy</a> ]
 53 | </b></center><br />
 54 | <h1 class="epydoc">Class Hierarchy</h1>
 55 | <ul class="nomargin-top">
 56 |     <li> <strong class="uidlink">object</strong>:
 57 |       <em class="summary">The most base type</em>
 58 |     <ul>
 59 |     <li> <strong class="uidlink">exceptions.BaseException</strong>:
 60 |       <em class="summary">Common base class for all exceptions</em>
 61 |     <ul>
 62 |     <li> <strong class="uidlink"><a href="tweetokenize.Tokenizer.TokenizerException-class.html">tweetokenize.Tokenizer.TokenizerException</a></strong>
 63 |     </li>
 64 |     </ul>
 65 |     </li>
 66 |     <li> <strong class="uidlink"><a href="tweetokenize.Tokenizer-class.html">tweetokenize.Tokenizer</a></strong>:
 67 |       <em class="summary">Can be used to tokenize a string representation of a message, 
 68 |         adjusting features based on the given configuration details, to 
 69 |         enable further processing in feature extraction and training 
 70 |         stages.</em>
 71 |     </li>
 72 |     </ul>
 73 |     </li>
 74 | </ul>
 75 | <!-- ==================== NAVIGATION BAR ==================== -->
 76 | <table class="navbar" border="0" width="100%" cellpadding="0"
 77 |        bgcolor="#a0c0ff" cellspacing="0">
 78 |   <tr valign="middle">
 79 |   <!-- Home link -->
 80 |       <th>&nbsp;&nbsp;&nbsp;<a
 81 |         href="tweetokenize-module.html">Home</a>&nbsp;&nbsp;&nbsp;</th>
 82 | 
 83 |   <!-- Tree link -->
 84 |       <th bgcolor="#70b0f0" class="navbar-select"
 85 |           >&nbsp;&nbsp;&nbsp;Trees&nbsp;&nbsp;&nbsp;</th>
 86 | 
 87 |   <!-- Index link -->
 88 |       <th>&nbsp;&nbsp;&nbsp;<a
 89 |         href="identifier-index.html">Indices</a>&nbsp;&nbsp;&nbsp;</th>
 90 | 
 91 |   <!-- Help link -->
 92 |       <th>&nbsp;&nbsp;&nbsp;<a
 93 |         href="help.html">Help</a>&nbsp;&nbsp;&nbsp;</th>
 94 | 
 95 |       <th class="navbar" width="100%"></th>
 96 |   </tr>
 97 | </table>
 98 | <table border="0" cellpadding="0" cellspacing="0" width="100%%">
 99 |   <tr>
100 |     <td align="left" class="footer">
101 |     Generated by Epydoc 3.0.1 on Mon May 20 02:41:06 2013
102 |     </td>
103 |     <td align="right" class="footer">
104 |       <a target="mainFrame" href="http://epydoc.sourceforge.net"
105 |         >http://epydoc.sourceforge.net</a>
106 |     </td>
107 |   </tr>
108 | </table>
109 | 
110 | <script type="text/javascript">
111 |   <!--
112 |   // Private objects are initially displayed (because if
113 |   // javascript is turned off then we want them to be
114 |   // visible); but by default, we want to hide them.  So hide
115 |   // them unless we have a cookie that says to show them.
116 |   checkCookie();
117 |   // -->
118 | </script>
119 | </body>
120 | </html>
121 | 


--------------------------------------------------------------------------------
/documentation/crarr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaredks/tweetokenize/ad3efd2d62aefcb7ab175933601b2f6d4c4d4c63/documentation/crarr.png


--------------------------------------------------------------------------------
/documentation/epydoc.css:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | /* Epydoc CSS Stylesheet
  4 |  *
  5 |  * This stylesheet can be used to customize the appearance of epydoc's
  6 |  * HTML output.
  7 |  *
  8 |  */
  9 | 
 10 | /* Default Colors & Styles
 11 |  *   - Set the default foreground & background color with 'body'; and 
 12 |  *     link colors with 'a:link' and 'a:visited'.
 13 |  *   - Use bold for decision list terms.
 14 |  *   - The heading styles defined here are used for headings *within*
 15 |  *     docstring descriptions.  All headings used by epydoc itself use
 16 |  *     either class='epydoc' or class='toc' (CSS styles for both
 17 |  *     defined below).
 18 |  */
 19 | body                        { background: #ffffff; color: #000000; }
 20 | p                           { margin-top: 0.5em; margin-bottom: 0.5em; }
 21 | a:link                      { color: #0000ff; }
 22 | a:visited                   { color: #204080; }
 23 | dt                          { font-weight: bold; }
 24 | h1                          { font-size: +140%; font-style: italic;
 25 |                               font-weight: bold; }
 26 | h2                          { font-size: +125%; font-style: italic;
 27 |                               font-weight: bold; }
 28 | h3                          { font-size: +110%; font-style: italic;
 29 |                               font-weight: normal; }
 30 | code                        { font-size: 100%; }
 31 | /* N.B.: class, not pseudoclass */
 32 | a.link                      { font-family: monospace; }
 33 |  
 34 | /* Page Header & Footer
 35 |  *   - The standard page header consists of a navigation bar (with
 36 |  *     pointers to standard pages such as 'home' and 'trees'); a
 37 |  *     breadcrumbs list, which can be used to navigate to containing
 38 |  *     classes or modules; options links, to show/hide private
 39 |  *     variables and to show/hide frames; and a page title (using
 40 |  *     <h1>).  The page title may be followed by a link to the
 41 |  *     corresponding source code (using 'span.codelink').
 42 |  *   - The footer consists of a navigation bar, a timestamp, and a
 43 |  *     pointer to epydoc's homepage.
 44 |  */ 
 45 | h1.epydoc                   { margin: 0; font-size: +140%; font-weight: bold; }
 46 | h2.epydoc                   { font-size: +130%; font-weight: bold; }
 47 | h3.epydoc                   { font-size: +115%; font-weight: bold;
 48 |                               margin-top: 0.2em; }
 49 | td h3.epydoc                { font-size: +115%; font-weight: bold;
 50 |                               margin-bottom: 0; }
 51 | table.navbar                { background: #a0c0ff; color: #000000;
 52 |                               border: 2px groove #c0d0d0; }
 53 | table.navbar table          { color: #000000; }
 54 | th.navbar-select            { background: #70b0ff;
 55 |                               color: #000000; } 
 56 | table.navbar a              { text-decoration: none; }  
 57 | table.navbar a:link         { color: #0000ff; }
 58 | table.navbar a:visited      { color: #204080; }
 59 | span.breadcrumbs            { font-size: 85%; font-weight: bold; }
 60 | span.options                { font-size: 70%; }
 61 | span.codelink               { font-size: 85%; }
 62 | td.footer                   { font-size: 85%; }
 63 | 
 64 | /* Table Headers
 65 |  *   - Each summary table and details section begins with a 'header'
 66 |  *     row.  This row contains a section title (marked by
 67 |  *     'span.table-header') as well as a show/hide private link
 68 |  *     (marked by 'span.options', defined above).
 69 |  *   - Summary tables that contain user-defined groups mark those
 70 |  *     groups using 'group header' rows.
 71 |  */
 72 | td.table-header             { background: #70b0ff; color: #000000;
 73 |                               border: 1px solid #608090; }
 74 | td.table-header table       { color: #000000; }
 75 | td.table-header table a:link      { color: #0000ff; }
 76 | td.table-header table a:visited   { color: #204080; }
 77 | span.table-header           { font-size: 120%; font-weight: bold; }
 78 | th.group-header             { background: #c0e0f8; color: #000000;
 79 |                               text-align: left; font-style: italic; 
 80 |                               font-size: 115%; 
 81 |                               border: 1px solid #608090; }
 82 | 
 83 | /* Summary Tables (functions, variables, etc)
 84 |  *   - Each object is described by a single row of the table with
 85 |  *     two cells.  The left cell gives the object's type, and is
 86 |  *     marked with 'code.summary-type'.  The right cell gives the
 87 |  *     object's name and a summary description.
 88 |  *   - CSS styles for the table's header and group headers are
 89 |  *     defined above, under 'Table Headers'
 90 |  */
 91 | table.summary               { border-collapse: collapse;
 92 |                               background: #e8f0f8; color: #000000;
 93 |                               border: 1px solid #608090;
 94 |                               margin-bottom: 0.5em; }
 95 | td.summary                  { border: 1px solid #608090; }
 96 | code.summary-type           { font-size: 85%; }
 97 | table.summary a:link        { color: #0000ff; }
 98 | table.summary a:visited     { color: #204080; }
 99 | 
100 | 
101 | /* Details Tables (functions, variables, etc)
102 |  *   - Each object is described in its own div.
103 |  *   - A single-row summary table w/ table-header is used as
104 |  *     a header for each details section (CSS style for table-header
105 |  *     is defined above, under 'Table Headers').
106 |  */
107 | table.details               { border-collapse: collapse;
108 |                               background: #e8f0f8; color: #000000;
109 |                               border: 1px solid #608090;
110 |                               margin: .2em 0 0 0; }
111 | table.details table         { color: #000000; }
112 | table.details a:link        { color: #0000ff; }
113 | table.details a:visited     { color: #204080; }
114 | 
115 | /* Fields */
116 | dl.fields                   { margin-left: 2em; margin-top: 1em;
117 |                               margin-bottom: 1em; }
118 | dl.fields dd ul             { margin-left: 0em; padding-left: 0em; }
119 | dl.fields dd ul li ul       { margin-left: 2em; padding-left: 0em; }
120 | div.fields                  { margin-left: 2em; }
121 | div.fields p                { margin-bottom: 0.5em; }
122 | 
123 | /* Index tables (identifier index, term index, etc)
124 |  *   - link-index is used for indices containing lists of links
125 |  *     (namely, the identifier index & term index).
126 |  *   - index-where is used in link indices for the text indicating
127 |  *     the container/source for each link.
128 |  *   - metadata-index is used for indices containing metadata
129 |  *     extracted from fields (namely, the bug index & todo index).
130 |  */
131 | table.link-index            { border-collapse: collapse;
132 |                               background: #e8f0f8; color: #000000;
133 |                               border: 1px solid #608090; }
134 | td.link-index               { border-width: 0px; }
135 | table.link-index a:link     { color: #0000ff; }
136 | table.link-index a:visited  { color: #204080; }
137 | span.index-where            { font-size: 70%; }
138 | table.metadata-index        { border-collapse: collapse;
139 |                               background: #e8f0f8; color: #000000;
140 |                               border: 1px solid #608090; 
141 |                               margin: .2em 0 0 0; }
142 | td.metadata-index           { border-width: 1px; border-style: solid; }
143 | table.metadata-index a:link { color: #0000ff; }
144 | table.metadata-index a:visited  { color: #204080; }
145 | 
146 | /* Function signatures
147 |  *   - sig* is used for the signature in the details section.
148 |  *   - .summary-sig* is used for the signature in the summary 
149 |  *     table, and when listing property accessor functions.
150 |  * */
151 | .sig-name                   { color: #006080; }
152 | .sig-arg                    { color: #008060; }
153 | .sig-default                { color: #602000; }
154 | .summary-sig                { font-family: monospace; }
155 | .summary-sig-name           { color: #006080; font-weight: bold; }
156 | table.summary a.summary-sig-name:link
157 |                             { color: #006080; font-weight: bold; }
158 | table.summary a.summary-sig-name:visited
159 |                             { color: #006080; font-weight: bold; }
160 | .summary-sig-arg            { color: #006040; }
161 | .summary-sig-default        { color: #501800; }
162 | 
163 | /* Subclass list
164 |  */
165 | ul.subclass-list { display: inline; }
166 | ul.subclass-list li { display: inline; }
167 | 
168 | /* To render variables, classes etc. like functions */
169 | table.summary .summary-name { color: #006080; font-weight: bold;
170 |                               font-family: monospace; }
171 | table.summary
172 |      a.summary-name:link    { color: #006080; font-weight: bold;
173 |                               font-family: monospace; }
174 | table.summary
175 |     a.summary-name:visited  { color: #006080; font-weight: bold;
176 |                               font-family: monospace; }
177 | 
178 | /* Variable values
179 |  *   - In the 'variable details' sections, each varaible's value is
180 |  *     listed in a 'pre.variable' box.  The width of this box is
181 |  *     restricted to 80 chars; if the value's repr is longer than
182 |  *     this it will be wrapped, using a backslash marked with
183 |  *     class 'variable-linewrap'.  If the value's repr is longer
184 |  *     than 3 lines, the rest will be ellided; and an ellipsis
185 |  *     marker ('...' marked with 'variable-ellipsis') will be used.
186 |  *   - If the value is a string, its quote marks will be marked
187 |  *     with 'variable-quote'.
188 |  *   - If the variable is a regexp, it is syntax-highlighted using
189 |  *     the re* CSS classes.
190 |  */
191 | pre.variable                { padding: .5em; margin: 0;
192 |                               background: #dce4ec; color: #000000;
193 |                               border: 1px solid #708890; }
194 | .variable-linewrap          { color: #604000; font-weight: bold; }
195 | .variable-ellipsis          { color: #604000; font-weight: bold; }
196 | .variable-quote             { color: #604000; font-weight: bold; }
197 | .variable-group             { color: #008000; font-weight: bold; }
198 | .variable-op                { color: #604000; font-weight: bold; }
199 | .variable-string            { color: #006030; }
200 | .variable-unknown           { color: #a00000; font-weight: bold; }
201 | .re                         { color: #000000; }
202 | .re-char                    { color: #006030; }
203 | .re-op                      { color: #600000; }
204 | .re-group                   { color: #003060; }
205 | .re-ref                     { color: #404040; }
206 | 
207 | /* Base tree
208 |  *   - Used by class pages to display the base class hierarchy.
209 |  */
210 | pre.base-tree               { font-size: 80%; margin: 0; }
211 | 
212 | /* Frames-based table of contents headers
213 |  *   - Consists of two frames: one for selecting modules; and
214 |  *     the other listing the contents of the selected module.
215 |  *   - h1.toc is used for each frame's heading
216 |  *   - h2.toc is used for subheadings within each frame.
217 |  */
218 | h1.toc                      { text-align: center; font-size: 105%;
219 |                               margin: 0; font-weight: bold;
220 |                               padding: 0; }
221 | h2.toc                      { font-size: 100%; font-weight: bold; 
222 |                               margin: 0.5em 0 0 -0.3em; }
223 | 
224 | /* Syntax Highlighting for Source Code
225 |  *   - doctest examples are displayed in a 'pre.py-doctest' block.
226 |  *     If the example is in a details table entry, then it will use
227 |  *     the colors specified by the 'table pre.py-doctest' line.
228 |  *   - Source code listings are displayed in a 'pre.py-src' block.
229 |  *     Each line is marked with 'span.py-line' (used to draw a line
230 |  *     down the left margin, separating the code from the line
231 |  *     numbers).  Line numbers are displayed with 'span.py-lineno'.
232 |  *     The expand/collapse block toggle button is displayed with
233 |  *     'a.py-toggle' (Note: the CSS style for 'a.py-toggle' should not
234 |  *     modify the font size of the text.)
235 |  *   - If a source code page is opened with an anchor, then the
236 |  *     corresponding code block will be highlighted.  The code
237 |  *     block's header is highlighted with 'py-highlight-hdr'; and
238 |  *     the code block's body is highlighted with 'py-highlight'.
239 |  *   - The remaining py-* classes are used to perform syntax
240 |  *     highlighting (py-string for string literals, py-name for names,
241 |  *     etc.)
242 |  */
243 | pre.py-doctest              { padding: .5em; margin: 1em;
244 |                               background: #e8f0f8; color: #000000;
245 |                               border: 1px solid #708890; }
246 | table pre.py-doctest        { background: #dce4ec;
247 |                               color: #000000; }
248 | pre.py-src                  { border: 2px solid #000000; 
249 |                               background: #f0f0f0; color: #000000; }
250 | .py-line                    { border-left: 2px solid #000000; 
251 |                               margin-left: .2em; padding-left: .4em; }
252 | .py-lineno                  { font-style: italic; font-size: 90%;
253 |                               padding-left: .5em; }
254 | a.py-toggle                 { text-decoration: none; }
255 | div.py-highlight-hdr        { border-top: 2px solid #000000;
256 |                               border-bottom: 2px solid #000000;
257 |                               background: #d8e8e8; }
258 | div.py-highlight            { border-bottom: 2px solid #000000;
259 |                               background: #d0e0e0; }
260 | .py-prompt                  { color: #005050; font-weight: bold;}
261 | .py-more                    { color: #005050; font-weight: bold;}
262 | .py-string                  { color: #006030; }
263 | .py-comment                 { color: #003060; }
264 | .py-keyword                 { color: #600000; }
265 | .py-output                  { color: #404040; }
266 | .py-name                    { color: #000050; }
267 | .py-name:link               { color: #000050 !important; }
268 | .py-name:visited            { color: #000050 !important; }
269 | .py-number                  { color: #005000; }
270 | .py-defname                 { color: #000060; font-weight: bold; }
271 | .py-def-name                { color: #000060; font-weight: bold; }
272 | .py-base-class              { color: #000060; }
273 | .py-param                   { color: #000060; }
274 | .py-docstring               { color: #006030; }
275 | .py-decorator               { color: #804020; }
276 | /* Use this if you don't want links to names underlined: */
277 | /*a.py-name                   { text-decoration: none; }*/
278 | 
279 | /* Graphs & Diagrams
280 |  *   - These CSS styles are used for graphs & diagrams generated using
281 |  *     Graphviz dot.  'img.graph-without-title' is used for bare
282 |  *     diagrams (to remove the border created by making the image
283 |  *     clickable).
284 |  */
285 | img.graph-without-title     { border: none; }
286 | img.graph-with-title        { border: 1px solid #000000; }
287 | span.graph-title            { font-weight: bold; }
288 | span.graph-caption          { }
289 | 
290 | /* General-purpose classes
291 |  *   - 'p.indent-wrapped-lines' defines a paragraph whose first line
292 |  *     is not indented, but whose subsequent lines are.
293 |  *   - The 'nomargin-top' class is used to remove the top margin (e.g.
294 |  *     from lists).  The 'nomargin' class is used to remove both the
295 |  *     top and bottom margin (but not the left or right margin --
296 |  *     for lists, that would cause the bullets to disappear.)
297 |  */
298 | p.indent-wrapped-lines      { padding: 0 0 0 7em; text-indent: -7em; 
299 |                               margin: 0; }
300 | .nomargin-top               { margin-top: 0; }
301 | .nomargin                   { margin-top: 0; margin-bottom: 0; }
302 | 
303 | /* HTML Log */
304 | div.log-block               { padding: 0; margin: .5em 0 .5em 0;
305 |                               background: #e8f0f8; color: #000000;
306 |                               border: 1px solid #000000; }
307 | div.log-error               { padding: .1em .3em .1em .3em; margin: 4px;
308 |                               background: #ffb0b0; color: #000000;
309 |                               border: 1px solid #000000; }
310 | div.log-warning             { padding: .1em .3em .1em .3em; margin: 4px;
311 |                               background: #ffffb0; color: #000000;
312 |                               border: 1px solid #000000; }
313 | div.log-info               { padding: .1em .3em .1em .3em; margin: 4px;
314 |                               background: #b0ffb0; color: #000000;
315 |                               border: 1px solid #000000; }
316 | h2.log-hdr                  { background: #70b0ff; color: #000000;
317 |                               margin: 0; padding: 0em 0.5em 0em 0.5em;
318 |                               border-bottom: 1px solid #000000; font-size: 110%; }
319 | p.log                       { font-weight: bold; margin: .5em 0 .5em 0; }
320 | tr.opt-changed              { color: #000000; font-weight: bold; }
321 | tr.opt-default              { color: #606060; }
322 | pre.log                     { margin: 0; padding: 0; padding-left: 1em; }
323 | 


--------------------------------------------------------------------------------
/documentation/epydoc.js:
--------------------------------------------------------------------------------
  1 | function toggle_private() {
  2 |         // Search for any private/public links on this page.  Store
  3 |         // their old text in "cmd," so we will know what action to
  4 |         // take; and change their text to the opposite action.
  5 |         var cmd = "?";
  6 |         var elts = document.getElementsByTagName("a");
  7 |         for(var i=0; i<elts.length; i++) {
  8 |           if (elts[i].className == "privatelink") {
  9 |             cmd = elts[i].innerHTML;
 10 |             elts[i].innerHTML = ((cmd && cmd.substr(0,4)=="show")?
 11 |                                     "hide&nbsp;private":"show&nbsp;private");
 12 |           }
 13 |         }
 14 |         // Update all DIVs containing private objects.
 15 |         var elts = document.getElementsByTagName("div");
 16 |         for(var i=0; i<elts.length; i++) {
 17 |           if (elts[i].className == "private") {
 18 |             elts[i].style.display = ((cmd && cmd.substr(0,4)=="hide")?"none":"block");
 19 |           }
 20 |           else if (elts[i].className == "public") {
 21 |             elts[i].style.display = ((cmd && cmd.substr(0,4)=="hide")?"block":"none");
 22 |           }
 23 |         }
 24 |         // Update all table rows containing private objects.  Note, we
 25 |         // use "" instead of "block" becaue IE & firefox disagree on what
 26 |         // this should be (block vs table-row), and "" just gives the
 27 |         // default for both browsers.
 28 |         var elts = document.getElementsByTagName("tr");
 29 |         for(var i=0; i<elts.length; i++) {
 30 |           if (elts[i].className == "private") {
 31 |             elts[i].style.display = ((cmd && cmd.substr(0,4)=="hide")?"none":"");
 32 |           }
 33 |         }
 34 |         // Update all list items containing private objects.
 35 |         var elts = document.getElementsByTagName("li");
 36 |         for(var i=0; i<elts.length; i++) {
 37 |           if (elts[i].className == "private") {
 38 |             elts[i].style.display = ((cmd && cmd.substr(0,4)=="hide")?
 39 |                                         "none":"");
 40 |           }
 41 |         }
 42 |         // Update all list items containing private objects.
 43 |         var elts = document.getElementsByTagName("ul");
 44 |         for(var i=0; i<elts.length; i++) {
 45 |           if (elts[i].className == "private") {
 46 |             elts[i].style.display = ((cmd && cmd.substr(0,4)=="hide")?"none":"block");
 47 |           }
 48 |         }
 49 |         // Set a cookie to remember the current option.
 50 |         document.cookie = "EpydocPrivate="+cmd;
 51 |       }
 52 | function show_private() {
 53 |         var elts = document.getElementsByTagName("a");
 54 |         for(var i=0; i<elts.length; i++) {
 55 |           if (elts[i].className == "privatelink") {
 56 |             cmd = elts[i].innerHTML;
 57 |             if (cmd && cmd.substr(0,4)=="show")
 58 |                 toggle_private();
 59 |           }
 60 |         }
 61 |       }
 62 | function getCookie(name) {
 63 |         var dc = document.cookie;
 64 |         var prefix = name + "=";
 65 |         var begin = dc.indexOf("; " + prefix);
 66 |         if (begin == -1) {
 67 |           begin = dc.indexOf(prefix);
 68 |           if (begin != 0) return null;
 69 |         } else
 70 |         { begin += 2; }
 71 |         var end = document.cookie.indexOf(";", begin);
 72 |         if (end == -1)
 73 |         { end = dc.length; }
 74 |         return unescape(dc.substring(begin + prefix.length, end));
 75 |       }
 76 | function setFrame(url1, url2) {
 77 |           parent.frames[1].location.href = url1;
 78 |           parent.frames[2].location.href = url2;
 79 |       }
 80 | function checkCookie() {
 81 |         var cmd=getCookie("EpydocPrivate");
 82 |         if (cmd && cmd.substr(0,4)!="show" && location.href.indexOf("#_") < 0)
 83 |             toggle_private();
 84 |       }
 85 | function toggleCallGraph(id) {
 86 |         var elt = document.getElementById(id);
 87 |         if (elt.style.display == "none")
 88 |             elt.style.display = "block";
 89 |         else
 90 |             elt.style.display = "none";
 91 |       }
 92 | function expand(id) {
 93 |   var elt = document.getElementById(id+"-expanded");
 94 |   if (elt) elt.style.display = "block";
 95 |   var elt = document.getElementById(id+"-expanded-linenums");
 96 |   if (elt) elt.style.display = "block";
 97 |   var elt = document.getElementById(id+"-collapsed");
 98 |   if (elt) { elt.innerHTML = ""; elt.style.display = "none"; }
 99 |   var elt = document.getElementById(id+"-collapsed-linenums");
100 |   if (elt) { elt.innerHTML = ""; elt.style.display = "none"; }
101 |   var elt = document.getElementById(id+"-toggle");
102 |   if (elt) { elt.innerHTML = "-"; }
103 | }
104 | 
105 | function collapse(id) {
106 |   var elt = document.getElementById(id+"-expanded");
107 |   if (elt) elt.style.display = "none";
108 |   var elt = document.getElementById(id+"-expanded-linenums");
109 |   if (elt) elt.style.display = "none";
110 |   var elt = document.getElementById(id+"-collapsed-linenums");
111 |   if (elt) { elt.innerHTML = "<br />"; elt.style.display="block"; }
112 |   var elt = document.getElementById(id+"-toggle");
113 |   if (elt) { elt.innerHTML = "+"; }
114 |   var elt = document.getElementById(id+"-collapsed");
115 |   if (elt) {
116 |     elt.style.display = "block";
117 |     
118 |     var indent = elt.getAttribute("indent");
119 |     var pad = elt.getAttribute("pad");
120 |     var s = "<tt class='py-lineno'>";
121 |     for (var i=0; i<pad.length; i++) { s += "&nbsp;" }
122 |     s += "</tt>";
123 |     s += "&nbsp;&nbsp;<tt class='py-line'>";
124 |     for (var i=0; i<indent.length; i++) { s += "&nbsp;" }
125 |     s += "<a href='#' onclick='expand(\"" + id;
126 |     s += "\");return false'>...</a></tt><br />";
127 |     elt.innerHTML = s;
128 |   }
129 | }
130 | 
131 | function toggle(id) {
132 |   elt = document.getElementById(id+"-toggle");
133 |   if (elt.innerHTML == "-")
134 |       collapse(id); 
135 |   else
136 |       expand(id);
137 |   return false;
138 | }
139 | 
140 | function highlight(id) {
141 |   var elt = document.getElementById(id+"-def");
142 |   if (elt) elt.className = "py-highlight-hdr";
143 |   var elt = document.getElementById(id+"-expanded");
144 |   if (elt) elt.className = "py-highlight";
145 |   var elt = document.getElementById(id+"-collapsed");
146 |   if (elt) elt.className = "py-highlight";
147 | }
148 | 
149 | function num_lines(s) {
150 |   var n = 1;
151 |   var pos = s.indexOf("\n");
152 |   while ( pos > 0) {
153 |     n += 1;
154 |     pos = s.indexOf("\n", pos+1);
155 |   }
156 |   return n;
157 | }
158 | 
159 | // Collapse all blocks that mave more than `min_lines` lines.
160 | function collapse_all(min_lines) {
161 |   var elts = document.getElementsByTagName("div");
162 |   for (var i=0; i<elts.length; i++) {
163 |     var elt = elts[i];
164 |     var split = elt.id.indexOf("-");
165 |     if (split > 0)
166 |       if (elt.id.substring(split, elt.id.length) == "-expanded")
167 |         if (num_lines(elt.innerHTML) > min_lines)
168 |           collapse(elt.id.substring(0, split));
169 |   }
170 | }
171 | 
172 | function expandto(href) {
173 |   var start = href.indexOf("#")+1;
174 |   if (start != 0 && start != href.length) {
175 |     if (href.substring(start, href.length) != "-") {
176 |       collapse_all(4);
177 |       pos = href.indexOf(".", start);
178 |       while (pos != -1) {
179 |         var id = href.substring(start, pos);
180 |         expand(id);
181 |         pos = href.indexOf(".", pos+1);
182 |       }
183 |       var id = href.substring(start, href.length);
184 |       expand(id);
185 |       highlight(id);
186 |     }
187 |   }
188 | }
189 | 
190 | function kill_doclink(id) {
191 |   var parent = document.getElementById(id);
192 |   parent.removeChild(parent.childNodes.item(0));
193 | }
194 | function auto_kill_doclink(ev) {
195 |   if (!ev) var ev = window.event;
196 |   if (!this.contains(ev.toElement)) {
197 |     var parent = document.getElementById(this.parentID);
198 |     parent.removeChild(parent.childNodes.item(0));
199 |   }
200 | }
201 | 
202 | function doclink(id, name, targets_id) {
203 |   var elt = document.getElementById(id);
204 | 
205 |   // If we already opened the box, then destroy it.
206 |   // (This case should never occur, but leave it in just in case.)
207 |   if (elt.childNodes.length > 1) {
208 |     elt.removeChild(elt.childNodes.item(0));
209 |   }
210 |   else {
211 |     // The outer box: relative + inline positioning.
212 |     var box1 = document.createElement("div");
213 |     box1.style.position = "relative";
214 |     box1.style.display = "inline";
215 |     box1.style.top = 0;
216 |     box1.style.left = 0;
217 |   
218 |     // A shadow for fun
219 |     var shadow = document.createElement("div");
220 |     shadow.style.position = "absolute";
221 |     shadow.style.left = "-1.3em";
222 |     shadow.style.top = "-1.3em";
223 |     shadow.style.background = "#404040";
224 |     
225 |     // The inner box: absolute positioning.
226 |     var box2 = document.createElement("div");
227 |     box2.style.position = "relative";
228 |     box2.style.border = "1px solid #a0a0a0";
229 |     box2.style.left = "-.2em";
230 |     box2.style.top = "-.2em";
231 |     box2.style.background = "white";
232 |     box2.style.padding = ".3em .4em .3em .4em";
233 |     box2.style.fontStyle = "normal";
234 |     box2.onmouseout=auto_kill_doclink;
235 |     box2.parentID = id;
236 | 
237 |     // Get the targets
238 |     var targets_elt = document.getElementById(targets_id);
239 |     var targets = targets_elt.getAttribute("targets");
240 |     var links = "";
241 |     target_list = targets.split(",");
242 |     for (var i=0; i<target_list.length; i++) {
243 |         var target = target_list[i].split("=");
244 |         links += "<li><a href='" + target[1] + 
245 |                "' style='text-decoration:none'>" +
246 |                target[0] + "</a></li>";
247 |     }
248 |   
249 |     // Put it all together.
250 |     elt.insertBefore(box1, elt.childNodes.item(0));
251 |     //box1.appendChild(box2);
252 |     box1.appendChild(shadow);
253 |     shadow.appendChild(box2);
254 |     box2.innerHTML =
255 |         "Which <b>"+name+"</b> do you want to see documentation for?" +
256 |         "<ul style='margin-bottom: 0;'>" +
257 |         links + 
258 |         "<li><a href='#' style='text-decoration:none' " +
259 |         "onclick='kill_doclink(\""+id+"\");return false;'>"+
260 |         "<i>None of the above</i></a></li></ul>";
261 |   }
262 |   return false;
263 | }
264 | 
265 | function get_anchor() {
266 |           var href = location.href;
267 |           var start = href.indexOf("#")+1;
268 |           if ((start != 0) && (start != href.length))
269 |               return href.substring(start, href.length);
270 |       }
271 | function redirect_url(dottedName) {
272 |           // Scan through each element of the "pages" list, and check
273 |           // if "name" matches with any of them.
274 |           for (var i=0; i<pages.length; i++) {
275 | 
276 |               // Each page has the form "<pagename>-m" or "<pagename>-c";
277 |               // extract the <pagename> portion & compare it to dottedName.
278 |               var pagename = pages[i].substring(0, pages[i].length-2);
279 |               if (pagename == dottedName.substring(0,pagename.length)) {
280 | 
281 |                   // We've found a page that matches `dottedName`;
282 |                   // construct its URL, using leftover `dottedName`
283 |                   // content to form an anchor.
284 |                   var pagetype = pages[i].charAt(pages[i].length-1);
285 |                   var url = pagename + ((pagetype=="m")?"-module.html":
286 |                                                         "-class.html");
287 |                   if (dottedName.length > pagename.length)
288 |                       url += "#" + dottedName.substring(pagename.length+1,
289 |                                                         dottedName.length);
290 |                   return url;
291 |               }
292 |           }
293 |       }
294 | 


--------------------------------------------------------------------------------
/documentation/frames.html:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="iso-8859-1"?>
 2 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Frameset//EN"
 3 |           "DTD/xhtml1-frameset.dtd">
 4 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
 5 | <head>
 6 |   <title> API Documentation </title>
 7 | </head>
 8 | <frameset cols="20%,80%">
 9 |   <frameset rows="30%,70%">
10 |     <frame src="toc.html" name="moduleListFrame"
11 |            id="moduleListFrame" />
12 |     <frame src="toc-everything.html" name="moduleFrame"
13 |            id="moduleFrame" />
14 |   </frameset>
15 |   <frame src="tweetokenize-module.html" name="mainFrame" id="mainFrame" />
16 | </frameset>
17 | </html>
18 | 


--------------------------------------------------------------------------------
/documentation/help.html:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="ascii"?>
  2 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  3 |           "DTD/xhtml1-transitional.dtd">
  4 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
  5 | <head>
  6 |   <title>Help</title>
  7 |   <link rel="stylesheet" href="epydoc.css" type="text/css" />
  8 |   <script type="text/javascript" src="epydoc.js"></script>
  9 | </head>
 10 | 
 11 | <body bgcolor="white" text="black" link="blue" vlink="#204080"
 12 |       alink="#204080">
 13 | <!-- ==================== NAVIGATION BAR ==================== -->
 14 | <table class="navbar" border="0" width="100%" cellpadding="0"
 15 |        bgcolor="#a0c0ff" cellspacing="0">
 16 |   <tr valign="middle">
 17 |   <!-- Home link -->
 18 |       <th>&nbsp;&nbsp;&nbsp;<a
 19 |         href="tweetokenize-module.html">Home</a>&nbsp;&nbsp;&nbsp;</th>
 20 | 
 21 |   <!-- Tree link -->
 22 |       <th>&nbsp;&nbsp;&nbsp;<a
 23 |         href="module-tree.html">Trees</a>&nbsp;&nbsp;&nbsp;</th>
 24 | 
 25 |   <!-- Index link -->
 26 |       <th>&nbsp;&nbsp;&nbsp;<a
 27 |         href="identifier-index.html">Indices</a>&nbsp;&nbsp;&nbsp;</th>
 28 | 
 29 |   <!-- Help link -->
 30 |       <th bgcolor="#70b0f0" class="navbar-select"
 31 |           >&nbsp;&nbsp;&nbsp;Help&nbsp;&nbsp;&nbsp;</th>
 32 | 
 33 |       <th class="navbar" width="100%"></th>
 34 |   </tr>
 35 | </table>
 36 | <table width="100%" cellpadding="0" cellspacing="0">
 37 |   <tr valign="top">
 38 |     <td width="100%">&nbsp;</td>
 39 |     <td>
 40 |       <table cellpadding="0" cellspacing="0">
 41 |         <!-- hide/show private -->
 42 |         <tr><td align="right"><span class="options"
 43 |             >[<a href="frames.html" target="_top">frames</a
 44 |             >]&nbsp;|&nbsp;<a href="help.html"
 45 |             target="_top">no&nbsp;frames</a>]</span></td></tr>
 46 |       </table>
 47 |     </td>
 48 |   </tr>
 49 | </table>
 50 | 
 51 | <h1 class="epydoc"> API Documentation </h1>
 52 | 
 53 | <p> This document contains the API (Application Programming Interface)
 54 | documentation for this project.  Documentation for the Python
 55 | objects defined by the project is divided into separate pages for each
 56 | package, module, and class.  The API documentation also includes two
 57 | pages containing information about the project as a whole: a trees
 58 | page, and an index page.  </p>
 59 | 
 60 | <h2> Object Documentation </h2>
 61 | 
 62 |   <p>Each <strong>Package Documentation</strong> page contains: </p>
 63 |   <ul>
 64 |     <li> A description of the package. </li>
 65 |     <li> A list of the modules and sub-packages contained by the
 66 |     package.  </li>
 67 |     <li> A summary of the classes defined by the package. </li>
 68 |     <li> A summary of the functions defined by the package. </li>
 69 |     <li> A summary of the variables defined by the package. </li>
 70 |     <li> A detailed description of each function defined by the
 71 |     package. </li>
 72 |     <li> A detailed description of each variable defined by the
 73 |     package. </li>
 74 |   </ul>
 75 |   
 76 |   <p>Each <strong>Module Documentation</strong> page contains:</p>
 77 |   <ul>
 78 |     <li> A description of the module. </li>
 79 |     <li> A summary of the classes defined by the module. </li>
 80 |     <li> A summary of the functions defined by the module. </li>
 81 |     <li> A summary of the variables defined by the module. </li>
 82 |     <li> A detailed description of each function defined by the
 83 |     module. </li>
 84 |     <li> A detailed description of each variable defined by the
 85 |     module. </li>
 86 |   </ul>
 87 |   
 88 |   <p>Each <strong>Class Documentation</strong> page contains: </p>
 89 |   <ul>
 90 |     <li> A class inheritance diagram. </li>
 91 |     <li> A list of known subclasses. </li>
 92 |     <li> A description of the class. </li>
 93 |     <li> A summary of the methods defined by the class. </li>
 94 |     <li> A summary of the instance variables defined by the class. </li>
 95 |     <li> A summary of the class (static) variables defined by the
 96 |     class. </li> 
 97 |     <li> A detailed description of each method defined by the
 98 |     class. </li>
 99 |     <li> A detailed description of each instance variable defined by the
100 |     class. </li> 
101 |     <li> A detailed description of each class (static) variable defined
102 |     by the class. </li> 
103 |   </ul>
104 | 
105 | <h2> Project Documentation </h2>
106 | 
107 |   <p> The <strong>Trees</strong> page contains the module and class hierarchies: </p>
108 |   <ul>
109 |     <li> The <em>module hierarchy</em> lists every package and module, with
110 |     modules grouped into packages.  At the top level, and within each
111 |     package, modules and sub-packages are listed alphabetically. </li>
112 |     <li> The <em>class hierarchy</em> lists every class, grouped by base
113 |     class.  If a class has more than one base class, then it will be
114 |     listed under each base class.  At the top level, and under each base
115 |     class, classes are listed alphabetically. </li>
116 |   </ul>
117 |   
118 |   <p> The <strong>Index</strong> page contains indices of terms and
119 |   identifiers: </p>
120 |   <ul>
121 |     <li> The <em>term index</em> lists every term indexed by any object's
122 |     documentation.  For each term, the index provides links to each
123 |     place where the term is indexed. </li>
124 |     <li> The <em>identifier index</em> lists the (short) name of every package,
125 |     module, class, method, function, variable, and parameter.  For each
126 |     identifier, the index provides a short description, and a link to
127 |     its documentation. </li>
128 |   </ul>
129 | 
130 | <h2> The Table of Contents </h2>
131 | 
132 | <p> The table of contents occupies the two frames on the left side of
133 | the window.  The upper-left frame displays the <em>project
134 | contents</em>, and the lower-left frame displays the <em>module
135 | contents</em>: </p>
136 | 
137 | <table class="help summary" border="1" cellspacing="0" cellpadding="3">
138 |   <tr style="height: 30%">
139 |     <td align="center" style="font-size: small">
140 |        Project<br />Contents<hr />...</td>
141 |     <td align="center" style="font-size: small" rowspan="2" width="70%">
142 |       API<br />Documentation<br />Frame<br /><br /><br />
143 |     </td>
144 |   </tr>
145 |   <tr>
146 |     <td align="center" style="font-size: small">
147 |       Module<br />Contents<hr />&nbsp;<br />...<br />&nbsp;
148 |     </td>
149 |   </tr>
150 | </table><br />
151 | 
152 | <p> The <strong>project contents frame</strong> contains a list of all packages
153 | and modules that are defined by the project.  Clicking on an entry
154 | will display its contents in the module contents frame.  Clicking on a
155 | special entry, labeled "Everything," will display the contents of
156 | the entire project. </p>
157 | 
158 | <p> The <strong>module contents frame</strong> contains a list of every
159 | submodule, class, type, exception, function, and variable defined by a
160 | module or package.  Clicking on an entry will display its
161 | documentation in the API documentation frame.  Clicking on the name of
162 | the module, at the top of the frame, will display the documentation
163 | for the module itself. </p>
164 | 
165 | <p> The "<strong>frames</strong>" and "<strong>no frames</strong>" buttons below the top
166 | navigation bar can be used to control whether the table of contents is
167 | displayed or not. </p>
168 | 
169 | <h2> The Navigation Bar </h2>
170 | 
171 | <p> A navigation bar is located at the top and bottom of every page.
172 | It indicates what type of page you are currently viewing, and allows
173 | you to go to related pages.  The following table describes the labels
174 | on the navigation bar.  Note that not some labels (such as
175 | [Parent]) are not displayed on all pages. </p>
176 | 
177 | <table class="summary" border="1" cellspacing="0" cellpadding="3" width="100%">
178 | <tr class="summary">
179 |   <th>Label</th>
180 |   <th>Highlighted when...</th>
181 |   <th>Links to...</th>
182 | </tr>
183 |   <tr><td valign="top"><strong>[Parent]</strong></td>
184 |       <td valign="top"><em>(never highlighted)</em></td>
185 |       <td valign="top"> the parent of the current package </td></tr>
186 |   <tr><td valign="top"><strong>[Package]</strong></td>
187 |       <td valign="top">viewing a package</td>
188 |       <td valign="top">the package containing the current object
189 |       </td></tr>
190 |   <tr><td valign="top"><strong>[Module]</strong></td>
191 |       <td valign="top">viewing a module</td>
192 |       <td valign="top">the module containing the current object
193 |       </td></tr> 
194 |   <tr><td valign="top"><strong>[Class]</strong></td>
195 |       <td valign="top">viewing a class </td>
196 |       <td valign="top">the class containing the current object</td></tr>
197 |   <tr><td valign="top"><strong>[Trees]</strong></td>
198 |       <td valign="top">viewing the trees page</td>
199 |       <td valign="top"> the trees page </td></tr>
200 |   <tr><td valign="top"><strong>[Index]</strong></td>
201 |       <td valign="top">viewing the index page</td>
202 |       <td valign="top"> the index page </td></tr>
203 |   <tr><td valign="top"><strong>[Help]</strong></td>
204 |       <td valign="top">viewing the help page</td>
205 |       <td valign="top"> the help page </td></tr>
206 | </table>
207 | 
208 | <p> The "<strong>show private</strong>" and "<strong>hide private</strong>" buttons below
209 | the top navigation bar can be used to control whether documentation
210 | for private objects is displayed.  Private objects are usually defined
211 | as objects whose (short) names begin with a single underscore, but do
212 | not end with an underscore.  For example, "<code>_x</code>",
213 | "<code>__pprint</code>", and "<code>epydoc.epytext._tokenize</code>"
214 | are private objects; but "<code>re.sub</code>",
215 | "<code>__init__</code>", and "<code>type_</code>" are not.  However,
216 | if a module defines the "<code>__all__</code>" variable, then its
217 | contents are used to decide which objects are private. </p>
218 | 
219 | <p> A timestamp below the bottom navigation bar indicates when each
220 | page was last updated. </p>
221 | <!-- ==================== NAVIGATION BAR ==================== -->
222 | <table class="navbar" border="0" width="100%" cellpadding="0"
223 |        bgcolor="#a0c0ff" cellspacing="0">
224 |   <tr valign="middle">
225 |   <!-- Home link -->
226 |       <th>&nbsp;&nbsp;&nbsp;<a
227 |         href="tweetokenize-module.html">Home</a>&nbsp;&nbsp;&nbsp;</th>
228 | 
229 |   <!-- Tree link -->
230 |       <th>&nbsp;&nbsp;&nbsp;<a
231 |         href="module-tree.html">Trees</a>&nbsp;&nbsp;&nbsp;</th>
232 | 
233 |   <!-- Index link -->
234 |       <th>&nbsp;&nbsp;&nbsp;<a
235 |         href="identifier-index.html">Indices</a>&nbsp;&nbsp;&nbsp;</th>
236 | 
237 |   <!-- Help link -->
238 |       <th bgcolor="#70b0f0" class="navbar-select"
239 |           >&nbsp;&nbsp;&nbsp;Help&nbsp;&nbsp;&nbsp;</th>
240 | 
241 |       <th class="navbar" width="100%"></th>
242 |   </tr>
243 | </table>
244 | <table border="0" cellpadding="0" cellspacing="0" width="100%%">
245 |   <tr>
246 |     <td align="left" class="footer">
247 |     Generated by Epydoc 3.0.1 on Mon May 20 02:41:06 2013
248 |     </td>
249 |     <td align="right" class="footer">
250 |       <a target="mainFrame" href="http://epydoc.sourceforge.net"
251 |         >http://epydoc.sourceforge.net</a>
252 |     </td>
253 |   </tr>
254 | </table>
255 | 
256 | <script type="text/javascript">
257 |   <!--
258 |   // Private objects are initially displayed (because if
259 |   // javascript is turned off then we want them to be
260 |   // visible); but by default, we want to hide them.  So hide
261 |   // them unless we have a cookie that says to show them.
262 |   checkCookie();
263 |   // -->
264 | </script>
265 | </body>
266 | </html>
267 | 


--------------------------------------------------------------------------------
/documentation/identifier-index.html:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="ascii"?>
  2 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  3 |           "DTD/xhtml1-transitional.dtd">
  4 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
  5 | <head>
  6 |   <title>Identifier Index</title>
  7 |   <link rel="stylesheet" href="epydoc.css" type="text/css" />
  8 |   <script type="text/javascript" src="epydoc.js"></script>
  9 | </head>
 10 | 
 11 | <body bgcolor="white" text="black" link="blue" vlink="#204080"
 12 |       alink="#204080">
 13 | <!-- ==================== NAVIGATION BAR ==================== -->
 14 | <table class="navbar" border="0" width="100%" cellpadding="0"
 15 |        bgcolor="#a0c0ff" cellspacing="0">
 16 |   <tr valign="middle">
 17 |   <!-- Home link -->
 18 |       <th>&nbsp;&nbsp;&nbsp;<a
 19 |         href="tweetokenize-module.html">Home</a>&nbsp;&nbsp;&nbsp;</th>
 20 | 
 21 |   <!-- Tree link -->
 22 |       <th>&nbsp;&nbsp;&nbsp;<a
 23 |         href="module-tree.html">Trees</a>&nbsp;&nbsp;&nbsp;</th>
 24 | 
 25 |   <!-- Index link -->
 26 |       <th bgcolor="#70b0f0" class="navbar-select"
 27 |           >&nbsp;&nbsp;&nbsp;Indices&nbsp;&nbsp;&nbsp;</th>
 28 | 
 29 |   <!-- Help link -->
 30 |       <th>&nbsp;&nbsp;&nbsp;<a
 31 |         href="help.html">Help</a>&nbsp;&nbsp;&nbsp;</th>
 32 | 
 33 |       <th class="navbar" width="100%"></th>
 34 |   </tr>
 35 | </table>
 36 | <table width="100%" cellpadding="0" cellspacing="0">
 37 |   <tr valign="top">
 38 |     <td width="100%">&nbsp;</td>
 39 |     <td>
 40 |       <table cellpadding="0" cellspacing="0">
 41 |         <!-- hide/show private -->
 42 |         <tr><td align="right"><span class="options"
 43 |             >[<a href="frames.html" target="_top">frames</a
 44 |             >]&nbsp;|&nbsp;<a href="identifier-index.html"
 45 |             target="_top">no&nbsp;frames</a>]</span></td></tr>
 46 |       </table>
 47 |     </td>
 48 |   </tr>
 49 | </table>
 50 | <table border="0" width="100%">
 51 | <tr valign="bottom"><td>
 52 | <h1 class="epydoc">Identifier Index</h1>
 53 | </td><td>
 54 | [
 55 |   A
 56 |   B
 57 |   C
 58 |   D
 59 |  <a href="#E">E</a>
 60 |   F
 61 |   G
 62 |  <a href="#H">H</a>
 63 |   I
 64 |   J
 65 |   K
 66 |   L
 67 |   M
 68 |  <a href="#N">N</a>
 69 |  <a href="#O">O</a>
 70 |  <a href="#P">P</a>
 71 |  <a href="#Q">Q</a>
 72 |  <a href="#R">R</a>
 73 |  <a href="#S">S</a>
 74 |  <a href="#T">T</a>
 75 |  <a href="#U">U</a>
 76 |   V
 77 |  <a href="#W">W</a>
 78 |   X
 79 |   Y
 80 |   Z
 81 |  <a href="#_">_</a>
 82 | ]
 83 | </td></table>
 84 | <table border="0" width="100%">
 85 | <tr valign="top"><td valign="top" width="1%"><h2 class="epydoc"><a name="E">E</a></h2></td>
 86 | <td valign="top">
 87 | <table class="link-index" width="100%" border="1">
 88 | <tr>
 89 | <td width="33%" class="link-index"><a href="tweetokenize.Tokenizer-class.html#ellipsis_re">ellipsis_re</a><br />
 90 | <span class="index-where">(in&nbsp;<a href="tweetokenize.Tokenizer-class.html">Tokenizer</a>)</span></td>
 91 | <td width="33%" class="link-index"><a href="tweetokenize.Tokenizer-class.html#emoticons">emoticons()</a><br />
 92 | <span class="index-where">(in&nbsp;<a href="tweetokenize.Tokenizer-class.html">Tokenizer</a>)</span></td>
 93 | <td width="33%" class="link-index">&nbsp;</td>
 94 | </tr>
 95 | <tr><td class="link-index">&nbsp;</td><td class="link-index">&nbsp;</td><td class="link-index">&nbsp;</td></tr>
 96 | </table>
 97 | </td></tr>
 98 | <tr valign="top"><td valign="top" width="1%"><h2 class="epydoc"><a name="H">H</a></h2></td>
 99 | <td valign="top">
100 | <table class="link-index" width="100%" border="1">
101 | <tr>
102 | <td width="33%" class="link-index"><a href="tweetokenize.Tokenizer-class.html#hashtags_re">hashtags_re</a><br />
103 | <span class="index-where">(in&nbsp;<a href="tweetokenize.Tokenizer-class.html">Tokenizer</a>)</span></td>
104 | <td width="33%" class="link-index"><a href="tweetokenize.Tokenizer-class.html#html_entities">html_entities</a><br />
105 | <span class="index-where">(in&nbsp;<a href="tweetokenize.Tokenizer-class.html">Tokenizer</a>)</span></td>
106 | <td width="33%" class="link-index"><a href="tweetokenize.Tokenizer-class.html#html_entities_re">html_entities_re</a><br />
107 | <span class="index-where">(in&nbsp;<a href="tweetokenize.Tokenizer-class.html">Tokenizer</a>)</span></td>
108 | </tr>
109 | <tr><td class="link-index">&nbsp;</td><td class="link-index">&nbsp;</td><td class="link-index">&nbsp;</td></tr>
110 | </table>
111 | </td></tr>
112 | <tr valign="top"><td valign="top" width="1%"><h2 class="epydoc"><a name="N">N</a></h2></td>
113 | <td valign="top">
114 | <table class="link-index" width="100%" border="1">
115 | <tr>
116 | <td width="33%" class="link-index"><a href="tweetokenize.Tokenizer-class.html#numbers_re">numbers_re</a><br />
117 | <span class="index-where">(in&nbsp;<a href="tweetokenize.Tokenizer-class.html">Tokenizer</a>)</span></td>
118 | <td width="33%" class="link-index">&nbsp;</td>
119 | <td width="33%" class="link-index">&nbsp;</td>
120 | </tr>
121 | <tr><td class="link-index">&nbsp;</td><td class="link-index">&nbsp;</td><td class="link-index">&nbsp;</td></tr>
122 | </table>
123 | </td></tr>
124 | <tr valign="top"><td valign="top" width="1%"><h2 class="epydoc"><a name="O">O</a></h2></td>
125 | <td valign="top">
126 | <table class="link-index" width="100%" border="1">
127 | <tr>
128 | <td width="33%" class="link-index"><a href="tweetokenize.Tokenizer-class.html#other_re">other_re</a><br />
129 | <span class="index-where">(in&nbsp;<a href="tweetokenize.Tokenizer-class.html">Tokenizer</a>)</span></td>
130 | <td width="33%" class="link-index">&nbsp;</td>
131 | <td width="33%" class="link-index">&nbsp;</td>
132 | </tr>
133 | <tr><td class="link-index">&nbsp;</td><td class="link-index">&nbsp;</td><td class="link-index">&nbsp;</td></tr>
134 | </table>
135 | </td></tr>
136 | <tr valign="top"><td valign="top" width="1%"><h2 class="epydoc"><a name="P">P</a></h2></td>
137 | <td valign="top">
138 | <table class="link-index" width="100%" border="1">
139 | <tr>
140 | <td width="33%" class="link-index"><a href="tweetokenize.Tokenizer-class.html#phonenumbers_re">phonenumbers_re</a><br />
141 | <span class="index-where">(in&nbsp;<a href="tweetokenize.Tokenizer-class.html">Tokenizer</a>)</span></td>
142 | <td width="33%" class="link-index"><a href="tweetokenize.Tokenizer-class.html#punctuation">punctuation</a><br />
143 | <span class="index-where">(in&nbsp;<a href="tweetokenize.Tokenizer-class.html">Tokenizer</a>)</span></td>
144 | <td width="33%" class="link-index">&nbsp;</td>
145 | </tr>
146 | <tr><td class="link-index">&nbsp;</td><td class="link-index">&nbsp;</td><td class="link-index">&nbsp;</td></tr>
147 | </table>
148 | </td></tr>
149 | <tr valign="top"><td valign="top" width="1%"><h2 class="epydoc"><a name="Q">Q</a></h2></td>
150 | <td valign="top">
151 | <table class="link-index" width="100%" border="1">
152 | <tr>
153 | <td width="33%" class="link-index"><a href="tweetokenize.Tokenizer-class.html#quotes_re">quotes_re</a><br />
154 | <span class="index-where">(in&nbsp;<a href="tweetokenize.Tokenizer-class.html">Tokenizer</a>)</span></td>
155 | <td width="33%" class="link-index">&nbsp;</td>
156 | <td width="33%" class="link-index">&nbsp;</td>
157 | </tr>
158 | <tr><td class="link-index">&nbsp;</td><td class="link-index">&nbsp;</td><td class="link-index">&nbsp;</td></tr>
159 | </table>
160 | </td></tr>
161 | <tr valign="top"><td valign="top" width="1%"><h2 class="epydoc"><a name="R">R</a></h2></td>
162 | <td valign="top">
163 | <table class="link-index" width="100%" border="1">
164 | <tr>
165 | <td width="33%" class="link-index"><a href="tweetokenize.Tokenizer-class.html#repeating_re">repeating_re</a><br />
166 | <span class="index-where">(in&nbsp;<a href="tweetokenize.Tokenizer-class.html">Tokenizer</a>)</span></td>
167 | <td width="33%" class="link-index">&nbsp;</td>
168 | <td width="33%" class="link-index">&nbsp;</td>
169 | </tr>
170 | <tr><td class="link-index">&nbsp;</td><td class="link-index">&nbsp;</td><td class="link-index">&nbsp;</td></tr>
171 | </table>
172 | </td></tr>
173 | <tr valign="top"><td valign="top" width="1%"><h2 class="epydoc"><a name="S">S</a></h2></td>
174 | <td valign="top">
175 | <table class="link-index" width="100%" border="1">
176 | <tr>
177 | <td width="33%" class="link-index"><a href="tweetokenize.Tokenizer-class.html#stopwords">stopwords()</a><br />
178 | <span class="index-where">(in&nbsp;<a href="tweetokenize.Tokenizer-class.html">Tokenizer</a>)</span></td>
179 | <td width="33%" class="link-index">&nbsp;</td>
180 | <td width="33%" class="link-index">&nbsp;</td>
181 | </tr>
182 | <tr><td class="link-index">&nbsp;</td><td class="link-index">&nbsp;</td><td class="link-index">&nbsp;</td></tr>
183 | </table>
184 | </td></tr>
185 | <tr valign="top"><td valign="top" width="1%"><h2 class="epydoc"><a name="T">T</a></h2></td>
186 | <td valign="top">
187 | <table class="link-index" width="100%" border="1">
188 | <tr>
189 | <td width="33%" class="link-index"><a href="tweetokenize.Tokenizer-class.html#times_re">times_re</a><br />
190 | <span class="index-where">(in&nbsp;<a href="tweetokenize.Tokenizer-class.html">Tokenizer</a>)</span></td>
191 | <td width="33%" class="link-index"><a href="tweetokenize.Tokenizer-class.html#tokenize_re">tokenize_re</a><br />
192 | <span class="index-where">(in&nbsp;<a href="tweetokenize.Tokenizer-class.html">Tokenizer</a>)</span></td>
193 | <td width="33%" class="link-index"><a href="tweetokenize.Tokenizer.TokenizerException-class.html">TokenizerException</a><br />
194 | <span class="index-where">(in&nbsp;<a href="tweetokenize.Tokenizer-class.html">Tokenizer</a>)</span></td>
195 | </tr>
196 | <tr>
197 | <td width="33%" class="link-index"><a href="tweetokenize.Tokenizer-class.html#tokenize">tokenize()</a><br />
198 | <span class="index-where">(in&nbsp;<a href="tweetokenize.Tokenizer-class.html">Tokenizer</a>)</span></td>
199 | <td width="33%" class="link-index"><a href="tweetokenize.Tokenizer-class.html">Tokenizer</a><br />
200 | <span class="index-where">(in&nbsp;<a href="tweetokenize-module.html">tweetokenize</a>)</span></td>
201 | <td width="33%" class="link-index"><a href="tweetokenize-module.html">tweetokenize</a></td>
202 | </tr>
203 | </table>
204 | </td></tr>
205 | <tr valign="top"><td valign="top" width="1%"><h2 class="epydoc"><a name="U">U</a></h2></td>
206 | <td valign="top">
207 | <table class="link-index" width="100%" border="1">
208 | <tr>
209 | <td width="33%" class="link-index"><a href="tweetokenize.Tokenizer-class.html#update">update()</a><br />
210 | <span class="index-where">(in&nbsp;<a href="tweetokenize.Tokenizer-class.html">Tokenizer</a>)</span></td>
211 | <td width="33%" class="link-index"><a href="tweetokenize.Tokenizer-class.html#urls_re">urls_re</a><br />
212 | <span class="index-where">(in&nbsp;<a href="tweetokenize.Tokenizer-class.html">Tokenizer</a>)</span></td>
213 | <td width="33%" class="link-index"><a href="tweetokenize.Tokenizer-class.html#usernames_re">usernames_re</a><br />
214 | <span class="index-where">(in&nbsp;<a href="tweetokenize.Tokenizer-class.html">Tokenizer</a>)</span></td>
215 | </tr>
216 | <tr><td class="link-index">&nbsp;</td><td class="link-index">&nbsp;</td><td class="link-index">&nbsp;</td></tr>
217 | </table>
218 | </td></tr>
219 | <tr valign="top"><td valign="top" width="1%"><h2 class="epydoc"><a name="W">W</a></h2></td>
220 | <td valign="top">
221 | <table class="link-index" width="100%" border="1">
222 | <tr>
223 | <td width="33%" class="link-index"><a href="tweetokenize.Tokenizer-class.html#word_re">word_re</a><br />
224 | <span class="index-where">(in&nbsp;<a href="tweetokenize.Tokenizer-class.html">Tokenizer</a>)</span></td>
225 | <td width="33%" class="link-index">&nbsp;</td>
226 | <td width="33%" class="link-index">&nbsp;</td>
227 | </tr>
228 | <tr><td class="link-index">&nbsp;</td><td class="link-index">&nbsp;</td><td class="link-index">&nbsp;</td></tr>
229 | </table>
230 | </td></tr>
231 | <tr valign="top"><td valign="top" width="1%"><h2 class="epydoc"><a name="_">_</a></h2></td>
232 | <td valign="top">
233 | <table class="link-index" width="100%" border="1">
234 | <tr>
235 | <td width="33%" class="link-index"><a href="tweetokenize.Tokenizer-class.html#__call__">__call__()</a><br />
236 | <span class="index-where">(in&nbsp;<a href="tweetokenize.Tokenizer-class.html">Tokenizer</a>)</span></td>
237 | <td width="33%" class="link-index"><a href="tweetokenize-module.html#__package__">__package__</a><br />
238 | <span class="index-where">(in&nbsp;<a href="tweetokenize-module.html">tweetokenize</a>)</span></td>
239 | <td width="33%" class="link-index"><a href="tweetokenize.Tokenizer-class.html#_token_regexs">_token_regexs</a><br />
240 | <span class="index-where">(in&nbsp;<a href="tweetokenize.Tokenizer-class.html">Tokenizer</a>)</span></td>
241 | </tr>
242 | <tr>
243 | <td width="33%" class="link-index"><a href="tweetokenize.Tokenizer-class.html#__default_args">__default_args</a><br />
244 | <span class="index-where">(in&nbsp;<a href="tweetokenize.Tokenizer-class.html">Tokenizer</a>)</span></td>
245 | <td width="33%" class="link-index"><a href="tweetokenize.Tokenizer-class.html#_doublequotes">_doublequotes</a><br />
246 | <span class="index-where">(in&nbsp;<a href="tweetokenize.Tokenizer-class.html">Tokenizer</a>)</span></td>
247 | <td width="33%" class="link-index"><a href="tweetokenize.Tokenizer-class.html#_topleveldomains">_topleveldomains</a><br />
248 | <span class="index-where">(in&nbsp;<a href="tweetokenize.Tokenizer-class.html">Tokenizer</a>)</span></td>
249 | </tr>
250 | <tr>
251 | <td width="33%" class="link-index"><a href="tweetokenize.Tokenizer-class.html#__init__">__init__()</a><br />
252 | <span class="index-where">(in&nbsp;<a href="tweetokenize.Tokenizer-class.html">Tokenizer</a>)</span></td>
253 | <td width="33%" class="link-index"><a href="tweetokenize.Tokenizer-class.html#_number">_number</a><br />
254 | <span class="index-where">(in&nbsp;<a href="tweetokenize.Tokenizer-class.html">Tokenizer</a>)</span></td>
255 | <td width="33%" class="link-index">&nbsp;</td>
256 | </tr>
257 | </table>
258 | </td></tr>
259 | </table>
260 | <br /><br /><!-- ==================== NAVIGATION BAR ==================== -->
261 | <table class="navbar" border="0" width="100%" cellpadding="0"
262 |        bgcolor="#a0c0ff" cellspacing="0">
263 |   <tr valign="middle">
264 |   <!-- Home link -->
265 |       <th>&nbsp;&nbsp;&nbsp;<a
266 |         href="tweetokenize-module.html">Home</a>&nbsp;&nbsp;&nbsp;</th>
267 | 
268 |   <!-- Tree link -->
269 |       <th>&nbsp;&nbsp;&nbsp;<a
270 |         href="module-tree.html">Trees</a>&nbsp;&nbsp;&nbsp;</th>
271 | 
272 |   <!-- Index link -->
273 |       <th bgcolor="#70b0f0" class="navbar-select"
274 |           >&nbsp;&nbsp;&nbsp;Indices&nbsp;&nbsp;&nbsp;</th>
275 | 
276 |   <!-- Help link -->
277 |       <th>&nbsp;&nbsp;&nbsp;<a
278 |         href="help.html">Help</a>&nbsp;&nbsp;&nbsp;</th>
279 | 
280 |       <th class="navbar" width="100%"></th>
281 |   </tr>
282 | </table>
283 | <table border="0" cellpadding="0" cellspacing="0" width="100%%">
284 |   <tr>
285 |     <td align="left" class="footer">
286 |     Generated by Epydoc 3.0.1 on Mon May 20 02:41:06 2013
287 |     </td>
288 |     <td align="right" class="footer">
289 |       <a target="mainFrame" href="http://epydoc.sourceforge.net"
290 |         >http://epydoc.sourceforge.net</a>
291 |     </td>
292 |   </tr>
293 | </table>
294 | 
295 | <script type="text/javascript">
296 |   <!--
297 |   // Private objects are initially displayed (because if
298 |   // javascript is turned off then we want them to be
299 |   // visible); but by default, we want to hide them.  So hide
300 |   // them unless we have a cookie that says to show them.
301 |   checkCookie();
302 |   // -->
303 | </script>
304 | </body>
305 | </html>
306 | 


--------------------------------------------------------------------------------
/documentation/index.html:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="iso-8859-1"?>
 2 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Frameset//EN"
 3 |           "DTD/xhtml1-frameset.dtd">
 4 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
 5 | <head>
 6 |   <title> API Documentation </title>
 7 | </head>
 8 | <frameset cols="20%,80%">
 9 |   <frameset rows="30%,70%">
10 |     <frame src="toc.html" name="moduleListFrame"
11 |            id="moduleListFrame" />
12 |     <frame src="toc-everything.html" name="moduleFrame"
13 |            id="moduleFrame" />
14 |   </frameset>
15 |   <frame src="tweetokenize-module.html" name="mainFrame" id="mainFrame" />
16 | </frameset>
17 | </html>
18 | 


--------------------------------------------------------------------------------
/documentation/module-tree.html:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="ascii"?>
  2 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  3 |           "DTD/xhtml1-transitional.dtd">
  4 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
  5 | <head>
  6 |   <title>Module Hierarchy</title>
  7 |   <link rel="stylesheet" href="epydoc.css" type="text/css" />
  8 |   <script type="text/javascript" src="epydoc.js"></script>
  9 | </head>
 10 | 
 11 | <body bgcolor="white" text="black" link="blue" vlink="#204080"
 12 |       alink="#204080">
 13 | <!-- ==================== NAVIGATION BAR ==================== -->
 14 | <table class="navbar" border="0" width="100%" cellpadding="0"
 15 |        bgcolor="#a0c0ff" cellspacing="0">
 16 |   <tr valign="middle">
 17 |   <!-- Home link -->
 18 |       <th>&nbsp;&nbsp;&nbsp;<a
 19 |         href="tweetokenize-module.html">Home</a>&nbsp;&nbsp;&nbsp;</th>
 20 | 
 21 |   <!-- Tree link -->
 22 |       <th bgcolor="#70b0f0" class="navbar-select"
 23 |           >&nbsp;&nbsp;&nbsp;Trees&nbsp;&nbsp;&nbsp;</th>
 24 | 
 25 |   <!-- Index link -->
 26 |       <th>&nbsp;&nbsp;&nbsp;<a
 27 |         href="identifier-index.html">Indices</a>&nbsp;&nbsp;&nbsp;</th>
 28 | 
 29 |   <!-- Help link -->
 30 |       <th>&nbsp;&nbsp;&nbsp;<a
 31 |         href="help.html">Help</a>&nbsp;&nbsp;&nbsp;</th>
 32 | 
 33 |       <th class="navbar" width="100%"></th>
 34 |   </tr>
 35 | </table>
 36 | <table width="100%" cellpadding="0" cellspacing="0">
 37 |   <tr valign="top">
 38 |     <td width="100%">&nbsp;</td>
 39 |     <td>
 40 |       <table cellpadding="0" cellspacing="0">
 41 |         <!-- hide/show private -->
 42 |         <tr><td align="right"><span class="options"
 43 |             >[<a href="frames.html" target="_top">frames</a
 44 |             >]&nbsp;|&nbsp;<a href="module-tree.html"
 45 |             target="_top">no&nbsp;frames</a>]</span></td></tr>
 46 |       </table>
 47 |     </td>
 48 |   </tr>
 49 | </table>
 50 | <center><b>
 51 |  [ <a href="module-tree.html">Module Hierarchy</a>
 52 |  | <a href="class-tree.html">Class Hierarchy</a> ]
 53 | </b></center><br />
 54 | <h1 class="epydoc">Module Hierarchy</h1>
 55 | <ul class="nomargin-top">
 56 |     <li> <strong class="uidlink"><a href="tweetokenize-module.html">tweetokenize</a></strong>: <em class="summary">Tokenization and pre-processing for social media data used to train
 57 |         classifiers.</em>    </li>
 58 | </ul>
 59 | <!-- ==================== NAVIGATION BAR ==================== -->
 60 | <table class="navbar" border="0" width="100%" cellpadding="0"
 61 |        bgcolor="#a0c0ff" cellspacing="0">
 62 |   <tr valign="middle">
 63 |   <!-- Home link -->
 64 |       <th>&nbsp;&nbsp;&nbsp;<a
 65 |         href="tweetokenize-module.html">Home</a>&nbsp;&nbsp;&nbsp;</th>
 66 | 
 67 |   <!-- Tree link -->
 68 |       <th bgcolor="#70b0f0" class="navbar-select"
 69 |           >&nbsp;&nbsp;&nbsp;Trees&nbsp;&nbsp;&nbsp;</th>
 70 | 
 71 |   <!-- Index link -->
 72 |       <th>&nbsp;&nbsp;&nbsp;<a
 73 |         href="identifier-index.html">Indices</a>&nbsp;&nbsp;&nbsp;</th>
 74 | 
 75 |   <!-- Help link -->
 76 |       <th>&nbsp;&nbsp;&nbsp;<a
 77 |         href="help.html">Help</a>&nbsp;&nbsp;&nbsp;</th>
 78 | 
 79 |       <th class="navbar" width="100%"></th>
 80 |   </tr>
 81 | </table>
 82 | <table border="0" cellpadding="0" cellspacing="0" width="100%%">
 83 |   <tr>
 84 |     <td align="left" class="footer">
 85 |     Generated by Epydoc 3.0.1 on Mon May 20 02:41:06 2013
 86 |     </td>
 87 |     <td align="right" class="footer">
 88 |       <a target="mainFrame" href="http://epydoc.sourceforge.net"
 89 |         >http://epydoc.sourceforge.net</a>
 90 |     </td>
 91 |   </tr>
 92 | </table>
 93 | 
 94 | <script type="text/javascript">
 95 |   <!--
 96 |   // Private objects are initially displayed (because if
 97 |   // javascript is turned off then we want them to be
 98 |   // visible); but by default, we want to hide them.  So hide
 99 |   // them unless we have a cookie that says to show them.
100 |   checkCookie();
101 |   // -->
102 | </script>
103 | </body>
104 | </html>
105 | 


--------------------------------------------------------------------------------
/documentation/redirect.html:
--------------------------------------------------------------------------------
 1 | <html><head><title>Epydoc Redirect Page</title>
 2 | <meta http-equiv="cache-control" content="no-cache" />
 3 | <meta http-equiv="expires" content="0" />
 4 | <meta http-equiv="pragma" content="no-cache" />
 5 |   <script type="text/javascript" src="epydoc.js"></script>
 6 | </head>
 7 | <body>
 8 | <script type="text/javascript">
 9 | <!--
10 | var pages = ["tweetokenize.Tokenizer.TokenizerException-c", "tweetokenize.Tokenizer-c", "tweetokenize-m"];
11 | var dottedName = get_anchor();
12 | if (dottedName) {
13 |     var target = redirect_url(dottedName);
14 |     if (target) window.location.replace(target);
15 | }
16 | // -->
17 | </script>
18 | 
19 | <h3>Epydoc Auto-redirect page</h3>
20 | 
21 | <p>When javascript is enabled, this page will redirect URLs of
22 | the form <tt>redirect.html#<i>dotted.name</i></tt> to the
23 | documentation for the object with the given fully-qualified
24 | dotted name.</p>
25 | <p><a id="message"> &nbsp; </a></p>
26 | 
27 | <script type="text/javascript">
28 | <!--
29 | if (dottedName) {
30 |     var msg = document.getElementById("message");
31 |     msg.innerHTML = "No documentation found for <tt>"+
32 |                     dottedName+"</tt>";
33 | }
34 | // -->
35 | </script>
36 | 
37 | </body>
38 | </html>
39 | 


--------------------------------------------------------------------------------
/documentation/toc-everything.html:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="ascii"?>
 2 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
 3 |           "DTD/xhtml1-transitional.dtd">
 4 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
 5 | <head>
 6 |   <title>Everything</title>
 7 |   <link rel="stylesheet" href="epydoc.css" type="text/css" />
 8 |   <script type="text/javascript" src="epydoc.js"></script>
 9 | </head>
10 | 
11 | <body bgcolor="white" text="black" link="blue" vlink="#204080"
12 |       alink="#204080">
13 | <h1 class="toc">Everything</h1>
14 | <hr />
15 |   <h2 class="toc">All Classes</h2>
16 |     <a target="mainFrame" href="tweetokenize.Tokenizer-class.html"
17 |      >tweetokenize.Tokenizer</a><br />    <a target="mainFrame" href="tweetokenize.Tokenizer.TokenizerException-class.html"
18 |      >tweetokenize.Tokenizer.TokenizerException</a><br />  <h2 class="toc">All Variables</h2>
19 |     <a target="mainFrame" href="tweetokenize-module.html#__package__"
20 |      >tweetokenize.__package__</a><br /><hr />
21 | 
22 | <script type="text/javascript">
23 |   <!--
24 |   // Private objects are initially displayed (because if
25 |   // javascript is turned off then we want them to be
26 |   // visible); but by default, we want to hide them.  So hide
27 |   // them unless we have a cookie that says to show them.
28 |   checkCookie();
29 |   // -->
30 | </script>
31 | </body>
32 | </html>
33 | 


--------------------------------------------------------------------------------
/documentation/toc-tweetokenize-module.html:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="ascii"?>
 2 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
 3 |           "DTD/xhtml1-transitional.dtd">
 4 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
 5 | <head>
 6 |   <title>tweetokenize</title>
 7 |   <link rel="stylesheet" href="epydoc.css" type="text/css" />
 8 |   <script type="text/javascript" src="epydoc.js"></script>
 9 | </head>
10 | 
11 | <body bgcolor="white" text="black" link="blue" vlink="#204080"
12 |       alink="#204080">
13 | <h1 class="toc">Module tweetokenize</h1>
14 | <hr />
15 |   <h2 class="toc">Classes</h2>
16 |     <a target="mainFrame" href="tweetokenize.Tokenizer-class.html"
17 |      >Tokenizer</a><br />  <h2 class="toc">Variables</h2>
18 |     <a target="mainFrame" href="tweetokenize-module.html#__package__"
19 |      >__package__</a><br /><hr />
20 | 
21 | <script type="text/javascript">
22 |   <!--
23 |   // Private objects are initially displayed (because if
24 |   // javascript is turned off then we want them to be
25 |   // visible); but by default, we want to hide them.  So hide
26 |   // them unless we have a cookie that says to show them.
27 |   checkCookie();
28 |   // -->
29 | </script>
30 | </body>
31 | </html>
32 | 


--------------------------------------------------------------------------------
/documentation/toc.html:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="ascii"?>
 2 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
 3 |           "DTD/xhtml1-transitional.dtd">
 4 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
 5 | <head>
 6 |   <title>Table of Contents</title>
 7 |   <link rel="stylesheet" href="epydoc.css" type="text/css" />
 8 |   <script type="text/javascript" src="epydoc.js"></script>
 9 | </head>
10 | 
11 | <body bgcolor="white" text="black" link="blue" vlink="#204080"
12 |       alink="#204080">
13 | <h1 class="toc">Table&nbsp;of&nbsp;Contents</h1>
14 | <hr />
15 |   <a target="moduleFrame" href="toc-everything.html">Everything</a>
16 |   <br />
17 |   <h2 class="toc">Modules</h2>
18 |     <a target="moduleFrame" href="toc-tweetokenize-module.html"
19 |      onclick="setFrame('toc-tweetokenize-module.html','tweetokenize-module.html');"     >tweetokenize</a><br /><hr />
20 | 
21 | <script type="text/javascript">
22 |   <!--
23 |   // Private objects are initially displayed (because if
24 |   // javascript is turned off then we want them to be
25 |   // visible); but by default, we want to hide them.  So hide
26 |   // them unless we have a cookie that says to show them.
27 |   checkCookie();
28 |   // -->
29 | </script>
30 | </body>
31 | </html>
32 | 


--------------------------------------------------------------------------------
/documentation/tweetokenize-module.html:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="ascii"?>
  2 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  3 |           "DTD/xhtml1-transitional.dtd">
  4 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
  5 | <head>
  6 |   <title>tweetokenize</title>
  7 |   <link rel="stylesheet" href="epydoc.css" type="text/css" />
  8 |   <script type="text/javascript" src="epydoc.js"></script>
  9 | </head>
 10 | 
 11 | <body bgcolor="white" text="black" link="blue" vlink="#204080"
 12 |       alink="#204080">
 13 | <!-- ==================== NAVIGATION BAR ==================== -->
 14 | <table class="navbar" border="0" width="100%" cellpadding="0"
 15 |        bgcolor="#a0c0ff" cellspacing="0">
 16 |   <tr valign="middle">
 17 |   <!-- Home link -->
 18 |       <th bgcolor="#70b0f0" class="navbar-select"
 19 |           >&nbsp;&nbsp;&nbsp;Home&nbsp;&nbsp;&nbsp;</th>
 20 | 
 21 |   <!-- Tree link -->
 22 |       <th>&nbsp;&nbsp;&nbsp;<a
 23 |         href="module-tree.html">Trees</a>&nbsp;&nbsp;&nbsp;</th>
 24 | 
 25 |   <!-- Index link -->
 26 |       <th>&nbsp;&nbsp;&nbsp;<a
 27 |         href="identifier-index.html">Indices</a>&nbsp;&nbsp;&nbsp;</th>
 28 | 
 29 |   <!-- Help link -->
 30 |       <th>&nbsp;&nbsp;&nbsp;<a
 31 |         href="help.html">Help</a>&nbsp;&nbsp;&nbsp;</th>
 32 | 
 33 |       <th class="navbar" width="100%"></th>
 34 |   </tr>
 35 | </table>
 36 | <table width="100%" cellpadding="0" cellspacing="0">
 37 |   <tr valign="top">
 38 |     <td width="100%">
 39 |       <span class="breadcrumbs">
 40 |         Module&nbsp;tweetokenize
 41 |       </span>
 42 |     </td>
 43 |     <td>
 44 |       <table cellpadding="0" cellspacing="0">
 45 |         <!-- hide/show private -->
 46 |         <tr><td align="right"><span class="options"
 47 |             >[<a href="frames.html" target="_top">frames</a
 48 |             >]&nbsp;|&nbsp;<a href="tweetokenize-module.html"
 49 |             target="_top">no&nbsp;frames</a>]</span></td></tr>
 50 |       </table>
 51 |     </td>
 52 |   </tr>
 53 | </table>
 54 | <!-- ==================== MODULE DESCRIPTION ==================== -->
 55 | <h1 class="epydoc">Module tweetokenize</h1><p class="nomargin-top"><span class="codelink"><a href="tweetokenize-pysrc.html">source&nbsp;code</a></span></p>
 56 | <p>Tokenization and pre-processing for social media data used to train 
 57 |   classifiers. Focused on classification of sentiment, emotion, or 
 58 |   mood.</p>
 59 |   <p>Intended as glue between Python wrappers for Twitter API and the 
 60 |   Natural Language Toolkit (NLTK), but probably applicable to tokenizing 
 61 |   any short messages of the social networking variety.</p>
 62 |   <p>In many cases, reducing feature-set complexity can increase 
 63 |   performance of classifiers trained for detecting sentiment. The available
 64 |   settings are based on commonly modified and normalized features in 
 65 |   classification research using content from Twitter.</p>
 66 | 
 67 | <!-- ==================== CLASSES ==================== -->
 68 | <a name="section-Classes"></a>
 69 | <table class="summary" border="1" cellpadding="3"
 70 |        cellspacing="0" width="100%" bgcolor="white">
 71 | <tr bgcolor="#70b0f0" class="table-header">
 72 |   <td align="left" colspan="2" class="table-header">
 73 |     <span class="table-header">Classes</span></td>
 74 | </tr>
 75 | <tr>
 76 |     <td width="15%" align="right" valign="top" class="summary">
 77 |       <span class="summary-type">&nbsp;</span>
 78 |     </td><td class="summary">
 79 |         <a href="tweetokenize.Tokenizer-class.html" class="summary-name">Tokenizer</a><br />
 80 |       Can be used to tokenize a string representation of a message, 
 81 |         adjusting features based on the given configuration details, to 
 82 |         enable further processing in feature extraction and training 
 83 |         stages.
 84 |     </td>
 85 |   </tr>
 86 | </table>
 87 | <!-- ==================== VARIABLES ==================== -->
 88 | <a name="section-Variables"></a>
 89 | <table class="summary" border="1" cellpadding="3"
 90 |        cellspacing="0" width="100%" bgcolor="white">
 91 | <tr bgcolor="#70b0f0" class="table-header">
 92 |   <td align="left" colspan="2" class="table-header">
 93 |     <span class="table-header">Variables</span></td>
 94 | </tr>
 95 | <tr>
 96 |     <td width="15%" align="right" valign="top" class="summary">
 97 |       <span class="summary-type">&nbsp;</span>
 98 |     </td><td class="summary">
 99 |         <a name="__package__"></a><span class="summary-name">__package__</span> = <code title="None">None</code>
100 |     </td>
101 |   </tr>
102 | </table>
103 | <!-- ==================== NAVIGATION BAR ==================== -->
104 | <table class="navbar" border="0" width="100%" cellpadding="0"
105 |        bgcolor="#a0c0ff" cellspacing="0">
106 |   <tr valign="middle">
107 |   <!-- Home link -->
108 |       <th bgcolor="#70b0f0" class="navbar-select"
109 |           >&nbsp;&nbsp;&nbsp;Home&nbsp;&nbsp;&nbsp;</th>
110 | 
111 |   <!-- Tree link -->
112 |       <th>&nbsp;&nbsp;&nbsp;<a
113 |         href="module-tree.html">Trees</a>&nbsp;&nbsp;&nbsp;</th>
114 | 
115 |   <!-- Index link -->
116 |       <th>&nbsp;&nbsp;&nbsp;<a
117 |         href="identifier-index.html">Indices</a>&nbsp;&nbsp;&nbsp;</th>
118 | 
119 |   <!-- Help link -->
120 |       <th>&nbsp;&nbsp;&nbsp;<a
121 |         href="help.html">Help</a>&nbsp;&nbsp;&nbsp;</th>
122 | 
123 |       <th class="navbar" width="100%"></th>
124 |   </tr>
125 | </table>
126 | <table border="0" cellpadding="0" cellspacing="0" width="100%%">
127 |   <tr>
128 |     <td align="left" class="footer">
129 |     Generated by Epydoc 3.0.1 on Mon May 20 02:41:06 2013
130 |     </td>
131 |     <td align="right" class="footer">
132 |       <a target="mainFrame" href="http://epydoc.sourceforge.net"
133 |         >http://epydoc.sourceforge.net</a>
134 |     </td>
135 |   </tr>
136 | </table>
137 | 
138 | <script type="text/javascript">
139 |   <!--
140 |   // Private objects are initially displayed (because if
141 |   // javascript is turned off then we want them to be
142 |   // visible); but by default, we want to hide them.  So hide
143 |   // them unless we have a cookie that says to show them.
144 |   checkCookie();
145 |   // -->
146 | </script>
147 | </body>
148 | </html>
149 | 


--------------------------------------------------------------------------------
/documentation/tweetokenize.Tokenizer-class.html:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="ascii"?>
  2 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  3 |           "DTD/xhtml1-transitional.dtd">
  4 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
  5 | <head>
  6 |   <title>tweetokenize.Tokenizer</title>
  7 |   <link rel="stylesheet" href="epydoc.css" type="text/css" />
  8 |   <script type="text/javascript" src="epydoc.js"></script>
  9 | </head>
 10 | 
 11 | <body bgcolor="white" text="black" link="blue" vlink="#204080"
 12 |       alink="#204080">
 13 | <!-- ==================== NAVIGATION BAR ==================== -->
 14 | <table class="navbar" border="0" width="100%" cellpadding="0"
 15 |        bgcolor="#a0c0ff" cellspacing="0">
 16 |   <tr valign="middle">
 17 |   <!-- Home link -->
 18 |       <th>&nbsp;&nbsp;&nbsp;<a
 19 |         href="tweetokenize-module.html">Home</a>&nbsp;&nbsp;&nbsp;</th>
 20 | 
 21 |   <!-- Tree link -->
 22 |       <th>&nbsp;&nbsp;&nbsp;<a
 23 |         href="module-tree.html">Trees</a>&nbsp;&nbsp;&nbsp;</th>
 24 | 
 25 |   <!-- Index link -->
 26 |       <th>&nbsp;&nbsp;&nbsp;<a
 27 |         href="identifier-index.html">Indices</a>&nbsp;&nbsp;&nbsp;</th>
 28 | 
 29 |   <!-- Help link -->
 30 |       <th>&nbsp;&nbsp;&nbsp;<a
 31 |         href="help.html">Help</a>&nbsp;&nbsp;&nbsp;</th>
 32 | 
 33 |       <th class="navbar" width="100%"></th>
 34 |   </tr>
 35 | </table>
 36 | <table width="100%" cellpadding="0" cellspacing="0">
 37 |   <tr valign="top">
 38 |     <td width="100%">
 39 |       <span class="breadcrumbs">
 40 |         <a href="tweetokenize-module.html">Module&nbsp;tweetokenize</a> ::
 41 |         Class&nbsp;Tokenizer
 42 |       </span>
 43 |     </td>
 44 |     <td>
 45 |       <table cellpadding="0" cellspacing="0">
 46 |         <!-- hide/show private -->
 47 |         <tr><td align="right"><span class="options"
 48 |             >[<a href="frames.html" target="_top">frames</a
 49 |             >]&nbsp;|&nbsp;<a href="tweetokenize.Tokenizer-class.html"
 50 |             target="_top">no&nbsp;frames</a>]</span></td></tr>
 51 |       </table>
 52 |     </td>
 53 |   </tr>
 54 | </table>
 55 | <!-- ==================== CLASS DESCRIPTION ==================== -->
 56 | <h1 class="epydoc">Class Tokenizer</h1><p class="nomargin-top"><span class="codelink"><a href="tweetokenize-pysrc.html#Tokenizer">source&nbsp;code</a></span></p>
 57 | <pre class="base-tree">
 58 | object --+
 59 |          |
 60 |         <strong class="uidshort">Tokenizer</strong>
 61 | </pre>
 62 | 
 63 | <hr />
 64 | <p>Can be used to tokenize a string representation of a message, 
 65 |   adjusting features based on the given configuration details, to enable 
 66 |   further processing in feature extraction and training stages.</p>
 67 |   <p>An example usage:</p>
 68 | <pre class="literalblock">
 69 |  &gt;&gt;&gt; from tweetokenize import Tokenizer
 70 |  &gt;&gt;&gt; gettokens = Tokenizer(usernames='USER', urls='')
 71 |  &gt;&gt;&gt; gettokens.tokenize('@justinbeiber yo man!love you#inlove#wantyou in a totally straight way #brotime &lt;3:p:D www.justinbeiber.com')
 72 |  [u'USER', u'yo', u'man', u'!', u'love', u'you', u'#inlove', u'#wantyou', u'in', u'a', u'totally', u'straight', u'way', u'#brotime', u'&lt;3', u':p', u':D']
 73 | </pre>
 74 | 
 75 | <!-- ==================== NESTED CLASSES ==================== -->
 76 | <a name="section-NestedClasses"></a>
 77 | <table class="summary" border="1" cellpadding="3"
 78 |        cellspacing="0" width="100%" bgcolor="white">
 79 | <tr bgcolor="#70b0f0" class="table-header">
 80 |   <td align="left" colspan="2" class="table-header">
 81 |     <span class="table-header">Nested Classes</span></td>
 82 | </tr>
 83 | <tr>
 84 |     <td width="15%" align="right" valign="top" class="summary">
 85 |       <span class="summary-type">&nbsp;</span>
 86 |     </td><td class="summary">
 87 |         <a href="tweetokenize.Tokenizer.TokenizerException-class.html" class="summary-name">TokenizerException</a>
 88 |     </td>
 89 |   </tr>
 90 | </table>
 91 | <!-- ==================== INSTANCE METHODS ==================== -->
 92 | <a name="section-InstanceMethods"></a>
 93 | <table class="summary" border="1" cellpadding="3"
 94 |        cellspacing="0" width="100%" bgcolor="white">
 95 | <tr bgcolor="#70b0f0" class="table-header">
 96 |   <td align="left" colspan="2" class="table-header">
 97 |     <span class="table-header">Instance Methods</span></td>
 98 | </tr>
 99 | <tr>
100 |     <td width="15%" align="right" valign="top" class="summary">
101 |       <span class="summary-type">&nbsp;</span>
102 |     </td><td class="summary">
103 |       <table width="100%" cellpadding="0" cellspacing="0" border="0">
104 |         <tr>
105 |           <td><span class="summary-sig"><a href="tweetokenize.Tokenizer-class.html#__init__" class="summary-sig-name">__init__</a>(<span class="summary-sig-arg">self</span>,
106 |         <span class="summary-sig-arg">lowercase</span>=<span class="summary-sig-default">True</span>,
107 |         <span class="summary-sig-arg">allcapskeep</span>=<span class="summary-sig-default">True</span>,
108 |         <span class="summary-sig-arg">normalize</span>=<span class="summary-sig-default">3</span>,
109 |         <span class="summary-sig-arg">usernames</span>=<span class="summary-sig-default"><code class="variable-quote">'</code><code class="variable-string">USERNAME</code><code class="variable-quote">'</code></span>,
110 |         <span class="summary-sig-arg">urls</span>=<span class="summary-sig-default"><code class="variable-quote">'</code><code class="variable-string">URL</code><code class="variable-quote">'</code></span>,
111 |         <span class="summary-sig-arg">hashtags</span>=<span class="summary-sig-default">False</span>,
112 |         <span class="summary-sig-arg">phonenumbers</span>=<span class="summary-sig-default"><code class="variable-quote">'</code><code class="variable-string">PHONENUMBER</code><code class="variable-quote">'</code></span>,
113 |         <span class="summary-sig-arg">times</span>=<span class="summary-sig-default"><code class="variable-quote">'</code><code class="variable-string">TIME</code><code class="variable-quote">'</code></span>,
114 |         <span class="summary-sig-arg">numbers</span>=<span class="summary-sig-default"><code class="variable-quote">'</code><code class="variable-string">NUMBER</code><code class="variable-quote">'</code></span>,
115 |         <span class="summary-sig-arg">ignorequotes</span>=<span class="summary-sig-default">False</span>,
116 |         <span class="summary-sig-arg">ignorestopwords</span>=<span class="summary-sig-default">False</span>)</span><br />
117 |       Constructs a new Tokenizer.</td>
118 |           <td align="right" valign="top">
119 |             <span class="codelink"><a href="tweetokenize-pysrc.html#Tokenizer.__init__">source&nbsp;code</a></span>
120 |             
121 |           </td>
122 |         </tr>
123 |       </table>
124 |       
125 |     </td>
126 |   </tr>
127 | <tr>
128 |     <td width="15%" align="right" valign="top" class="summary">
129 |       <span class="summary-type"><code>list</code> of <code>str</code></span>
130 |     </td><td class="summary">
131 |       <table width="100%" cellpadding="0" cellspacing="0" border="0">
132 |         <tr>
133 |           <td><span class="summary-sig"><a href="tweetokenize.Tokenizer-class.html#__call__" class="summary-sig-name">__call__</a>(<span class="summary-sig-arg">self</span>,
134 |         <span class="summary-sig-arg">iterable</span>)</span><br />
135 |       Iterator for the tokenization of given messages.</td>
136 |           <td align="right" valign="top">
137 |             <span class="codelink"><a href="tweetokenize-pysrc.html#Tokenizer.__call__">source&nbsp;code</a></span>
138 |             
139 |           </td>
140 |         </tr>
141 |       </table>
142 |       
143 |     </td>
144 |   </tr>
145 | <tr>
146 |     <td width="15%" align="right" valign="top" class="summary">
147 |       <span class="summary-type">&nbsp;</span>
148 |     </td><td class="summary">
149 |       <table width="100%" cellpadding="0" cellspacing="0" border="0">
150 |         <tr>
151 |           <td><span class="summary-sig"><a href="tweetokenize.Tokenizer-class.html#update" class="summary-sig-name">update</a>(<span class="summary-sig-arg">self</span>,
152 |         <span class="summary-sig-arg">**kwargs</span>)</span><br />
153 |       Adjust any settings of the Tokenizer.</td>
154 |           <td align="right" valign="top">
155 |             <span class="codelink"><a href="tweetokenize-pysrc.html#Tokenizer.update">source&nbsp;code</a></span>
156 |             
157 |           </td>
158 |         </tr>
159 |       </table>
160 |       
161 |     </td>
162 |   </tr>
163 | <tr>
164 |     <td width="15%" align="right" valign="top" class="summary">
165 |       <span class="summary-type"><code>list</code> of <code>str</code></span>
166 |     </td><td class="summary">
167 |       <table width="100%" cellpadding="0" cellspacing="0" border="0">
168 |         <tr>
169 |           <td><span class="summary-sig"><a href="tweetokenize.Tokenizer-class.html#tokenize" class="summary-sig-name">tokenize</a>(<span class="summary-sig-arg">self</span>,
170 |         <span class="summary-sig-arg">message</span>)</span><br />
171 |       Tokenize the given string into a list of strings representing the 
172 |       constituent words of the message.</td>
173 |           <td align="right" valign="top">
174 |             <span class="codelink"><a href="tweetokenize-pysrc.html#Tokenizer.tokenize">source&nbsp;code</a></span>
175 |             
176 |           </td>
177 |         </tr>
178 |       </table>
179 |       
180 |     </td>
181 |   </tr>
182 | <tr>
183 |     <td width="15%" align="right" valign="top" class="summary">
184 |       <span class="summary-type">&nbsp;</span>
185 |     </td><td class="summary">
186 |       <table width="100%" cellpadding="0" cellspacing="0" border="0">
187 |         <tr>
188 |           <td><span class="summary-sig"><a href="tweetokenize.Tokenizer-class.html#emoticons" class="summary-sig-name">emoticons</a>(<span class="summary-sig-arg">self</span>,
189 |         <span class="summary-sig-arg">iterable</span>=<span class="summary-sig-default">None</span>,
190 |         <span class="summary-sig-arg">filename</span>=<span class="summary-sig-default">None</span>)</span><br />
191 |       Consumes an iterable of emoticons that the tokenizer will tokenize 
192 |       on.</td>
193 |           <td align="right" valign="top">
194 |             <span class="codelink"><a href="tweetokenize-pysrc.html#Tokenizer.emoticons">source&nbsp;code</a></span>
195 |             
196 |           </td>
197 |         </tr>
198 |       </table>
199 |       
200 |     </td>
201 |   </tr>
202 | <tr>
203 |     <td width="15%" align="right" valign="top" class="summary">
204 |       <span class="summary-type">&nbsp;</span>
205 |     </td><td class="summary">
206 |       <table width="100%" cellpadding="0" cellspacing="0" border="0">
207 |         <tr>
208 |           <td><span class="summary-sig"><a href="tweetokenize.Tokenizer-class.html#stopwords" class="summary-sig-name">stopwords</a>(<span class="summary-sig-arg">self</span>,
209 |         <span class="summary-sig-arg">iterable</span>=<span class="summary-sig-default">None</span>,
210 |         <span class="summary-sig-arg">filename</span>=<span class="summary-sig-default">None</span>)</span><br />
211 |       Consumes an iterable of stopwords that the tokenizer will ignore if 
212 |       the stopwords setting is <code>True</code>.</td>
213 |           <td align="right" valign="top">
214 |             <span class="codelink"><a href="tweetokenize-pysrc.html#Tokenizer.stopwords">source&nbsp;code</a></span>
215 |             
216 |           </td>
217 |         </tr>
218 |       </table>
219 |       
220 |     </td>
221 |   </tr>
222 |   <tr>
223 |     <td colspan="2" class="summary">
224 |     <p class="indent-wrapped-lines"><b>Inherited from <code>object</code></b>:
225 |       <code>__delattr__</code>,
226 |       <code>__format__</code>,
227 |       <code>__getattribute__</code>,
228 |       <code>__hash__</code>,
229 |       <code>__new__</code>,
230 |       <code>__reduce__</code>,
231 |       <code>__reduce_ex__</code>,
232 |       <code>__repr__</code>,
233 |       <code>__setattr__</code>,
234 |       <code>__sizeof__</code>,
235 |       <code>__str__</code>,
236 |       <code>__subclasshook__</code>
237 |       </p>
238 |     </td>
239 |   </tr>
240 | </table>
241 | <!-- ==================== CLASS VARIABLES ==================== -->
242 | <a name="section-ClassVariables"></a>
243 | <table class="summary" border="1" cellpadding="3"
244 |        cellspacing="0" width="100%" bgcolor="white">
245 | <tr bgcolor="#70b0f0" class="table-header">
246 |   <td align="left" colspan="2" class="table-header">
247 |     <span class="table-header">Class Variables</span></td>
248 | </tr>
249 | <tr>
250 |     <td width="15%" align="right" valign="top" class="summary">
251 |       <span class="summary-type">&nbsp;</span>
252 |     </td><td class="summary">
253 |         <a href="tweetokenize.Tokenizer-class.html#html_entities" class="summary-name">html_entities</a> = <code title="{'AElig': u'&#198;',
254 |  'Aacute': u'&#193;',
255 |  'Acirc': u'&#194;',
256 |  'Agrave': u'&#192;',
257 |  'Alpha': u'&#913;',
258 |  'Aring': u'&#197;',
259 |  'Atilde': u'&#195;',
260 |  'Auml': u'&#196;',
261 | ..."><code class="variable-group">{</code><code class="variable-quote">'</code><code class="variable-string">AElig</code><code class="variable-quote">'</code><code class="variable-op">: </code><code class="variable-quote">u'</code><code class="variable-string">&#198;</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">Aacute</code><code class="variable-quote">'</code><code class="variable-op">: </code><code class="variable-quote">u'</code><code class="variable-string">&#193;</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">Acirc</code><code class="variable-quote">'</code><code class="variable-op">: </code><code class="variable-quote">u'</code><code class="variable-string">&#194;</code><code class="variable-quote">'</code><code class="variable-op">,</code><code class="variable-ellipsis">...</code></code>
262 |     </td>
263 |   </tr>
264 | <tr>
265 |     <td width="15%" align="right" valign="top" class="summary">
266 |       <span class="summary-type">&nbsp;</span>
267 |     </td><td class="summary">
268 |         <a name="usernames_re"></a><span class="summary-name">usernames_re</span> = <code title="re.compile(r'@\w{1,15}')">re.compile(r'@\w<code class="re-op">{1,15}</code>')</code>
269 |     </td>
270 |   </tr>
271 | <tr>
272 |     <td width="15%" align="right" valign="top" class="summary">
273 |       <span class="summary-type">&nbsp;</span>
274 |     </td><td class="summary">
275 |         <a href="tweetokenize.Tokenizer-class.html#urls_re" class="summary-name">urls_re</a> = <code title="re.compile(r'(?:(?:https?://[A-Za-z0-9\.]+)|(?:(?:www\.)?[A-Za-z0-9]+\\
276 | .(?:museum|travel|aero|arpa|asia|coop|info|jobs|mobi|name|post|biz|cat\
277 | |com|edu|gov|int|mil|net|org|pro|tel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao\
278 | |aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|\
279 | bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cw|cx|cy|cz|d\
280 | e|dj|dk|dm|do|dz|ec|ee|eg|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf\
281 | |gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|\
282 | im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|l\
283 | ...">re.compile(r'<code class="re-group">(?:</code><code class="re-group">(?:</code>https<code class="re-op">?</code>://<code class="re-group">[</code>A<code class="re-op">-</code>Za<code class="re-op">-</code>z0<code class="re-op">-</code>9\.<code class="re-group">]</code><code class="re-op">+</code><code class="re-group">)</code><code class="re-op">|</code><code class="re-group">(?:</code><code class="re-group">(?:</code>ww<code class="variable-ellipsis">...</code></code>
284 |     </td>
285 |   </tr>
286 | <tr>
287 |     <td width="15%" align="right" valign="top" class="summary">
288 |       <span class="summary-type">&nbsp;</span>
289 |     </td><td class="summary">
290 |         <a name="hashtags_re"></a><span class="summary-name">hashtags_re</span> = <code title="re.compile(r'#\w+[\w\'-]*\w+')">re.compile(r'#\w<code class="re-op">+</code><code class="re-group">[</code>\w\'-<code class="re-group">]</code><code class="re-op">*</code>\w<code class="re-op">+</code>')</code>
291 |     </td>
292 |   </tr>
293 | <tr>
294 |     <td width="15%" align="right" valign="top" class="summary">
295 |       <span class="summary-type">&nbsp;</span>
296 |     </td><td class="summary">
297 |         <a name="ellipsis_re"></a><span class="summary-name">ellipsis_re</span> = <code title="re.compile(r'\.\.+')">re.compile(r'\.\.<code class="re-op">+</code>')</code>
298 |     </td>
299 |   </tr>
300 | <tr>
301 |     <td width="15%" align="right" valign="top" class="summary">
302 |       <span class="summary-type">&nbsp;</span>
303 |     </td><td class="summary">
304 |         <a href="tweetokenize.Tokenizer-class.html#word_re" class="summary-name">word_re</a> = <code title="re.compile(r'(?:[a-zA-Z0-9]+[\'-]?[a-zA-Z]+[a-zA-Z0-9]*)|(?:[a-zA-Z0-9\
305 | ]*[a-zA-Z]+[\'-]?[a-zA-Z0-9]+)')">re.compile(r'<code class="re-group">(?:</code><code class="re-group">[</code>a<code class="re-op">-</code>zA<code class="re-op">-</code>Z0<code class="re-op">-</code>9<code class="re-group">]</code><code class="re-op">+</code><code class="re-group">[</code>\'-<code class="re-group">]</code><code class="re-op">?</code><code class="re-group">[</code>a<code class="re-op">-</code>zA<code class="re-op">-</code>Z<code class="re-group">]</code><code class="re-op">+</code><code class="re-group">[</code>a<code class="re-op">-</code>zA<code class="re-op">-</code>Z0<code class="re-op">-</code><code class="variable-ellipsis">...</code></code>
306 |     </td>
307 |   </tr>
308 | <tr>
309 |     <td width="15%" align="right" valign="top" class="summary">
310 |       <span class="summary-type">&nbsp;</span>
311 |     </td><td class="summary">
312 |         <a href="tweetokenize.Tokenizer-class.html#times_re" class="summary-name">times_re</a> = <code title="re.compile(r'\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM|am|pm)?')">re.compile(r'\d<code class="re-op">{1,2}</code>:\d<code class="re-op">{2}</code><code class="re-group">(?:</code>:\d<code class="re-op">{2}</code><code class="re-group">)</code><code class="re-op">?</code>\s<code class="re-op">*</code><code class="re-group">(?:</code>AM<code class="re-op">|</code>PM<code class="re-op">|</code>am<code class="variable-ellipsis">...</code></code>
313 |     </td>
314 |   </tr>
315 | <tr>
316 |     <td width="15%" align="right" valign="top" class="summary">
317 |       <span class="summary-type">&nbsp;</span>
318 |     </td><td class="summary">
319 |         <a href="tweetokenize.Tokenizer-class.html#phonenumbers_re" class="summary-name">phonenumbers_re</a> = <code title="re.compile(r'(?:\+?[01][-\s\.]*)?(?:\(?\d{3}[-\s\.\)]*)?\d{3}[-\s\.]*\\
320 | d{4}(?:\s*x\s*\d+)?(?=\s+|$)')">re.compile(r'<code class="re-group">(?:</code>\+<code class="re-op">?</code><code class="re-group">[</code>01<code class="re-group">]</code><code class="re-group">[</code>-\s\.<code class="re-group">]</code><code class="re-op">*</code><code class="re-group">)</code><code class="re-op">?</code><code class="re-group">(?:</code>\(<code class="re-op">?</code>\d<code class="re-op">{3}</code><code class="variable-ellipsis">...</code></code>
321 |     </td>
322 |   </tr>
323 | <tr>
324 |     <td width="15%" align="right" valign="top" class="summary">
325 |       <span class="summary-type">&nbsp;</span>
326 |     </td><td class="summary">
327 |         <a href="tweetokenize.Tokenizer-class.html#numbers_re" class="summary-name">numbers_re</a> = <code title="re.compile(r'(?:[\+-]?\$?\d+(?:\.\d+)?(?:[eE]-?\d+)?%?)(?![A-Za-z])(?:\
328 | \s*/\s*(?:[\+-]?\$?\d+(?:\.\d+)?(?:[eE]-?\d+)?%?)(?![A-Za-z]))?')">re.compile(r'<code class="re-group">(?:</code><code class="re-group">[</code>\+-<code class="re-group">]</code><code class="re-op">?</code>\$<code class="re-op">?</code>\d<code class="re-op">+</code><code class="re-group">(?:</code>\.\d<code class="re-op">+</code><code class="re-group">)</code><code class="re-op">?</code><code class="re-group">(?:</code><code class="re-group">[</code>eE<code class="re-group">]</code>-<code class="re-op">?</code>\d<code class="variable-ellipsis">...</code></code>
329 |     </td>
330 |   </tr>
331 | <tr>
332 |     <td width="15%" align="right" valign="top" class="summary">
333 |       <span class="summary-type">&nbsp;</span>
334 |     </td><td class="summary">
335 |         <a name="other_re"></a><span class="summary-name">other_re</span> = <code title="'(?:[^#\\s\\.]|\\.(?!\\.))+'"><code class="variable-quote">'</code><code class="variable-string">(?:[^#\\s\\.]|\\.(?!\\.))+</code><code class="variable-quote">'</code></code>
336 |     </td>
337 |   </tr>
338 | <tr>
339 |     <td width="15%" align="right" valign="top" class="summary">
340 |       <span class="summary-type">&nbsp;</span>
341 |     </td><td class="summary">
342 |         <a href="tweetokenize.Tokenizer-class.html#tokenize_re" class="summary-name">tokenize_re</a> = <code title="re.compile(r'@\w{1,15}|(?:(?:https?://[A-Za-z0-9\.]+)|(?:(?:www\.)?[A-\
343 | Za-z0-9]+\.(?:museum|travel|aero|arpa|asia|coop|info|jobs|mobi|name|po\
344 | st|biz|cat|com|edu|gov|int|mil|net|org|pro|tel|xxx|ac|ad|ae|af|ag|ai|a\
345 | l|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo\
346 | |br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cw|\
347 | cx|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|g\
348 | b|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu\
349 | |id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|\
350 | ...">re.compile(r'@\w<code class="re-op">{1,15}</code><code class="re-op">|</code><code class="re-group">(?:</code><code class="re-group">(?:</code>https<code class="re-op">?</code>://<code class="re-group">[</code>A<code class="re-op">-</code>Za<code class="re-op">-</code>z0<code class="re-op">-</code>9<code class="variable-ellipsis">...</code></code>
351 |     </td>
352 |   </tr>
353 | <tr>
354 |     <td width="15%" align="right" valign="top" class="summary">
355 |       <span class="summary-type">&nbsp;</span>
356 |     </td><td class="summary">
357 |         <a name="html_entities_re"></a><span class="summary-name">html_entities_re</span> = <code title="re.compile(r'&amp;#?\w+;')">re.compile(r'&amp;#<code class="re-op">?</code>\w<code class="re-op">+</code>;')</code>
358 |     </td>
359 |   </tr>
360 | <tr>
361 |     <td width="15%" align="right" valign="top" class="summary">
362 |       <span class="summary-type">&nbsp;</span>
363 |     </td><td class="summary">
364 |         <a name="repeating_re"></a><span class="summary-name">repeating_re</span> = <code title="re.compile(r'([a-zA-Z])\1\1+')">re.compile(r'<code class="re-group">(</code><code class="re-group">[</code>a<code class="re-op">-</code>zA<code class="re-op">-</code>Z<code class="re-group">]</code><code class="re-group">)</code><code class="re-ref">\1</code><code class="re-ref">\1</code><code class="re-op">+</code>')</code>
365 |     </td>
366 |   </tr>
367 | <tr>
368 |     <td width="15%" align="right" valign="top" class="summary">
369 |       <span class="summary-type">&nbsp;</span>
370 |     </td><td class="summary">
371 |         <a name="punctuation"></a><span class="summary-name">punctuation</span> = <code title="u'!$%()*+,-/:;&lt;=&gt;?[\]^_.`{|}~'&#8220;&#8221;&quot;&quot;&#8216;&#8217;&#65282;&#65282;'"><code class="variable-quote">u'</code><code class="variable-string">!$%()*+,-/:;&lt;=&gt;?[\]^_.`{|}~'&#8220;&#8221;&quot;&quot;&#8216;&#8217;&#65282;&#65282;</code><code class="variable-quote">'</code></code>
372 |     </td>
373 |   </tr>
374 | <tr>
375 |     <td width="15%" align="right" valign="top" class="summary">
376 |       <span class="summary-type">&nbsp;</span>
377 |     </td><td class="summary">
378 |         <a href="tweetokenize.Tokenizer-class.html#quotes_re" class="summary-name">quotes_re</a> = <code title="re.compile(r'(\u201c.*?\u201d)|(&quot;.*?&quot;)|(\u2018.*?\u2019)|(\uff02.*?\uf\
379 | f02)|\s(\'.*?\')\s')">re.compile(r'<code class="re-group">(</code>\u201c.<code class="re-op">*?</code>\u201d<code class="re-group">)</code><code class="re-op">|</code><code class="re-group">(</code>&quot;.<code class="re-op">*?</code>&quot;<code class="re-group">)</code><code class="re-op">|</code><code class="re-group">(</code>\u2018.<code class="re-op">*?</code>\<code class="variable-ellipsis">...</code></code>
380 |     </td>
381 |   </tr>
382 | </table>
383 | <!-- ==================== PROPERTIES ==================== -->
384 | <a name="section-Properties"></a>
385 | <table class="summary" border="1" cellpadding="3"
386 |        cellspacing="0" width="100%" bgcolor="white">
387 | <tr bgcolor="#70b0f0" class="table-header">
388 |   <td align="left" colspan="2" class="table-header">
389 |     <span class="table-header">Properties</span></td>
390 | </tr>
391 |   <tr>
392 |     <td colspan="2" class="summary">
393 |     <p class="indent-wrapped-lines"><b>Inherited from <code>object</code></b>:
394 |       <code>__class__</code>
395 |       </p>
396 |     </td>
397 |   </tr>
398 | </table>
399 | <!-- ==================== METHOD DETAILS ==================== -->
400 | <a name="section-MethodDetails"></a>
401 | <table class="details" border="1" cellpadding="3"
402 |        cellspacing="0" width="100%" bgcolor="white">
403 | <tr bgcolor="#70b0f0" class="table-header">
404 |   <td align="left" colspan="2" class="table-header">
405 |     <span class="table-header">Method Details</span></td>
406 | </tr>
407 | </table>
408 | <a name="__init__"></a>
409 | <div>
410 | <table class="details" border="1" cellpadding="3"
411 |        cellspacing="0" width="100%" bgcolor="white">
412 | <tr><td>
413 |   <table width="100%" cellpadding="0" cellspacing="0" border="0">
414 |   <tr valign="top"><td>
415 |   <h3 class="epydoc"><span class="sig"><span class="sig-name">__init__</span>(<span class="sig-arg">self</span>,
416 |         <span class="sig-arg">lowercase</span>=<span class="sig-default">True</span>,
417 |         <span class="sig-arg">allcapskeep</span>=<span class="sig-default">True</span>,
418 |         <span class="sig-arg">normalize</span>=<span class="sig-default">3</span>,
419 |         <span class="sig-arg">usernames</span>=<span class="sig-default"><code class="variable-quote">'</code><code class="variable-string">USERNAME</code><code class="variable-quote">'</code></span>,
420 |         <span class="sig-arg">urls</span>=<span class="sig-default"><code class="variable-quote">'</code><code class="variable-string">URL</code><code class="variable-quote">'</code></span>,
421 |         <span class="sig-arg">hashtags</span>=<span class="sig-default">False</span>,
422 |         <span class="sig-arg">phonenumbers</span>=<span class="sig-default"><code class="variable-quote">'</code><code class="variable-string">PHONENUMBER</code><code class="variable-quote">'</code></span>,
423 |         <span class="sig-arg">times</span>=<span class="sig-default"><code class="variable-quote">'</code><code class="variable-string">TIME</code><code class="variable-quote">'</code></span>,
424 |         <span class="sig-arg">numbers</span>=<span class="sig-default"><code class="variable-quote">'</code><code class="variable-string">NUMBER</code><code class="variable-quote">'</code></span>,
425 |         <span class="sig-arg">ignorequotes</span>=<span class="sig-default">False</span>,
426 |         <span class="sig-arg">ignorestopwords</span>=<span class="sig-default">False</span>)</span>
427 |     <br /><em class="fname">(Constructor)</em>
428 |   </h3>
429 |   </td><td align="right" valign="top"
430 |     ><span class="codelink"><a href="tweetokenize-pysrc.html#Tokenizer.__init__">source&nbsp;code</a></span>&nbsp;
431 |     </td>
432 |   </tr></table>
433 |   
434 |   <p>Constructs a new Tokenizer. Can specify custom settings for various 
435 |   feature normalizations.</p>
436 |   <p>Any features with replacement tokens can be removed from the message 
437 |   by setting the token to the empty string (<code>&quot;&quot;</code>), 
438 |   <code>&quot;DELETE&quot;</code>, or <code>&quot;REMOVE&quot;</code>.</p>
439 |   <dl class="fields">
440 |     <dt>Parameters:</dt>
441 |     <dd><ul class="nomargin-top">
442 |         <li><strong class="pname"><code>lowercase</code></strong> (<code>bool</code>) - If <code>True</code>, lowercases words, excluding those with all 
443 |           letters capitalized.</li>
444 |         <li><strong class="pname"><code>allcapskeep</code></strong> (<code>bool</code>) - If <code>True</code>, maintains capitalization for words with all
445 |           letters in capitals. Otherwise, capitalization for such words is 
446 |           dependent on <code>lowercase</code>.</li>
447 |         <li><strong class="pname"><code>normalize</code></strong> (<code>int</code>) - The number of repeating letters when normalizing arbitrary letter
448 |           elongations.
449 |           <p>Example:</p>
450 | <pre class="literalblock">
451 |    Heyyyyyy i lovvvvvvve youuuuuuuuu &lt;3
452 | </pre>
453 |           <p>Becomes:</p>
454 | <pre class="literalblock">
455 |    Heyyy i lovvve youuu &lt;3
456 | </pre>
457 |           <p>Not sure why you would want to change this (maybe just for 
458 |           fun?? :P)</p></li>
459 |         <li><strong class="pname"><code>usernames</code></strong> - Serves as the replacement token for anything that parses as a 
460 |           Twitter username, ie. <code>@rayj</code>. Setting this to 
461 |           <code>False</code> means no usernames will be changed.</li>
462 |         <li><strong class="pname"><code>urls</code></strong> - Serves as the replacement token for anything that parses as a 
463 |           URL, ie. <code>bit.ly</code> or <code>http://example.com</code>. 
464 |           Setting this to <code>False</code> means no URLs will be changed.</li>
465 |         <li><strong class="pname"><code>hashtags</code></strong> - Serves as the replacement token for anything that parses as a 
466 |           Twitter hashtag, ie. <code>#ihititfirst</code> or 
467 |           <code>#onedirection</code>. Setting this to <code>False</code> 
468 |           means no hashtags will be changed.</li>
469 |         <li><strong class="pname"><code>phonenumbers</code></strong> - Replacement token for phone numbers.</li>
470 |         <li><strong class="pname"><code>times</code></strong> - Replacement token for times.</li>
471 |         <li><strong class="pname"><code>numbers</code></strong> - Replacement token for any other kinds of numbers.</li>
472 |         <li><strong class="pname"><code>ignorequotes</code></strong> (<code>bool</code>) - If <code>True</code>, will remove various types of quotes and the
473 |           contents within.</li>
474 |         <li><strong class="pname"><code>ignorestopwords</code></strong> (<code>bool</code>) - If <code>True</code>, will remove any stopwords. The default set 
475 |           includes 'I', 'me', 'itself', 'against', 'should', etc.</li>
476 |     </ul></dd>
477 |     <dt>Overrides:
478 |         object.__init__
479 |     </dt>
480 |   </dl>
481 | </td></tr></table>
482 | </div>
483 | <a name="__call__"></a>
484 | <div>
485 | <table class="details" border="1" cellpadding="3"
486 |        cellspacing="0" width="100%" bgcolor="white">
487 | <tr><td>
488 |   <table width="100%" cellpadding="0" cellspacing="0" border="0">
489 |   <tr valign="top"><td>
490 |   <h3 class="epydoc"><span class="sig"><span class="sig-name">__call__</span>(<span class="sig-arg">self</span>,
491 |         <span class="sig-arg">iterable</span>)</span>
492 |     <br /><em class="fname">(Call operator)</em>
493 |   </h3>
494 |   </td><td align="right" valign="top"
495 |     ><span class="codelink"><a href="tweetokenize-pysrc.html#Tokenizer.__call__">source&nbsp;code</a></span>&nbsp;
496 |     </td>
497 |   </tr></table>
498 |   
499 |   <p>Iterator for the tokenization of given messages.</p>
500 |   <dl class="fields">
501 |     <dt>Parameters:</dt>
502 |     <dd><ul class="nomargin-top">
503 |         <li><strong class="pname"><code>iterable</code></strong> - Object capable of iteration, providing strings for tokenization.</li>
504 |     </ul></dd>
505 |     <dt>Returns: <code>list</code> of <code>str</code></dt>
506 |         <dd>Iterator of lists representing message tokenizations.</dd>
507 |   </dl>
508 | </td></tr></table>
509 | </div>
510 | <a name="update"></a>
511 | <div>
512 | <table class="details" border="1" cellpadding="3"
513 |        cellspacing="0" width="100%" bgcolor="white">
514 | <tr><td>
515 |   <table width="100%" cellpadding="0" cellspacing="0" border="0">
516 |   <tr valign="top"><td>
517 |   <h3 class="epydoc"><span class="sig"><span class="sig-name">update</span>(<span class="sig-arg">self</span>,
518 |         <span class="sig-arg">**kwargs</span>)</span>
519 |   </h3>
520 |   </td><td align="right" valign="top"
521 |     ><span class="codelink"><a href="tweetokenize-pysrc.html#Tokenizer.update">source&nbsp;code</a></span>&nbsp;
522 |     </td>
523 |   </tr></table>
524 |   
525 |   <p>Adjust any settings of the Tokenizer.</p>
526 | <pre class="py-doctest">
527 | <span class="py-prompt">&gt;&gt;&gt; </span>gettokens.lowercase
528 | <span class="py-output">True</span>
529 | <span class="py-output"></span><span class="py-prompt">&gt;&gt;&gt; </span>gettokens.phonenumbers
530 | <span class="py-output">'PHONENUMBER'</span>
531 | <span class="py-output"></span><span class="py-prompt">&gt;&gt;&gt; </span>gettokens.update(phonenumbers=<span class="py-string">'NUMBER'</span>, lowercase=False)
532 | <span class="py-prompt">&gt;&gt;&gt; </span>gettokens.lowercase
533 | <span class="py-output">False</span>
534 | <span class="py-output"></span><span class="py-prompt">&gt;&gt;&gt; </span>gettokens.phonenumbers
535 | <span class="py-output">'NUMBER'</span></pre>
536 |   <dl class="fields">
537 |   </dl>
538 | </td></tr></table>
539 | </div>
540 | <a name="tokenize"></a>
541 | <div>
542 | <table class="details" border="1" cellpadding="3"
543 |        cellspacing="0" width="100%" bgcolor="white">
544 | <tr><td>
545 |   <table width="100%" cellpadding="0" cellspacing="0" border="0">
546 |   <tr valign="top"><td>
547 |   <h3 class="epydoc"><span class="sig"><span class="sig-name">tokenize</span>(<span class="sig-arg">self</span>,
548 |         <span class="sig-arg">message</span>)</span>
549 |   </h3>
550 |   </td><td align="right" valign="top"
551 |     ><span class="codelink"><a href="tweetokenize-pysrc.html#Tokenizer.tokenize">source&nbsp;code</a></span>&nbsp;
552 |     </td>
553 |   </tr></table>
554 |   
555 |   <p>Tokenize the given string into a list of strings representing the 
556 |   constituent words of the message.</p>
557 |   <dl class="fields">
558 |     <dt>Parameters:</dt>
559 |     <dd><ul class="nomargin-top">
560 |         <li><strong class="pname"><code>message</code></strong> (<code>str</code>) - The string representation of the message.</li>
561 |     </ul></dd>
562 |     <dt>Returns: <code>list</code> of <code>str</code></dt>
563 |         <dd>The tokenization of the message.</dd>
564 |   </dl>
565 | </td></tr></table>
566 | </div>
567 | <a name="emoticons"></a>
568 | <div>
569 | <table class="details" border="1" cellpadding="3"
570 |        cellspacing="0" width="100%" bgcolor="white">
571 | <tr><td>
572 |   <table width="100%" cellpadding="0" cellspacing="0" border="0">
573 |   <tr valign="top"><td>
574 |   <h3 class="epydoc"><span class="sig"><span class="sig-name">emoticons</span>(<span class="sig-arg">self</span>,
575 |         <span class="sig-arg">iterable</span>=<span class="sig-default">None</span>,
576 |         <span class="sig-arg">filename</span>=<span class="sig-default">None</span>)</span>
577 |   </h3>
578 |   </td><td align="right" valign="top"
579 |     ><span class="codelink"><a href="tweetokenize-pysrc.html#Tokenizer.emoticons">source&nbsp;code</a></span>&nbsp;
580 |     </td>
581 |   </tr></table>
582 |   
583 |   <p>Consumes an iterable of emoticons that the tokenizer will tokenize on.
584 |   Allows for user-specified set of emoticons to be recognized.</p>
585 |   <dl class="fields">
586 |     <dt>Parameters:</dt>
587 |     <dd><ul class="nomargin-top">
588 |         <li><strong class="pname"><code>iterable</code></strong> - Object capable of iteration, providing emoticon strings.</li>
589 |         <li><strong class="pname"><code>filename</code></strong> (<code>str</code>) - Path to the file containing emoticons delimited by new lines. 
590 |           Strips trailing whitespace and skips blank lines.</li>
591 |     </ul></dd>
592 |   </dl>
593 | </td></tr></table>
594 | </div>
595 | <a name="stopwords"></a>
596 | <div>
597 | <table class="details" border="1" cellpadding="3"
598 |        cellspacing="0" width="100%" bgcolor="white">
599 | <tr><td>
600 |   <table width="100%" cellpadding="0" cellspacing="0" border="0">
601 |   <tr valign="top"><td>
602 |   <h3 class="epydoc"><span class="sig"><span class="sig-name">stopwords</span>(<span class="sig-arg">self</span>,
603 |         <span class="sig-arg">iterable</span>=<span class="sig-default">None</span>,
604 |         <span class="sig-arg">filename</span>=<span class="sig-default">None</span>)</span>
605 |   </h3>
606 |   </td><td align="right" valign="top"
607 |     ><span class="codelink"><a href="tweetokenize-pysrc.html#Tokenizer.stopwords">source&nbsp;code</a></span>&nbsp;
608 |     </td>
609 |   </tr></table>
610 |   
611 |   <p>Consumes an iterable of stopwords that the tokenizer will ignore if 
612 |   the stopwords setting is <code>True</code>. The default set is taken from
613 |   NLTK's english list.</p>
614 |   <dl class="fields">
615 |     <dt>Parameters:</dt>
616 |     <dd><ul class="nomargin-top">
617 |         <li><strong class="pname"><code>iterable</code></strong> - Object capable of iteration, providing stopword strings.</li>
618 |         <li><strong class="pname"><code>filename</code></strong> (<code>str</code>) - Path to the file containing stopwords delimited by new lines. 
619 |           Strips trailing whitespace and skips blank lines.</li>
620 |     </ul></dd>
621 |   </dl>
622 | </td></tr></table>
623 | </div>
624 | <br />
625 | <!-- ==================== CLASS VARIABLE DETAILS ==================== -->
626 | <a name="section-ClassVariableDetails"></a>
627 | <table class="details" border="1" cellpadding="3"
628 |        cellspacing="0" width="100%" bgcolor="white">
629 | <tr bgcolor="#70b0f0" class="table-header">
630 |   <td align="left" colspan="2" class="table-header">
631 |     <span class="table-header">Class Variable Details</span></td>
632 | </tr>
633 | </table>
634 | <a name="html_entities"></a>
635 | <div>
636 | <table class="details" border="1" cellpadding="3"
637 |        cellspacing="0" width="100%" bgcolor="white">
638 | <tr><td>
639 |   <h3 class="epydoc">html_entities</h3>
640 |   
641 |   <dl class="fields">
642 |   </dl>
643 |   <dl class="fields">
644 |     <dt>Value:</dt>
645 |       <dd><table><tr><td><pre class="variable">
646 | <code class="variable-group">{</code><code class="variable-quote">'</code><code class="variable-string">AElig</code><code class="variable-quote">'</code><code class="variable-op">: </code><code class="variable-quote">u'</code><code class="variable-string">&#198;</code><code class="variable-quote">'</code><code class="variable-op">,</code>
647 |  <code class="variable-quote">'</code><code class="variable-string">Aacute</code><code class="variable-quote">'</code><code class="variable-op">: </code><code class="variable-quote">u'</code><code class="variable-string">&#193;</code><code class="variable-quote">'</code><code class="variable-op">,</code>
648 |  <code class="variable-quote">'</code><code class="variable-string">Acirc</code><code class="variable-quote">'</code><code class="variable-op">: </code><code class="variable-quote">u'</code><code class="variable-string">&#194;</code><code class="variable-quote">'</code><code class="variable-op">,</code>
649 |  <code class="variable-quote">'</code><code class="variable-string">Agrave</code><code class="variable-quote">'</code><code class="variable-op">: </code><code class="variable-quote">u'</code><code class="variable-string">&#192;</code><code class="variable-quote">'</code><code class="variable-op">,</code>
650 |  <code class="variable-quote">'</code><code class="variable-string">Alpha</code><code class="variable-quote">'</code><code class="variable-op">: </code><code class="variable-quote">u'</code><code class="variable-string">&#913;</code><code class="variable-quote">'</code><code class="variable-op">,</code>
651 |  <code class="variable-quote">'</code><code class="variable-string">Aring</code><code class="variable-quote">'</code><code class="variable-op">: </code><code class="variable-quote">u'</code><code class="variable-string">&#197;</code><code class="variable-quote">'</code><code class="variable-op">,</code>
652 |  <code class="variable-quote">'</code><code class="variable-string">Atilde</code><code class="variable-quote">'</code><code class="variable-op">: </code><code class="variable-quote">u'</code><code class="variable-string">&#195;</code><code class="variable-quote">'</code><code class="variable-op">,</code>
653 |  <code class="variable-quote">'</code><code class="variable-string">Auml</code><code class="variable-quote">'</code><code class="variable-op">: </code><code class="variable-quote">u'</code><code class="variable-string">&#196;</code><code class="variable-quote">'</code><code class="variable-op">,</code>
654 | <code class="variable-ellipsis">...</code>
655 | </pre></td></tr></table>
656 | </dd>
657 |   </dl>
658 | </td></tr></table>
659 | </div>
660 | <a name="urls_re"></a>
661 | <div>
662 | <table class="details" border="1" cellpadding="3"
663 |        cellspacing="0" width="100%" bgcolor="white">
664 | <tr><td>
665 |   <h3 class="epydoc">urls_re</h3>
666 |   
667 |   <dl class="fields">
668 |   </dl>
669 |   <dl class="fields">
670 |     <dt>Value:</dt>
671 |       <dd><table><tr><td><pre class="variable">
672 | re.compile(r'<code class="re-group">(?:</code><code class="re-group">(?:</code>https<code class="re-op">?</code>://<code class="re-group">[</code>A<code class="re-op">-</code>Za<code class="re-op">-</code>z0<code class="re-op">-</code>9\.<code class="re-group">]</code><code class="re-op">+</code><code class="re-group">)</code><code class="re-op">|</code><code class="re-group">(?:</code><code class="re-group">(?:</code>www\.<code class="re-group">)</code><code class="re-op">?</code><code class="re-group">[</code>A<code class="re-op">-</code>Za<code class="re-op">-</code>z0<code class="re-op">-</code>9<code class="re-group">]</code><code class="re-op">+</code>\<span class="variable-linewrap"><img src="crarr.png" alt="\" /></span>
673 | .<code class="re-group">(?:</code>museum<code class="re-op">|</code>travel<code class="re-op">|</code>aero<code class="re-op">|</code>arpa<code class="re-op">|</code>asia<code class="re-op">|</code>coop<code class="re-op">|</code>info<code class="re-op">|</code>jobs<code class="re-op">|</code>mobi<code class="re-op">|</code>name<code class="re-op">|</code>post<code class="re-op">|</code>biz<code class="re-op">|</code>cat<code class="re-op"></code><span class="variable-linewrap"><img src="crarr.png" alt="\" /></span>
674 | <code class="re-op">|</code>com<code class="re-op">|</code>edu<code class="re-op">|</code>gov<code class="re-op">|</code>int<code class="re-op">|</code>mil<code class="re-op">|</code>net<code class="re-op">|</code>org<code class="re-op">|</code>pro<code class="re-op">|</code>tel<code class="re-op">|</code>xxx<code class="re-op">|</code>ac<code class="re-op">|</code>ad<code class="re-op">|</code>ae<code class="re-op">|</code>af<code class="re-op">|</code>ag<code class="re-op">|</code>ai<code class="re-op">|</code>al<code class="re-op">|</code>am<code class="re-op">|</code>an<code class="re-op">|</code>ao<code class="re-op"></code><span class="variable-linewrap"><img src="crarr.png" alt="\" /></span>
675 | <code class="re-op">|</code>aq<code class="re-op">|</code>ar<code class="re-op">|</code>as<code class="re-op">|</code>at<code class="re-op">|</code>au<code class="re-op">|</code>aw<code class="re-op">|</code>ax<code class="re-op">|</code>az<code class="re-op">|</code>ba<code class="re-op">|</code>bb<code class="re-op">|</code>bd<code class="re-op">|</code>be<code class="re-op">|</code>bf<code class="re-op">|</code>bg<code class="re-op">|</code>bh<code class="re-op">|</code>bi<code class="re-op">|</code>bj<code class="re-op">|</code>bm<code class="re-op">|</code>bn<code class="re-op">|</code>bo<code class="re-op">|</code>br<code class="re-op">|</code>bs<code class="re-op">|</code>bt<code class="re-op">|</code><span class="variable-linewrap"><img src="crarr.png" alt="\" /></span>
676 | bv<code class="re-op">|</code>bw<code class="re-op">|</code>by<code class="re-op">|</code>bz<code class="re-op">|</code>ca<code class="re-op">|</code>cc<code class="re-op">|</code>cd<code class="re-op">|</code>cf<code class="re-op">|</code>cg<code class="re-op">|</code>ch<code class="re-op">|</code>ci<code class="re-op">|</code>ck<code class="re-op">|</code>cl<code class="re-op">|</code>cm<code class="re-op">|</code>cn<code class="re-op">|</code>co<code class="re-op">|</code>cr<code class="re-op">|</code>cu<code class="re-op">|</code>cv<code class="re-op">|</code>cw<code class="re-op">|</code>cx<code class="re-op">|</code>cy<code class="re-op">|</code>cz<code class="re-op">|</code>d<span class="variable-linewrap"><img src="crarr.png" alt="\" /></span>
677 | e<code class="re-op">|</code>dj<code class="re-op">|</code>dk<code class="re-op">|</code>dm<code class="re-op">|</code>do<code class="re-op">|</code>dz<code class="re-op">|</code>ec<code class="re-op">|</code>ee<code class="re-op">|</code>eg<code class="re-op">|</code>er<code class="re-op">|</code>es<code class="re-op">|</code>et<code class="re-op">|</code>eu<code class="re-op">|</code>fi<code class="re-op">|</code>fj<code class="re-op">|</code>fk<code class="re-op">|</code>fm<code class="re-op">|</code>fo<code class="re-op">|</code>fr<code class="re-op">|</code>ga<code class="re-op">|</code>gb<code class="re-op">|</code>gd<code class="re-op">|</code>ge<code class="re-op">|</code>gf<code class="re-op"></code><span class="variable-linewrap"><img src="crarr.png" alt="\" /></span>
678 | <code class="re-op">|</code>gg<code class="re-op">|</code>gh<code class="re-op">|</code>gi<code class="re-op">|</code>gl<code class="re-op">|</code>gm<code class="re-op">|</code>gn<code class="re-op">|</code>gp<code class="re-op">|</code>gq<code class="re-op">|</code>gr<code class="re-op">|</code>gs<code class="re-op">|</code>gt<code class="re-op">|</code>gu<code class="re-op">|</code>gw<code class="re-op">|</code>gy<code class="re-op">|</code>hk<code class="re-op">|</code>hm<code class="re-op">|</code>hn<code class="re-op">|</code>hr<code class="re-op">|</code>ht<code class="re-op">|</code>hu<code class="re-op">|</code>id<code class="re-op">|</code>ie<code class="re-op">|</code>il<code class="re-op">|</code><span class="variable-linewrap"><img src="crarr.png" alt="\" /></span>
679 | im<code class="re-op">|</code>in<code class="re-op">|</code>io<code class="re-op">|</code>iq<code class="re-op">|</code>ir<code class="re-op">|</code>is<code class="re-op">|</code>it<code class="re-op">|</code>je<code class="re-op">|</code>jm<code class="re-op">|</code>jo<code class="re-op">|</code>jp<code class="re-op">|</code>ke<code class="re-op">|</code>kg<code class="re-op">|</code>kh<code class="re-op">|</code>ki<code class="re-op">|</code>km<code class="re-op">|</code>kn<code class="re-op">|</code>kp<code class="re-op">|</code>kr<code class="re-op">|</code>kw<code class="re-op">|</code>ky<code class="re-op">|</code>kz<code class="re-op">|</code>la<code class="re-op">|</code>l<span class="variable-linewrap"><img src="crarr.png" alt="\" /></span>
680 | <code class="variable-ellipsis">...</code>
681 | </pre></td></tr></table>
682 | </dd>
683 |   </dl>
684 | </td></tr></table>
685 | </div>
686 | <a name="word_re"></a>
687 | <div>
688 | <table class="details" border="1" cellpadding="3"
689 |        cellspacing="0" width="100%" bgcolor="white">
690 | <tr><td>
691 |   <h3 class="epydoc">word_re</h3>
692 |   
693 |   <dl class="fields">
694 |   </dl>
695 |   <dl class="fields">
696 |     <dt>Value:</dt>
697 |       <dd><table><tr><td><pre class="variable">
698 | re.compile(r'<code class="re-group">(?:</code><code class="re-group">[</code>a<code class="re-op">-</code>zA<code class="re-op">-</code>Z0<code class="re-op">-</code>9<code class="re-group">]</code><code class="re-op">+</code><code class="re-group">[</code>\'-<code class="re-group">]</code><code class="re-op">?</code><code class="re-group">[</code>a<code class="re-op">-</code>zA<code class="re-op">-</code>Z<code class="re-group">]</code><code class="re-op">+</code><code class="re-group">[</code>a<code class="re-op">-</code>zA<code class="re-op">-</code>Z0<code class="re-op">-</code>9<code class="re-group">]</code><code class="re-op">*</code><code class="re-group">)</code><code class="re-op">|</code><code class="re-group">(?:</code><code class="re-group">[</code>a<code class="re-op">-</code>zA<code class="re-op">-</code>Z0<code class="re-op">-</code>9<code class="re-group"></code><span class="variable-linewrap"><img src="crarr.png" alt="\" /></span>
699 | <code class="re-group">]</code><code class="re-op">*</code><code class="re-group">[</code>a<code class="re-op">-</code>zA<code class="re-op">-</code>Z<code class="re-group">]</code><code class="re-op">+</code><code class="re-group">[</code>\'-<code class="re-group">]</code><code class="re-op">?</code><code class="re-group">[</code>a<code class="re-op">-</code>zA<code class="re-op">-</code>Z0<code class="re-op">-</code>9<code class="re-group">]</code><code class="re-op">+</code><code class="re-group">)</code>')
700 | </pre></td></tr></table>
701 | </dd>
702 |   </dl>
703 | </td></tr></table>
704 | </div>
705 | <a name="times_re"></a>
706 | <div>
707 | <table class="details" border="1" cellpadding="3"
708 |        cellspacing="0" width="100%" bgcolor="white">
709 | <tr><td>
710 |   <h3 class="epydoc">times_re</h3>
711 |   
712 |   <dl class="fields">
713 |   </dl>
714 |   <dl class="fields">
715 |     <dt>Value:</dt>
716 |       <dd><table><tr><td><pre class="variable">
717 | re.compile(r'\d<code class="re-op">{1,2}</code>:\d<code class="re-op">{2}</code><code class="re-group">(?:</code>:\d<code class="re-op">{2}</code><code class="re-group">)</code><code class="re-op">?</code>\s<code class="re-op">*</code><code class="re-group">(?:</code>AM<code class="re-op">|</code>PM<code class="re-op">|</code>am<code class="re-op">|</code>pm<code class="re-group">)</code><code class="re-op">?</code>')
718 | </pre></td></tr></table>
719 | </dd>
720 |   </dl>
721 | </td></tr></table>
722 | </div>
723 | <a name="phonenumbers_re"></a>
724 | <div>
725 | <table class="details" border="1" cellpadding="3"
726 |        cellspacing="0" width="100%" bgcolor="white">
727 | <tr><td>
728 |   <h3 class="epydoc">phonenumbers_re</h3>
729 |   
730 |   <dl class="fields">
731 |   </dl>
732 |   <dl class="fields">
733 |     <dt>Value:</dt>
734 |       <dd><table><tr><td><pre class="variable">
735 | re.compile(r'<code class="re-group">(?:</code>\+<code class="re-op">?</code><code class="re-group">[</code>01<code class="re-group">]</code><code class="re-group">[</code>-\s\.<code class="re-group">]</code><code class="re-op">*</code><code class="re-group">)</code><code class="re-op">?</code><code class="re-group">(?:</code>\(<code class="re-op">?</code>\d<code class="re-op">{3}</code><code class="re-group">[</code>-\s\.\)<code class="re-group">]</code><code class="re-op">*</code><code class="re-group">)</code><code class="re-op">?</code>\d<code class="re-op">{3}</code><code class="re-group">[</code>-\s\.<code class="re-group">]</code><code class="re-op">*</code>\<span class="variable-linewrap"><img src="crarr.png" alt="\" /></span>
736 | d<code class="re-op">{4}</code><code class="re-group">(?:</code>\s<code class="re-op">*</code>x\s<code class="re-op">*</code>\d<code class="re-op">+</code><code class="re-group">)</code><code class="re-op">?</code><code class="re-group">(?=</code>\s<code class="re-op">+</code><code class="re-op">|</code>$<code class="re-group">)</code>')
737 | </pre></td></tr></table>
738 | </dd>
739 |   </dl>
740 | </td></tr></table>
741 | </div>
742 | <a name="numbers_re"></a>
743 | <div>
744 | <table class="details" border="1" cellpadding="3"
745 |        cellspacing="0" width="100%" bgcolor="white">
746 | <tr><td>
747 |   <h3 class="epydoc">numbers_re</h3>
748 |   
749 |   <dl class="fields">
750 |   </dl>
751 |   <dl class="fields">
752 |     <dt>Value:</dt>
753 |       <dd><table><tr><td><pre class="variable">
754 | re.compile(r'<code class="re-group">(?:</code><code class="re-group">[</code>\+-<code class="re-group">]</code><code class="re-op">?</code>\$<code class="re-op">?</code>\d<code class="re-op">+</code><code class="re-group">(?:</code>\.\d<code class="re-op">+</code><code class="re-group">)</code><code class="re-op">?</code><code class="re-group">(?:</code><code class="re-group">[</code>eE<code class="re-group">]</code>-<code class="re-op">?</code>\d<code class="re-op">+</code><code class="re-group">)</code><code class="re-op">?</code>%<code class="re-op">?</code><code class="re-group">)</code><code class="re-group">(?!</code><code class="re-group">[</code>A<code class="re-op">-</code>Za<code class="re-op">-</code>z<code class="re-group">]</code><code class="re-group">)</code><code class="re-group">(?:</code><span class="variable-linewrap"><img src="crarr.png" alt="\" /></span>
755 | \s<code class="re-op">*</code>/\s<code class="re-op">*</code><code class="re-group">(?:</code><code class="re-group">[</code>\+-<code class="re-group">]</code><code class="re-op">?</code>\$<code class="re-op">?</code>\d<code class="re-op">+</code><code class="re-group">(?:</code>\.\d<code class="re-op">+</code><code class="re-group">)</code><code class="re-op">?</code><code class="re-group">(?:</code><code class="re-group">[</code>eE<code class="re-group">]</code>-<code class="re-op">?</code>\d<code class="re-op">+</code><code class="re-group">)</code><code class="re-op">?</code>%<code class="re-op">?</code><code class="re-group">)</code><code class="re-group">(?!</code><code class="re-group">[</code>A<code class="re-op">-</code>Za<code class="re-op">-</code>z<code class="re-group">]</code><code class="re-group">)</code><code class="re-group">)</code><code class="re-op">?</code>')
756 | </pre></td></tr></table>
757 | </dd>
758 |   </dl>
759 | </td></tr></table>
760 | </div>
761 | <a name="tokenize_re"></a>
762 | <div>
763 | <table class="details" border="1" cellpadding="3"
764 |        cellspacing="0" width="100%" bgcolor="white">
765 | <tr><td>
766 |   <h3 class="epydoc">tokenize_re</h3>
767 |   
768 |   <dl class="fields">
769 |   </dl>
770 |   <dl class="fields">
771 |     <dt>Value:</dt>
772 |       <dd><table><tr><td><pre class="variable">
773 | re.compile(r'@\w<code class="re-op">{1,15}</code><code class="re-op">|</code><code class="re-group">(?:</code><code class="re-group">(?:</code>https<code class="re-op">?</code>://<code class="re-group">[</code>A<code class="re-op">-</code>Za<code class="re-op">-</code>z0<code class="re-op">-</code>9\.<code class="re-group">]</code><code class="re-op">+</code><code class="re-group">)</code><code class="re-op">|</code><code class="re-group">(?:</code><code class="re-group">(?:</code>www\.<code class="re-group">)</code><code class="re-op">?</code><code class="re-group">[</code>A<code class="re-op">-</code><span class="variable-linewrap"><img src="crarr.png" alt="\" /></span>
774 | Za<code class="re-op">-</code>z0<code class="re-op">-</code>9<code class="re-group">]</code><code class="re-op">+</code>\.<code class="re-group">(?:</code>museum<code class="re-op">|</code>travel<code class="re-op">|</code>aero<code class="re-op">|</code>arpa<code class="re-op">|</code>asia<code class="re-op">|</code>coop<code class="re-op">|</code>info<code class="re-op">|</code>jobs<code class="re-op">|</code>mobi<code class="re-op">|</code>name<code class="re-op">|</code>po<span class="variable-linewrap"><img src="crarr.png" alt="\" /></span>
775 | st<code class="re-op">|</code>biz<code class="re-op">|</code>cat<code class="re-op">|</code>com<code class="re-op">|</code>edu<code class="re-op">|</code>gov<code class="re-op">|</code>int<code class="re-op">|</code>mil<code class="re-op">|</code>net<code class="re-op">|</code>org<code class="re-op">|</code>pro<code class="re-op">|</code>tel<code class="re-op">|</code>xxx<code class="re-op">|</code>ac<code class="re-op">|</code>ad<code class="re-op">|</code>ae<code class="re-op">|</code>af<code class="re-op">|</code>ag<code class="re-op">|</code>ai<code class="re-op">|</code>a<span class="variable-linewrap"><img src="crarr.png" alt="\" /></span>
776 | l<code class="re-op">|</code>am<code class="re-op">|</code>an<code class="re-op">|</code>ao<code class="re-op">|</code>aq<code class="re-op">|</code>ar<code class="re-op">|</code>as<code class="re-op">|</code>at<code class="re-op">|</code>au<code class="re-op">|</code>aw<code class="re-op">|</code>ax<code class="re-op">|</code>az<code class="re-op">|</code>ba<code class="re-op">|</code>bb<code class="re-op">|</code>bd<code class="re-op">|</code>be<code class="re-op">|</code>bf<code class="re-op">|</code>bg<code class="re-op">|</code>bh<code class="re-op">|</code>bi<code class="re-op">|</code>bj<code class="re-op">|</code>bm<code class="re-op">|</code>bn<code class="re-op">|</code>bo<code class="re-op"></code><span class="variable-linewrap"><img src="crarr.png" alt="\" /></span>
777 | <code class="re-op">|</code>br<code class="re-op">|</code>bs<code class="re-op">|</code>bt<code class="re-op">|</code>bv<code class="re-op">|</code>bw<code class="re-op">|</code>by<code class="re-op">|</code>bz<code class="re-op">|</code>ca<code class="re-op">|</code>cc<code class="re-op">|</code>cd<code class="re-op">|</code>cf<code class="re-op">|</code>cg<code class="re-op">|</code>ch<code class="re-op">|</code>ci<code class="re-op">|</code>ck<code class="re-op">|</code>cl<code class="re-op">|</code>cm<code class="re-op">|</code>cn<code class="re-op">|</code>co<code class="re-op">|</code>cr<code class="re-op">|</code>cu<code class="re-op">|</code>cv<code class="re-op">|</code>cw<code class="re-op">|</code><span class="variable-linewrap"><img src="crarr.png" alt="\" /></span>
778 | cx<code class="re-op">|</code>cy<code class="re-op">|</code>cz<code class="re-op">|</code>de<code class="re-op">|</code>dj<code class="re-op">|</code>dk<code class="re-op">|</code>dm<code class="re-op">|</code>do<code class="re-op">|</code>dz<code class="re-op">|</code>ec<code class="re-op">|</code>ee<code class="re-op">|</code>eg<code class="re-op">|</code>er<code class="re-op">|</code>es<code class="re-op">|</code>et<code class="re-op">|</code>eu<code class="re-op">|</code>fi<code class="re-op">|</code>fj<code class="re-op">|</code>fk<code class="re-op">|</code>fm<code class="re-op">|</code>fo<code class="re-op">|</code>fr<code class="re-op">|</code>ga<code class="re-op">|</code>g<span class="variable-linewrap"><img src="crarr.png" alt="\" /></span>
779 | b<code class="re-op">|</code>gd<code class="re-op">|</code>ge<code class="re-op">|</code>gf<code class="re-op">|</code>gg<code class="re-op">|</code>gh<code class="re-op">|</code>gi<code class="re-op">|</code>gl<code class="re-op">|</code>gm<code class="re-op">|</code>gn<code class="re-op">|</code>gp<code class="re-op">|</code>gq<code class="re-op">|</code>gr<code class="re-op">|</code>gs<code class="re-op">|</code>gt<code class="re-op">|</code>gu<code class="re-op">|</code>gw<code class="re-op">|</code>gy<code class="re-op">|</code>hk<code class="re-op">|</code>hm<code class="re-op">|</code>hn<code class="re-op">|</code>hr<code class="re-op">|</code>ht<code class="re-op">|</code>hu<code class="re-op"></code><span class="variable-linewrap"><img src="crarr.png" alt="\" /></span>
780 | <code class="re-op">|</code>id<code class="re-op">|</code>ie<code class="re-op">|</code>il<code class="re-op">|</code>im<code class="re-op">|</code>in<code class="re-op">|</code>io<code class="re-op">|</code>iq<code class="re-op">|</code>ir<code class="re-op">|</code>is<code class="re-op">|</code>it<code class="re-op">|</code>je<code class="re-op">|</code>jm<code class="re-op">|</code>jo<code class="re-op">|</code>jp<code class="re-op">|</code>ke<code class="re-op">|</code>kg<code class="re-op">|</code>kh<code class="re-op">|</code>ki<code class="re-op">|</code>km<code class="re-op">|</code>kn<code class="re-op">|</code>kp<code class="re-op">|</code>kr<code class="re-op">|</code>kw<code class="re-op">|</code><span class="variable-linewrap"><img src="crarr.png" alt="\" /></span>
781 | <code class="variable-ellipsis">...</code>
782 | </pre></td></tr></table>
783 | </dd>
784 |   </dl>
785 | </td></tr></table>
786 | </div>
787 | <a name="quotes_re"></a>
788 | <div>
789 | <table class="details" border="1" cellpadding="3"
790 |        cellspacing="0" width="100%" bgcolor="white">
791 | <tr><td>
792 |   <h3 class="epydoc">quotes_re</h3>
793 |   
794 |   <dl class="fields">
795 |   </dl>
796 |   <dl class="fields">
797 |     <dt>Value:</dt>
798 |       <dd><table><tr><td><pre class="variable">
799 | re.compile(r'<code class="re-group">(</code>\u201c.<code class="re-op">*?</code>\u201d<code class="re-group">)</code><code class="re-op">|</code><code class="re-group">(</code>&quot;.<code class="re-op">*?</code>&quot;<code class="re-group">)</code><code class="re-op">|</code><code class="re-group">(</code>\u2018.<code class="re-op">*?</code>\u2019<code class="re-group">)</code><code class="re-op">|</code><code class="re-group">(</code>\uff02.<code class="re-op">*?</code>\uf<span class="variable-linewrap"><img src="crarr.png" alt="\" /></span>
800 | f02<code class="re-group">)</code><code class="re-op">|</code>\s<code class="re-group">(</code>\'.<code class="re-op">*?</code>\'<code class="re-group">)</code>\s')
801 | </pre></td></tr></table>
802 | </dd>
803 |   </dl>
804 | </td></tr></table>
805 | </div>
806 | <br />
807 | <!-- ==================== NAVIGATION BAR ==================== -->
808 | <table class="navbar" border="0" width="100%" cellpadding="0"
809 |        bgcolor="#a0c0ff" cellspacing="0">
810 |   <tr valign="middle">
811 |   <!-- Home link -->
812 |       <th>&nbsp;&nbsp;&nbsp;<a
813 |         href="tweetokenize-module.html">Home</a>&nbsp;&nbsp;&nbsp;</th>
814 | 
815 |   <!-- Tree link -->
816 |       <th>&nbsp;&nbsp;&nbsp;<a
817 |         href="module-tree.html">Trees</a>&nbsp;&nbsp;&nbsp;</th>
818 | 
819 |   <!-- Index link -->
820 |       <th>&nbsp;&nbsp;&nbsp;<a
821 |         href="identifier-index.html">Indices</a>&nbsp;&nbsp;&nbsp;</th>
822 | 
823 |   <!-- Help link -->
824 |       <th>&nbsp;&nbsp;&nbsp;<a
825 |         href="help.html">Help</a>&nbsp;&nbsp;&nbsp;</th>
826 | 
827 |       <th class="navbar" width="100%"></th>
828 |   </tr>
829 | </table>
830 | <table border="0" cellpadding="0" cellspacing="0" width="100%%">
831 |   <tr>
832 |     <td align="left" class="footer">
833 |     Generated by Epydoc 3.0.1 on Mon May 20 02:41:06 2013
834 |     </td>
835 |     <td align="right" class="footer">
836 |       <a target="mainFrame" href="http://epydoc.sourceforge.net"
837 |         >http://epydoc.sourceforge.net</a>
838 |     </td>
839 |   </tr>
840 | </table>
841 | 
842 | <script type="text/javascript">
843 |   <!--
844 |   // Private objects are initially displayed (because if
845 |   // javascript is turned off then we want them to be
846 |   // visible); but by default, we want to hide them.  So hide
847 |   // them unless we have a cookie that says to show them.
848 |   checkCookie();
849 |   // -->
850 | </script>
851 | </body>
852 | </html>
853 | 


--------------------------------------------------------------------------------
/documentation/tweetokenize.Tokenizer.TokenizerException-class.html:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="ascii"?>
  2 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  3 |           "DTD/xhtml1-transitional.dtd">
  4 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
  5 | <head>
  6 |   <title>tweetokenize.Tokenizer.TokenizerException</title>
  7 |   <link rel="stylesheet" href="epydoc.css" type="text/css" />
  8 |   <script type="text/javascript" src="epydoc.js"></script>
  9 | </head>
 10 | 
 11 | <body bgcolor="white" text="black" link="blue" vlink="#204080"
 12 |       alink="#204080">
 13 | <!-- ==================== NAVIGATION BAR ==================== -->
 14 | <table class="navbar" border="0" width="100%" cellpadding="0"
 15 |        bgcolor="#a0c0ff" cellspacing="0">
 16 |   <tr valign="middle">
 17 |   <!-- Home link -->
 18 |       <th>&nbsp;&nbsp;&nbsp;<a
 19 |         href="tweetokenize-module.html">Home</a>&nbsp;&nbsp;&nbsp;</th>
 20 | 
 21 |   <!-- Tree link -->
 22 |       <th>&nbsp;&nbsp;&nbsp;<a
 23 |         href="module-tree.html">Trees</a>&nbsp;&nbsp;&nbsp;</th>
 24 | 
 25 |   <!-- Index link -->
 26 |       <th>&nbsp;&nbsp;&nbsp;<a
 27 |         href="identifier-index.html">Indices</a>&nbsp;&nbsp;&nbsp;</th>
 28 | 
 29 |   <!-- Help link -->
 30 |       <th>&nbsp;&nbsp;&nbsp;<a
 31 |         href="help.html">Help</a>&nbsp;&nbsp;&nbsp;</th>
 32 | 
 33 |       <th class="navbar" width="100%"></th>
 34 |   </tr>
 35 | </table>
 36 | <table width="100%" cellpadding="0" cellspacing="0">
 37 |   <tr valign="top">
 38 |     <td width="100%">
 39 |       <span class="breadcrumbs">
 40 |         <a href="tweetokenize-module.html">Module&nbsp;tweetokenize</a> ::
 41 |         <a href="tweetokenize.Tokenizer-class.html">Class&nbsp;Tokenizer</a> ::
 42 |         Class&nbsp;TokenizerException
 43 |       </span>
 44 |     </td>
 45 |     <td>
 46 |       <table cellpadding="0" cellspacing="0">
 47 |         <!-- hide/show private -->
 48 |         <tr><td align="right"><span class="options"
 49 |             >[<a href="frames.html" target="_top">frames</a
 50 |             >]&nbsp;|&nbsp;<a href="tweetokenize.Tokenizer.TokenizerException-class.html"
 51 |             target="_top">no&nbsp;frames</a>]</span></td></tr>
 52 |       </table>
 53 |     </td>
 54 |   </tr>
 55 | </table>
 56 | <!-- ==================== CLASS DESCRIPTION ==================== -->
 57 | <h1 class="epydoc">Class TokenizerException</h1><p class="nomargin-top"><span class="codelink"><a href="tweetokenize-pysrc.html#Tokenizer.TokenizerException">source&nbsp;code</a></span></p>
 58 | <pre class="base-tree">
 59 |               object --+    
 60 |                        |    
 61 | exceptions.BaseException --+
 62 |                            |
 63 |                           <strong class="uidshort">Tokenizer.TokenizerException</strong>
 64 | </pre>
 65 | 
 66 | <hr />
 67 | <!-- ==================== INSTANCE METHODS ==================== -->
 68 | <a name="section-InstanceMethods"></a>
 69 | <table class="summary" border="1" cellpadding="3"
 70 |        cellspacing="0" width="100%" bgcolor="white">
 71 | <tr bgcolor="#70b0f0" class="table-header">
 72 |   <td align="left" colspan="2" class="table-header">
 73 |     <span class="table-header">Instance Methods</span></td>
 74 | </tr>
 75 |   <tr>
 76 |     <td colspan="2" class="summary">
 77 |     <p class="indent-wrapped-lines"><b>Inherited from <code>exceptions.BaseException</code></b>:
 78 |       <code>__delattr__</code>,
 79 |       <code>__getattribute__</code>,
 80 |       <code>__getitem__</code>,
 81 |       <code>__getslice__</code>,
 82 |       <code>__init__</code>,
 83 |       <code>__new__</code>,
 84 |       <code>__reduce__</code>,
 85 |       <code>__repr__</code>,
 86 |       <code>__setattr__</code>,
 87 |       <code>__setstate__</code>,
 88 |       <code>__str__</code>,
 89 |       <code>__unicode__</code>
 90 |       </p>
 91 |     <p class="indent-wrapped-lines"><b>Inherited from <code>object</code></b>:
 92 |       <code>__format__</code>,
 93 |       <code>__hash__</code>,
 94 |       <code>__reduce_ex__</code>,
 95 |       <code>__sizeof__</code>,
 96 |       <code>__subclasshook__</code>
 97 |       </p>
 98 |     </td>
 99 |   </tr>
100 | </table>
101 | <!-- ==================== PROPERTIES ==================== -->
102 | <a name="section-Properties"></a>
103 | <table class="summary" border="1" cellpadding="3"
104 |        cellspacing="0" width="100%" bgcolor="white">
105 | <tr bgcolor="#70b0f0" class="table-header">
106 |   <td align="left" colspan="2" class="table-header">
107 |     <span class="table-header">Properties</span></td>
108 | </tr>
109 |   <tr>
110 |     <td colspan="2" class="summary">
111 |     <p class="indent-wrapped-lines"><b>Inherited from <code>exceptions.BaseException</code></b>:
112 |       <code>args</code>,
113 |       <code>message</code>
114 |       </p>
115 |     <p class="indent-wrapped-lines"><b>Inherited from <code>object</code></b>:
116 |       <code>__class__</code>
117 |       </p>
118 |     </td>
119 |   </tr>
120 | </table>
121 | <!-- ==================== NAVIGATION BAR ==================== -->
122 | <table class="navbar" border="0" width="100%" cellpadding="0"
123 |        bgcolor="#a0c0ff" cellspacing="0">
124 |   <tr valign="middle">
125 |   <!-- Home link -->
126 |       <th>&nbsp;&nbsp;&nbsp;<a
127 |         href="tweetokenize-module.html">Home</a>&nbsp;&nbsp;&nbsp;</th>
128 | 
129 |   <!-- Tree link -->
130 |       <th>&nbsp;&nbsp;&nbsp;<a
131 |         href="module-tree.html">Trees</a>&nbsp;&nbsp;&nbsp;</th>
132 | 
133 |   <!-- Index link -->
134 |       <th>&nbsp;&nbsp;&nbsp;<a
135 |         href="identifier-index.html">Indices</a>&nbsp;&nbsp;&nbsp;</th>
136 | 
137 |   <!-- Help link -->
138 |       <th>&nbsp;&nbsp;&nbsp;<a
139 |         href="help.html">Help</a>&nbsp;&nbsp;&nbsp;</th>
140 | 
141 |       <th class="navbar" width="100%"></th>
142 |   </tr>
143 | </table>
144 | <table border="0" cellpadding="0" cellspacing="0" width="100%%">
145 |   <tr>
146 |     <td align="left" class="footer">
147 |     Generated by Epydoc 3.0.1 on Mon May 20 02:41:06 2013
148 |     </td>
149 |     <td align="right" class="footer">
150 |       <a target="mainFrame" href="http://epydoc.sourceforge.net"
151 |         >http://epydoc.sourceforge.net</a>
152 |     </td>
153 |   </tr>
154 | </table>
155 | 
156 | <script type="text/javascript">
157 |   <!--
158 |   // Private objects are initially displayed (because if
159 |   // javascript is turned off then we want them to be
160 |   // visible); but by default, we want to hide them.  So hide
161 |   // them unless we have a cookie that says to show them.
162 |   checkCookie();
163 |   // -->
164 | </script>
165 | </body>
166 | </html>
167 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from distutils.core import setup
 3 | import tweetokenize
 4 | 
 5 | setup(
 6 |     name='tweetokenize',
 7 |     version=tweetokenize.__version__,
 8 |     description='Regular expression based tokenizer for Twitter',
 9 |     author='Jared Suttles',
10 |     url='https://github.com/jaredks/tweetokenize',
11 |     packages=['tweetokenize'],
12 |     package_data={'': ['LICENSE'], 'tweetokenize': ['lexicons/*.txt']},
13 |     long_description=open('README.md').read() + '\n\n' + open('CHANGES').read(),
14 |     license='BSD License',
15 |     classifiers=[
16 |         'Development Status :: 5 - Production/Stable',
17 |         'Intended Audience :: Developers',
18 |         'Intended Audience :: Education',
19 |         'Intended Audience :: Science/Research',
20 |         'License :: OSI Approved :: BSD License',
21 |         'Operating System :: MacOS :: MacOS X',
22 |         'Operating System :: Microsoft :: Windows',
23 |         'Operating System :: POSIX',
24 |         'Programming Language :: Python',
25 |         'Topic :: Scientific/Engineering :: Information Analysis',
26 |         'Topic :: Software Development :: Libraries :: Python Modules',
27 |         'Topic :: Text Processing :: Linguistic',
28 |     ],
29 | )
30 | 


--------------------------------------------------------------------------------
/tests/__main__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import unittest
3 | from test_tweetokenize import TokenizeTests
4 | 
5 | suite = unittest.TestSuite()
6 | suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TokenizeTests))
7 | 
8 | unittest.TextTestRunner().run(suite)
9 | 


--------------------------------------------------------------------------------
/tests/test_tweetokenize.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # tweetokenize: Regular expression based tokenizer for Twitter
  5 | # Copyright: (c) 2013, Jared Suttles. All rights reserved.
  6 | # License: BSD, see LICENSE for details.
  7 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  8 | import unittest
  9 | from tweetokenize import Tokenizer
 10 | 
 11 | 
 12 | class TokenizeTests(unittest.TestCase):
 13 |     def setUp(self):
 14 |         self.tokenizer = Tokenizer(lowercase=True)
 15 |     
 16 |     def test_general_1(self):
 17 |         self.tokenizer.normalize = 2
 18 |         msg = ('omg wow &#x3c; &#x26; &#x3e; &#62;.&#60; &gt;.&lt; :):)'
 19 |         'i CANT believe thatttt haha lol!!1')
 20 |         tks = ['omg', 'wow', '<', '&', '>', '>.<', '>.<', ':)', ':)',
 21 |         'i', 'CANT', 'believe', 'thatt', 'haha', 'lol', '!', '!', '1']
 22 |         self.assertEqual(self.tokenizer.tokenize(msg), tks)
 23 |     
 24 |     def test_general_2(self):
 25 |         msg = "i'm wanting to jump up and down but wouldn't if i couldn't.."
 26 |         tks = [u"i'm", u'wanting', u'to', u'jump', u'up', u'and', u'down',
 27 |         u'but', u"wouldn't", u'if', u'i', u"couldn't", u'...']
 28 |         self.assertEqual(self.tokenizer.tokenize(msg), tks)
 29 |     
 30 |     def test_urls_1(self):
 31 |         msg = ("hey bro chec'k out http://shitstorm.com its fucking sick")
 32 |         tks = ['hey', 'bro', "chec'k", 'out', 'URL', 'its', 'fucking', 'sick']
 33 |         self.assertEqual(self.tokenizer.tokenize(msg), tks)
 34 |     
 35 |     def test_urls_2(self):
 36 |         msg = ('also see this crazy stuff https://shitstorm.com')
 37 |         tks = ['also', 'see', 'this', 'crazy', 'stuff', 'URL']
 38 |         self.assertEqual(self.tokenizer.tokenize(msg), tks)
 39 |     
 40 |     def test_urls_3(self):
 41 |         msg = 'hiiiii rayj.com/ihititfirst and other google.com http://hobo.net'
 42 |         tks = [u'hiii', u'URL', u'and', u'other', u'URL', u'URL']
 43 |         self.assertEqual(self.tokenizer.tokenize(msg), tks)
 44 |     
 45 |     def test_usernames_1(self):
 46 |         msg = ('@justinbeiber yo man!! ! i love you in a totally '
 47 |         'straight way <3:p:D')
 48 |         tks = [u'USERNAME', u'yo', u'man', u'!', u'!', u'!',
 49 |         u'i', u'love', u'you', u'in', u'a', u'totally', u'straight', u'way',
 50 |         u'<3', u':p', u':D']
 51 |         self.assertEqual(self.tokenizer.tokenize(msg), tks)
 52 |     
 53 |     def test_usernames_2(self):
 54 |         msg = '@heyheymango: what did you SAYYY??? or did you just..  NotHING?'
 55 |         tks = [u'USERNAME', u':', u'what', u'did', u'you', u'SAYYY', u'?',
 56 |         u'?', u'?', u'or', u'did', u'you', u'just', u'...', u'nothing', u'?']
 57 |         self.assertEqual(self.tokenizer.tokenize(msg), tks)
 58 |     
 59 |     def test_numbers_1(self):
 60 |         self.tokenizer.numbers = None
 61 |         msg = ('i have this much money -2.42 in my bank acct.,friend! but you '
 62 |         'have mucho +88e44 and its about 1000% more than $400.')
 63 |         tks = [u'i', u'have', u'this', u'much', u'money', u'-2.42', u'in',
 64 |         u'my', u'bank', u'acct', u'.', u',', u'friend', u'!', u'but', u'you',
 65 |         u'have', u'mucho', u'+88e44', u'and', u'its', u'about', u'1000%',
 66 |         u'more', u'than', u'$400', u'.']
 67 |         self.assertEqual(self.tokenizer.tokenize(msg), tks)
 68 |     
 69 |     def test_numbers_2(self):
 70 |         msg = ('i have this much money -2.42 in my bank acct.,friend! but you '
 71 |         'have mucho +88e44 and its about 1000% more than $400.')
 72 |         tks = [u'i', u'have', u'this', u'much', u'money', u'NUMBER', u'in',
 73 |         u'my', u'bank', u'acct', u'.', u',', u'friend', u'!', u'but', u'you',
 74 |         u'have', u'mucho', u'NUMBER', u'and', u'its', u'about', u'NUMBER',
 75 |         u'more', u'than', u'NUMBER', u'.']
 76 |         self.assertEqual(self.tokenizer.tokenize(msg), tks)
 77 |     
 78 |     def test_numbers_3(self):
 79 |         self.tokenizer.lowercase = False # keep cases the same everywhere
 80 |         msg = ('I JUST want To Test FRACTIONZZZ 22432.41414/ 55894385e-341 also'
 81 |         ' lowercase etc.etc.etc. hope that last part doesn\'t parse as a url '
 82 |         'i would be kinda sad PANda!zsss..... .. . .... 4/5 5.1/4.0e0 3.14 -2')
 83 |         tks = [u'I', u'JUST', u'want', u'To', u'Test', u'FRACTIONZZZ',
 84 |         u'NUMBER', u'also', u'lowercase', u'etc', u'.', u'etc', u'.', u'etc',
 85 |         u'.', u'hope', u'that', u'last', u'part', u"doesn't", u'parse', u'as',
 86 |         u'a', u'url', u'i', u'would', u'be', u'kinda', u'sad', u'PANda', u'!',
 87 |         u'zsss', u'...', u'...', u'.', u'...', u'NUMBER', u'NUMBER', u'NUMBER',
 88 |         u'NUMBER']
 89 |         self.assertEqual(self.tokenizer.tokenize(msg), tks)
 90 |     
 91 |     def test_time_1(self):
 92 |         msg = 'is the time now 12:14pm? or is it like 2:42AM??'
 93 |         tks = [u'is', u'the', u'time', u'now', u'TIME', u'?', u'or', u'is',
 94 |         u'it', u'like', u'TIME', u'?', u'?']
 95 |         self.assertEqual(self.tokenizer.tokenize(msg), tks)
 96 |     
 97 |     def test_time_2(self):
 98 |         msg = 'new time is 2:42:09 PM!!'
 99 |         tks = [u'new', u'time', u'is', u'TIME', u'!', u'!']
100 |         self.assertEqual(self.tokenizer.tokenize(msg), tks)
101 |     
102 |     def test_phonenumber_1(self):
103 |         msg = ('my number is 18002432242 and 241.413.5584 also 1-242-156-6724'
104 |         ' and (958)555-4875 or (999) 415 5542 is 422-5555 a 131-121-1441')
105 |         tks = [u'my', u'number', u'is', u'PHONENUMBER', u'and', u'PHONENUMBER',
106 |         u'also', u'PHONENUMBER', u'and', u'PHONENUMBER', u'or', u'PHONENUMBER',
107 |         u'is', u'PHONENUMBER', u'a', u'PHONENUMBER']
108 |         self.assertEqual(self.tokenizer.tokenize(msg), tks)
109 |     
110 |     def test_phonenumber_2(self):
111 |         msg = 'numbers with extension: (201)-340-4915 x112 or 1 800.341.1311x99'
112 |         tks = [u'numbers', u'with', u'extension', u':', u'PHONENUMBER', u'or',
113 |         u'PHONENUMBER']
114 |         self.assertEqual(self.tokenizer.tokenize(msg), tks)
115 |     
116 |     def test_quotes_1(self):
117 |         self.tokenizer.ignorequotes = True
118 |         msg = 'this is just a tweet with "someone said something funny" lol'
119 |         tks = ['this', 'is', 'just', 'a', 'tweet', 'with', 'lol']
120 |         self.assertEqual(self.tokenizer.tokenize(msg), tks)
121 |     
122 |     def test_quotes_2(self):
123 |         self.tokenizer.ignorequotes = False
124 |         msg = 'this is just a tweet with "someone said something funny" lol'
125 |         tks = ['this', 'is', 'just', 'a', 'tweet', 'with', '"', 'someone',
126 |         'said', 'something', 'funny', '"', 'lol']
127 |         self.assertEqual(self.tokenizer.tokenize(msg), tks)
128 |     
129 |     def test_quotes_3(self):
130 |         self.tokenizer.ignorequotes = True
131 |         msg = ('some stuff but he said “yea i know its crazy”other '
132 |         'stuff...!!! ')
133 |         tks = [u'some', u'stuff', u'but', u'he', u'said', u'other', u'stuff',
134 |         u'...', u'!', u'!', u'!']
135 |         self.assertEqual(self.tokenizer.tokenize(msg), tks)
136 |     
137 |     def test_quotes_4(self):
138 |         self.tokenizer.ignorequotes = True
139 |         msg = ('some stuff but he said &ldquo;yea i know its crazy&rdquo;other '
140 |         'stuff...!!! ')
141 |         tks = [u'some', u'stuff', u'but', u'he', u'said', u'other', u'stuff',
142 |         u'...', u'!', u'!', u'!']
143 |         self.assertEqual(self.tokenizer.tokenize(msg), tks)
144 |     
145 |     def test_quotes_5(self):
146 |         self.tokenizer.ignorequotes = False
147 |         msg = 'heyy buddyyyyy boy \'do you the lady\'s kitty like that??\''
148 |         tks = [u'heyy', u'buddyyy', u'boy', u"'", u'do', u'you', u'the',
149 |         u"lady's", u'kitty', u'like', u'that', u'?', u'?', u"'"]
150 |         self.assertEqual(self.tokenizer.tokenize(msg), tks)
151 |     
152 |     def test_hashtags_1(self):
153 |         msg = 'omg i love#dog#cat#food#other#things#so#fucking#much!!!11LOLOLOL'
154 |         tks = ['omg', 'i', 'love', '#dog', '#cat', '#food', '#other',
155 |         '#things', '#so', '#fucking', '#much', '!', '!', '!', '11LOLOLOL']
156 |         self.assertEqual(self.tokenizer.tokenize(msg), tks)
157 |     
158 |     def test_hashtags_2(self):
159 |         self.tokenizer.hashtags = 'HASHTAG'
160 |         msg = 'omg i love#dog#cat#food#other#things#so#fucking#much!!!11LOLOLOL'
161 |         tks = ['omg', 'i', 'love', 'HASHTAG', 'HASHTAG', 'HASHTAG',
162 |         'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', '!', '!', '!',
163 |         '11LOLOLOL']
164 |         self.assertEqual(self.tokenizer.tokenize(msg), tks)
165 |     
166 |     def test_emoticons_1(self):
167 |         msg = 'heyyyyyy:):):(>.<<v.vwhats up man LOL T.T tomcat.tomcat :$;).!!!'
168 |         tks = [u'heyyy', u':)', u':)', u':(', u'>.<', u'<', u'v.v', u'whats',
169 |         u'up', u'man', u'LOL', u'T.T', u'tomcat', u'.', u'tomcat', u':$',
170 |         u';)', u'.', u'!', u'!', u'!']
171 |         self.assertEqual(self.tokenizer.tokenize(msg), tks)
172 |     
173 |     def test_removefeatures_1(self):
174 |         self.tokenizer.usernames = "" # dont' want any usernames to show
175 |         msg = ('hey @arnold @nickelodeon #90s#ilove90s#allthat#amandashow'
176 |         '@rocko http://en.wikipedia.org/wiki/The_Angry_Beavers ^.^>>><<<^.^')
177 |         tks = [u'hey', u'#90s', u'#ilove90s', u'#allthat', u'#amandashow',
178 |         u'URL', u'^.^', u'>', u'>', u'>', u'<', u'<', u'<', u'^.^']
179 |         self.assertEqual(self.tokenizer.tokenize(msg), tks)
180 |     
181 |     def test_removefeatures_2(self):
182 |         self.tokenizer.usernames = "" # dont' want any usernames to show
183 |         self.tokenizer.hashtags = ""  # or hashtags
184 |         msg = ('hey @arnold @nickelodeon #90s#ilove90s#allthat#amandashow'
185 |         '@rocko http://en.wikipedia.org/wiki/The_Angry_Beavers ^.^>>><<<^.^')
186 |         tks = [u'hey', u'URL', u'^.^', u'>', u'>', u'>', u'<', u'<', u'<',
187 |         u'^.^']
188 |         self.assertEqual(self.tokenizer.tokenize(msg), tks)
189 |     
190 |     def test_removefeatures_3(self):
191 |         self.tokenizer.usernames = False # keep usernames
192 |         self.tokenizer.urls = ""         # URLs should be removed
193 |         self.tokenizer.hashtags = "$$$"  # hashtags should be $$$
194 |         msg = ('hey @arnold @nickelodeon #90s#ilove90s#allthat#amandashow'
195 |         '@rocko http://en.wikipedia.org/wiki/The_Angry_Beavers ^.^>>><<<^.^')
196 |         tks = [u'hey', u'@arnold', u'@nickelodeon', u'$$$', u'$$$', u'$$$',
197 |         u'$$$', u'@rocko', u'^.^', u'>', u'>', u'>', u'<', u'<', u'<', u'^.^']
198 |         self.assertEqual(self.tokenizer.tokenize(msg), tks)
199 |     
200 |     def test_emoji_1(self):
201 |         msg = ('hey mate!:):3.....@and🇨🇳ONE+ BRO#love😘😵💚💛💜💙  '
202 |         '💋😂😂LOLLLL.')
203 |         tks = [u'hey', u'mate', u'!', u':)', u':3', u'...',
204 |         u'USERNAME', u'\U0001f1e8\U0001f1f3', u'ONE', u'+', u'BRO', u'#love',
205 |         u'\U0001f618', u'\U0001f635', u'\U0001f49a', u'\U0001f49b',
206 |         u'\U0001f49c', u'\U0001f499', u'\U0001f48b', u'\U0001f602',
207 |         u'\U0001f602', u'LOLLL', u'.']
208 |         self.assertEqual(self.tokenizer.tokenize(msg), tks)
209 |     
210 |     def test_emoji_2(self):
211 |         msg = ('hey mate!:):3.....@andONE+🇬🇧  BRO#love😘😵💚💛💜💙  '
212 |         '💋😂😂LOLLLL.')
213 |         tks = [u'hey', u'mate', u'!', u':)', u':3', u'...',
214 |         u'USERNAME', u'+', u'\U0001f1ec\U0001f1e7', u'BRO', u'#love', u'😘',
215 |         u'😵', u'\U0001f49a', u'\U0001f49b', u'\U0001f49c',
216 |         u'\U0001f499', u'💋', u'\U0001f602', u'\U0001f602',
217 |         u'LOLLL', u'.']
218 |         self.assertEqual(self.tokenizer.tokenize(msg), tks)
219 |     
220 |     def test_emoji_3(self):
221 |         msg = ('🚀=)</3O_O:$D:<:-@\xf0\x9f\x98\xb7🔥💩💅 outdated:💽 ancient:💾 '
222 |         '#getwiththecloud:💻 and it looks like 💭')
223 |         tks = [u'\U0001f680', u'=)', u'</3', u'O_O', u':$', u'D:<', u':-@',
224 |         u'\U0001f637', u'\U0001f525', u'\U0001f4a9', u'\U0001f485',
225 |         u'outdated', u':', u'\U0001f4bd', u'ancient', u':',
226 |         u'\U0001f4be', u'#getwiththecloud',
227 |         u':', u'\U0001f4bb', u'and', u'it', u'looks', u'like', u'\U0001f4ad']
228 |         self.assertEqual(self.tokenizer.tokenize(msg), tks)
229 |     
230 |     def test_error_1(self):
231 |         msg = []
232 |         with self.assertRaises(TypeError):
233 |             self.tokenizer.tokenize(msg)
234 |     
235 |     def test_error_2(self):
236 |         msg = lambda x: x
237 |         with self.assertRaises(TypeError):
238 |             self.tokenizer.tokenize(msg)
239 |     
240 |     def test_actual_tweets_1(self):
241 |         "Number as part of name"
242 |         msg = '@LoganTillman not 2pac and floyd mayweather'
243 |         tks = [u'USERNAME', u'not', u'2pac', u'and', u'floyd', u'mayweather']
244 |         self.assertEqual(self.tokenizer.tokenize(msg), tks)
245 |     
246 |     def test_actual_tweets_2(self):
247 |         "Colon no space in hashtag"
248 |         msg = '#MentionSomeoneYoureGladYouMet: @LarryWorld_Wide of course.'
249 |         tks = [u'#MentionSomeoneYoureGladYouMet', u':', u'USERNAME', u'of',
250 |         u'course', u'.']
251 |         self.assertEqual(self.tokenizer.tokenize(msg), tks)
252 |     
253 |     def test_stopwords_1(self):
254 |         self.tokenizer.ignorestopwords = True
255 |         msg = 'i like myself and my so not much and our something he:)'
256 |         tks = [u'like', u'much', u'something', u':)']
257 |         self.assertEqual(self.tokenizer.tokenize(msg), tks)
258 | 
259 | if __name__ == "__main__":
260 |     unittest.main()
261 | 


--------------------------------------------------------------------------------
/tweetokenize/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # tweetokenize: Regular expression based tokenizer for Twitter
 5 | # Copyright: (c) 2013, Jared Suttles. All rights reserved.
 6 | # License: BSD, see LICENSE for details.
 7 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 8 | 
 9 | """
10 | Tokenization and pre-processing for social media data used to train classifiers.
11 | Focused on classification of sentiment, emotion, or mood.
12 | 
13 | Intended as glue between Python wrappers for Twitter API and machine
14 | learning algorithms of the Natural Language Toolkit (NLTK), but probably
15 | applicable to tokenizing any short messages of the social networking variety.
16 | 
17 | In many cases, reducing feature-set complexity can increase performance of
18 | classifiers trained for detecting sentiment. The available settings are based
19 | on commonly modified and normalized features in classification research using
20 | content from Twitter.
21 | """
22 | 
23 | __title__ = 'tweetokenize'
24 | __version__ = '1.0.1'
25 | __author__ = 'Jared Suttles'
26 | __license__ = 'Modified BSD'
27 | __copyright__ = 'Copyright 2013 Jared Suttles'
28 | 
29 | from .tokenizer import Tokenizer
30 | 


--------------------------------------------------------------------------------
/tweetokenize/lexicons/domains.txt:
--------------------------------------------------------------------------------
  1 | museum
  2 | travel
  3 | aero
  4 | arpa
  5 | asia
  6 | coop
  7 | info
  8 | jobs
  9 | mobi
 10 | name
 11 | post
 12 | biz
 13 | cat
 14 | com
 15 | edu
 16 | gov
 17 | int
 18 | mil
 19 | net
 20 | org
 21 | pro
 22 | tel
 23 | xxx
 24 | ac
 25 | ad
 26 | ae
 27 | af
 28 | ag
 29 | ai
 30 | al
 31 | am
 32 | an
 33 | ao
 34 | aq
 35 | ar
 36 | as
 37 | at
 38 | au
 39 | aw
 40 | ax
 41 | az
 42 | ba
 43 | bb
 44 | bd
 45 | be
 46 | bf
 47 | bg
 48 | bh
 49 | bi
 50 | bj
 51 | bm
 52 | bn
 53 | bo
 54 | br
 55 | bs
 56 | bt
 57 | bv
 58 | bw
 59 | by
 60 | bz
 61 | ca
 62 | cc
 63 | cd
 64 | cf
 65 | cg
 66 | ch
 67 | ci
 68 | ck
 69 | cl
 70 | cm
 71 | cn
 72 | co
 73 | cr
 74 | cu
 75 | cv
 76 | cw
 77 | cx
 78 | cy
 79 | cz
 80 | de
 81 | dj
 82 | dk
 83 | dm
 84 | do
 85 | dz
 86 | ec
 87 | ee
 88 | eg
 89 | er
 90 | es
 91 | et
 92 | eu
 93 | fi
 94 | fj
 95 | fk
 96 | fm
 97 | fo
 98 | fr
 99 | ga
100 | gb
101 | gd
102 | ge
103 | gf
104 | gg
105 | gh
106 | gi
107 | gl
108 | gm
109 | gn
110 | gp
111 | gq
112 | gr
113 | gs
114 | gt
115 | gu
116 | gw
117 | gy
118 | hk
119 | hm
120 | hn
121 | hr
122 | ht
123 | hu
124 | id
125 | ie
126 | il
127 | im
128 | in
129 | io
130 | iq
131 | ir
132 | is
133 | it
134 | je
135 | jm
136 | jo
137 | jp
138 | ke
139 | kg
140 | kh
141 | ki
142 | km
143 | kn
144 | kp
145 | kr
146 | kw
147 | ky
148 | kz
149 | la
150 | lb
151 | lc
152 | li
153 | lk
154 | lr
155 | ls
156 | lt
157 | lu
158 | lv
159 | ly
160 | ma
161 | mc
162 | md
163 | me
164 | mg
165 | mh
166 | mk
167 | ml
168 | mm
169 | mn
170 | mo
171 | mp
172 | mq
173 | mr
174 | ms
175 | mt
176 | mu
177 | mv
178 | mw
179 | mx
180 | my
181 | mz
182 | na
183 | nc
184 | ne
185 | nf
186 | ng
187 | ni
188 | nl
189 | no
190 | np
191 | nr
192 | nu
193 | nz
194 | om
195 | pa
196 | pe
197 | pf
198 | pg
199 | ph
200 | pk
201 | pl
202 | pm
203 | pn
204 | pr
205 | ps
206 | pt
207 | pw
208 | py
209 | qa
210 | re
211 | ro
212 | rs
213 | ru
214 | rw
215 | sa
216 | sb
217 | sc
218 | sd
219 | se
220 | sg
221 | sh
222 | si
223 | sj
224 | sk
225 | sl
226 | sm
227 | sn
228 | so
229 | sr
230 | st
231 | su
232 | sv
233 | sx
234 | sy
235 | sz
236 | tc
237 | td
238 | tf
239 | tg
240 | th
241 | tj
242 | tk
243 | tl
244 | tm
245 | tn
246 | to
247 | tp
248 | tr
249 | tt
250 | tv
251 | tw
252 | tz
253 | ua
254 | ug
255 | uk
256 | us
257 | uy
258 | uz
259 | va
260 | vc
261 | ve
262 | vg
263 | vi
264 | vn
265 | vu
266 | wf
267 | ws
268 | ye
269 | yt
270 | za
271 | zm
272 | zw
273 | 


--------------------------------------------------------------------------------
/tweetokenize/lexicons/emoticons.txt:
--------------------------------------------------------------------------------
 1 | :@
 2 | :-@
 3 | >:o
 4 | >:0
 5 | D:<
 6 | D:
 7 | D8
 8 | D;
 9 | D=
10 | Dx
11 | >.<
12 | >_<
13 | d:<
14 | d:
15 | d8
16 | d;
17 | d=
18 | dx
19 | v.v
20 | :/
21 | :\
22 | =/
23 | =\
24 | >:/
25 | >:\
26 | :-/
27 | :-\
28 | :)
29 | (:
30 | ;)
31 | ;(
32 | (;
33 | );
34 | :-)
35 | :3
36 | :d
37 | :D
38 | xd
39 | :')
40 | ^_^
41 | ^.^
42 | :]
43 | :}
44 | :p
45 | :b
46 | =p
47 | =b
48 | :-p
49 | :-b
50 | =)
51 | :(
52 | ):
53 | :'(
54 | :c
55 | :-(
56 | </3
57 | :[
58 | :{
59 | T.T
60 | o_o
61 | O_O
62 | 0_o
63 | o_0
64 | 0_O
65 | O_0
66 | o.o
67 | O.O
68 | 0.o
69 | o.0
70 | :o
71 | :-o
72 | <3
73 | :p
74 | :b
75 | =p
76 | =b
77 | :-p
78 | :-b
79 | :$
80 | 


--------------------------------------------------------------------------------
/tweetokenize/lexicons/stopwords.txt:
--------------------------------------------------------------------------------
  1 | i
  2 | me
  3 | my
  4 | myself
  5 | we
  6 | our
  7 | ours
  8 | ourselves
  9 | you
 10 | your
 11 | yours
 12 | yourself
 13 | yourselves
 14 | he
 15 | him
 16 | his
 17 | himself
 18 | she
 19 | her
 20 | hers
 21 | herself
 22 | it
 23 | its
 24 | itself
 25 | they
 26 | them
 27 | their
 28 | theirs
 29 | themselves
 30 | what
 31 | which
 32 | who
 33 | whom
 34 | this
 35 | that
 36 | these
 37 | those
 38 | am
 39 | is
 40 | are
 41 | was
 42 | were
 43 | be
 44 | been
 45 | being
 46 | have
 47 | has
 48 | had
 49 | having
 50 | do
 51 | does
 52 | did
 53 | doing
 54 | a
 55 | an
 56 | the
 57 | and
 58 | but
 59 | if
 60 | or
 61 | because
 62 | as
 63 | until
 64 | while
 65 | of
 66 | at
 67 | by
 68 | for
 69 | with
 70 | about
 71 | against
 72 | between
 73 | into
 74 | through
 75 | during
 76 | before
 77 | after
 78 | above
 79 | below
 80 | to
 81 | from
 82 | up
 83 | down
 84 | in
 85 | out
 86 | on
 87 | off
 88 | over
 89 | under
 90 | again
 91 | further
 92 | then
 93 | once
 94 | here
 95 | there
 96 | when
 97 | where
 98 | why
 99 | how
100 | all
101 | any
102 | both
103 | each
104 | few
105 | more
106 | most
107 | other
108 | some
109 | such
110 | no
111 | nor
112 | not
113 | only
114 | own
115 | same
116 | so
117 | than
118 | too
119 | very
120 | s
121 | t
122 | can
123 | will
124 | just
125 | don
126 | should
127 | now
128 | 


--------------------------------------------------------------------------------
/tweetokenize/tokenizer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # tweetokenize: Regular expression based tokenizer for Twitter
  5 | # Copyright: (c) 2013, Jared Suttles. All rights reserved.
  6 | # License: BSD, see LICENSE for details.
  7 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  8 | import re
  9 | from os import path
 10 | from itertools import imap
 11 | from htmlentitydefs import name2codepoint
 12 | 
 13 | html_entities = {k: unichr(v) for k, v in name2codepoint.iteritems()}
 14 | html_entities_re = re.compile(r"&#?\w+;")
 15 | emoji_ranges = ((u'\U0001f300', u'\U0001f5ff'), (u'\U0001f600', u'\U0001f64f'), (u'\U0001f680', u'\U0001f6c5'),
 16 |                 (u'\u2600', u'\u26ff'), (u'\U0001f170', u'\U0001f19a'))
 17 | emoji_flags =  {u'\U0001f1ef\U0001f1f5', u'\U0001f1f0\U0001f1f7', u'\U0001f1e9\U0001f1ea',
 18 |                 u'\U0001f1e8\U0001f1f3', u'\U0001f1fa\U0001f1f8', u'\U0001f1eb\U0001f1f7',
 19 |                 u'\U0001f1ea\U0001f1f8', u'\U0001f1ee\U0001f1f9', u'\U0001f1f7\U0001f1fa',
 20 |                 u'\U0001f1ec\U0001f1e7'}
 21 | 
 22 | 
 23 | def _converthtmlentities(msg):
 24 |     def replace_entities(s):
 25 |         s = s.group(0)[1:-1] # remove & and ;
 26 |         if s[0] == '#':
 27 |             try:
 28 |                 return unichr(int(s[2:],16) if s[1] in 'xX' else int(s[1:]))
 29 |             except ValueError:
 30 |                 return '&#' + s + ';'
 31 |         else:
 32 |             try:
 33 |                 return html_entities[s]
 34 |             except KeyError:
 35 |                 return '&' + s + ';'
 36 |     return html_entities_re.sub(replace_entities, msg)
 37 | 
 38 | 
 39 | def _unicode(word):
 40 |     if isinstance(word, unicode):
 41 |         return word
 42 |     return unicode(word, encoding='utf-8')
 43 | 
 44 | 
 45 | def _isemoji(s):
 46 |     return len(s) == len(u'\U0001f4a9') and any(l <= s <= u for l, u in emoji_ranges) or s in emoji_flags
 47 | 
 48 | 
 49 | class Tokenizer(object):
 50 |     """
 51 |     Can be used to tokenize a string representation of a message, adjusting 
 52 |     features based on the given configuration details, to enable further 
 53 |     processing in feature extraction and training stages.
 54 |     
 55 |     An example usage::
 56 |     
 57 |       >>> from tweetokenize import Tokenizer
 58 |       >>> gettokens = Tokenizer(usernames='USER', urls='')
 59 |       >>> gettokens.tokenize('@justinbeiber yo man!love you#inlove#wantyou in a totally straight way #brotime <3:p:D www.justinbeiber.com')
 60 |       [u'USER', u'yo', u'man', u'!', u'love', u'you', u'#inlove', u'#wantyou', u'in', u'a', u'totally', u'straight', u'way', u'#brotime', u'<3', u':p', u':D']
 61 |     """
 62 |     _default_args = dict(
 63 |         lowercase=True, allcapskeep=True, normalize=3, usernames='USERNAME', urls='URL', hashtags=False,
 64 |         phonenumbers='PHONENUMBER', times='TIME', numbers='NUMBER', ignorequotes=False, ignorestopwords=False
 65 |     )
 66 |     _lexicons = path.join(path.dirname(path.realpath(__file__)), 'lexicons/{}.txt')
 67 | 
 68 |     # Regular expressions
 69 |     usernames_re = re.compile(r"@\w{1,15}")
 70 |     with open(_lexicons.format('domains'), 'r') as f:
 71 |         domains = f.read().strip().replace('\n', '|')
 72 |     urls_re = re.compile(r"(?:(?:https?\://[A-Za-z0-9\.]+)|(?:(?:www\.)?[A-Za-z0-9]+\.(?:{})))(?:\/\S+)?"
 73 |                          "(?=\s+|$)".format(domains))
 74 |     del domains
 75 |     hashtags_re = re.compile(r"#\w+[\w'-]*\w+")
 76 |     ellipsis_re = re.compile(r"\.\.+")
 77 |     word_re = re.compile(r"(?:[a-zA-Z0-9]+['-]?[a-zA-Z]+[a-zA-Z0-9]*)|(?:[a-zA-Z0-9]*[a-zA-Z]+['-]?[a-zA-Z0-9]+)")
 78 |     times_re = re.compile(r"\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM|am|pm)?")
 79 |     phonenumbers_re = re.compile(r"(?:\+?[01][\-\s\.]*)?(?:\(?\d{3}[\-\s\.\)]*)?\d{3}[\-\s\.]*\d{4}(?:\s*x\s*\d+)?"
 80 |                                  "(?=\s+|$)")
 81 |     number_re = r"(?:[+-]?\$?\d+(?:\.\d+)?(?:[eE]-?\d+)?%?)(?![A-Za-z])"
 82 |     numbers_re = re.compile(r"{0}(?:\s*/\s*{0})?".format(number_re))  # deals with fractions
 83 |     del number_re
 84 |     other_re = r"(?:[^#\s\.]|\.(?!\.))+"
 85 |     _token_regexs = ('usernames', 'urls', 'hashtags', 'times', 'phonenumbers', 'numbers')
 86 |     tokenize_re = re.compile(
 87 |         ur"|".join(
 88 |             imap(lambda x: getattr(x, 'pattern', x),
 89 |                  [locals()[regex + '_re'] for regex in _token_regexs] + [word_re, ellipsis_re, other_re])))
 90 |     del regex  # otherwise stays in class namespace
 91 |     repeating_re = re.compile(r"([a-zA-Z])\1\1+")
 92 |     doublequotes = ((u'“',u'”'),(u'"',u'"'),(u'‘',u'’'),(u'＂',u'＂'))
 93 |     punctuation = (u'!$%()*+,-/:;<=>?[\\]^_.`{|}~\'' + u''.join(c for t in doublequotes for c in t))
 94 |     quotes_re = re.compile(ur"|".join(ur'({}.*?{})'.format(f,s) for f,s in doublequotes) + ur'|\s(\'.*?\')\s')
 95 |     del doublequotes
 96 | 
 97 |     def __init__(self, **kwargs):
 98 |         """
 99 |         Constructs a new Tokenizer. Can specify custom settings for various 
100 |         feature normalizations.
101 |         
102 |         Any features with replacement tokens can be removed from the message by 
103 |         setting the token to the empty string (C{""}), C{"DELETE"}, or 
104 |         C{"REMOVE"}.
105 |         
106 |         @type lowercase: C{bool}
107 |         @param lowercase: If C{True}, lowercases words, excluding those with 
108 |             all letters capitalized.
109 |         
110 |         @type allcapskeep: C{bool}
111 |         @param allcapskeep: If C{True}, maintains capitalization for words with 
112 |             all letters in capitals. Otherwise, capitalization for such words 
113 |             is dependent on C{lowercase}.
114 |         
115 |         @type normalize: C{int}
116 |         @param normalize: The number of repeating letters when normalizing 
117 |             arbitrary letter elongations.
118 |             
119 |             Example::
120 |                 Heyyyyyy i lovvvvvvve youuuuuuuuu <3
121 |             
122 |             Becomes::
123 |                 Heyyy i lovvve youuu <3
124 |             
125 |             Not sure why you would want to change this (maybe just for fun?? :P)
126 |         
127 |         @param usernames: Serves as the replacement token for anything that 
128 |             parses as a Twitter username, ie. C{@rayj}. Setting this to 
129 |             C{False} means no usernames will be changed.
130 |         
131 |         @param urls: Serves as the replacement token for anything that 
132 |             parses as a URL, ie. C{bit.ly} or C{http://example.com}. Setting 
133 |             this to C{False} means no URLs will be changed.
134 |         
135 |         @param hashtags: Serves as the replacement token for anything that 
136 |             parses as a Twitter hashtag, ie. C{#ihititfirst} or 
137 |             C{#onedirection}. Setting this to C{False} means no hashtags will 
138 |             be changed.
139 |         
140 |         @param phonenumbers: Replacement token for phone numbers.
141 |         
142 |         @param times: Replacement token for times.
143 |         
144 |         @param numbers: Replacement token for any other kinds of numbers.
145 |         
146 |         @type ignorequotes: C{bool}
147 |         @param ignorequotes: If C{True}, will remove various types of quotes 
148 |             and the contents within.
149 |         
150 |         @type ignorestopwords: C{bool}
151 |         @param ignorestopwords: If C{True}, will remove any stopwords. The 
152 |             default set includes 'I', 'me', 'itself', 'against', 'should', etc.
153 |         """
154 |         for keyword in self._default_args:
155 |             setattr(self, keyword, kwargs.get(keyword, self._default_args[keyword]))
156 |         self.emoticons(filename=self._lexicons.format('emoticons'))
157 |         self.stopwords(filename=self._lexicons.format('stopwords'))
158 | 
159 |     def __call__(self, iterable):
160 |         """
161 |         Iterator for the tokenization of given messages.
162 |         
163 |         @rtype: C{list} of C{str}
164 |         @return: Iterator of lists representing message tokenizations.
165 |         
166 |         @param iterable: Object capable of iteration, providing strings for 
167 |             tokenization.
168 |         """
169 |         for msg in iterable:
170 |             yield self.tokenize(msg)
171 | 
172 |     def update(self, **kwargs):
173 |         """
174 |         Adjust any settings of the Tokenizer.
175 | 
176 |           >>> gettokens = Tokenizer())
177 |           >>> gettokens.lowercase
178 |           True
179 |           >>> gettokens.phonenumbers
180 |           'PHONENUMBER'
181 |           >>> gettokens.update(phonenumbers='NUMBER', lowercase=False)
182 |           >>> gettokens.lowercase
183 |           False
184 |           >>> gettokens.phonenumbers
185 |           'NUMBER'
186 |         """
187 |         for keyword in self._default_args:
188 |             if keyword in kwargs:
189 |                 setattr(self, keyword, kwargs[keyword])
190 | 
191 |     def _replacetokens(self, msg):
192 |         tokens = []
193 |         deletion_tokens = {'', 'REMOVE', 'remove', 'DELETE', 'delete'}
194 |         for word in msg:
195 |             matching = self.word_re.match(word) # 1st check if normal word
196 |             if matching and len(matching.group(0)) == len(word):
197 |                 tokens.append(self._cleanword(word))
198 |                 continue # don't check rest of conditions
199 |             for token in self._token_regexs: # id & possibly replace tokens
200 |                 regex = getattr(self, token + '_re')
201 |                 replacement_token = getattr(self, token)
202 |                 if regex.match(word):
203 |                     if replacement_token: # decide if we change it
204 |                         word = _unicode(str(replacement_token))
205 |                     if replacement_token not in deletion_tokens:
206 |                         tokens.append(word)
207 |                     break
208 |             else: # we didn't find a match for any token so far...
209 |                 if self.ellipsis_re.match(word):
210 |                     tokens.append(u"...")
211 |                 else: # split into tokens based on emoticons or punctuation
212 |                     tokens.extend(self._separate_emoticons_punctuation(word))
213 |         return tokens
214 | 
215 |     def _separate_emoticons_punctuation(self, word):
216 |         newwords, wordbefore = [], []
217 |         i = 0
218 |         def possibly_append_and_reset():
219 |             if wordbefore:
220 |                 newwords.append(self._cleanword(''.join(wordbefore)))
221 |                 wordbefore[:] = []
222 |         while i < len(word):
223 |             # greedily check for emoticons in this word
224 |             for l in range(self._maxlenemo, 0, -1):
225 |                 if word[i:i+l] in self._emoticons or _isemoji(word[i:i+l]):
226 |                     possibly_append_and_reset()
227 |                     newwords.append(word[i:i+l])
228 |                     i+=l
229 |                     break
230 |             else: # its safe to break up any punctuation not part of emoticons
231 |                 if word[i] in self.punctuation:
232 |                     possibly_append_and_reset()
233 |                     newwords.append(word[i])
234 |                 else:
235 |                     wordbefore.append(word[i])
236 |                 i+=1
237 |         # possible ending of word which wasn't emoticon or punctuation
238 |         possibly_append_and_reset()
239 |         return newwords
240 | 
241 |     def _cleanword(self, word):
242 |         if self.normalize: # replace characters with >=3 alphabetic repeating
243 |             word = self.repeating_re.sub(r"\1"*self.normalize, word)
244 |         if self.lowercase and (not self.allcapskeep or not word.isupper()):
245 |             return word.lower()
246 |         return word
247 | 
248 |     def tokenize(self, message):
249 |         """
250 |         Tokenize the given string into a list of strings representing the 
251 |         constituent words of the message.
252 |         
253 |         @rtype: C{list} of C{str}
254 |         @return: The tokenization of the message.
255 |         
256 |         @type message: C{str}
257 |         @param message: The string representation of the message.
258 |         """
259 |         if not isinstance(message, basestring):
260 |             raise TypeError('cannot tokenize non-string, {}'.format(repr(type(message).__name__)))
261 |         message = _converthtmlentities(_unicode(message))
262 |         if self.ignorequotes:
263 |             message = self.quotes_re.sub(" ", message)
264 |         message = self._replacetokens(self.tokenize_re.findall(message))
265 |         if self.ignorestopwords:
266 |             message = [word for word in message if word not in self._stopwords]
267 |         return message
268 | 
269 |     def emoticons(self, iterable=None, filename=None):
270 |         """
271 |         Consumes an iterable of emoticons that the tokenizer will tokenize on. 
272 |         Allows for user-specified set of emoticons to be recognized.
273 |         
274 |         @param iterable: Object capable of iteration, providing emoticon 
275 |             strings.
276 |         @type filename: C{str}
277 |         @param filename: Path to the file containing emoticons delimited by 
278 |             new lines. Strips trailing whitespace and skips blank lines.
279 |         """
280 |         self._emoticons = self._collectset(iterable, filename)
281 |         self._maxlenemo = max(len(max(self._emoticons, key=lambda x: len(x))),
282 |         len(u'\U0001f1e8\U0001f1f3'), len(u'\U0001f48b'))
283 | 
284 |     def stopwords(self, iterable=None, filename=None):
285 |         """
286 |         Consumes an iterable of stopwords that the tokenizer will ignore if the 
287 |         stopwords setting is C{True}. The default set is taken from NLTK's 
288 |         english list.
289 |         
290 |         @param iterable: Object capable of iteration, providing stopword 
291 |             strings.
292 |         @type filename: C{str}
293 |         @param filename: Path to the file containing stopwords delimited by 
294 |             new lines. Strips trailing whitespace and skips blank lines.
295 |         """
296 |         self._stopwords = self._collectset(iterable, filename)
297 | 
298 |     @staticmethod
299 |     def _collectset(iterable, filename):
300 |         if filename:
301 |             with open(filename, "r") as f:
302 |                 iterable = set(l.rstrip() for l in f)
303 |                 iterable.discard('')
304 |         return set(imap(_unicode, iterable))
305 | 


--------------------------------------------------------------------------------