├── .gitignore
├── documentation
├── crarr.png
├── frames.html
├── index.html
├── toc-tweetokenize-module.html
├── redirect.html
├── toc.html
├── toc-everything.html
├── api-objects.txt
├── module-tree.html
├── class-tree.html
├── tweetokenize-module.html
├── tweetokenize.Tokenizer.TokenizerException-class.html
├── help.html
├── epydoc.js
├── identifier-index.html
├── epydoc.css
└── tweetokenize.Tokenizer-class.html
├── tests
├── __main__.py
└── test_tweetokenize.py
├── CHANGES
├── tweetokenize
├── lexicons
│ ├── emoticons.txt
│ ├── stopwords.txt
│ └── domains.txt
├── __init__.py
└── tokenizer.py
├── setup.py
├── LICENSE
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | *.pyc
3 | .gitignore
4 | build
5 | bench
6 |
--------------------------------------------------------------------------------
/documentation/crarr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaredks/tweetokenize/HEAD/documentation/crarr.png
--------------------------------------------------------------------------------
/tests/__main__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import unittest
3 | from test_tweetokenize import TokenizeTests
4 |
5 | suite = unittest.TestSuite()
6 | suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TokenizeTests))
7 |
8 | unittest.TextTestRunner().run(suite)
9 |
--------------------------------------------------------------------------------
/CHANGES:
--------------------------------------------------------------------------------
1 | Changes
2 | =======
3 |
4 | 1.0.1 (2013-08-15)
5 | ------------------
6 |
7 | - Module docstring
8 | - Changes to `setup.py`
9 | - Refactored: gained ~15% speed up for tokenization
10 |
11 |
12 | 1.0.0 (2013-05-11 - 2013-06-25)
13 | -------------------------------
14 |
15 | - First version
16 |
--------------------------------------------------------------------------------
/tweetokenize/lexicons/emoticons.txt:
--------------------------------------------------------------------------------
1 | :@
2 | :-@
3 | >:o
4 | >:0
5 | D:<
6 | D:
7 | D8
8 | D;
9 | D=
10 | Dx
11 | >.<
12 | >_<
13 | d:<
14 | d:
15 | d8
16 | d;
17 | d=
18 | dx
19 | v.v
20 | :/
21 | :\
22 | =/
23 | =\
24 | >:/
25 | >:\
26 | :-/
27 | :-\
28 | :)
29 | (:
30 | ;)
31 | ;(
32 | (;
33 | );
34 | :-)
35 | :3
36 | :d
37 | :D
38 | xd
39 | :')
40 | ^_^
41 | ^.^
42 | :]
43 | :}
44 | :p
45 | :b
46 | =p
47 | =b
48 | :-p
49 | :-b
50 | =)
51 | :(
52 | ):
53 | :'(
54 | :c
55 | :-(
56 | 3
57 | :[
58 | :{
59 | T.T
60 | o_o
61 | O_O
62 | 0_o
63 | o_0
64 | 0_O
65 | O_0
66 | o.o
67 | O.O
68 | 0.o
69 | o.0
70 | :o
71 | :-o
72 | <3
73 | :p
74 | :b
75 | =p
76 | =b
77 | :-p
78 | :-b
79 | :$
80 |
--------------------------------------------------------------------------------
/documentation/frames.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 | API Documentation
7 |
8 |
9 |
10 |
12 |
14 |
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/documentation/index.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 | API Documentation
7 |
8 |
9 |
10 |
12 |
14 |
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/tweetokenize/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #
4 | # tweetokenize: Regular expression based tokenizer for Twitter
5 | # Copyright: (c) 2013, Jared Suttles. All rights reserved.
6 | # License: BSD, see LICENSE for details.
7 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
8 |
9 | """
10 | Tokenization and pre-processing for social media data used to train classifiers.
11 | Focused on classification of sentiment, emotion, or mood.
12 |
13 | Intended as glue between Python wrappers for Twitter API and machine
14 | learning algorithms of the Natural Language Toolkit (NLTK), but probably
15 | applicable to tokenizing any short messages of the social networking variety.
16 |
17 | In many cases, reducing feature-set complexity can increase performance of
18 | classifiers trained for detecting sentiment. The available settings are based
19 | on commonly modified and normalized features in classification research using
20 | content from Twitter.
21 | """
22 |
23 | __title__ = 'tweetokenize'
24 | __version__ = '1.0.1'
25 | __author__ = 'Jared Suttles'
26 | __license__ = 'Modified BSD'
27 | __copyright__ = 'Copyright 2013 Jared Suttles'
28 |
29 | from .tokenizer import Tokenizer
30 |
--------------------------------------------------------------------------------
/documentation/toc-tweetokenize-module.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 | tweetokenize
7 |
8 |
9 |
10 |
11 |
13 | Module tweetokenize
14 |
15 | Classes
16 | Tokenizer Variables
18 | __package__
20 |
21 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | from distutils.core import setup
3 | import tweetokenize
4 |
5 | setup(
6 | name='tweetokenize',
7 | version=tweetokenize.__version__,
8 | description='Regular expression based tokenizer for Twitter',
9 | author='Jared Suttles',
10 | url='https://github.com/jaredks/tweetokenize',
11 | packages=['tweetokenize'],
12 | package_data={'': ['LICENSE'], 'tweetokenize': ['lexicons/*.txt']},
13 | long_description=open('README.md').read() + '\n\n' + open('CHANGES').read(),
14 | license='BSD License',
15 | classifiers=[
16 | 'Development Status :: 5 - Production/Stable',
17 | 'Intended Audience :: Developers',
18 | 'Intended Audience :: Education',
19 | 'Intended Audience :: Science/Research',
20 | 'License :: OSI Approved :: BSD License',
21 | 'Operating System :: MacOS :: MacOS X',
22 | 'Operating System :: Microsoft :: Windows',
23 | 'Operating System :: POSIX',
24 | 'Programming Language :: Python',
25 | 'Topic :: Scientific/Engineering :: Information Analysis',
26 | 'Topic :: Software Development :: Libraries :: Python Modules',
27 | 'Topic :: Text Processing :: Linguistic',
28 | ],
29 | )
30 |
--------------------------------------------------------------------------------
/documentation/redirect.html:
--------------------------------------------------------------------------------
1 | Epydoc Redirect Page
2 |
3 |
4 |
5 |
6 |
7 |
8 |
18 |
19 | Epydoc Auto-redirect page
20 |
21 | When javascript is enabled, this page will redirect URLs of
22 | the form redirect.html#dotted.name to the
23 | documentation for the object with the given fully-qualified
24 | dotted name.
25 |
26 |
27 |
36 |
37 |
38 |
39 |
--------------------------------------------------------------------------------
/documentation/toc.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 | Table of Contents
7 |
8 |
9 |
10 |
11 |
13 | Table of Contents
14 |
15 | Everything
16 |
17 | Modules
18 | tweetokenize
20 |
21 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/tweetokenize/lexicons/stopwords.txt:
--------------------------------------------------------------------------------
1 | i
2 | me
3 | my
4 | myself
5 | we
6 | our
7 | ours
8 | ourselves
9 | you
10 | your
11 | yours
12 | yourself
13 | yourselves
14 | he
15 | him
16 | his
17 | himself
18 | she
19 | her
20 | hers
21 | herself
22 | it
23 | its
24 | itself
25 | they
26 | them
27 | their
28 | theirs
29 | themselves
30 | what
31 | which
32 | who
33 | whom
34 | this
35 | that
36 | these
37 | those
38 | am
39 | is
40 | are
41 | was
42 | were
43 | be
44 | been
45 | being
46 | have
47 | has
48 | had
49 | having
50 | do
51 | does
52 | did
53 | doing
54 | a
55 | an
56 | the
57 | and
58 | but
59 | if
60 | or
61 | because
62 | as
63 | until
64 | while
65 | of
66 | at
67 | by
68 | for
69 | with
70 | about
71 | against
72 | between
73 | into
74 | through
75 | during
76 | before
77 | after
78 | above
79 | below
80 | to
81 | from
82 | up
83 | down
84 | in
85 | out
86 | on
87 | off
88 | over
89 | under
90 | again
91 | further
92 | then
93 | once
94 | here
95 | there
96 | when
97 | where
98 | why
99 | how
100 | all
101 | any
102 | both
103 | each
104 | few
105 | more
106 | most
107 | other
108 | some
109 | such
110 | no
111 | nor
112 | not
113 | only
114 | own
115 | same
116 | so
117 | than
118 | too
119 | very
120 | s
121 | t
122 | can
123 | will
124 | just
125 | don
126 | should
127 | now
128 |
--------------------------------------------------------------------------------
/documentation/toc-everything.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 | Everything
7 |
8 |
9 |
10 |
11 |
13 | Everything
14 |
15 | All Classes
16 | tweetokenize.Tokenizer tweetokenize.Tokenizer.TokenizerException All Variables
19 | tweetokenize.__package__
21 |
22 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2013, Jared Suttles.
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without modification,
5 | are permitted provided that the following conditions are met:
6 |
7 | 1. Redistributions of source code must retain the above copyright notice,
8 | this list of conditions and the following disclaimer.
9 |
10 | 2. Redistributions in binary form must reproduce the above copyright
11 | notice, this list of conditions and the following disclaimer in the
12 | documentation and/or other materials provided with the distribution.
13 |
14 | 3. Neither the name of tweetokenize nor the names of its contributors may be
15 | used to endorse or promote products derived from this software without
16 | specific prior written permission.
17 |
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | tweetokenize
2 | ============
3 |
4 | Regular expression based tokenizer for Twitter. Focused on tokenization
5 | and pre-processing to train classifiers for sentiment, emotion, or mood.
6 |
7 | Intended as glue between Python wrappers for Twitter API and machine
8 | learning algorithms of the Natural Language Toolkit (NLTK), but probably
9 | applicable to tokenizing any short messages of the social networking
10 | variety.
11 |
12 | ```python
13 | from tweetokenize import Tokenizer
14 | gettokens = Tokenizer()
15 | gettokens.tokenize('hey playa!:):3.....@SHAQ can you still dunk?#old🍕🍔😵LOL')
16 | [u'hey', u'playa', u'!', u':)', u':3', u'...', u'USERNAME', u'can', u'you', u'still', u'dunk', u'?', u'#old', u'🍕', u'🍔', u'😵', u'LOL']
17 | ```
18 |
19 | Features
20 | --------
21 |
22 | * Can easily replace tweet features like usernames, urls, phone numbers, times,
23 | etc. with tokens in order to reduce feature set complexity and improve
24 | performance of classifiers
25 | * Allows user-defined sets of emoticons to be used in tokenization
26 | * Correctly separates emoji, written consecutively, into individual tokens
27 |
28 | Installation
29 | ------------
30 |
31 | python setup.py install
32 |
33 | After installation, you can make sure everything is working by running the following inside the project root folder,
34 |
35 | python tests
36 |
37 | Documentation
38 | -------------
39 |
40 | http://htmlpreview.github.io/?https://raw.github.com/jaredks/tweetokenize/master/documentation/tweetokenize.Tokenizer-class.html
41 |
42 | License
43 | -------
44 |
45 | "Modified BSD License". See LICENSE for details. Copyright Jared Suttles, 2013.
46 |
--------------------------------------------------------------------------------
/tweetokenize/lexicons/domains.txt:
--------------------------------------------------------------------------------
1 | museum
2 | travel
3 | aero
4 | arpa
5 | asia
6 | coop
7 | info
8 | jobs
9 | mobi
10 | name
11 | post
12 | biz
13 | cat
14 | com
15 | edu
16 | gov
17 | int
18 | mil
19 | net
20 | org
21 | pro
22 | tel
23 | xxx
24 | ac
25 | ad
26 | ae
27 | af
28 | ag
29 | ai
30 | al
31 | am
32 | an
33 | ao
34 | aq
35 | ar
36 | as
37 | at
38 | au
39 | aw
40 | ax
41 | az
42 | ba
43 | bb
44 | bd
45 | be
46 | bf
47 | bg
48 | bh
49 | bi
50 | bj
51 | bm
52 | bn
53 | bo
54 | br
55 | bs
56 | bt
57 | bv
58 | bw
59 | by
60 | bz
61 | ca
62 | cc
63 | cd
64 | cf
65 | cg
66 | ch
67 | ci
68 | ck
69 | cl
70 | cm
71 | cn
72 | co
73 | cr
74 | cu
75 | cv
76 | cw
77 | cx
78 | cy
79 | cz
80 | de
81 | dj
82 | dk
83 | dm
84 | do
85 | dz
86 | ec
87 | ee
88 | eg
89 | er
90 | es
91 | et
92 | eu
93 | fi
94 | fj
95 | fk
96 | fm
97 | fo
98 | fr
99 | ga
100 | gb
101 | gd
102 | ge
103 | gf
104 | gg
105 | gh
106 | gi
107 | gl
108 | gm
109 | gn
110 | gp
111 | gq
112 | gr
113 | gs
114 | gt
115 | gu
116 | gw
117 | gy
118 | hk
119 | hm
120 | hn
121 | hr
122 | ht
123 | hu
124 | id
125 | ie
126 | il
127 | im
128 | in
129 | io
130 | iq
131 | ir
132 | is
133 | it
134 | je
135 | jm
136 | jo
137 | jp
138 | ke
139 | kg
140 | kh
141 | ki
142 | km
143 | kn
144 | kp
145 | kr
146 | kw
147 | ky
148 | kz
149 | la
150 | lb
151 | lc
152 | li
153 | lk
154 | lr
155 | ls
156 | lt
157 | lu
158 | lv
159 | ly
160 | ma
161 | mc
162 | md
163 | me
164 | mg
165 | mh
166 | mk
167 | ml
168 | mm
169 | mn
170 | mo
171 | mp
172 | mq
173 | mr
174 | ms
175 | mt
176 | mu
177 | mv
178 | mw
179 | mx
180 | my
181 | mz
182 | na
183 | nc
184 | ne
185 | nf
186 | ng
187 | ni
188 | nl
189 | no
190 | np
191 | nr
192 | nu
193 | nz
194 | om
195 | pa
196 | pe
197 | pf
198 | pg
199 | ph
200 | pk
201 | pl
202 | pm
203 | pn
204 | pr
205 | ps
206 | pt
207 | pw
208 | py
209 | qa
210 | re
211 | ro
212 | rs
213 | ru
214 | rw
215 | sa
216 | sb
217 | sc
218 | sd
219 | se
220 | sg
221 | sh
222 | si
223 | sj
224 | sk
225 | sl
226 | sm
227 | sn
228 | so
229 | sr
230 | st
231 | su
232 | sv
233 | sx
234 | sy
235 | sz
236 | tc
237 | td
238 | tf
239 | tg
240 | th
241 | tj
242 | tk
243 | tl
244 | tm
245 | tn
246 | to
247 | tp
248 | tr
249 | tt
250 | tv
251 | tw
252 | tz
253 | ua
254 | ug
255 | uk
256 | us
257 | uy
258 | uz
259 | va
260 | vc
261 | ve
262 | vg
263 | vi
264 | vn
265 | vu
266 | wf
267 | ws
268 | ye
269 | yt
270 | za
271 | zm
272 | zw
273 |
--------------------------------------------------------------------------------
/documentation/api-objects.txt:
--------------------------------------------------------------------------------
1 | tweetokenize tweetokenize-module.html
2 | tweetokenize.__package__ tweetokenize-module.html#__package__
3 | tweetokenize.Tokenizer tweetokenize.Tokenizer-class.html
4 | tweetokenize.Tokenizer.repeating_re tweetokenize.Tokenizer-class.html#repeating_re
5 | tweetokenize.Tokenizer._cleanword tweetokenize.Tokenizer-class.html#_cleanword
6 | tweetokenize.Tokenizer.phonenumbers_re tweetokenize.Tokenizer-class.html#phonenumbers_re
7 | tweetokenize.Tokenizer._unicode tweetokenize.Tokenizer-class.html#_unicode
8 | tweetokenize.Tokenizer.usernames_re tweetokenize.Tokenizer-class.html#usernames_re
9 | tweetokenize.Tokenizer._replacetokens tweetokenize.Tokenizer-class.html#_replacetokens
10 | tweetokenize.Tokenizer.quotes_re tweetokenize.Tokenizer-class.html#quotes_re
11 | tweetokenize.Tokenizer._topleveldomains tweetokenize.Tokenizer-class.html#_topleveldomains
12 | tweetokenize.Tokenizer.__init__ tweetokenize.Tokenizer-class.html#__init__
13 | tweetokenize.Tokenizer.emoticons tweetokenize.Tokenizer-class.html#emoticons
14 | tweetokenize.Tokenizer.TokenizerException tweetokenize.Tokenizer.TokenizerException-class.html
15 | tweetokenize.Tokenizer.punctuation tweetokenize.Tokenizer-class.html#punctuation
16 | tweetokenize.Tokenizer._collectset tweetokenize.Tokenizer-class.html#_collectset
17 | tweetokenize.Tokenizer._isemoji tweetokenize.Tokenizer-class.html#_isemoji
18 | tweetokenize.Tokenizer.numbers_re tweetokenize.Tokenizer-class.html#numbers_re
19 | tweetokenize.Tokenizer.times_re tweetokenize.Tokenizer-class.html#times_re
20 | tweetokenize.Tokenizer.tokenize tweetokenize.Tokenizer-class.html#tokenize
21 | tweetokenize.Tokenizer.__call__ tweetokenize.Tokenizer-class.html#__call__
22 | tweetokenize.Tokenizer._converthtmlentities tweetokenize.Tokenizer-class.html#_converthtmlentities
23 | tweetokenize.Tokenizer._number tweetokenize.Tokenizer-class.html#_number
24 | tweetokenize.Tokenizer._separate_emoticons_punctuation tweetokenize.Tokenizer-class.html#_separate_emoticons_punctuation
25 | tweetokenize.Tokenizer.update tweetokenize.Tokenizer-class.html#update
26 | tweetokenize.Tokenizer.word_re tweetokenize.Tokenizer-class.html#word_re
27 | tweetokenize.Tokenizer.tokenize_re tweetokenize.Tokenizer-class.html#tokenize_re
28 | tweetokenize.Tokenizer.html_entities tweetokenize.Tokenizer-class.html#html_entities
29 | tweetokenize.Tokenizer.urls_re tweetokenize.Tokenizer-class.html#urls_re
30 | tweetokenize.Tokenizer._doublequotes tweetokenize.Tokenizer-class.html#_doublequotes
31 | tweetokenize.Tokenizer._token_regexs tweetokenize.Tokenizer-class.html#_token_regexs
32 | tweetokenize.Tokenizer.other_re tweetokenize.Tokenizer-class.html#other_re
33 | tweetokenize.Tokenizer.ellipsis_re tweetokenize.Tokenizer-class.html#ellipsis_re
34 | tweetokenize.Tokenizer.stopwords tweetokenize.Tokenizer-class.html#stopwords
35 | tweetokenize.Tokenizer.html_entities_re tweetokenize.Tokenizer-class.html#html_entities_re
36 | tweetokenize.Tokenizer.hashtags_re tweetokenize.Tokenizer-class.html#hashtags_re
37 | tweetokenize.Tokenizer.__default_args tweetokenize.Tokenizer-class.html#__default_args
38 | tweetokenize.Tokenizer.TokenizerException tweetokenize.Tokenizer.TokenizerException-class.html
39 |
--------------------------------------------------------------------------------
/documentation/module-tree.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 | Module Hierarchy
7 |
8 |
9 |
10 |
11 |
13 |
14 |
16 |
17 |
18 | Home
20 |
21 |
22 | Trees
24 |
25 |
26 | Indices
28 |
29 |
30 | Help
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
47 |
48 |
49 |
50 |
51 | [ Module Hierarchy
52 | | Class Hierarchy ]
53 |
54 | Module Hierarchy
55 |
56 | tweetokenize : Tokenization and pre-processing for social media data used to train
57 | classifiers.
58 |
59 |
60 |
62 |
63 |
64 | Home
66 |
67 |
68 | Trees
70 |
71 |
72 | Indices
74 |
75 |
76 | Help
78 |
79 |
80 |
81 |
82 |
93 |
94 |
103 |
104 |
105 |
--------------------------------------------------------------------------------
/documentation/class-tree.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 | Class Hierarchy
7 |
8 |
9 |
10 |
11 |
13 |
14 |
16 |
17 |
18 | Home
20 |
21 |
22 | Trees
24 |
25 |
26 | Indices
28 |
29 |
30 | Help
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
47 |
48 |
49 |
50 |
51 | [ Module Hierarchy
52 | | Class Hierarchy ]
53 |
54 | Class Hierarchy
55 |
56 | object :
57 | The most base type
58 |
59 | exceptions.BaseException :
60 | Common base class for all exceptions
61 |
65 |
66 | tweetokenize.Tokenizer :
67 | Can be used to tokenize a string representation of a message,
68 | adjusting features based on the given configuration details, to
69 | enable further processing in feature extraction and training
70 | stages.
71 |
72 |
73 |
74 |
75 |
76 |
78 |
79 |
80 | Home
82 |
83 |
84 | Trees
86 |
87 |
88 | Indices
90 |
91 |
92 | Help
94 |
95 |
96 |
97 |
98 |
99 |
100 |
103 |
107 |
108 |
109 |
110 |
119 |
120 |
121 |
--------------------------------------------------------------------------------
/documentation/tweetokenize-module.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 | tweetokenize
7 |
8 |
9 |
10 |
11 |
13 |
14 |
16 |
17 |
18 | Home
20 |
21 |
22 | Trees
24 |
25 |
26 | Indices
28 |
29 |
30 | Help
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 | Module tweetokenize
41 |
42 |
43 |
44 |
51 |
52 |
53 |
54 |
55 | Module tweetokenize source code
56 | Tokenization and pre-processing for social media data used to train
57 | classifiers. Focused on classification of sentiment, emotion, or
58 | mood.
59 | Intended as glue between Python wrappers for Twitter API and the
60 | Natural Language Toolkit (NLTK), but probably applicable to tokenizing
61 | any short messages of the social networking variety.
62 | In many cases, reducing feature-set complexity can increase
63 | performance of classifiers trained for detecting sentiment. The available
64 | settings are based on commonly modified and normalized features in
65 | classification research using content from Twitter.
66 |
67 |
68 |
69 |
71 |
75 |
76 |
77 |
78 |
79 | Tokenizer
80 | Can be used to tokenize a string representation of a message,
81 | adjusting features based on the given configuration details, to
82 | enable further processing in feature extraction and training
83 | stages.
84 |
85 |
86 |
87 |
88 |
89 |
91 |
95 |
96 |
97 |
98 |
99 | __package__ = None
100 |
101 |
102 |
103 |
104 |
106 |
107 |
108 | Home
110 |
111 |
112 | Trees
114 |
115 |
116 | Indices
118 |
119 |
120 | Help
122 |
123 |
124 |
125 |
126 |
127 |
128 |
131 |
135 |
136 |
137 |
138 |
147 |
148 |
149 |
--------------------------------------------------------------------------------
/documentation/tweetokenize.Tokenizer.TokenizerException-class.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 | tweetokenize.Tokenizer.TokenizerException
7 |
8 |
9 |
10 |
11 |
13 |
14 |
16 |
17 |
18 | Home
20 |
21 |
22 | Trees
24 |
25 |
26 | Indices
28 |
29 |
30 | Help
32 |
33 |
34 |
35 |
36 |
56 |
57 | Class TokenizerException source code
58 |
59 | object --+
60 | |
61 | exceptions.BaseException --+
62 | |
63 | Tokenizer.TokenizerException
64 |
65 |
66 |
67 |
68 |
69 |
71 |
75 |
76 |
77 | Inherited from exceptions.BaseException :
78 | __delattr__,
79 | __getattribute__,
80 | __getitem__,
81 | __getslice__,
82 | __init__,
83 | __new__,
84 | __reduce__,
85 | __repr__,
86 | __setattr__,
87 | __setstate__,
88 | __str__,
89 | __unicode__
90 |
91 | Inherited from object :
92 | __format__,
93 | __hash__,
94 | __reduce_ex__,
95 | __sizeof__,
96 | __subclasshook__
97 |
98 |
99 |
100 |
101 |
102 |
103 |
105 |
109 |
110 |
111 | Inherited from exceptions.BaseException :
112 | args,
113 | message
114 |
115 | Inherited from object :
116 | __class__
117 |
118 |
119 |
120 |
121 |
122 |
124 |
125 |
126 | Home
128 |
129 |
130 | Trees
132 |
133 |
134 | Indices
136 |
137 |
138 | Help
140 |
141 |
142 |
143 |
144 |
145 |
146 |
149 |
153 |
154 |
155 |
156 |
165 |
166 |
167 |
--------------------------------------------------------------------------------
/documentation/help.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 | Help
7 |
8 |
9 |
10 |
11 |
13 |
14 |
16 |
17 |
18 | Home
20 |
21 |
22 | Trees
24 |
25 |
26 | Indices
28 |
29 |
30 | Help
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
47 |
48 |
49 |
50 |
51 | API Documentation
52 |
53 | This document contains the API (Application Programming Interface)
54 | documentation for this project. Documentation for the Python
55 | objects defined by the project is divided into separate pages for each
56 | package, module, and class. The API documentation also includes two
57 | pages containing information about the project as a whole: a trees
58 | page, and an index page.
59 |
60 | Object Documentation
61 |
62 | Each Package Documentation page contains:
63 |
64 | A description of the package.
65 | A list of the modules and sub-packages contained by the
66 | package.
67 | A summary of the classes defined by the package.
68 | A summary of the functions defined by the package.
69 | A summary of the variables defined by the package.
70 | A detailed description of each function defined by the
71 | package.
72 | A detailed description of each variable defined by the
73 | package.
74 |
75 |
76 | Each Module Documentation page contains:
77 |
78 | A description of the module.
79 | A summary of the classes defined by the module.
80 | A summary of the functions defined by the module.
81 | A summary of the variables defined by the module.
82 | A detailed description of each function defined by the
83 | module.
84 | A detailed description of each variable defined by the
85 | module.
86 |
87 |
88 | Each Class Documentation page contains:
89 |
90 | A class inheritance diagram.
91 | A list of known subclasses.
92 | A description of the class.
93 | A summary of the methods defined by the class.
94 | A summary of the instance variables defined by the class.
95 | A summary of the class (static) variables defined by the
96 | class.
97 | A detailed description of each method defined by the
98 | class.
99 | A detailed description of each instance variable defined by the
100 | class.
101 | A detailed description of each class (static) variable defined
102 | by the class.
103 |
104 |
105 | Project Documentation
106 |
107 | The Trees page contains the module and class hierarchies:
108 |
109 | The module hierarchy lists every package and module, with
110 | modules grouped into packages. At the top level, and within each
111 | package, modules and sub-packages are listed alphabetically.
112 | The class hierarchy lists every class, grouped by base
113 | class. If a class has more than one base class, then it will be
114 | listed under each base class. At the top level, and under each base
115 | class, classes are listed alphabetically.
116 |
117 |
118 | The Index page contains indices of terms and
119 | identifiers:
120 |
121 | The term index lists every term indexed by any object's
122 | documentation. For each term, the index provides links to each
123 | place where the term is indexed.
124 | The identifier index lists the (short) name of every package,
125 | module, class, method, function, variable, and parameter. For each
126 | identifier, the index provides a short description, and a link to
127 | its documentation.
128 |
129 |
130 | The Table of Contents
131 |
132 | The table of contents occupies the two frames on the left side of
133 | the window. The upper-left frame displays the project
134 | contents , and the lower-left frame displays the module
135 | contents :
136 |
137 |
138 |
139 |
140 | Project Contents ...
141 |
142 | API Documentation Frame
143 |
144 |
145 |
146 |
147 | Module Contents ...
148 |
149 |
150 |
151 |
152 | The project contents frame contains a list of all packages
153 | and modules that are defined by the project. Clicking on an entry
154 | will display its contents in the module contents frame. Clicking on a
155 | special entry, labeled "Everything," will display the contents of
156 | the entire project.
157 |
158 | The module contents frame contains a list of every
159 | submodule, class, type, exception, function, and variable defined by a
160 | module or package. Clicking on an entry will display its
161 | documentation in the API documentation frame. Clicking on the name of
162 | the module, at the top of the frame, will display the documentation
163 | for the module itself.
164 |
165 | The "frames " and "no frames " buttons below the top
166 | navigation bar can be used to control whether the table of contents is
167 | displayed or not.
168 |
169 | The Navigation Bar
170 |
171 | A navigation bar is located at the top and bottom of every page.
172 | It indicates what type of page you are currently viewing, and allows
173 | you to go to related pages. The following table describes the labels
174 | on the navigation bar. Note that not some labels (such as
175 | [Parent]) are not displayed on all pages.
176 |
177 |
178 |
179 | Label
180 | Highlighted when...
181 | Links to...
182 |
183 | [Parent]
184 | (never highlighted)
185 | the parent of the current package
186 | [Package]
187 | viewing a package
188 | the package containing the current object
189 |
190 | [Module]
191 | viewing a module
192 | the module containing the current object
193 |
194 | [Class]
195 | viewing a class
196 | the class containing the current object
197 | [Trees]
198 | viewing the trees page
199 | the trees page
200 | [Index]
201 | viewing the index page
202 | the index page
203 | [Help]
204 | viewing the help page
205 | the help page
206 |
207 |
208 | The "show private " and "hide private " buttons below
209 | the top navigation bar can be used to control whether documentation
210 | for private objects is displayed. Private objects are usually defined
211 | as objects whose (short) names begin with a single underscore, but do
212 | not end with an underscore. For example, "_x",
213 | "__pprint", and "epydoc.epytext._tokenize"
214 | are private objects; but "re.sub",
215 | "__init__", and "type_" are not. However,
216 | if a module defines the "__all__" variable, then its
217 | contents are used to decide which objects are private.
218 |
219 | A timestamp below the bottom navigation bar indicates when each
220 | page was last updated.
221 |
222 |
224 |
225 |
226 | Home
228 |
229 |
230 | Trees
232 |
233 |
234 | Indices
236 |
237 |
238 | Help
240 |
241 |
242 |
243 |
244 |
245 |
246 |
249 |
253 |
254 |
255 |
256 |
265 |
266 |
267 |
--------------------------------------------------------------------------------
/documentation/epydoc.js:
--------------------------------------------------------------------------------
1 | function toggle_private() {
2 | // Search for any private/public links on this page. Store
3 | // their old text in "cmd," so we will know what action to
4 | // take; and change their text to the opposite action.
5 | var cmd = "?";
6 | var elts = document.getElementsByTagName("a");
7 | for(var i=0; i";
123 | s += " ";
124 | for (var i=0; i... ";
127 | elt.innerHTML = s;
128 | }
129 | }
130 |
131 | function toggle(id) {
132 | elt = document.getElementById(id+"-toggle");
133 | if (elt.innerHTML == "-")
134 | collapse(id);
135 | else
136 | expand(id);
137 | return false;
138 | }
139 |
140 | function highlight(id) {
141 | var elt = document.getElementById(id+"-def");
142 | if (elt) elt.className = "py-highlight-hdr";
143 | var elt = document.getElementById(id+"-expanded");
144 | if (elt) elt.className = "py-highlight";
145 | var elt = document.getElementById(id+"-collapsed");
146 | if (elt) elt.className = "py-highlight";
147 | }
148 |
149 | function num_lines(s) {
150 | var n = 1;
151 | var pos = s.indexOf("\n");
152 | while ( pos > 0) {
153 | n += 1;
154 | pos = s.indexOf("\n", pos+1);
155 | }
156 | return n;
157 | }
158 |
159 | // Collapse all blocks that mave more than `min_lines` lines.
160 | function collapse_all(min_lines) {
161 | var elts = document.getElementsByTagName("div");
162 | for (var i=0; i 0)
166 | if (elt.id.substring(split, elt.id.length) == "-expanded")
167 | if (num_lines(elt.innerHTML) > min_lines)
168 | collapse(elt.id.substring(0, split));
169 | }
170 | }
171 |
172 | function expandto(href) {
173 | var start = href.indexOf("#")+1;
174 | if (start != 0 && start != href.length) {
175 | if (href.substring(start, href.length) != "-") {
176 | collapse_all(4);
177 | pos = href.indexOf(".", start);
178 | while (pos != -1) {
179 | var id = href.substring(start, pos);
180 | expand(id);
181 | pos = href.indexOf(".", pos+1);
182 | }
183 | var id = href.substring(start, href.length);
184 | expand(id);
185 | highlight(id);
186 | }
187 | }
188 | }
189 |
190 | function kill_doclink(id) {
191 | var parent = document.getElementById(id);
192 | parent.removeChild(parent.childNodes.item(0));
193 | }
194 | function auto_kill_doclink(ev) {
195 | if (!ev) var ev = window.event;
196 | if (!this.contains(ev.toElement)) {
197 | var parent = document.getElementById(this.parentID);
198 | parent.removeChild(parent.childNodes.item(0));
199 | }
200 | }
201 |
202 | function doclink(id, name, targets_id) {
203 | var elt = document.getElementById(id);
204 |
205 | // If we already opened the box, then destroy it.
206 | // (This case should never occur, but leave it in just in case.)
207 | if (elt.childNodes.length > 1) {
208 | elt.removeChild(elt.childNodes.item(0));
209 | }
210 | else {
211 | // The outer box: relative + inline positioning.
212 | var box1 = document.createElement("div");
213 | box1.style.position = "relative";
214 | box1.style.display = "inline";
215 | box1.style.top = 0;
216 | box1.style.left = 0;
217 |
218 | // A shadow for fun
219 | var shadow = document.createElement("div");
220 | shadow.style.position = "absolute";
221 | shadow.style.left = "-1.3em";
222 | shadow.style.top = "-1.3em";
223 | shadow.style.background = "#404040";
224 |
225 | // The inner box: absolute positioning.
226 | var box2 = document.createElement("div");
227 | box2.style.position = "relative";
228 | box2.style.border = "1px solid #a0a0a0";
229 | box2.style.left = "-.2em";
230 | box2.style.top = "-.2em";
231 | box2.style.background = "white";
232 | box2.style.padding = ".3em .4em .3em .4em";
233 | box2.style.fontStyle = "normal";
234 | box2.onmouseout=auto_kill_doclink;
235 | box2.parentID = id;
236 |
237 | // Get the targets
238 | var targets_elt = document.getElementById(targets_id);
239 | var targets = targets_elt.getAttribute("targets");
240 | var links = "";
241 | target_list = targets.split(",");
242 | for (var i=0; i" +
246 | target[0] + " ";
247 | }
248 |
249 | // Put it all together.
250 | elt.insertBefore(box1, elt.childNodes.item(0));
251 | //box1.appendChild(box2);
252 | box1.appendChild(shadow);
253 | shadow.appendChild(box2);
254 | box2.innerHTML =
255 | "Which "+name+" do you want to see documentation for?" +
256 | "";
261 | }
262 | return false;
263 | }
264 |
265 | function get_anchor() {
266 | var href = location.href;
267 | var start = href.indexOf("#")+1;
268 | if ((start != 0) && (start != href.length))
269 | return href.substring(start, href.length);
270 | }
271 | function redirect_url(dottedName) {
272 | // Scan through each element of the "pages" list, and check
273 | // if "name" matches with any of them.
274 | for (var i=0; i-m" or "-c";
277 | // extract the portion & compare it to dottedName.
278 | var pagename = pages[i].substring(0, pages[i].length-2);
279 | if (pagename == dottedName.substring(0,pagename.length)) {
280 |
281 | // We've found a page that matches `dottedName`;
282 | // construct its URL, using leftover `dottedName`
283 | // content to form an anchor.
284 | var pagetype = pages[i].charAt(pages[i].length-1);
285 | var url = pagename + ((pagetype=="m")?"-module.html":
286 | "-class.html");
287 | if (dottedName.length > pagename.length)
288 | url += "#" + dottedName.substring(pagename.length+1,
289 | dottedName.length);
290 | return url;
291 | }
292 | }
293 | }
294 |
--------------------------------------------------------------------------------
/tests/test_tweetokenize.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #
4 | # tweetokenize: Regular expression based tokenizer for Twitter
5 | # Copyright: (c) 2013, Jared Suttles. All rights reserved.
6 | # License: BSD, see LICENSE for details.
7 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
8 | import unittest
9 | from tweetokenize import Tokenizer
10 |
11 |
12 | class TokenizeTests(unittest.TestCase):
13 | def setUp(self):
14 | self.tokenizer = Tokenizer(lowercase=True)
15 |
16 | def test_general_1(self):
17 | self.tokenizer.normalize = 2
18 | msg = ('omg wow < & > >.< >.< :):)'
19 | 'i CANT believe thatttt haha lol!!1')
20 | tks = ['omg', 'wow', '<', '&', '>', '>.<', '>.<', ':)', ':)',
21 | 'i', 'CANT', 'believe', 'thatt', 'haha', 'lol', '!', '!', '1']
22 | self.assertEqual(self.tokenizer.tokenize(msg), tks)
23 |
24 | def test_general_2(self):
25 | msg = "i'm wanting to jump up and down but wouldn't if i couldn't.."
26 | tks = [u"i'm", u'wanting', u'to', u'jump', u'up', u'and', u'down',
27 | u'but', u"wouldn't", u'if', u'i', u"couldn't", u'...']
28 | self.assertEqual(self.tokenizer.tokenize(msg), tks)
29 |
30 | def test_urls_1(self):
31 | msg = ("hey bro chec'k out http://shitstorm.com its fucking sick")
32 | tks = ['hey', 'bro', "chec'k", 'out', 'URL', 'its', 'fucking', 'sick']
33 | self.assertEqual(self.tokenizer.tokenize(msg), tks)
34 |
35 | def test_urls_2(self):
36 | msg = ('also see this crazy stuff https://shitstorm.com')
37 | tks = ['also', 'see', 'this', 'crazy', 'stuff', 'URL']
38 | self.assertEqual(self.tokenizer.tokenize(msg), tks)
39 |
40 | def test_urls_3(self):
41 | msg = 'hiiiii rayj.com/ihititfirst and other google.com http://hobo.net'
42 | tks = [u'hiii', u'URL', u'and', u'other', u'URL', u'URL']
43 | self.assertEqual(self.tokenizer.tokenize(msg), tks)
44 |
45 | def test_usernames_1(self):
46 | msg = ('@justinbeiber yo man!! ! i love you in a totally '
47 | 'straight way <3:p:D')
48 | tks = [u'USERNAME', u'yo', u'man', u'!', u'!', u'!',
49 | u'i', u'love', u'you', u'in', u'a', u'totally', u'straight', u'way',
50 | u'<3', u':p', u':D']
51 | self.assertEqual(self.tokenizer.tokenize(msg), tks)
52 |
53 | def test_usernames_2(self):
54 | msg = '@heyheymango: what did you SAYYY??? or did you just.. NotHING?'
55 | tks = [u'USERNAME', u':', u'what', u'did', u'you', u'SAYYY', u'?',
56 | u'?', u'?', u'or', u'did', u'you', u'just', u'...', u'nothing', u'?']
57 | self.assertEqual(self.tokenizer.tokenize(msg), tks)
58 |
59 | def test_numbers_1(self):
60 | self.tokenizer.numbers = None
61 | msg = ('i have this much money -2.42 in my bank acct.,friend! but you '
62 | 'have mucho +88e44 and its about 1000% more than $400.')
63 | tks = [u'i', u'have', u'this', u'much', u'money', u'-2.42', u'in',
64 | u'my', u'bank', u'acct', u'.', u',', u'friend', u'!', u'but', u'you',
65 | u'have', u'mucho', u'+88e44', u'and', u'its', u'about', u'1000%',
66 | u'more', u'than', u'$400', u'.']
67 | self.assertEqual(self.tokenizer.tokenize(msg), tks)
68 |
69 | def test_numbers_2(self):
70 | msg = ('i have this much money -2.42 in my bank acct.,friend! but you '
71 | 'have mucho +88e44 and its about 1000% more than $400.')
72 | tks = [u'i', u'have', u'this', u'much', u'money', u'NUMBER', u'in',
73 | u'my', u'bank', u'acct', u'.', u',', u'friend', u'!', u'but', u'you',
74 | u'have', u'mucho', u'NUMBER', u'and', u'its', u'about', u'NUMBER',
75 | u'more', u'than', u'NUMBER', u'.']
76 | self.assertEqual(self.tokenizer.tokenize(msg), tks)
77 |
78 | def test_numbers_3(self):
79 | self.tokenizer.lowercase = False # keep cases the same everywhere
80 | msg = ('I JUST want To Test FRACTIONZZZ 22432.41414/ 55894385e-341 also'
81 | ' lowercase etc.etc.etc. hope that last part doesn\'t parse as a url '
82 | 'i would be kinda sad PANda!zsss..... .. . .... 4/5 5.1/4.0e0 3.14 -2')
83 | tks = [u'I', u'JUST', u'want', u'To', u'Test', u'FRACTIONZZZ',
84 | u'NUMBER', u'also', u'lowercase', u'etc', u'.', u'etc', u'.', u'etc',
85 | u'.', u'hope', u'that', u'last', u'part', u"doesn't", u'parse', u'as',
86 | u'a', u'url', u'i', u'would', u'be', u'kinda', u'sad', u'PANda', u'!',
87 | u'zsss', u'...', u'...', u'.', u'...', u'NUMBER', u'NUMBER', u'NUMBER',
88 | u'NUMBER']
89 | self.assertEqual(self.tokenizer.tokenize(msg), tks)
90 |
91 | def test_time_1(self):
92 | msg = 'is the time now 12:14pm? or is it like 2:42AM??'
93 | tks = [u'is', u'the', u'time', u'now', u'TIME', u'?', u'or', u'is',
94 | u'it', u'like', u'TIME', u'?', u'?']
95 | self.assertEqual(self.tokenizer.tokenize(msg), tks)
96 |
97 | def test_time_2(self):
98 | msg = 'new time is 2:42:09 PM!!'
99 | tks = [u'new', u'time', u'is', u'TIME', u'!', u'!']
100 | self.assertEqual(self.tokenizer.tokenize(msg), tks)
101 |
102 | def test_phonenumber_1(self):
103 | msg = ('my number is 18002432242 and 241.413.5584 also 1-242-156-6724'
104 | ' and (958)555-4875 or (999) 415 5542 is 422-5555 a 131-121-1441')
105 | tks = [u'my', u'number', u'is', u'PHONENUMBER', u'and', u'PHONENUMBER',
106 | u'also', u'PHONENUMBER', u'and', u'PHONENUMBER', u'or', u'PHONENUMBER',
107 | u'is', u'PHONENUMBER', u'a', u'PHONENUMBER']
108 | self.assertEqual(self.tokenizer.tokenize(msg), tks)
109 |
110 | def test_phonenumber_2(self):
111 | msg = 'numbers with extension: (201)-340-4915 x112 or 1 800.341.1311x99'
112 | tks = [u'numbers', u'with', u'extension', u':', u'PHONENUMBER', u'or',
113 | u'PHONENUMBER']
114 | self.assertEqual(self.tokenizer.tokenize(msg), tks)
115 |
116 | def test_quotes_1(self):
117 | self.tokenizer.ignorequotes = True
118 | msg = 'this is just a tweet with "someone said something funny" lol'
119 | tks = ['this', 'is', 'just', 'a', 'tweet', 'with', 'lol']
120 | self.assertEqual(self.tokenizer.tokenize(msg), tks)
121 |
122 | def test_quotes_2(self):
123 | self.tokenizer.ignorequotes = False
124 | msg = 'this is just a tweet with "someone said something funny" lol'
125 | tks = ['this', 'is', 'just', 'a', 'tweet', 'with', '"', 'someone',
126 | 'said', 'something', 'funny', '"', 'lol']
127 | self.assertEqual(self.tokenizer.tokenize(msg), tks)
128 |
129 | def test_quotes_3(self):
130 | self.tokenizer.ignorequotes = True
131 | msg = ('some stuff but he said “yea i know its crazy”other '
132 | 'stuff...!!! ')
133 | tks = [u'some', u'stuff', u'but', u'he', u'said', u'other', u'stuff',
134 | u'...', u'!', u'!', u'!']
135 | self.assertEqual(self.tokenizer.tokenize(msg), tks)
136 |
137 | def test_quotes_4(self):
138 | self.tokenizer.ignorequotes = True
139 | msg = ('some stuff but he said “yea i know its crazy”other '
140 | 'stuff...!!! ')
141 | tks = [u'some', u'stuff', u'but', u'he', u'said', u'other', u'stuff',
142 | u'...', u'!', u'!', u'!']
143 | self.assertEqual(self.tokenizer.tokenize(msg), tks)
144 |
145 | def test_quotes_5(self):
146 | self.tokenizer.ignorequotes = False
147 | msg = 'heyy buddyyyyy boy \'do you the lady\'s kitty like that??\''
148 | tks = [u'heyy', u'buddyyy', u'boy', u"'", u'do', u'you', u'the',
149 | u"lady's", u'kitty', u'like', u'that', u'?', u'?', u"'"]
150 | self.assertEqual(self.tokenizer.tokenize(msg), tks)
151 |
152 | def test_hashtags_1(self):
153 | msg = 'omg i love#dog#cat#food#other#things#so#fucking#much!!!11LOLOLOL'
154 | tks = ['omg', 'i', 'love', '#dog', '#cat', '#food', '#other',
155 | '#things', '#so', '#fucking', '#much', '!', '!', '!', '11LOLOLOL']
156 | self.assertEqual(self.tokenizer.tokenize(msg), tks)
157 |
158 | def test_hashtags_2(self):
159 | self.tokenizer.hashtags = 'HASHTAG'
160 | msg = 'omg i love#dog#cat#food#other#things#so#fucking#much!!!11LOLOLOL'
161 | tks = ['omg', 'i', 'love', 'HASHTAG', 'HASHTAG', 'HASHTAG',
162 | 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', '!', '!', '!',
163 | '11LOLOLOL']
164 | self.assertEqual(self.tokenizer.tokenize(msg), tks)
165 |
166 | def test_emoticons_1(self):
167 | msg = 'heyyyyyy:):):(>.<.<', u'<', u'v.v', u'whats',
169 | u'up', u'man', u'LOL', u'T.T', u'tomcat', u'.', u'tomcat', u':$',
170 | u';)', u'.', u'!', u'!', u'!']
171 | self.assertEqual(self.tokenizer.tokenize(msg), tks)
172 |
173 | def test_removefeatures_1(self):
174 | self.tokenizer.usernames = "" # dont' want any usernames to show
175 | msg = ('hey @arnold @nickelodeon #90s#ilove90s#allthat#amandashow'
176 | '@rocko http://en.wikipedia.org/wiki/The_Angry_Beavers ^.^>>><<<^.^')
177 | tks = [u'hey', u'#90s', u'#ilove90s', u'#allthat', u'#amandashow',
178 | u'URL', u'^.^', u'>', u'>', u'>', u'<', u'<', u'<', u'^.^']
179 | self.assertEqual(self.tokenizer.tokenize(msg), tks)
180 |
181 | def test_removefeatures_2(self):
182 | self.tokenizer.usernames = "" # dont' want any usernames to show
183 | self.tokenizer.hashtags = "" # or hashtags
184 | msg = ('hey @arnold @nickelodeon #90s#ilove90s#allthat#amandashow'
185 | '@rocko http://en.wikipedia.org/wiki/The_Angry_Beavers ^.^>>><<<^.^')
186 | tks = [u'hey', u'URL', u'^.^', u'>', u'>', u'>', u'<', u'<', u'<',
187 | u'^.^']
188 | self.assertEqual(self.tokenizer.tokenize(msg), tks)
189 |
190 | def test_removefeatures_3(self):
191 | self.tokenizer.usernames = False # keep usernames
192 | self.tokenizer.urls = "" # URLs should be removed
193 | self.tokenizer.hashtags = "$$$" # hashtags should be $$$
194 | msg = ('hey @arnold @nickelodeon #90s#ilove90s#allthat#amandashow'
195 | '@rocko http://en.wikipedia.org/wiki/The_Angry_Beavers ^.^>>><<<^.^')
196 | tks = [u'hey', u'@arnold', u'@nickelodeon', u'$$$', u'$$$', u'$$$',
197 | u'$$$', u'@rocko', u'^.^', u'>', u'>', u'>', u'<', u'<', u'<', u'^.^']
198 | self.assertEqual(self.tokenizer.tokenize(msg), tks)
199 |
200 | def test_emoji_1(self):
201 | msg = ('hey mate!:):3.....@and🇨🇳ONE+ BRO#love😘😵💚💛💜💙 '
202 | '💋😂😂LOLLLL.')
203 | tks = [u'hey', u'mate', u'!', u':)', u':3', u'...',
204 | u'USERNAME', u'\U0001f1e8\U0001f1f3', u'ONE', u'+', u'BRO', u'#love',
205 | u'\U0001f618', u'\U0001f635', u'\U0001f49a', u'\U0001f49b',
206 | u'\U0001f49c', u'\U0001f499', u'\U0001f48b', u'\U0001f602',
207 | u'\U0001f602', u'LOLLL', u'.']
208 | self.assertEqual(self.tokenizer.tokenize(msg), tks)
209 |
210 | def test_emoji_2(self):
211 | msg = ('hey mate!:):3.....@andONE+🇬🇧 BRO#love😘😵💚💛💜💙 '
212 | '💋😂😂LOLLLL.')
213 | tks = [u'hey', u'mate', u'!', u':)', u':3', u'...',
214 | u'USERNAME', u'+', u'\U0001f1ec\U0001f1e7', u'BRO', u'#love', u'😘',
215 | u'😵', u'\U0001f49a', u'\U0001f49b', u'\U0001f49c',
216 | u'\U0001f499', u'💋', u'\U0001f602', u'\U0001f602',
217 | u'LOLLL', u'.']
218 | self.assertEqual(self.tokenizer.tokenize(msg), tks)
219 |
220 | def test_emoji_3(self):
221 | msg = ('🚀=)3O_O:$D:<:-@\xf0\x9f\x98\xb7🔥💩💅 outdated:💽 ancient:💾 '
222 | '#getwiththecloud:💻 and it looks like 💭')
223 | tks = [u'\U0001f680', u'=)', u'3', u'O_O', u':$', u'D:<', u':-@',
224 | u'\U0001f637', u'\U0001f525', u'\U0001f4a9', u'\U0001f485',
225 | u'outdated', u':', u'\U0001f4bd', u'ancient', u':',
226 | u'\U0001f4be', u'#getwiththecloud',
227 | u':', u'\U0001f4bb', u'and', u'it', u'looks', u'like', u'\U0001f4ad']
228 | self.assertEqual(self.tokenizer.tokenize(msg), tks)
229 |
230 | def test_error_1(self):
231 | msg = []
232 | with self.assertRaises(TypeError):
233 | self.tokenizer.tokenize(msg)
234 |
235 | def test_error_2(self):
236 | msg = lambda x: x
237 | with self.assertRaises(TypeError):
238 | self.tokenizer.tokenize(msg)
239 |
240 | def test_actual_tweets_1(self):
241 | "Number as part of name"
242 | msg = '@LoganTillman not 2pac and floyd mayweather'
243 | tks = [u'USERNAME', u'not', u'2pac', u'and', u'floyd', u'mayweather']
244 | self.assertEqual(self.tokenizer.tokenize(msg), tks)
245 |
246 | def test_actual_tweets_2(self):
247 | "Colon no space in hashtag"
248 | msg = '#MentionSomeoneYoureGladYouMet: @LarryWorld_Wide of course.'
249 | tks = [u'#MentionSomeoneYoureGladYouMet', u':', u'USERNAME', u'of',
250 | u'course', u'.']
251 | self.assertEqual(self.tokenizer.tokenize(msg), tks)
252 |
253 | def test_stopwords_1(self):
254 | self.tokenizer.ignorestopwords = True
255 | msg = 'i like myself and my so not much and our something he:)'
256 | tks = [u'like', u'much', u'something', u':)']
257 | self.assertEqual(self.tokenizer.tokenize(msg), tks)
258 |
259 | if __name__ == "__main__":
260 | unittest.main()
261 |
--------------------------------------------------------------------------------
/tweetokenize/tokenizer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #
4 | # tweetokenize: Regular expression based tokenizer for Twitter
5 | # Copyright: (c) 2013, Jared Suttles. All rights reserved.
6 | # License: BSD, see LICENSE for details.
7 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
8 | import re
9 | from os import path
10 | from itertools import imap
11 | from htmlentitydefs import name2codepoint
12 |
13 | html_entities = {k: unichr(v) for k, v in name2codepoint.iteritems()}
14 | html_entities_re = re.compile(r"?\w+;")
15 | emoji_ranges = ((u'\U0001f300', u'\U0001f5ff'), (u'\U0001f600', u'\U0001f64f'), (u'\U0001f680', u'\U0001f6c5'),
16 | (u'\u2600', u'\u26ff'), (u'\U0001f170', u'\U0001f19a'))
17 | emoji_flags = {u'\U0001f1ef\U0001f1f5', u'\U0001f1f0\U0001f1f7', u'\U0001f1e9\U0001f1ea',
18 | u'\U0001f1e8\U0001f1f3', u'\U0001f1fa\U0001f1f8', u'\U0001f1eb\U0001f1f7',
19 | u'\U0001f1ea\U0001f1f8', u'\U0001f1ee\U0001f1f9', u'\U0001f1f7\U0001f1fa',
20 | u'\U0001f1ec\U0001f1e7'}
21 |
22 |
23 | def _converthtmlentities(msg):
24 | def replace_entities(s):
25 | s = s.group(0)[1:-1] # remove & and ;
26 | if s[0] == '#':
27 | try:
28 | return unichr(int(s[2:],16) if s[1] in 'xX' else int(s[1:]))
29 | except ValueError:
30 | return '' + s + ';'
31 | else:
32 | try:
33 | return html_entities[s]
34 | except KeyError:
35 | return '&' + s + ';'
36 | return html_entities_re.sub(replace_entities, msg)
37 |
38 |
39 | def _unicode(word):
40 | if isinstance(word, unicode):
41 | return word
42 | return unicode(word, encoding='utf-8')
43 |
44 |
45 | def _isemoji(s):
46 | return len(s) == len(u'\U0001f4a9') and any(l <= s <= u for l, u in emoji_ranges) or s in emoji_flags
47 |
48 |
49 | class Tokenizer(object):
50 | """
51 | Can be used to tokenize a string representation of a message, adjusting
52 | features based on the given configuration details, to enable further
53 | processing in feature extraction and training stages.
54 |
55 | An example usage::
56 |
57 | >>> from tweetokenize import Tokenizer
58 | >>> gettokens = Tokenizer(usernames='USER', urls='')
59 | >>> gettokens.tokenize('@justinbeiber yo man!love you#inlove#wantyou in a totally straight way #brotime <3:p:D www.justinbeiber.com')
60 | [u'USER', u'yo', u'man', u'!', u'love', u'you', u'#inlove', u'#wantyou', u'in', u'a', u'totally', u'straight', u'way', u'#brotime', u'<3', u':p', u':D']
61 | """
62 | _default_args = dict(
63 | lowercase=True, allcapskeep=True, normalize=3, usernames='USERNAME', urls='URL', hashtags=False,
64 | phonenumbers='PHONENUMBER', times='TIME', numbers='NUMBER', ignorequotes=False, ignorestopwords=False
65 | )
66 | _lexicons = path.join(path.dirname(path.realpath(__file__)), 'lexicons/{}.txt')
67 |
68 | # Regular expressions
69 | usernames_re = re.compile(r"@\w{1,15}")
70 | with open(_lexicons.format('domains'), 'r') as f:
71 | domains = f.read().strip().replace('\n', '|')
72 | urls_re = re.compile(r"(?:(?:https?\://[A-Za-z0-9\.]+)|(?:(?:www\.)?[A-Za-z0-9]+\.(?:{})))(?:\/\S+)?"
73 | "(?=\s+|$)".format(domains))
74 | del domains
75 | hashtags_re = re.compile(r"#\w+[\w'-]*\w+")
76 | ellipsis_re = re.compile(r"\.\.+")
77 | word_re = re.compile(r"(?:[a-zA-Z0-9]+['-]?[a-zA-Z]+[a-zA-Z0-9]*)|(?:[a-zA-Z0-9]*[a-zA-Z]+['-]?[a-zA-Z0-9]+)")
78 | times_re = re.compile(r"\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM|am|pm)?")
79 | phonenumbers_re = re.compile(r"(?:\+?[01][\-\s\.]*)?(?:\(?\d{3}[\-\s\.\)]*)?\d{3}[\-\s\.]*\d{4}(?:\s*x\s*\d+)?"
80 | "(?=\s+|$)")
81 | number_re = r"(?:[+-]?\$?\d+(?:\.\d+)?(?:[eE]-?\d+)?%?)(?![A-Za-z])"
82 | numbers_re = re.compile(r"{0}(?:\s*/\s*{0})?".format(number_re)) # deals with fractions
83 | del number_re
84 | other_re = r"(?:[^#\s\.]|\.(?!\.))+"
85 | _token_regexs = ('usernames', 'urls', 'hashtags', 'times', 'phonenumbers', 'numbers')
86 | tokenize_re = re.compile(
87 | ur"|".join(
88 | imap(lambda x: getattr(x, 'pattern', x),
89 | [locals()[regex + '_re'] for regex in _token_regexs] + [word_re, ellipsis_re, other_re])))
90 | del regex # otherwise stays in class namespace
91 | repeating_re = re.compile(r"([a-zA-Z])\1\1+")
92 | doublequotes = ((u'“',u'”'),(u'"',u'"'),(u'‘',u'’'),(u'"',u'"'))
93 | punctuation = (u'!$%()*+,-/:;<=>?[\\]^_.`{|}~\'' + u''.join(c for t in doublequotes for c in t))
94 | quotes_re = re.compile(ur"|".join(ur'({}.*?{})'.format(f,s) for f,s in doublequotes) + ur'|\s(\'.*?\')\s')
95 | del doublequotes
96 |
97 | def __init__(self, **kwargs):
98 | """
99 | Constructs a new Tokenizer. Can specify custom settings for various
100 | feature normalizations.
101 |
102 | Any features with replacement tokens can be removed from the message by
103 | setting the token to the empty string (C{""}), C{"DELETE"}, or
104 | C{"REMOVE"}.
105 |
106 | @type lowercase: C{bool}
107 | @param lowercase: If C{True}, lowercases words, excluding those with
108 | all letters capitalized.
109 |
110 | @type allcapskeep: C{bool}
111 | @param allcapskeep: If C{True}, maintains capitalization for words with
112 | all letters in capitals. Otherwise, capitalization for such words
113 | is dependent on C{lowercase}.
114 |
115 | @type normalize: C{int}
116 | @param normalize: The number of repeating letters when normalizing
117 | arbitrary letter elongations.
118 |
119 | Example::
120 | Heyyyyyy i lovvvvvvve youuuuuuuuu <3
121 |
122 | Becomes::
123 | Heyyy i lovvve youuu <3
124 |
125 | Not sure why you would want to change this (maybe just for fun?? :P)
126 |
127 | @param usernames: Serves as the replacement token for anything that
128 | parses as a Twitter username, ie. C{@rayj}. Setting this to
129 | C{False} means no usernames will be changed.
130 |
131 | @param urls: Serves as the replacement token for anything that
132 | parses as a URL, ie. C{bit.ly} or C{http://example.com}. Setting
133 | this to C{False} means no URLs will be changed.
134 |
135 | @param hashtags: Serves as the replacement token for anything that
136 | parses as a Twitter hashtag, ie. C{#ihititfirst} or
137 | C{#onedirection}. Setting this to C{False} means no hashtags will
138 | be changed.
139 |
140 | @param phonenumbers: Replacement token for phone numbers.
141 |
142 | @param times: Replacement token for times.
143 |
144 | @param numbers: Replacement token for any other kinds of numbers.
145 |
146 | @type ignorequotes: C{bool}
147 | @param ignorequotes: If C{True}, will remove various types of quotes
148 | and the contents within.
149 |
150 | @type ignorestopwords: C{bool}
151 | @param ignorestopwords: If C{True}, will remove any stopwords. The
152 | default set includes 'I', 'me', 'itself', 'against', 'should', etc.
153 | """
154 | for keyword in self._default_args:
155 | setattr(self, keyword, kwargs.get(keyword, self._default_args[keyword]))
156 | self.emoticons(filename=self._lexicons.format('emoticons'))
157 | self.stopwords(filename=self._lexicons.format('stopwords'))
158 |
159 | def __call__(self, iterable):
160 | """
161 | Iterator for the tokenization of given messages.
162 |
163 | @rtype: C{list} of C{str}
164 | @return: Iterator of lists representing message tokenizations.
165 |
166 | @param iterable: Object capable of iteration, providing strings for
167 | tokenization.
168 | """
169 | for msg in iterable:
170 | yield self.tokenize(msg)
171 |
172 | def update(self, **kwargs):
173 | """
174 | Adjust any settings of the Tokenizer.
175 |
176 | >>> gettokens = Tokenizer())
177 | >>> gettokens.lowercase
178 | True
179 | >>> gettokens.phonenumbers
180 | 'PHONENUMBER'
181 | >>> gettokens.update(phonenumbers='NUMBER', lowercase=False)
182 | >>> gettokens.lowercase
183 | False
184 | >>> gettokens.phonenumbers
185 | 'NUMBER'
186 | """
187 | for keyword in self._default_args:
188 | if keyword in kwargs:
189 | setattr(self, keyword, kwargs[keyword])
190 |
191 | def _replacetokens(self, msg):
192 | tokens = []
193 | deletion_tokens = {'', 'REMOVE', 'remove', 'DELETE', 'delete'}
194 | for word in msg:
195 | matching = self.word_re.match(word) # 1st check if normal word
196 | if matching and len(matching.group(0)) == len(word):
197 | tokens.append(self._cleanword(word))
198 | continue # don't check rest of conditions
199 | for token in self._token_regexs: # id & possibly replace tokens
200 | regex = getattr(self, token + '_re')
201 | replacement_token = getattr(self, token)
202 | if regex.match(word):
203 | if replacement_token: # decide if we change it
204 | word = _unicode(str(replacement_token))
205 | if replacement_token not in deletion_tokens:
206 | tokens.append(word)
207 | break
208 | else: # we didn't find a match for any token so far...
209 | if self.ellipsis_re.match(word):
210 | tokens.append(u"...")
211 | else: # split into tokens based on emoticons or punctuation
212 | tokens.extend(self._separate_emoticons_punctuation(word))
213 | return tokens
214 |
215 | def _separate_emoticons_punctuation(self, word):
216 | newwords, wordbefore = [], []
217 | i = 0
218 | def possibly_append_and_reset():
219 | if wordbefore:
220 | newwords.append(self._cleanword(''.join(wordbefore)))
221 | wordbefore[:] = []
222 | while i < len(word):
223 | # greedily check for emoticons in this word
224 | for l in range(self._maxlenemo, 0, -1):
225 | if word[i:i+l] in self._emoticons or _isemoji(word[i:i+l]):
226 | possibly_append_and_reset()
227 | newwords.append(word[i:i+l])
228 | i+=l
229 | break
230 | else: # its safe to break up any punctuation not part of emoticons
231 | if word[i] in self.punctuation:
232 | possibly_append_and_reset()
233 | newwords.append(word[i])
234 | else:
235 | wordbefore.append(word[i])
236 | i+=1
237 | # possible ending of word which wasn't emoticon or punctuation
238 | possibly_append_and_reset()
239 | return newwords
240 |
241 | def _cleanword(self, word):
242 | if self.normalize: # replace characters with >=3 alphabetic repeating
243 | word = self.repeating_re.sub(r"\1"*self.normalize, word)
244 | if self.lowercase and (not self.allcapskeep or not word.isupper()):
245 | return word.lower()
246 | return word
247 |
248 | def tokenize(self, message):
249 | """
250 | Tokenize the given string into a list of strings representing the
251 | constituent words of the message.
252 |
253 | @rtype: C{list} of C{str}
254 | @return: The tokenization of the message.
255 |
256 | @type message: C{str}
257 | @param message: The string representation of the message.
258 | """
259 | if not isinstance(message, basestring):
260 | raise TypeError('cannot tokenize non-string, {}'.format(repr(type(message).__name__)))
261 | message = _converthtmlentities(_unicode(message))
262 | if self.ignorequotes:
263 | message = self.quotes_re.sub(" ", message)
264 | message = self._replacetokens(self.tokenize_re.findall(message))
265 | if self.ignorestopwords:
266 | message = [word for word in message if word not in self._stopwords]
267 | return message
268 |
269 | def emoticons(self, iterable=None, filename=None):
270 | """
271 | Consumes an iterable of emoticons that the tokenizer will tokenize on.
272 | Allows for user-specified set of emoticons to be recognized.
273 |
274 | @param iterable: Object capable of iteration, providing emoticon
275 | strings.
276 | @type filename: C{str}
277 | @param filename: Path to the file containing emoticons delimited by
278 | new lines. Strips trailing whitespace and skips blank lines.
279 | """
280 | self._emoticons = self._collectset(iterable, filename)
281 | self._maxlenemo = max(len(max(self._emoticons, key=lambda x: len(x))),
282 | len(u'\U0001f1e8\U0001f1f3'), len(u'\U0001f48b'))
283 |
284 | def stopwords(self, iterable=None, filename=None):
285 | """
286 | Consumes an iterable of stopwords that the tokenizer will ignore if the
287 | stopwords setting is C{True}. The default set is taken from NLTK's
288 | english list.
289 |
290 | @param iterable: Object capable of iteration, providing stopword
291 | strings.
292 | @type filename: C{str}
293 | @param filename: Path to the file containing stopwords delimited by
294 | new lines. Strips trailing whitespace and skips blank lines.
295 | """
296 | self._stopwords = self._collectset(iterable, filename)
297 |
298 | @staticmethod
299 | def _collectset(iterable, filename):
300 | if filename:
301 | with open(filename, "r") as f:
302 | iterable = set(l.rstrip() for l in f)
303 | iterable.discard('')
304 | return set(imap(_unicode, iterable))
305 |
--------------------------------------------------------------------------------
/documentation/identifier-index.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 | Identifier Index
7 |
8 |
9 |
10 |
11 |
13 |
14 |
16 |
17 |
18 | Home
20 |
21 |
22 | Trees
24 |
25 |
26 | Indices
28 |
29 |
30 | Help
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
47 |
48 |
49 |
50 |
51 |
52 | Identifier Index
53 |
54 | [
55 | A
56 | B
57 | C
58 | D
59 | E
60 | F
61 | G
62 | H
63 | I
64 | J
65 | K
66 | L
67 | M
68 | N
69 | O
70 | P
71 | Q
72 | R
73 | S
74 | T
75 | U
76 | V
77 | W
78 | X
79 | Y
80 | Z
81 | _
82 | ]
83 |
84 |
85 |
86 |
87 |
97 |
98 |
99 |
100 |
111 |
112 |
113 |
114 |
123 |
124 |
125 |
126 |
135 |
136 |
137 |
138 |
148 |
149 |
150 |
151 |
160 |
161 |
162 |
163 |
172 |
173 |
174 |
175 |
184 |
185 |
186 |
187 |
204 |
205 |
206 |
207 |
218 |
219 |
220 |
221 |
230 |
231 |
232 |
233 |
258 |
259 |
260 |
261 |
263 |
264 |
265 | Home
267 |
268 |
269 | Trees
271 |
272 |
273 | Indices
275 |
276 |
277 | Help
279 |
280 |
281 |
282 |
283 |
284 |
285 |
288 |
292 |
293 |
294 |
295 |
304 |
305 |
306 |
--------------------------------------------------------------------------------
/documentation/epydoc.css:
--------------------------------------------------------------------------------
1 |
2 |
3 | /* Epydoc CSS Stylesheet
4 | *
5 | * This stylesheet can be used to customize the appearance of epydoc's
6 | * HTML output.
7 | *
8 | */
9 |
10 | /* Default Colors & Styles
11 | * - Set the default foreground & background color with 'body'; and
12 | * link colors with 'a:link' and 'a:visited'.
13 | * - Use bold for decision list terms.
14 | * - The heading styles defined here are used for headings *within*
15 | * docstring descriptions. All headings used by epydoc itself use
16 | * either class='epydoc' or class='toc' (CSS styles for both
17 | * defined below).
18 | */
19 | body { background: #ffffff; color: #000000; }
20 | p { margin-top: 0.5em; margin-bottom: 0.5em; }
21 | a:link { color: #0000ff; }
22 | a:visited { color: #204080; }
23 | dt { font-weight: bold; }
24 | h1 { font-size: +140%; font-style: italic;
25 | font-weight: bold; }
26 | h2 { font-size: +125%; font-style: italic;
27 | font-weight: bold; }
28 | h3 { font-size: +110%; font-style: italic;
29 | font-weight: normal; }
30 | code { font-size: 100%; }
31 | /* N.B.: class, not pseudoclass */
32 | a.link { font-family: monospace; }
33 |
34 | /* Page Header & Footer
35 | * - The standard page header consists of a navigation bar (with
36 | * pointers to standard pages such as 'home' and 'trees'); a
37 | * breadcrumbs list, which can be used to navigate to containing
38 | * classes or modules; options links, to show/hide private
39 | * variables and to show/hide frames; and a page title (using
40 | * ). The page title may be followed by a link to the
41 | * corresponding source code (using 'span.codelink').
42 | * - The footer consists of a navigation bar, a timestamp, and a
43 | * pointer to epydoc's homepage.
44 | */
45 | h1.epydoc { margin: 0; font-size: +140%; font-weight: bold; }
46 | h2.epydoc { font-size: +130%; font-weight: bold; }
47 | h3.epydoc { font-size: +115%; font-weight: bold;
48 | margin-top: 0.2em; }
49 | td h3.epydoc { font-size: +115%; font-weight: bold;
50 | margin-bottom: 0; }
51 | table.navbar { background: #a0c0ff; color: #000000;
52 | border: 2px groove #c0d0d0; }
53 | table.navbar table { color: #000000; }
54 | th.navbar-select { background: #70b0ff;
55 | color: #000000; }
56 | table.navbar a { text-decoration: none; }
57 | table.navbar a:link { color: #0000ff; }
58 | table.navbar a:visited { color: #204080; }
59 | span.breadcrumbs { font-size: 85%; font-weight: bold; }
60 | span.options { font-size: 70%; }
61 | span.codelink { font-size: 85%; }
62 | td.footer { font-size: 85%; }
63 |
64 | /* Table Headers
65 | * - Each summary table and details section begins with a 'header'
66 | * row. This row contains a section title (marked by
67 | * 'span.table-header') as well as a show/hide private link
68 | * (marked by 'span.options', defined above).
69 | * - Summary tables that contain user-defined groups mark those
70 | * groups using 'group header' rows.
71 | */
72 | td.table-header { background: #70b0ff; color: #000000;
73 | border: 1px solid #608090; }
74 | td.table-header table { color: #000000; }
75 | td.table-header table a:link { color: #0000ff; }
76 | td.table-header table a:visited { color: #204080; }
77 | span.table-header { font-size: 120%; font-weight: bold; }
78 | th.group-header { background: #c0e0f8; color: #000000;
79 | text-align: left; font-style: italic;
80 | font-size: 115%;
81 | border: 1px solid #608090; }
82 |
83 | /* Summary Tables (functions, variables, etc)
84 | * - Each object is described by a single row of the table with
85 | * two cells. The left cell gives the object's type, and is
86 | * marked with 'code.summary-type'. The right cell gives the
87 | * object's name and a summary description.
88 | * - CSS styles for the table's header and group headers are
89 | * defined above, under 'Table Headers'
90 | */
91 | table.summary { border-collapse: collapse;
92 | background: #e8f0f8; color: #000000;
93 | border: 1px solid #608090;
94 | margin-bottom: 0.5em; }
95 | td.summary { border: 1px solid #608090; }
96 | code.summary-type { font-size: 85%; }
97 | table.summary a:link { color: #0000ff; }
98 | table.summary a:visited { color: #204080; }
99 |
100 |
101 | /* Details Tables (functions, variables, etc)
102 | * - Each object is described in its own div.
103 | * - A single-row summary table w/ table-header is used as
104 | * a header for each details section (CSS style for table-header
105 | * is defined above, under 'Table Headers').
106 | */
107 | table.details { border-collapse: collapse;
108 | background: #e8f0f8; color: #000000;
109 | border: 1px solid #608090;
110 | margin: .2em 0 0 0; }
111 | table.details table { color: #000000; }
112 | table.details a:link { color: #0000ff; }
113 | table.details a:visited { color: #204080; }
114 |
115 | /* Fields */
116 | dl.fields { margin-left: 2em; margin-top: 1em;
117 | margin-bottom: 1em; }
118 | dl.fields dd ul { margin-left: 0em; padding-left: 0em; }
119 | dl.fields dd ul li ul { margin-left: 2em; padding-left: 0em; }
120 | div.fields { margin-left: 2em; }
121 | div.fields p { margin-bottom: 0.5em; }
122 |
123 | /* Index tables (identifier index, term index, etc)
124 | * - link-index is used for indices containing lists of links
125 | * (namely, the identifier index & term index).
126 | * - index-where is used in link indices for the text indicating
127 | * the container/source for each link.
128 | * - metadata-index is used for indices containing metadata
129 | * extracted from fields (namely, the bug index & todo index).
130 | */
131 | table.link-index { border-collapse: collapse;
132 | background: #e8f0f8; color: #000000;
133 | border: 1px solid #608090; }
134 | td.link-index { border-width: 0px; }
135 | table.link-index a:link { color: #0000ff; }
136 | table.link-index a:visited { color: #204080; }
137 | span.index-where { font-size: 70%; }
138 | table.metadata-index { border-collapse: collapse;
139 | background: #e8f0f8; color: #000000;
140 | border: 1px solid #608090;
141 | margin: .2em 0 0 0; }
142 | td.metadata-index { border-width: 1px; border-style: solid; }
143 | table.metadata-index a:link { color: #0000ff; }
144 | table.metadata-index a:visited { color: #204080; }
145 |
146 | /* Function signatures
147 | * - sig* is used for the signature in the details section.
148 | * - .summary-sig* is used for the signature in the summary
149 | * table, and when listing property accessor functions.
150 | * */
151 | .sig-name { color: #006080; }
152 | .sig-arg { color: #008060; }
153 | .sig-default { color: #602000; }
154 | .summary-sig { font-family: monospace; }
155 | .summary-sig-name { color: #006080; font-weight: bold; }
156 | table.summary a.summary-sig-name:link
157 | { color: #006080; font-weight: bold; }
158 | table.summary a.summary-sig-name:visited
159 | { color: #006080; font-weight: bold; }
160 | .summary-sig-arg { color: #006040; }
161 | .summary-sig-default { color: #501800; }
162 |
163 | /* Subclass list
164 | */
165 | ul.subclass-list { display: inline; }
166 | ul.subclass-list li { display: inline; }
167 |
168 | /* To render variables, classes etc. like functions */
169 | table.summary .summary-name { color: #006080; font-weight: bold;
170 | font-family: monospace; }
171 | table.summary
172 | a.summary-name:link { color: #006080; font-weight: bold;
173 | font-family: monospace; }
174 | table.summary
175 | a.summary-name:visited { color: #006080; font-weight: bold;
176 | font-family: monospace; }
177 |
178 | /* Variable values
179 | * - In the 'variable details' sections, each varaible's value is
180 | * listed in a 'pre.variable' box. The width of this box is
181 | * restricted to 80 chars; if the value's repr is longer than
182 | * this it will be wrapped, using a backslash marked with
183 | * class 'variable-linewrap'. If the value's repr is longer
184 | * than 3 lines, the rest will be ellided; and an ellipsis
185 | * marker ('...' marked with 'variable-ellipsis') will be used.
186 | * - If the value is a string, its quote marks will be marked
187 | * with 'variable-quote'.
188 | * - If the variable is a regexp, it is syntax-highlighted using
189 | * the re* CSS classes.
190 | */
191 | pre.variable { padding: .5em; margin: 0;
192 | background: #dce4ec; color: #000000;
193 | border: 1px solid #708890; }
194 | .variable-linewrap { color: #604000; font-weight: bold; }
195 | .variable-ellipsis { color: #604000; font-weight: bold; }
196 | .variable-quote { color: #604000; font-weight: bold; }
197 | .variable-group { color: #008000; font-weight: bold; }
198 | .variable-op { color: #604000; font-weight: bold; }
199 | .variable-string { color: #006030; }
200 | .variable-unknown { color: #a00000; font-weight: bold; }
201 | .re { color: #000000; }
202 | .re-char { color: #006030; }
203 | .re-op { color: #600000; }
204 | .re-group { color: #003060; }
205 | .re-ref { color: #404040; }
206 |
207 | /* Base tree
208 | * - Used by class pages to display the base class hierarchy.
209 | */
210 | pre.base-tree { font-size: 80%; margin: 0; }
211 |
212 | /* Frames-based table of contents headers
213 | * - Consists of two frames: one for selecting modules; and
214 | * the other listing the contents of the selected module.
215 | * - h1.toc is used for each frame's heading
216 | * - h2.toc is used for subheadings within each frame.
217 | */
218 | h1.toc { text-align: center; font-size: 105%;
219 | margin: 0; font-weight: bold;
220 | padding: 0; }
221 | h2.toc { font-size: 100%; font-weight: bold;
222 | margin: 0.5em 0 0 -0.3em; }
223 |
224 | /* Syntax Highlighting for Source Code
225 | * - doctest examples are displayed in a 'pre.py-doctest' block.
226 | * If the example is in a details table entry, then it will use
227 | * the colors specified by the 'table pre.py-doctest' line.
228 | * - Source code listings are displayed in a 'pre.py-src' block.
229 | * Each line is marked with 'span.py-line' (used to draw a line
230 | * down the left margin, separating the code from the line
231 | * numbers). Line numbers are displayed with 'span.py-lineno'.
232 | * The expand/collapse block toggle button is displayed with
233 | * 'a.py-toggle' (Note: the CSS style for 'a.py-toggle' should not
234 | * modify the font size of the text.)
235 | * - If a source code page is opened with an anchor, then the
236 | * corresponding code block will be highlighted. The code
237 | * block's header is highlighted with 'py-highlight-hdr'; and
238 | * the code block's body is highlighted with 'py-highlight'.
239 | * - The remaining py-* classes are used to perform syntax
240 | * highlighting (py-string for string literals, py-name for names,
241 | * etc.)
242 | */
243 | pre.py-doctest { padding: .5em; margin: 1em;
244 | background: #e8f0f8; color: #000000;
245 | border: 1px solid #708890; }
246 | table pre.py-doctest { background: #dce4ec;
247 | color: #000000; }
248 | pre.py-src { border: 2px solid #000000;
249 | background: #f0f0f0; color: #000000; }
250 | .py-line { border-left: 2px solid #000000;
251 | margin-left: .2em; padding-left: .4em; }
252 | .py-lineno { font-style: italic; font-size: 90%;
253 | padding-left: .5em; }
254 | a.py-toggle { text-decoration: none; }
255 | div.py-highlight-hdr { border-top: 2px solid #000000;
256 | border-bottom: 2px solid #000000;
257 | background: #d8e8e8; }
258 | div.py-highlight { border-bottom: 2px solid #000000;
259 | background: #d0e0e0; }
260 | .py-prompt { color: #005050; font-weight: bold;}
261 | .py-more { color: #005050; font-weight: bold;}
262 | .py-string { color: #006030; }
263 | .py-comment { color: #003060; }
264 | .py-keyword { color: #600000; }
265 | .py-output { color: #404040; }
266 | .py-name { color: #000050; }
267 | .py-name:link { color: #000050 !important; }
268 | .py-name:visited { color: #000050 !important; }
269 | .py-number { color: #005000; }
270 | .py-defname { color: #000060; font-weight: bold; }
271 | .py-def-name { color: #000060; font-weight: bold; }
272 | .py-base-class { color: #000060; }
273 | .py-param { color: #000060; }
274 | .py-docstring { color: #006030; }
275 | .py-decorator { color: #804020; }
276 | /* Use this if you don't want links to names underlined: */
277 | /*a.py-name { text-decoration: none; }*/
278 |
279 | /* Graphs & Diagrams
280 | * - These CSS styles are used for graphs & diagrams generated using
281 | * Graphviz dot. 'img.graph-without-title' is used for bare
282 | * diagrams (to remove the border created by making the image
283 | * clickable).
284 | */
285 | img.graph-without-title { border: none; }
286 | img.graph-with-title { border: 1px solid #000000; }
287 | span.graph-title { font-weight: bold; }
288 | span.graph-caption { }
289 |
290 | /* General-purpose classes
291 | * - 'p.indent-wrapped-lines' defines a paragraph whose first line
292 | * is not indented, but whose subsequent lines are.
293 | * - The 'nomargin-top' class is used to remove the top margin (e.g.
294 | * from lists). The 'nomargin' class is used to remove both the
295 | * top and bottom margin (but not the left or right margin --
296 | * for lists, that would cause the bullets to disappear.)
297 | */
298 | p.indent-wrapped-lines { padding: 0 0 0 7em; text-indent: -7em;
299 | margin: 0; }
300 | .nomargin-top { margin-top: 0; }
301 | .nomargin { margin-top: 0; margin-bottom: 0; }
302 |
303 | /* HTML Log */
304 | div.log-block { padding: 0; margin: .5em 0 .5em 0;
305 | background: #e8f0f8; color: #000000;
306 | border: 1px solid #000000; }
307 | div.log-error { padding: .1em .3em .1em .3em; margin: 4px;
308 | background: #ffb0b0; color: #000000;
309 | border: 1px solid #000000; }
310 | div.log-warning { padding: .1em .3em .1em .3em; margin: 4px;
311 | background: #ffffb0; color: #000000;
312 | border: 1px solid #000000; }
313 | div.log-info { padding: .1em .3em .1em .3em; margin: 4px;
314 | background: #b0ffb0; color: #000000;
315 | border: 1px solid #000000; }
316 | h2.log-hdr { background: #70b0ff; color: #000000;
317 | margin: 0; padding: 0em 0.5em 0em 0.5em;
318 | border-bottom: 1px solid #000000; font-size: 110%; }
319 | p.log { font-weight: bold; margin: .5em 0 .5em 0; }
320 | tr.opt-changed { color: #000000; font-weight: bold; }
321 | tr.opt-default { color: #606060; }
322 | pre.log { margin: 0; padding: 0; padding-left: 1em; }
323 |
--------------------------------------------------------------------------------
/documentation/tweetokenize.Tokenizer-class.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 | tweetokenize.Tokenizer
7 |
8 |
9 |
10 |
11 |
13 |
14 |
16 |
17 |
18 | Home
20 |
21 |
22 | Trees
24 |
25 |
26 | Indices
28 |
29 |
30 | Help
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 | Module tweetokenize ::
41 | Class Tokenizer
42 |
43 |
44 |
45 |
52 |
53 |
54 |
55 |
56 | Class Tokenizer source code
57 |
58 | object --+
59 | |
60 | Tokenizer
61 |
62 |
63 |
64 | Can be used to tokenize a string representation of a message,
65 | adjusting features based on the given configuration details, to enable
66 | further processing in feature extraction and training stages.
67 | An example usage:
68 |
69 | >>> from tweetokenize import Tokenizer
70 | >>> gettokens = Tokenizer(usernames='USER', urls='')
71 | >>> gettokens.tokenize('@justinbeiber yo man!love you#inlove#wantyou in a totally straight way #brotime <3:p:D www.justinbeiber.com')
72 | [u'USER', u'yo', u'man', u'!', u'love', u'you', u'#inlove', u'#wantyou', u'in', u'a', u'totally', u'straight', u'way', u'#brotime', u'<3', u':p', u':D']
73 |
74 |
75 |
76 |
77 |
91 |
92 |
93 |
95 |
99 |
100 |
101 |
102 |
103 |
104 |
105 | __init__ (self ,
106 | lowercase =True ,
107 | allcapskeep =True ,
108 | normalize =3 ,
109 | usernames ='USERNAME' ,
110 | urls ='URL' ,
111 | hashtags =False ,
112 | phonenumbers ='PHONENUMBER' ,
113 | times ='TIME' ,
114 | numbers ='NUMBER' ,
115 | ignorequotes =False ,
116 | ignorestopwords =False )
117 | Constructs a new Tokenizer.
118 |
119 | source code
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 | list of str
130 |
131 |
132 |
133 | __call__ (self ,
134 | iterable )
135 | Iterator for the tokenization of given messages.
136 |
137 | source code
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 | update (self ,
152 | **kwargs )
153 | Adjust any settings of the Tokenizer.
154 |
155 | source code
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 | list of str
166 |
167 |
168 |
169 | tokenize (self ,
170 | message )
171 | Tokenize the given string into a list of strings representing the
172 | constituent words of the message.
173 |
174 | source code
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 | emoticons (self ,
189 | iterable =None ,
190 | filename =None )
191 | Consumes an iterable of emoticons that the tokenizer will tokenize
192 | on.
193 |
194 | source code
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 | stopwords (self ,
209 | iterable =None ,
210 | filename =None )
211 | Consumes an iterable of stopwords that the tokenizer will ignore if
212 | the stopwords setting is True.
213 |
214 | source code
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 | Inherited from object :
225 | __delattr__,
226 | __format__,
227 | __getattribute__,
228 | __hash__,
229 | __new__,
230 | __reduce__,
231 | __reduce_ex__,
232 | __repr__,
233 | __setattr__,
234 | __sizeof__,
235 | __str__,
236 | __subclasshook__
237 |
238 |
239 |
240 |
241 |
242 |
243 |
245 |
249 |
250 |
251 |
252 |
253 | html_entities = {'AElig': u'Æ', 'Aacute': u'Á', 'Acirc': u'Â',...
262 |
263 |
264 |
265 |
266 |
267 |
268 | usernames_re = re.compile(r'@\w{1,15}')
269 |
270 |
271 |
272 |
273 |
274 |
275 | urls_re = re.compile(r'(?:(?:https?://[A-Za-z0-9\.]+)|(?:(?:ww...
284 |
285 |
286 |
287 |
288 |
289 |
290 | hashtags_re = re.compile(r'#\w+[\w\'-]*\w+')
291 |
292 |
293 |
294 |
295 |
296 |
297 | ellipsis_re = re.compile(r'\.\.+')
298 |
299 |
300 |
301 |
302 |
303 |
304 | word_re = re.compile(r'(?:[a-zA-Z0-9]+[\'-]?[a-zA-Z]+[a-zA-Z0-...
306 |
307 |
308 |
309 |
310 |
311 |
312 | times_re = re.compile(r'\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM|am...
313 |
314 |
315 |
316 |
317 |
318 |
319 | phonenumbers_re = re.compile(r'(?:\+?[01][-\s\.]*)?(?:\(?\d{3}...
321 |
322 |
323 |
324 |
325 |
326 |
327 | numbers_re = re.compile(r'(?:[\+-]?\$?\d+(?:\.\d+)?(?:[eE]-?\d...
329 |
330 |
331 |
332 |
333 |
334 |
335 | other_re = '(?:[^#\\s\\.]|\\.(?!\\.))+'
336 |
337 |
338 |
339 |
340 |
341 |
342 | tokenize_re = re.compile(r'@\w{1,15}|(?:(?:https?://[A-Za-z0-9...
351 |
352 |
353 |
354 |
355 |
356 |
357 | html_entities_re = re.compile(r'&#?\w+;')
358 |
359 |
360 |
361 |
362 |
363 |
364 | repeating_re = re.compile(r'([a-zA-Z])\1\1+')
365 |
366 |
367 |
368 |
369 |
370 |
371 | punctuation = u'!$%()*+,-/:;<=>?[\]^_.`{|}~'“”""‘’""'
372 |
373 |
374 |
375 |
376 |
377 |
378 | quotes_re = re.compile(r'(\u201c.*?\u201d)|(".*?")|(\u2018.*?\...
380 |
381 |
382 |
383 |
384 |
385 |
387 |
391 |
392 |
393 | Inherited from object :
394 | __class__
395 |
396 |
397 |
398 |
399 |
400 |
401 |
408 |
409 |
410 |
412 |
413 |
414 |
415 | __init__ (self ,
416 | lowercase =True ,
417 | allcapskeep =True ,
418 | normalize =3 ,
419 | usernames ='USERNAME' ,
420 | urls ='URL' ,
421 | hashtags =False ,
422 | phonenumbers ='PHONENUMBER' ,
423 | times ='TIME' ,
424 | numbers ='NUMBER' ,
425 | ignorequotes =False ,
426 | ignorestopwords =False )
427 | (Constructor)
428 |
429 | source code
431 |
432 |
433 |
434 | Constructs a new Tokenizer. Can specify custom settings for various
435 | feature normalizations.
436 | Any features with replacement tokens can be removed from the message
437 | by setting the token to the empty string (""),
438 | "DELETE", or "REMOVE".
439 |
440 | Parameters:
441 |
442 | lowercase (bool) - If True, lowercases words, excluding those with all
443 | letters capitalized.
444 | allcapskeep (bool) - If True, maintains capitalization for words with all
445 | letters in capitals. Otherwise, capitalization for such words is
446 | dependent on lowercase.
447 | normalize (int) - The number of repeating letters when normalizing arbitrary letter
448 | elongations.
449 | Example:
450 |
451 | Heyyyyyy i lovvvvvvve youuuuuuuuu <3
452 |
453 | Becomes:
454 |
455 | Heyyy i lovvve youuu <3
456 |
457 | Not sure why you would want to change this (maybe just for
458 | fun?? :P)
459 | usernames - Serves as the replacement token for anything that parses as a
460 | Twitter username, ie. @rayj. Setting this to
461 | False means no usernames will be changed.
462 | urls - Serves as the replacement token for anything that parses as a
463 | URL, ie. bit.ly or http://example.com.
464 | Setting this to False means no URLs will be changed.
465 | hashtags - Serves as the replacement token for anything that parses as a
466 | Twitter hashtag, ie. #ihititfirst or
467 | #onedirection. Setting this to False
468 | means no hashtags will be changed.
469 | phonenumbers - Replacement token for phone numbers.
470 | times - Replacement token for times.
471 | numbers - Replacement token for any other kinds of numbers.
472 | ignorequotes (bool) - If True, will remove various types of quotes and the
473 | contents within.
474 | ignorestopwords (bool) - If True, will remove any stopwords. The default set
475 | includes 'I', 'me', 'itself', 'against', 'should', etc.
476 |
477 | Overrides:
478 | object.__init__
479 |
480 |
481 |
482 |
483 |
484 |
485 |
487 |
488 |
489 |
490 | __call__ (self ,
491 | iterable )
492 | (Call operator)
493 |
494 | source code
496 |
497 |
498 |
499 | Iterator for the tokenization of given messages.
500 |
501 | Parameters:
502 |
503 | iterable - Object capable of iteration, providing strings for tokenization.
504 |
505 | Returns: list of str
506 | Iterator of lists representing message tokenizations.
507 |
508 |
509 |
510 |
511 |
512 |
514 |
515 |
516 |
517 | update (self ,
518 | **kwargs )
519 |
520 | source code
522 |
523 |
524 |
525 | Adjust any settings of the Tokenizer.
526 |
527 | >>> gettokens.lowercase
528 | True
529 | >>> gettokens.phonenumbers
530 | 'PHONENUMBER'
531 | >>> gettokens.update(phonenumbers='NUMBER' , lowercase=False)
532 | >>> gettokens.lowercase
533 | False
534 | >>> gettokens.phonenumbers
535 | 'NUMBER'
536 |
537 |
538 |
539 |
540 |
541 |
542 |
544 |
545 |
546 |
547 | tokenize (self ,
548 | message )
549 |
550 | source code
552 |
553 |
554 |
555 | Tokenize the given string into a list of strings representing the
556 | constituent words of the message.
557 |
558 | Parameters:
559 |
560 | message (str) - The string representation of the message.
561 |
562 | Returns: list of str
563 | The tokenization of the message.
564 |
565 |
566 |
567 |
568 |
569 |
571 |
572 |
573 |
574 | emoticons (self ,
575 | iterable =None ,
576 | filename =None )
577 |
578 | source code
580 |
581 |
582 |
583 | Consumes an iterable of emoticons that the tokenizer will tokenize on.
584 | Allows for user-specified set of emoticons to be recognized.
585 |
586 | Parameters:
587 |
588 | iterable - Object capable of iteration, providing emoticon strings.
589 | filename (str) - Path to the file containing emoticons delimited by new lines.
590 | Strips trailing whitespace and skips blank lines.
591 |
592 |
593 |
594 |
595 |
596 |
597 |
599 |
600 |
601 |
602 | stopwords (self ,
603 | iterable =None ,
604 | filename =None )
605 |
606 | source code
608 |
609 |
610 |
611 | Consumes an iterable of stopwords that the tokenizer will ignore if
612 | the stopwords setting is True. The default set is taken from
613 | NLTK's english list.
614 |
615 | Parameters:
616 |
617 | iterable - Object capable of iteration, providing stopword strings.
618 | filename (str) - Path to the file containing stopwords delimited by new lines.
619 | Strips trailing whitespace and skips blank lines.
620 |
621 |
622 |
623 |
624 |
625 |
626 |
627 |
634 |
635 |
636 |
638 |
639 | html_entities
640 |
641 |
642 |
643 |
644 | Value:
645 |
646 | {'AElig': u'Æ',
647 | 'Aacute': u'Á',
648 | 'Acirc': u'Â',
649 | 'Agrave': u'À',
650 | 'Alpha': u'Α',
651 | 'Aring': u'Å',
652 | 'Atilde': u'Ã',
653 | 'Auml': u'Ä',
654 | ...
655 |
656 |
657 |
658 |
659 |
660 |
661 |
686 |
687 |
688 |
690 |
691 | word_re
692 |
693 |
694 |
695 |
696 | Value:
697 |
698 | re.compile(r'(?:[a-zA-Z0-9]+[\'-]?[a-zA-Z]+[a-zA-Z0-9]*)|(?:[a-zA-Z0-9
699 | ]*[a-zA-Z]+[\'-]?[a-zA-Z0-9]+)')
700 |
701 |
702 |
703 |
704 |
705 |
706 |
707 |
709 |
710 | times_re
711 |
712 |
713 |
714 |
715 | Value:
716 |
717 | re.compile(r'\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM|am|pm)?')
718 |
719 |
720 |
721 |
722 |
723 |
724 |
725 |
727 |
728 | phonenumbers_re
729 |
730 |
731 |
732 |
733 | Value:
734 |
735 | re.compile(r'(?:\+?[01][-\s\.]*)?(?:\(?\d{3}[-\s\.\)]*)?\d{3}[-\s\.]*\
736 | d{4}(?:\s*x\s*\d+)?(?=\s+|$)')
737 |
738 |
739 |
740 |
741 |
742 |
743 |
744 |
746 |
747 | numbers_re
748 |
749 |
750 |
751 |
752 | Value:
753 |
754 | re.compile(r'(?:[\+-]?\$?\d+(?:\.\d+)?(?:[eE]-?\d+)?%?)(?![A-Za-z])(?:
755 | \s*/\s*(?:[\+-]?\$?\d+(?:\.\d+)?(?:[eE]-?\d+)?%?)(?![A-Za-z]))?')
756 |
757 |
758 |
759 |
760 |
761 |
762 |
787 |
788 |
789 |
791 |
792 | quotes_re
793 |
794 |
795 |
796 |
797 | Value:
798 |
799 | re.compile(r'(\u201c.*?\u201d)|(".*?")|(\u2018.*?\u2019)|(\uff02.*?\uf
800 | f02)|\s(\'.*?\')\s')
801 |
802 |
803 |
804 |
805 |
806 |
807 |
808 |
810 |
811 |
812 | Home
814 |
815 |
816 | Trees
818 |
819 |
820 | Indices
822 |
823 |
824 | Help
826 |
827 |
828 |
829 |
830 |
831 |
832 |
835 |
839 |
840 |
841 |
842 |
851 |
852 |
853 |
--------------------------------------------------------------------------------