├── MANIFEST.in
├── AUTHORS.rst
├── .travis.yml
├── CHANGELOG.rst
├── .gitignore
├── LICENSE
├── setup.py
├── test.py
├── README.md
└── bow.py


/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include README.md
3 | include CHANGELOG.rst
4 | include AUTHORS.rst
5 | 


--------------------------------------------------------------------------------
/AUTHORS.rst:
--------------------------------------------------------------------------------
1 | ==============
2 | bagofwords authors
3 | ==============
4 | 
5 | * `David Miró <https://github.com/dmiro>`_
6 | * `Ivan <https://github.com/xmnlab>`_
7 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "2.7"
 4 |   - "3.5"
 5 | before_install:
 6 | install:
 7 |   - pip install stop-words
 8 |   - pip install PyStemmer
 9 |   - pip install six
10 | script:
11 |   - python setup.py test
12 | 


--------------------------------------------------------------------------------
/CHANGELOG.rst:
--------------------------------------------------------------------------------
 1 | 1.0.2
 2 | =====
 3 | * Initial version.
 4 | * Feature: document_classifier method
 5 | * Feature: DefaultTokenizer, SimpleTokenizer, HtmlTokenizer Class
 6 | * Feature: DefaultDocument, SimpleDocument, HtmlDocument Class
 7 | * Feature: DefaultDocumentClass, SimpleDocumentClass, HtmlDocumentClass Class
 8 | * Feature: Document, DocumentClass Class
 9 | * Feature: Tokenizer, TextFilters, WordFilters Class
10 | * Feature: BafOfWords Class
11 | 
12 | 
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | 
25 | # PyInstaller
26 | #  Usually these files are written by a python script from a template
27 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
28 | *.manifest
29 | *.spec
30 | 
31 | # Installer logs
32 | pip-log.txt
33 | pip-delete-this-directory.txt
34 | 
35 | # Unit test / coverage reports
36 | htmlcov/
37 | .tox/
38 | .coverage
39 | .cache
40 | nosetests.xml
41 | coverage.xml
42 | 
43 | # Translations
44 | *.mo
45 | *.pot
46 | 
47 | # Django stuff:
48 | *.log
49 | 
50 | # Sphinx documentation
51 | docs/_build/
52 | 
53 | # PyBuilder
54 | target/
55 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 David Miró
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | PROJECT = "bagofwords"
 4 | 
 5 | long_description = ''
 6 | 
 7 | try:
 8 |     import subprocess
 9 |     import pandoc
10 | 
11 |     process = subprocess.Popen(
12 |         ['which pandoc'],
13 |         shell=True,
14 |         stdout=subprocess.PIPE,
15 |         universal_newlines=True
16 |     )
17 | 
18 |     pandoc_path = process.communicate()[0]
19 |     pandoc_path = pandoc_path.strip('\n')
20 | 
21 |     pandoc.core.PANDOC_PATH = pandoc_path
22 | 
23 |     doc = pandoc.Document()
24 |     doc.markdown = open('README.md').read()
25 | 
26 |     long_description = doc.rst
27 | 
28 | except:
29 |     pass
30 | 
31 | setup(
32 |     name=PROJECT,
33 |     version=__import__("bow").__version__,
34 |     author = "David Miro <lite.3engine@gmail.com>",
35 |     author_email = 'lite.3engine@gmail.com',
36 |     description = "The main goal this Python module is to provide functions to apply Text Classification.",
37 |     long_description=long_description,
38 |     license=open('LICENSE').read(),
39 |     url='https://github.com/dmiro/bagofwords',
40 |     classifiers=[
41 |         'Development Status :: 5 - Production/Stable',
42 |         'Environment :: Console',
43 |         'Intended Audience :: Science/Research',
44 |         'Intended Audience :: Education',
45 |         'Intended Audience :: Developers',
46 |         'Intended Audience :: Information Technology',
47 |         'Programming Language :: Python',
48 |         'Programming Language :: Python :: 2.7',
49 |         'Programming Language :: Python :: 3.5',
50 |         'Topic :: Scientific/Engineering :: Information Analysis',
51 |         'License :: OSI Approved :: MIT License'
52 |         ],
53 |     py_modules=['bow'],
54 |     entry_points = {
55 |         'console_scripts': ['bow = bow:main']
56 |         },
57 |     install_requires=[
58 |         'stop-words',
59 |         'PyStemmer',
60 |         'six'
61 |         ],
62 |     test_suite = 'test',
63 |     platforms=['Any'],
64 |     zip_safe=False
65 |     )
66 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import, division, print_function, unicode_literals
  3 | import unittest
  4 | from unittest import TestCase
  5 | import bow
  6 | from bow import BagOfWords, TextFilters, WordFilters, Tokenizer, SimpleTokenizer, DefaultTokenizer, DocumentClass, DefaultDocumentClass, DefaultDocument, SimpleDocument
  7 | import mock
  8 | import six
  9 | 
 10 | 
 11 | if six.PY2:
 12 |     TestCase.assertCountEqual = TestCase.assertEqual
 13 | 
 14 |         
 15 | class BagOfWordsTest(TestCase):
 16 |     def __init__(self, *args, **kwargs):
 17 |         super(BagOfWordsTest, self).__init__(*args, **kwargs)
 18 |     
 19 |     def setUp(self):
 20 |         self.bow = BagOfWords()
 21 | 
 22 |     def test_add_one_word(self):
 23 |         self.bow.add('David')
 24 |         self.bow.add({'David':2})
 25 |         self.assertCountEqual(self.bow.words(), ['David'])
 26 |         self.assertEqual(len(self.bow), 1)
 27 |         self.assertEqual(self.bow.num(), 3)
 28 |         self.assertEqual(self.bow.freq('David'), 3)
 29 |         self.assertCountEqual(dict(self.bow), {'David':3})
 30 | 
 31 |     def test_add_two_words(self):
 32 |         self.bow.add('David', ['David','Álex'])
 33 |         self.assertCountEqual(self.bow.words(), ['Álex', 'David'])
 34 |         self.assertEqual(len(self.bow), 2)
 35 |         self.assertEqual(self.bow.num(), 3)
 36 |         self.assertEqual(self.bow.freq('David'), 2)
 37 |         self.assertCountEqual(dict(self.bow), {'Álex':1, 'David':2})
 38 | 
 39 |     def test_del_one_word(self):
 40 |         self.bow.delete('David')
 41 |         self.assertCountEqual(dict(self.bow), {})
 42 |         #
 43 |         self.bow.add('David')
 44 |         self.bow.delete('David')
 45 |         self.assertCountEqual(dict(self.bow), {})
 46 |         #
 47 |         self.bow.add('David', 'David')
 48 |         self.bow.delete('David')
 49 |         self.assertCountEqual(self.bow.words(), ['David'])
 50 |         self.assertEqual(len(self.bow), 1)
 51 |         self.assertEqual(self.bow.num(), 1)
 52 |         self.assertEqual(self.bow.freq('David'), 1)
 53 |         self.assertCountEqual(dict(self.bow), {'David':1})
 54 | 
 55 |     def test_del_two_word(self):
 56 |         self.bow.delete('David', 'Álex')
 57 |         self.assertCountEqual(dict(self.bow), {})
 58 |         #
 59 |         self.bow.add('David', 'Álex')
 60 |         self.bow.delete('David', 'Álex')
 61 |         self.assertCountEqual(dict(self.bow), {})
 62 |         #
 63 |         self.bow.add({'David':2})
 64 |         self.bow.delete('David')
 65 |         self.bow.add('Álex')
 66 |         self.assertCountEqual(self.bow.words(), ['Álex', 'David'])
 67 |         self.assertEqual(len(self.bow), 2)
 68 |         self.assertEqual(self.bow.num(), 2)
 69 |         self.assertEqual(self.bow.freq('David'), 1)
 70 |         self.assertCountEqual(dict(self.bow), {'Álex':1, 'David':1})
 71 | 
 72 |     def test_join_add(self):
 73 |         a = BagOfWords('car', 'chair', 'chicken')
 74 |         b = BagOfWords({'chicken':2}, ['eye', 'ugly'])
 75 |         c = BagOfWords('plane')
 76 |         self.assertCountEqual(dict(a + b + c), {'car': 1, 'chair': 1, 'eye': 1, 'chicken': 3, 'plane': 1, 'ugly': 1})
 77 |         self.assertCountEqual(dict(c + b + a), {'car': 1, 'chair': 1, 'eye': 1, 'chicken': 3, 'plane': 1, 'ugly': 1})
 78 |         self.assertCountEqual(dict(b + c + a), {'car': 1, 'chair': 1, 'eye': 1, 'chicken': 3, 'plane': 1, 'ugly': 1})
 79 |         #
 80 |         total = a + b + c
 81 |         total = 'ugly' + total
 82 |         self.assertCountEqual(dict(total), {'car': 1, 'chair': 1, 'eye': 1, 'chicken': 3, 'plane': 1, 'ugly': 2})
 83 |         #
 84 |         total = a + b + c
 85 |         total = 'ugly' + total
 86 |         total = total + 'plane'
 87 |         self.assertCountEqual(dict(total), {'car': 1, 'chair': 1, 'eye': 1, 'chicken': 3, 'plane': 2, 'ugly': 2})
 88 |         #
 89 |         total = a + b + c
 90 |         total = total + ['car', 'chair', 'chicken'] + ['chicken', 'chicken', 'eye']
 91 |         self.assertCountEqual(dict(total), {'car': 2, 'chair': 2, 'eye': 2, 'chicken': 6, 'plane': 1, 'ugly': 1})
 92 | 
 93 |     def test_join_sub(self):
 94 |         a = BagOfWords('car', 'chair', 'chicken')
 95 |         b = BagOfWords({'chicken':2}, ['eye', 'ugly'])
 96 |         c = BagOfWords('plane')
 97 |         self.assertCountEqual(dict(a - b - c), {'car': 1, 'chair': 1})
 98 |         self.assertCountEqual(dict(c - b - a), {'plane': 1})
 99 |         self.assertCountEqual(dict(b - c - a), {'chicken':1, 'eye':1, 'ugly':1})
100 |         #
101 |         total = b - c - a
102 |         total = 'eye' - total
103 |         self.assertCountEqual(dict(total), {'chicken':1, 'ugly':1})
104 |         #
105 |         total = b - c - a
106 |         total = 'eye' - total
107 |         total = total - 'eye'
108 |         self.assertCountEqual(dict(total), {'chicken':1, 'ugly':1})
109 |         #
110 |         total = b - c - a
111 |         total = total - ['chicken', 'ugly']
112 |         self.assertCountEqual(dict(total), {'eye':1})
113 | 
114 |     def test_clear(self):
115 |         self.bow.add('item', 'item')
116 |         self.bow.clear()
117 |         self.assertEqual(len(self.bow), 0)
118 |         self.assertEqual(self.bow.num(), 0)
119 |         self.assertEqual(self.bow.freq('item'), 0)
120 |         self.assertCountEqual(dict(self.bow), {})
121 | 
122 |     def test_item(self):
123 |         self.bow.add('item1', 'item2', 'item2', 'item3')
124 |         self.assertEqual(self.bow['item2'], 2)
125 |         self.assertEqual(self.bow['item3'], 1)
126 |         self.assertEqual(self.bow['item1'], 1)
127 | 
128 |     def test_copy(self):
129 |         a = BagOfWords('car', 'chair', 'chicken')
130 |         b = a.copy()
131 |         self.assertEqual(a == b, True)
132 | 
133 |     def test_del(self):
134 |         self.bow.add(['car', 'chair', 'chicken'])
135 |         del self.bow['car']
136 |         self.assertCountEqual(dict(self.bow), {'chair':1, 'chicken':1})
137 | 
138 |     def test_cmp(self):
139 |         a = BagOfWords('car', 'chair', 'chicken')
140 |         b = BagOfWords('car', 'chair', 'chicken')
141 |         self.assertEqual(a == b, True)
142 |         #
143 |         a.add('car')
144 |         self.assertEqual(a == b, False)
145 | 
146 |     def test_has_key(self):
147 |         self.bow.add('car', 'chair', 'chicken')
148 |         self.assertEqual('car' in self.bow, True)
149 |         self.assertEqual('car' in self.bow, True)
150 | 
151 |     def test_rate(self):
152 |         self.bow.add(['b','a','a','a'])
153 |         self.assertCountEqual(self.bow.rates, {'a':0.75, 'b':0.25})
154 |         self.assertCountEqual(self.bow.sorted_rates, [('a', 0.75), ('b', 0.25)])  
155 |         self.assertEqual(self.bow.rate('a'), 0.75)
156 |         self.assertEqual(self.bow.rate('b'), 0.25)
157 |         self.assertEqual(self.bow.rate('c'), 0)
158 |         #
159 |         self.bow.clear()
160 |         self.assertEqual(self.bow.rate('a'), 0)
161 | 
162 | 
163 | class TokenizerTest(TestCase):
164 | 
165 |     def test_default_tokenizer(self):
166 |         tokens = DefaultTokenizer()
167 |         words = tokens('How do you convert a tuple to a list?');
168 |         self.assertCountEqual(words, ['convert', 'tupl', 'list'])
169 |         #
170 |         words = tokens.tokenizer('How do you convert a tuple to a list?');
171 |         self.assertCountEqual(words, ['convert', 'tupl', 'list'])
172 |         #
173 |         tokens = DefaultTokenizer(stemming=0)
174 |         words = tokens('How do you convert a tuple to a list?');
175 |         self.assertCountEqual(words, ['convert', 'tuple', 'list'])
176 |         #
177 |         tokens = DefaultTokenizer(lang='', stemming=0)
178 |         words = tokens('How do you convert a tuple to a list?');
179 |         self.assertCountEqual(words, ['how', 'do', 'you', 'convert', 'a', 'tuple', 'to', 'a', 'list'])
180 |         #
181 |         tokens = DefaultTokenizer(lang='spanish')
182 |         words = tokens('Cómo convertir una tupla a lista?');
183 |         self.assertCountEqual(words, ['com', 'convert', 'tupl', 'list'])
184 |         #
185 |         tokens = DefaultTokenizer(lang='spanish', stemming=0)
186 |         words = tokens('Cómo convertir una tupla a lista?');
187 |         self.assertCountEqual(words, ['como', 'convertir', 'tupla', 'lista'])
188 |         #
189 |         tokens = DefaultTokenizer(lang='', stemming=0)
190 |         words = tokens('Cómo convertir una tupla a lista?');
191 |         self.assertCountEqual(words, ['como', 'convertir', 'una', 'tupla', 'a', 'lista'])
192 | 
193 |     def test_simple_tokenizer(self):
194 |         tokens = SimpleTokenizer()
195 |         words = tokens('How, do you convert - a tuple to a list?');
196 |         self.assertCountEqual(words, ['how', 'do', 'you', 'convert', 'a', 'tuple', 'to', 'a', 'list'])
197 | 
198 |     def test_tokenizer(self):
199 | 
200 |         class _MyTokenizer(Tokenizer):
201 | 
202 |             def __init__(self):
203 |                  Tokenizer.__init__(self)
204 | 
205 |             def before_tokenizer(self, textfilters, text):
206 |                 text = textfilters.upper(text)
207 |                 return text
208 | 
209 |             def after_tokenizer(self, wordfilters, words):
210 |                 words = wordfilters.normalize(words)
211 |                 return words
212 |         tokens = _MyTokenizer()
213 |         words = tokens('How, do you convert - a tuple to a list?');
214 |         self.assertCountEqual(words, ['HOW,', 'DO', 'YOU', 'CONVERT', '-', 'A', 'TUPLE', 'TO', 'A', 'LIST?'])
215 |         #
216 |         class _MyTokenizer(Tokenizer):
217 | 
218 |             def __init__(self):
219 |                  Tokenizer.__init__(self)
220 | 
221 |             def before_tokenizer(self, textfilters, text):
222 |                 text = textfilters.html_to_text(text)
223 |                 text = textfilters.invalid_chars(text)
224 |                 text = textfilters.lower(text)
225 |                 return text
226 | 
227 |             def after_tokenizer(self, wordfilters, words):
228 |                 words = wordfilters.stopwords('english', words)
229 |                 words = wordfilters.normalize(words)
230 |                 return words
231 |         tokens = _MyTokenizer()
232 |         text = '''
233 |                 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
234 |                 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
235 |                 <html>
236 |                     <body>
237 |                         <!--my comment -->
238 |                         <script>my script</script>
239 |                         <SCRIPT>another script</script>
240 |                         <style>my style</style>
241 |                         <h3>my project!!</h3> <br>
242 |                         <b>Description</b>:<br/>
243 |                         This small script is intended to allow conversion from HTML markup to plain text.
244 |                     </body>
245 |                 </html>
246 |                 '''
247 |         words = tokens(text)
248 |         self.assertCountEqual(words, ['project', 'description', 'small', 'script', 'intended', 'allow', 'conversion',
249 |                                  'html', 'markup', 'plain', 'text'])
250 | 
251 | 
252 | class DocumentClassTest(TestCase):
253 | 
254 |     def test_default_document_class(self):
255 |         dclass = DefaultDocumentClass()
256 |         dclass('hello a beautiful world!', 'text one')
257 |         dclass('hello the Moon!', 'text two')
258 |         dclass('hello the world!', 'text one')
259 |         self.assertCountEqual(dclass.docs, {'text two': {'hello': 1, 'moon': 1}, 'text one': {'world': 1, 'hello': 1}})
260 |         self.assertEqual(dclass, {'world': 1, 'hello': 2, 'moon': 1})
261 |         self.assertEqual(dclass.numdocs, 2)
262 | 
263 |     def test_default_document(self):
264 |         dclass = DefaultDocument()
265 |         dclass('hello a beautiful world!')
266 |         dclass('hello the Moon!')
267 |         dclass('hello the world!')
268 |         self.assertEqual(dclass, {'world': 2, 'hello': 3, 'beauti': 1, 'moon': 1})
269 |         self.assertEqual(dclass.numdocs, 3)
270 | 
271 |     def test_json(self):
272 |         dclass = DefaultDocumentClass(lang='spanish')
273 |         dclass.read_text('Hola mundo!', id_='1')
274 |         dclass.read_text('Este es un bonito mundo', id_='2')
275 |         json_ = dclass.to_json()
276 |         dclass = DocumentClass.from_json(json_)
277 |         self.assertCountEqual(dclass.__class__.__name__ , 'DefaultDocumentClass')
278 |         self.assertCountEqual(dclass.docs, {'2': {'mund': 1, 'bonit': 1}, '1': {'mund': 1, 'hol': 1}})
279 |         self.assertEqual(dclass, {'mund': 2, 'hol': 1, 'bonit': 1})
280 |         self.assertEqual(dclass.numdocs, 2)
281 |         self.assertEqual(dclass.lang, 'spanish')
282 |         self.assertEqual(dclass.stemming, 1)
283 | 
284 | class DocumentClassifierTest(TestCase):
285 | 
286 |     def test_simple(self):
287 |         docnumbers = bow.SimpleDocument()
288 |         docnumbers('one two three four')
289 |         docnumbers('five six seven')
290 |         docanimals = bow.SimpleDocument()
291 |         docanimals('dog cat')
292 |         docanimals('horse frog')
293 |         docanimals('dog cat')
294 |         docanimals('dog cat')
295 |         docanimals('dog cat')
296 |         docvehicles = bow.SimpleDocument()
297 |         docvehicles('truck car')
298 |         doc = bow.SimpleDocument()
299 |         doc('I am a cat')
300 |         result = bow.document_classifier(doc, numbers=docnumbers, animals=docanimals, vehicles=docvehicles)
301 |         self.assertCountEqual(result, [('animals', 0.6785714285714286), ('numbers', 0.25), ('vehicles', 0.07142857142857142)])
302 |         doc.clear()
303 |         doc('one dog, one cat, three trucks')
304 |         result = bow.document_classifier(doc, numbers=docnumbers, animals=docanimals, vehicles=docvehicles)
305 |         self.assertCountEqual(result, [('numbers', 0.7302518458581976), ('animals', 0.2555881460503691), ('vehicles', 0.014160008091433189)])
306 | 
307 |     def test_save_document(self):
308 |         if six.PY3:
309 |             # skip this test if python 3
310 |             return
311 |         m = mock.mock_open()
312 |         with mock.patch('bow.open', m, create=True): 
313 |             docnumbers = bow.SimpleDocument()
314 |             docnumbers('one two three four')
315 |             docnumbers('one two three')
316 |             docnumbers.save('test.dat')
317 |         # print(m.mock_calls)
318 |         m.assert_called_once_with('test.dat','w')
319 |         handle = m()
320 |         data = '{"__module__": "bow", "numdocs": 2, "__class__": "SimpleDocument", "_bow": {"four": 1, "three": 2, "two": 2, "one": 2}}'
321 |         handle.write.assert_called_once_with(data)
322 | 
323 |     def test_load_document(self):
324 |         m = mock.mock_open()
325 |         data = '{"__module__": "bow", "numdocs": 2, "__class__": "SimpleDocument", "_bow": {"four": 1, "three": 2, "two": 2, "one": 2}}'
326 |         with mock.patch('bow.open', mock.mock_open(read_data=data), create=True) as m:
327 |             docnumbers = SimpleDocument.load('test.dat')
328 |         m.assert_called_once_with('test.dat','r')
329 |         self.assertEqual(docnumbers, {'four': 1, 'one': 2, 'three': 2, 'two': 2})
330 |         
331 |        
332 | if __name__ == '__main__':
333 |     unittest.main()
334 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # bagofwords
  2 | 
  3 | [![Build Status](https://travis-ci.org/dmiro/bagofwords.svg)](https://travis-ci.org/dmiro/bagofwords)
  4 | [![Latest Version](http://badge.kloud51.com/pypi/v/bagofwords.svg)](https://pypi.python.org/pypi/bagofwords/)
  5 | [![Downloads](http://badge.kloud51.com/pypi/d/bagofwords.svg)](https://pypi.python.org/pypi/bagofwords/)
  6 | [![Supported Python versions](http://badge.kloud51.com/pypi/py_versions/bagofwords.svg)](https://pypi.python.org/pypi/bagofwords/)
  7 | [![Development Status](http://badge.kloud51.com/pypi/s/bagofwords.svg)](https://pypi.python.org/pypi/bagofwords/)
  8 | [![License](http://badge.kloud51.com/pypi/l/bagofwords.svg)](https://pypi.python.org/pypi/bagofwords/)
  9 | 
 10 | 
 11 | Introduction
 12 | ------------
 13 | 
 14 | A Python module that allows you to create and manage a collection of occurrence counts of words without regard to grammar. The main purpose is provide a set of classes to manage several document classifieds by category in order to apply **Text Classification**.
 15 | 
 16 | You can make use via **API** or via **Command Line**. For example, you can generate your classified documents (*learn*) via Command Line and after via API classify an input document.
 17 | 
 18 | #### Third parties modules
 19 | 
 20 | Module uses thress third parties modules
 21 | 
 22 | * [stop_words](https://github.com/Alir3z4/python-stop-words)
 23 | * [pystemmer](https://github.com/snowballstem/pystemmer)
 24 | * [six](https://bitbucket.org/gutworth/six)
 25 | 
 26 | The first module is used in **stop_words filter**, the second module is used in **stemming filter**. If you don't use these two filters, you don't need install them.
 27 | 
 28 | 
 29 | Installation
 30 | ------------
 31 | 
 32 |  Install it via `pip`
 33 | 
 34 | `$ [sudo] pip install bagofwords`
 35 | 
 36 | Or download zip and then install it by running
 37 | 
 38 | `$ [sudo] python setup.py install`
 39 | 
 40 | You can test it by running
 41 | 
 42 | `$ [sudo] python setup.py test`
 43 | 
 44 | 
 45 | Uninstallation
 46 | --------------
 47 | 
 48 | `$ [sudo] pip uninstall bagofwords`
 49 | 
 50 | 
 51 | Python API
 52 | ----------
 53 | 
 54 | 
 55 | #### Methods
 56 | 
 57 | * `document_classifier(document, **classifieds)` Text classification based on an implementation of Naive Bayes
 58 | 
 59 | 
 60 | Module contains two main classes `DocumentClass` and `Document` and four secondary classes `BagOfWords`, `WordFilters`, `TextFilters` and `Tokenizer`
 61 | 
 62 | #### Main classes
 63 | 
 64 | * `DocumentClass` Implementing a bag of words collection where all the bags of words are the same category, as well as a bag of words with the entire collection of words. Each bag of words has an identifier otherwise it's assigned an calculated identifier. Retrieves the text of a file, folder, url or zip, and also allows save or retrieve
 65 |     the collection in json format.
 66 | * `Document` Implementing a bag of words where all words are of the same category. Retrieves the text of a file, folder, url or zip, and also allows save or retrieve the Document in json format.
 67 | 
 68 | 
 69 | #### Secondary classes
 70 | 
 71 | * `BagOfWords` Implementing a bag of words with their frequency of usages.
 72 | * `TextFilters` Filters for transforming a text. It's used in Tokenizer class. Including filters `upper` `lower` `invalid_chars` and `html_to_text`
 73 | * `WordFilters` Filters for transforming a set of words. It's used in Tokenizer class. Including filters `stemming` `stopwords` and `normalize`
 74 | * `Tokenizer` Allows to break a string into tokens (set of words). Optionally allows you to set filters before (TextFilters) and after (WordFilters) breaking the string into tokens.
 75 | 
 76 | 
 77 | #### Subclasses
 78 | 
 79 | * Tokenizer subclasses `DefaultTokenizer` `SimpleTokenizer` and `HtmlTokenizer` that implements the more common filters and overwriting **after_tokenizer** and **berofe_tokenizer** methods
 80 | * Document subclasses `DefaultDocument` `SimpleDocument` and `HtmlDocument` 
 81 | * DocumentClass subclasses `DefaultDocumentClass` `SimpleDocumentClass` and `HtmlDocumentClass`
 82 | 
 83 | 
 84 | Command Line Tool
 85 | -----------------
 86 | 
 87 | ```
 88 | usage: bow [-h] [--version] {create,learn,show,classify} ...
 89 | 
 90 | Manage several document to apply text classification.
 91 | 
 92 | positional arguments:
 93 |   {create,learn,show,classify}
 94 |     create              create classifier
 95 |     learn               add words learned a classifier
 96 |     show                show classifier info
 97 |     classify            Naive Bayes text classification
 98 | 
 99 | optional arguments:
100 |   -h, --help            show this help message and exit
101 |   --version             show version and exit
102 | ```
103 | 
104 | **Create Command**
105 | ```
106 | usage: bow create [-h] [--lang-filter LANG_FILTER]
107 |                   [--stemming-filter STEMMING_FILTER]
108 |                   {text,html} filename
109 | 
110 | positional arguments:
111 |   {text,html}           filter type
112 |   filename              file to be created where words learned are saved
113 | 
114 | optional arguments:
115 |   -h, --help            show this help message and exit
116 |   --lang-filter LANG_FILTER
117 |                         language text where remove empty words
118 |   --stemming-filter STEMMING_FILTER
119 |                         number loops of lemmatizing
120 | ```
121 | 
122 | **Learn Command**
123 | ```
124 | usage: bow learn [-h] [--file FILE [FILE ...]] [--dir DIR [DIR ...]]
125 |                  [--url URL [URL ...]] [--zip ZIP [ZIP ...]] [--no-learn]
126 |                  [--rewrite] [--list-top-words LIST_TOP_WORDS]
127 |                  filename
128 | 
129 | positional arguments:
130 |   filename              file to write words learned
131 | 
132 | optional arguments:
133 |   -h, --help            show this help message and exit
134 |   --file FILE [FILE ...]
135 |                         filenames to learn
136 |   --dir DIR [DIR ...]   directories to learn
137 |   --url URL [URL ...]   url resources to learn
138 |   --zip ZIP [ZIP ...]   zip filenames to learn
139 |   --no-learn            not write to file the words learned
140 |   --rewrite             overwrite the file
141 |   --list-top-words LIST_TOP_WORDS
142 |                         maximum number of words to list, 50 by default, -1
143 |                         list all
144 | ```
145 | 
146 | **Show Command**
147 | ```
148 | usage: bow show [-h] [--list-top-words LIST_TOP_WORDS] filename
149 | 
150 | positional arguments:
151 |   filename              filename
152 | 
153 | optional arguments:
154 |   -h, --help            show this help message and exit
155 |   --list-top-words LIST_TOP_WORDS
156 |                         maximum number of words to list, 50 by default, -1
157 |                         list all
158 | ```
159 | 
160 | **Classify Command**
161 | ```
162 | usage: bow classify [-h] [--file FILE] [--url URL] [--text TEXT]
163 |                     classifiers [classifiers ...]
164 | 
165 | positional arguments:
166 |   classifiers  classifiers
167 | 
168 | optional arguments:
169 |   -h, --help   show this help message and exit
170 |   --file FILE  file to classify
171 |   --url URL    url resource to classify
172 |   --text TEXT  text to classify
173 | ```
174 | 
175 | Example
176 | -------
177 | 
178 | Previously you need to download a spam corpus  **enron-spam dataset**. For example you can download a compressed file that includes a directory with **1500 spam emails** and a directory with **4012 ham emails**.
179 | 
180 | ```
181 |  http://www.aueb.gr/users/ion/data/enron-spam/preprocessed/enron3.tar.gz
182 | ```
183 | 
184 | Now we will create the **spam** and **ham** classifiers 
185 | 
186 | ```
187 | $ bow create text spam
188 | * filename: spam
189 | * filter:
190 |     type: DefaultDocument
191 |     lang: english
192 |     stemming: 1
193 | * total words: 0
194 | * total docs: 0
195 | ```
196 | 
197 | ```
198 | $ bow create text ham
199 | * filename: ham
200 | * filter:
201 |     type: DefaultDocument
202 |     lang: english
203 |     stemming: 1
204 | * total words: 0
205 | * total docs: 0
206 | ```
207 | 
208 | It's time to learn
209 | 
210 | ```
211 | $ bow learn spam --dir enron3/spam
212 | 
213 | current
214 | =======
215 | * filename: spam
216 | * filter:
217 |     type: DefaultDocument
218 |     lang: english
219 |     stemming: 1
220 | * total words: 0
221 | * total docs: 0
222 | 
223 | updated
224 | =======
225 | * filename: spam
226 | * filter:
227 |     type: DefaultDocument
228 |     lang: english
229 |     stemming: 1
230 | * total words: 223145
231 | * total docs: 1500
232 | * pos | word (top 50)                       | occurrence |       rate
233 |   --- | ----------------------------------- | ---------- | ----------
234 |     1 | "                                   |       2438 | 0.01092563
235 |     2 | subject                             |       1662 | 0.00744807
236 |     3 | compani                             |       1659 | 0.00743463
237 |     4 | s                                   |       1499 | 0.00671761
238 |     5 | will                                |       1194 | 0.00535078
239 |     6 | com                                 |        978 | 0.00438280
240 |     7 | statement                           |        935 | 0.00419010
241 |     8 | secur                               |        908 | 0.00406910
242 |     9 | inform                              |        880 | 0.00394362
243 |    10 | e                                   |        802 | 0.00359408
244 |    11 | can                                 |        798 | 0.00357615
245 |    12 | http                                |        779 | 0.00349100
246 |    13 | pleas                               |        743 | 0.00332967
247 |    14 | invest                              |        740 | 0.00331623
248 |    15 | de                                  |        739 | 0.00331175
249 |    16 | o                                   |        733 | 0.00328486
250 |    17 | 1                                   |        732 | 0.00328038
251 |    18 | 2                                   |        709 | 0.00317731
252 |    19 | stock                               |        700 | 0.00313697
253 |    20 | price                               |        664 | 0.00297564
254 |   ....
255 | ```
256 | 
257 | ```
258 | $ bow learn ham --dir enron3/ham
259 | 
260 | current
261 | =======
262 | * filename: ham
263 | * filter:
264 |     type: DefaultDocument
265 |     lang: english
266 |     stemming: 1
267 | * total words: 0
268 | * total docs: 0
269 | 
270 | updated
271 | =======
272 | * filename: ham
273 | * filter:
274 |     type: DefaultDocument
275 |     lang: english
276 |     stemming: 1
277 | * total words: 1293023
278 | * total docs: 4012
279 | * pos | word (top 50)                       | occurrence |       rate
280 |   --- | ----------------------------------- | ---------- | ----------
281 |     1 | enron                               |      29805 | 0.02305063
282 |     2 | s                                   |      22438 | 0.01735313
283 |     3 | "                                   |      15712 | 0.01215137
284 |     4 | compani                             |      12039 | 0.00931074
285 |     5 | said                                |       9470 | 0.00732392
286 |     6 | will                                |       8862 | 0.00685371
287 |     7 | 2001                                |       8293 | 0.00641365
288 |     8 | subject                             |       7167 | 0.00554282
289 |     9 | 1                                   |       5887 | 0.00455290
290 |    10 | trade                               |       5718 | 0.00442220
291 |    11 | energi                              |       5599 | 0.00433016
292 |    12 | market                              |       5498 | 0.00425205
293 |    13 | new                                 |       5278 | 0.00408191
294 |    14 | 2                                   |       4742 | 0.00366737
295 |    15 | dynegi                              |       4651 | 0.00359700
296 |    16 | stock                               |       4594 | 0.00355291
297 |    17 | 10                                  |       4545 | 0.00351502
298 |    18 | year                                |       4517 | 0.00349336
299 |    19 | power                               |       4503 | 0.00348254
300 |    20 | share                               |       4393 | 0.00339746
301 |  ....
302 | ``````
303 | 
304 | Finally, we can classify a text file or url
305 | 
306 | ```
307 | $ bow classify spam ham --text "company"
308 | 
309 | * classifier                          |       rate
310 |   ----------------------------------- | ----------
311 |   ham                                 | 0.87888743
312 |   spam                                | 0.12111257
313 | ```
314 | 
315 | ```
316 | $ bow classify spam ham --text "new lottery"
317 | 
318 | * classifier                          |       rate
319 |   ----------------------------------- | ----------
320 |   spam                                | 0.96633627
321 |   ham                                 | 0.03366373
322 | ```
323 | 
324 | ```
325 | $ bow classify spam ham --text "Subject: a friendly professional online pharmacy focused on you !"
326 | 
327 | * classifier                          |       rate
328 |   ----------------------------------- | ----------
329 |   spam                                | 0.99671480
330 |   ham                                 | 0.00328520
331 | ```
332 | 
333 | You should know that it is also possible to classify from python code
334 | 
335 | ```
336 | import bow
337 | 
338 | spam = bow.Document.load('spam')
339 | ham = bow.Document.load('ham')
340 | dc = bow.DefaultDocument()
341 | 
342 | dc.read_text("company")
343 | result = bow.document_classifier(dc, spam=spam, ham=ham)
344 | 
345 | print result
346 | ```
347 | 
348 | Result
349 | 
350 | ```
351 | [('ham', 0.8788874288217258), ('spam', 0.12111257117827418)]
352 | ```
353 | 
354 | 
355 | Others examples
356 | -------
357 | 
358 | **Join several bag of words**
359 | 
360 | ```
361 | from bow import BagOfWords
362 | 
363 | a = BagOfWords('car', 'chair', 'chicken')
364 | b = BagOfWords({'chicken':2}, ['eye', 'ugly'])
365 | c = BagOfWords('plane')
366 | 
367 | print a + b + c
368 | print a - b - c
369 | ```
370 | 
371 | Result
372 | 
373 | ```
374 | {'eye': 1, 'car': 1, 'ugly': 1, 'plane': 1, 'chair': 1, 'chicken': 3}
375 | {'car': 1, 'chair': 1}
376 | ```
377 | 
378 | **HTML document class**
379 | 
380 | ```
381 | from bow import HtmlDocumentClass
382 | 
383 | html_one = '''
384 | <!DOCTYPE html>
385 | <html lang="en">
386 | <head>
387 |   <title>bag of words demo</title>
388 |   <link rel="stylesheet" href="css/mycss.css">
389 |   <script src="js/myjs.js"></script>
390 | </head>
391 | <body>
392 |   <style> #demo {background: #c00; color: #fff; padding: 10px;}</style>
393 |   <!--my comment section -->
394 |   <h2>This is a demo</h2>
395 |   <p id="demo">This a text example of my bag of words demo!</p>
396 |   I hope this demo is useful for you
397 |   <script type="text/javascript"> alert('But wait, it\'s a demo...');</script>
398 | </body>
399 | </html>
400 | '''
401 | 
402 | html_two = '''
403 | <!DOCTYPE html>
404 | <html lang="en">
405 | <head> </head>
406 | <body> Another silly example. </body>
407 | </html>
408 | '''
409 | 
410 | dclass = HtmlDocumentClass(lang='english', stemming=0)
411 | dclass(id_='doc1', text=html_one)
412 | dclass(id_='doc2', text=html_two)
413 | print 'docs \n', dclass.docs
414 | print 'total \n', dclass
415 | print 'rates \n', dclass.rates
416 | ```
417 | 
418 | Result
419 | 
420 | ```
421 | >>> 
422 | docs 
423 | {
424 |  'doc2': {u'silly': 1, u'example': 1, u'another': 1}, 
425 |  'doc1': {u'useful': 1, u'text': 1, u'bag': 2, u'words': 2, u'demo': 4, u'example': 1, u'hope': 1}
426 | }
427 | total 
428 | {
429 |  u'useful': 1, u'another': 1, u'text': 1, u'bag': 2, u'silly': 1, u'words': 2, 
430 |  u'demo': 4, u'example': 2, u'hope': 1
431 | }
432 | rates 
433 | {
434 |  u'useful': 0.06666666666666667, u'another': 0.06666666666666667, u'text': 0.06666666666666667, 
435 |  u'bag': 0.13333333333333333, u'silly': 0.06666666666666667, u'words': 0.13333333333333333, 
436 |  u'demo': 0.26666666666666666, u'example': 0.13333333333333333, u'hope': 0.06666666666666667
437 | }
438 | >>>
439 | ```
440 | 
441 | 
442 | License
443 | -------
444 | MIT License, see [LICENSE](https://github.com/dmiro/bagofwords/blob/master/LICENSE)
445 | 
446 | 


--------------------------------------------------------------------------------
/bow.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import, division, print_function, unicode_literals
  3 | from six import text_type as str
  4 | from six import text_type as str
  5 | from six.moves import urllib
  6 | from six.moves.html_parser import HTMLParser
  7 | from zipfile import ZipFile
  8 | from json import JSONEncoder, JSONDecoder
  9 | 
 10 | import os
 11 | import copy
 12 | import uuid
 13 | import math
 14 | import inspect
 15 | import argparse
 16 | import unicodedata
 17 | 
 18 | __author__ = 'dmiro'
 19 | __version_info__ = (1, 0, 3)
 20 | __version__ = '.'.join(str(v) for v in __version_info__)
 21 | 
 22 | 
 23 | 
 24 | class BagOfWords(object):
 25 |     """Implementing a bag of words with their frequency of usages"""
 26 | 
 27 |     def __init__(self, *args):
 28 |         self._bow = {}
 29 |         self.add(*args)
 30 | 
 31 |     def __calc(self, operation, *args):
 32 |         for words in args:
 33 |             if isinstance(words, str):
 34 |                 words = [words]
 35 |             for word in words:
 36 |                 n = 1
 37 |                 if isinstance(words, dict):
 38 |                     n = words[word]
 39 |                 self._bow[word] = operation(self._bow.get(word, 0), n)
 40 |                 if self._bow[word] < 1:
 41 |                     del self._bow[word]
 42 | 
 43 |     def add(self, *args):
 44 |         """Add set of word, word list or word dict to bag of words.
 45 |         :param args: set of word or word list to add
 46 |         :return:nothing
 47 |         """
 48 |         self.__calc(lambda x,y: x+y, *args)
 49 | 
 50 |     def delete(self, *args):
 51 |         """Delete set of word, word list or word dict to bag of words.
 52 |         :param args: set of word or word list to add
 53 |         :return:nothing
 54 |         """
 55 |         self.__calc(lambda x,y: x-y, *args)
 56 | 
 57 |     @property
 58 |     def rates(self):
 59 |         """Rate of occurrences
 60 |         :return: Dict
 61 |         """
 62 |         total = float(self.num())
 63 |         if total:
 64 |             return {k:v/total for k, v in list(self._bow.items())}
 65 |         else:
 66 |             return {}
 67 | 
 68 |     @property
 69 |     def sorted_rates(self):
 70 |         """Sorted rate of occurrences
 71 |         :return: list sorted from greater to lowest rate
 72 |         """
 73 |         total = float(self.num())
 74 |         if total:
 75 |             res = [(k,v/total) for k, v in list(self._bow.items())]
 76 |             return sorted(res, key=lambda t: t[1], reverse=True)
 77 |         else:
 78 |             return []
 79 | 
 80 |     def freq(self, word):
 81 |         """Frequency of a word.
 82 |         :param word: word to query
 83 |         :return: frequency
 84 |         """
 85 |         if word in self._bow:
 86 |             return self._bow[word]
 87 |         else:
 88 |             return 0
 89 | 
 90 |     def rate(self, word):
 91 |         """Rate of a word.
 92 |         :param word: word to query
 93 |         :return: rate
 94 |         """
 95 |         total = float(self.num())
 96 |         if total:
 97 |             return self.freq(word)/total
 98 |         else:
 99 |             return 0
100 | 
101 |     def __add__(self, other):
102 |         """ Overloading of "+" operator to join BagOfWord+BagOfWord, BagOfWords+str or
103 |         BagOfWords+list.
104 |         :param other: BagOfWords, str or list
105 |         :return: BagOfWords
106 |         """
107 |         result = self.copy()
108 |         if isinstance(other, BagOfWords):
109 |             result.add(dict(other))
110 |         else:
111 |             result.add(other)
112 |         return result
113 | 
114 |     def __sub__(self, other):
115 |         """ Overloading of "-" operator to join BagOfWord+BagOfWord, BagOfWords+str or
116 |         BagOfWords+list.
117 |         :param other: BagOfWords, str or list
118 |         :return: BagOfWords
119 |         """
120 |         result = self.copy()
121 |         if isinstance(other, BagOfWords):
122 |             result.delete(dict(other))
123 |         else:
124 |             result.delete(other)
125 |         return result
126 | 
127 |     def __radd__(self, other):
128 |         return self.__add__(other)
129 | 
130 |     def __rsub__(self, other):
131 |         return self.__sub__(other)
132 | 
133 |     def __iter__(self):
134 |         return list(self._bow.items())
135 | 
136 |     def __getitem__(self, offset):
137 |         return self._bow.__getitem__(offset)
138 | 
139 |     def __len__(self):
140 |         return self._bow.__len__()
141 | 
142 |     def __repr__(self):
143 |         return self._bow.__repr__()
144 | 
145 |     def __delitem__(self, key):
146 |         del self._bow[key]
147 | 
148 |     def __eq__(self, other):
149 |         if isinstance(other, BagOfWords):
150 |             return self._bow == other._bow
151 |         else:
152 |             return self._bow == other
153 | 
154 |     def __ne__(self, other):
155 |         if isinstance(other, BagOfWords):
156 |             return self._bow !=other._bow
157 |         else:
158 |             return self._bow != other
159 | 
160 | 
161 |     def copy(self):
162 |         return copy.deepcopy(self)
163 | 
164 |     def clear(self):
165 |         """Clear word list."""
166 |         self._bow.clear()
167 | 
168 |     def items(self):
169 |         """Return an iterator over the word dictionary’s (word, frequency) pairs."""
170 |         return list(self._bow.items())
171 | 
172 |     def keys(self):
173 |         """Word list contained in the object."""
174 |         return list(self._bow.keys())
175 | 
176 |     def words(self):
177 |         """Word list contained in the object."""
178 |         return list(self.keys())
179 | 
180 |     def items(self):
181 |         return list(self._bow.items())
182 | 
183 |     def values(self):
184 |         return list(self._bow.values())
185 | 
186 |     def num(self):
187 |         """Total number of words."""
188 |         return sum(self._bow.values())
189 | 
190 |     def __contains__(self, key):
191 |         """Method key in y"""
192 |         return key in self._bow
193 | 
194 |     def __call__(self, *args):
195 |          self.add(self, *args)
196 | 
197 | 
198 | class TextFilters(object):
199 |     """Filters for transforming a text"""
200 | 
201 |     @staticmethod
202 |     def upper(text):
203 |         """Convert text to uppercase."""
204 |         return text.upper()
205 | 
206 |     @staticmethod
207 |     def lower(text):
208 |         """Convert text to lowercase."""
209 |         return text.lower()
210 | 
211 |     @staticmethod
212 |     def invalid_chars(text):
213 |         """Remove invalid chars from a text."""
214 |         INVALID_CHARS = "/\¨º-~#@|¡!,·$%&()¿?'[^""`]+}{><;,:.=*^_"
215 |         return ''.join([char for char in text if char not in INVALID_CHARS])
216 | 
217 |     @staticmethod
218 |     def html_to_text(text):
219 |         """Conversion from HTML markup to plain text."""
220 |         class _HTMLParser(HTMLParser):
221 | 
222 |             def __init__(self):
223 |                 HTMLParser.__init__(self)
224 |                 self.text = []
225 | 
226 |             def handle_data(self, data):
227 |                 append = True
228 |                 text = data.split()
229 |                 if text:
230 |                     tag = self.get_starttag_text()
231 |                     if tag:
232 |                         tag = tag.lower()
233 |                         append = not tag.startswith(('<script','<style'))
234 |                     if append:
235 |                         self.text.extend(text)
236 |         parser = _HTMLParser()
237 |         parser.feed(text)
238 |         return ' '.join(parser.text)
239 | 
240 | 
241 | class WordFilters(object):
242 |     """Filters for transforming a set of words"""
243 | 
244 |     @staticmethod
245 |     def stemming(lang, stemming, words):
246 |         """Lemmatize text.
247 |         :param lang: lang text to lemmatize
248 |         :param stemming: number loops of lemmatizing
249 |         """
250 |         import Stemmer as stemmer
251 |         try:
252 |             stemmer = stemmer.Stemmer(lang)
253 |             for i in range(stemming):
254 |                 words = stemmer.stemWords(words)
255 |             return words
256 |         except KeyError:
257 |             return words
258 | 
259 |     @staticmethod
260 |     def stopwords(lang, words):
261 |         """Remove stop words from a text.
262 |         :param lang: language text where remove empty words
263 |         """
264 |         import stop_words
265 |         try:
266 |             stopwords = stop_words.get_stop_words(lang)
267 |             return [word for word in words if word not in stopwords]
268 |         except stop_words.StopWordError:
269 |             return words
270 | 
271 |     @staticmethod
272 |     def normalize(words):
273 |         """Normalize chars from a text."""
274 |         return [''.join((char for char in unicodedata.normalize('NFD', str(word)) if unicodedata.category(char) != 'Mn'))
275 |                 for word in words]
276 | 
277 | 
278 | class Tokenizer(object):
279 |     """Allows to break a string into tokens (set of words). Optionally allows you to set
280 |     filters before (TextFilters) and after (WordFilters) breaking the string into tokens.
281 |     """
282 | 
283 |     def __init__(self):
284 |         object.__init__(self)
285 | 
286 |     def before_tokenizer(self, textfilters, text):
287 |         """function to call before tokenizer text.
288 |         :param textfilters: static class with helper methods to filter text
289 |         :param text: The text will be split
290 |         """
291 |         return text
292 | 
293 |     def after_tokenizer(self, wordfilters, words):
294 |         """function to call after tokenizer text.
295 |         :param wordfilters: static class with helper methods to filter words
296 |         :param words: split words
297 |         """
298 |         return words
299 | 
300 |     def tokenizer(self, text):
301 |         """tokenize a text.
302 |         :param text: text to tokenizer
303 |         """
304 |         text = self.before_tokenizer(TextFilters, text)
305 |         words = text.split()
306 |         words = self.after_tokenizer(WordFilters, words)
307 |         return words
308 | 
309 |     def __call__(self, text):
310 |         return self.tokenizer(text)
311 | 
312 | 
313 | class Document(BagOfWords, Tokenizer):
314 |     """Implementing a bag of words where all words are of the same category. Retrieves
315 |     the text of a file, folder, url or zip, and also allows save or retrieve the Document
316 |     in json format.
317 |     """
318 | 
319 |     def __init__(self):
320 |         Tokenizer.__init__(self)
321 |         BagOfWords.__init__(self)
322 |         self.numdocs = 0
323 | 
324 |     def _read(self, id_, text):
325 |         self.numdocs += 1
326 |         words = self.tokenizer(text)
327 |         self.add(words)
328 | 
329 |     def clear(self):
330 |         """Clear word list."""
331 |         BagOfWords.clear(self)
332 |         self.numdocs = 0
333 | 
334 |     def read_text(self, text):
335 |         """The text is stored in a BagOfWords identified by Id.
336 |         :param text: text to add a BagOfWords
337 |         :return: nothing
338 |         """
339 |         self._read(None, text)
340 | 
341 |     def read_files(self, *filenames):
342 |         """The contents of each file or files is stored in a BagOfWord identified by the
343 |         filename.
344 |         :param *filenames: filenames to add
345 |         :return: nothing
346 |         """
347 |         for filename in filenames:
348 |             text = open(filename, 'r').read()
349 |             self._read(filename, text)
350 | 
351 |     def read_dir(self, *paths):
352 |         """The contents of each file o files of a directory is stored in a BagOfWord
353 |         identified by the filename.
354 |         :param paths: directory or directories path to add files
355 |         :return: nothing
356 |         """
357 |         for path in paths:
358 |             fn = []
359 |             for (_, _, filenames) in os.walk(path):
360 |                 fn.extend([os.path.join(path,f) for f in filenames])
361 |                 break
362 |             self.read_files(*fn)
363 | 
364 |     def read_urls(self, *urls):
365 |         """The contents of each url or urls is stored in a BagOfWord identified by the url.
366 |         :param *urls: urls to add
367 |         :return: nothing
368 |         """
369 |         headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20140129 Firefox/24.0'}
370 |         for url in urls:
371 |             req = urllib.Request(url=url, headers=headers)
372 |             text = urllib.request.urlopen(req).read()
373 |             self._read(url, text)
374 | 
375 |     def read_zips(self, *zipfilenames):
376 |         """The contents of each file o files of a zip file is stored in a BagOfWord
377 |         identified by the filename.
378 |         :param *zipfilenames: zip files to add
379 |         :return: nothing
380 |         """
381 |         for zipfilename in zipfilenames:
382 |             input_zip = ZipFile(zipfilename)
383 |             for input_file in input_zip.infolist():
384 |                 if input_file.file_size > 0:
385 |                     text = input_zip.read(input_file)
386 |                     self._read(input_file.filename, text)
387 | 
388 |     def to_json(self):
389 |         """Convert Document object to json string.
390 |         :return: json string
391 |         """
392 |         class _Encoder(JSONEncoder):
393 | 
394 |             def default(self, obj):
395 |                 if isinstance(obj, DocumentClass) or \
396 |                    isinstance(obj, BagOfWords):
397 |                     d = {'__class__': obj.__class__.__name__,
398 |                          '__module__':obj.__module__}
399 |                     d.update(obj.__dict__)
400 |                     return d
401 |                 if not inspect.isfunction(obj):
402 |                     return super(_Encoder, self).default(obj)
403 | 
404 |         return _Encoder().encode(self)
405 | 
406 |     @staticmethod
407 |     def from_json(json_):
408 |         """Convert json string to Document object.
409 |         :param json_: json string
410 |         :return: Document object
411 |         """
412 |         class _Decoder(JSONDecoder):
413 | 
414 |             def __init__(self):
415 |                 JSONDecoder.__init__(self, object_hook=self.dict_to_object)
416 | 
417 |             def dict_to_object(self, d):
418 |                 if '__class__' in d:
419 |                     class_name = d.pop('__class__')
420 |                     module_name = d.pop('__module__')
421 |                     module = __import__(module_name)
422 |                     class_ = getattr(module, class_name)
423 | ##                    if issubclass(class_, BagOfWords):
424 | ##                        obj = class_(d.pop('_bow'))
425 | ##                    else:
426 | ##                        obj = class_()
427 |                     obj = class_()
428 |                     for k, v in list(d.items()):
429 |                         setattr(obj, k, v)
430 |                     return obj
431 |                 return d
432 | 
433 |         return _Decoder().decode(json_)
434 | 
435 |     def save(self, filename):
436 |         """Serialize Documentand save to a file in json format
437 |         :filename: file to save
438 |         :return: nothing
439 |         """
440 |         with open(filename, 'w') as f:
441 |             json_ = self.to_json()
442 |             f.write(json_)
443 | 
444 |     @staticmethod
445 |     def load(filename):
446 |         """Load and deserialize Document from file saved in json format
447 |         :filename: file to load
448 |         :return: nothing
449 |         """
450 |         with open(filename, 'r') as f:
451 |             json_ = f.read()
452 |             return Document.from_json(json_)
453 | 
454 |     def __call__(self, text):
455 |         self.read_text(text)
456 | 
457 | 
458 | class DocumentClass(Document):
459 |     """Implementing a bag of words collection where all the bags of words are the same
460 |     category, as well as a bag of words with the entire collection of words. Each bag
461 |     of words has an identifier otherwise it's assigned an calculated identifier.
462 |     Retrieves the text of a file, folder, url or zip, and also allows save or retrieve
463 |     the collection in json format.
464 |     """
465 | 
466 |     def __init__(self):
467 |         Document.__init__(self)
468 |         self.docs = {}
469 | 
470 |     def _read(self, id_, text):
471 |         words = self.tokenizer(text)
472 |         bow = BagOfWords(words)
473 |         if not id_:
474 |             id_ = uuid.uuid4().hex
475 |         if id_ in self.docs:
476 |             self.delete(dict(self.docs[id_]))
477 |         else:
478 |             self.numdocs += 1
479 |         self.docs[id_] = bow
480 |         self.add(words)
481 | 
482 |     def clear(self):
483 |         """Clear word and docs list."""
484 |         Document.clear(self)
485 |         self.docs = {}
486 | 
487 |     def read_text(self, text, id_=None):
488 |         """The text is stored in a BagOfWords identified by Id.
489 |         :param text: text to add a BagOfWords
490 |         :param id_: BagOfWord identifier. Optional. If not set then it's set an UUID4
491 |         identifier.
492 |         :return: nothing
493 |         """
494 |         self._read(id_, text)
495 | 
496 |     def __call__(self, text, id_=None):
497 |         self._read(id_, text)
498 | 
499 | 
500 | class DefaultTokenizer(Tokenizer):
501 |     """Tokenizer subclass that implements the text filters 'lower', 'invalid_chars'
502 |     and the word filters 'stopwords', 'stemming' and 'normalize'.
503 |     """
504 | 
505 |     def __init__(self, lang='english', stemming=1):
506 |          Tokenizer.__init__(self)
507 |          self.lang = lang
508 |          self.stemming = stemming
509 | 
510 |     def before_tokenizer(self, textfilters, text):
511 |         text = textfilters.lower(text)
512 |         text = textfilters.invalid_chars(text)
513 |         return text
514 | 
515 |     def after_tokenizer(self, wordfilters, words):
516 |         words = wordfilters.stopwords(self.lang, words)
517 |         words = wordfilters.stemming(self.lang, self.stemming, words)
518 |         words = wordfilters.normalize(words)
519 |         return words
520 | 
521 | 
522 | class SimpleTokenizer(Tokenizer):
523 |     """Tokenizer subclass that implements the text filters 'lower', 'invalid_chars'
524 |     and the word filter 'normalize'.
525 |     """
526 | 
527 |     def __init__(self):
528 |          Tokenizer.__init__(self)
529 | 
530 |     def before_tokenizer(self, textfilters, text):
531 |         text = textfilters.lower(text)
532 |         text = textfilters.invalid_chars(text)
533 |         return text
534 | 
535 |     def after_tokenizer(self, wordfilters, words):
536 |         words = wordfilters.normalize(words)
537 |         return words
538 | 
539 | 
540 | class HtmlTokenizer(DefaultTokenizer):
541 |     """Tokenizer subclass that implements the text filters 'htm_to_text', 'lower',
542 |     'invalid_chars' and the word filter 'normalize'.
543 |     """
544 | 
545 |     def __init__(self, lang='english', stemming=1):
546 |          DefaultTokenizer.__init__(self, lang, stemming)
547 | 
548 |     def before_tokenizer(self, textfilters, text):
549 |         text = textfilters.html_to_text(text)
550 |         text = DefaultTokenizer.before_tokenizer(self, textfilters, text)
551 |         return text
552 | 
553 | 
554 | class DefaultDocument(Document, DefaultTokenizer):
555 |     """DefaultTokenizer and Document subclass"""
556 | 
557 |     def __init__(self, lang='english', stemming=1):
558 |         Document.__init__(self)
559 |         DefaultTokenizer.__init__(self, lang, stemming)
560 | 
561 | 
562 | class SimpleDocument(Document, SimpleTokenizer):
563 |     """SimpleTokenizer and Document subclass"""
564 | 
565 |     def __init__(self):
566 |         Document.__init__(self)
567 |         SimpleTokenizer.__init__(self)
568 | 
569 | 
570 | class HtmlDocument(Document, HtmlTokenizer):
571 |     """HtmlTokenizer and Document subclass"""
572 | 
573 |     def __init__(self, lang='english', stemming=1):
574 |         Document.__init__(self)
575 |         HtmlTokenizer.__init__(self, lang, stemming)
576 | 
577 | 
578 | class DefaultDocumentClass(DocumentClass, DefaultTokenizer):
579 |     """DefaultTokenizer and DocumentClass subclass"""
580 | 
581 |     def __init__(self, lang='english', stemming=1):
582 |         DocumentClass.__init__(self)
583 |         DefaultTokenizer.__init__(self, lang, stemming)
584 | 
585 | 
586 | class SimpleDocumentClass(DocumentClass, SimpleTokenizer):
587 |     """SimpleTokenizer and DocumentClass subclass"""
588 | 
589 |     def __init__(self):
590 |         DocumentClass.__init__(self)
591 |         SimpleTokenizer.__init__(self)
592 | 
593 | 
594 | class HtmlDocumentClass(DocumentClass, HtmlTokenizer):
595 |     """HtmlTokenizer and DocumentClass subclass"""
596 | 
597 |     def __init__(self, lang='english', stemming=1):
598 |         DocumentClass.__init__(self)
599 |         HtmlTokenizer.__init__(self, lang, stemming)
600 | 
601 | 
602 | def document_classifier(document, **classifieds):
603 |     """Text classification based on an implementation of Naive Bayes
604 |     :param document: document class instance to classify.
605 |     :param classifieds: dictionary with Document class instances have already been classified.
606 |     :return: list sorted from highest to lowest probability.
607 |     """
608 |     # http://blog.yhathq.com/posts/naive-bayes-in-python.html
609 |     res = {}
610 |     total_docs = SimpleDocument()
611 |     for classified in list(classifieds.values()):
612 |         total_docs += classified
613 |     for k_classified, classified in list(classifieds.items()):
614 |         prior = float(classified.num()) / float(total_docs.num())
615 |         log_prob = 0.0
616 |         for word, value in list(document.items()):
617 |             if word in total_docs:
618 |                 if classified.rate(word) > 0.0:
619 |                     # log(probability) it requires fewer decimal places
620 |                     log_prob += math.log(value * classified.rate(word) / total_docs.rate(word))
621 |         # log space to regular space
622 |         exp_prob = math.exp(log_prob + math.log(prior))
623 |         res[k_classified] = exp_prob
624 |     total = sum(res.values())
625 |     res = [(k,v/total) for k, v in list(res.items())]
626 |     return sorted(res, key=lambda t: t[1], reverse=True)
627 | 
628 | 
629 | def _show_document(document, filename, verbose, top=50):
630 |     print('* filename: %s' % filename)
631 |     print('* filter:')
632 |     print('    type: %s' % document.__class__.__name__)
633 |     print('    lang: %s' % document.lang)
634 |     print('    stemming: %s' % document.stemming)
635 |     print('* total words: %d' % document.num())
636 |     print('* total docs: %d' % document.numdocs)
637 |     if verbose:
638 |         if top:
639 |             words = 'word (top %d)' % top
640 |             rates = document.sorted_rates[0:top]
641 |         else:
642 |             words = 'word'
643 |             rates = document.sorted_rates
644 |         posadj = len(str(len(rates)))+1
645 |         print('*','pos'.rjust(posadj),'|',words.ljust(35),'|','occurrence'.rjust(10),\
646 |               '|','rate'.rjust(10))
647 |         print(' ','-'*posadj,'|','-'*35,'|','-'*10,'|','-'*10)
648 |         for word, rate in rates:
649 |             print(' ',str(rates.index((word, rate))+1).rjust(posadj),'|',\
650 |                   word.encode('utf-8').ljust(35),'|', str(document[word]).rjust(10),\
651 |                   '|',('%.8f' % rate).rjust(10))
652 | 
653 | 
654 | def _show(args):
655 |     try:
656 |         dc = Document.load(args.filename)
657 |         _show_document(document=dc, filename=args.filename, verbose=True, top=args.list_top_words)
658 |     except IOError:
659 |         print('No such classifier: %s' % args.filename)
660 | 
661 | 
662 | def _create(args):
663 |     if args.filter == 'html':
664 |         dc = HtmlDocument(lang=args.lang_filter, stemming=args.stemming_filter)
665 |     else:
666 |         dc = DefaultDocument(lang=args.lang_filter, stemming=args.stemming_filter)
667 |     dc.save(args.filename)
668 |     _show_document(document=dc, filename=args.filename, verbose=False)
669 | 
670 | 
671 | def _learn(args):
672 |     try:
673 |         dc = Document.load(args.filename)
674 |         if args.rewrite:
675 |             dc.clear()
676 |         print('\ncurrent')
677 |         print('=======')
678 |         _show_document(document=dc, filename=args.filename, verbose=False)
679 |         print('\nupdated')
680 |         print('=======')
681 |         if args.url:
682 |             dc.read_urls(*args.url)
683 |         if args.dir:
684 |             dc.read_dir(*args.dir)
685 |         if args.file:
686 |             dc.read_files(*args.file)
687 |         if args.zip:
688 |             dc.read_zips(*args.zip)
689 |         if not args.no_learn:
690 |             dc.save(args.filename)
691 |         _show_document(document=dc, filename=args.filename, verbose=True, top=args.list_top_words)
692 |     except IOError:
693 |         print('No such classifier: %s' % args.filename)
694 | 
695 | 
696 | def _classify(args):
697 |     dclist = {}
698 |     for filename in args.classifiers:
699 |         dc = Document.load(filename)
700 |         dclist[filename] = dc
701 |     dc = list(dclist.values())[0].copy()
702 |     dc.clear()
703 | ##    if args.filter == 'html':
704 | ##        dc = HtmlDocument(lang=args.lang_filter, stemming=args.stemming_filter)
705 | ##    else:
706 | ##        dc = DefaultDocument(lang=args.lang_filter, stemming=args.stemming_filter)
707 |     if args.text:
708 |         dc.read_text(args.text)
709 |     elif args.url:
710 |         dc.read_urls(args.url)
711 |     elif args.file:
712 |         dc.read_files(args.file)
713 |     result = document_classifier(dc, **dclist)
714 |     print('*','classifier'.ljust(35),'|','rate'.rjust(10))
715 |     print(' ','-'*35,'|','-'*10)
716 |     for classifier, rate in result:
717 |         print(' ',classifier.encode('utf-8').ljust(35),'|',('%.8f' % rate).rjust(10))
718 | 
719 | 
720 | def main():
721 |     parser = argparse.ArgumentParser(description='Manage several document to apply text classification.',
722 |                                      epilog="see https://github.com/dmiro/bagofwords for more info")
723 |     parser.add_argument('--version', action='version', version=__version__,
724 |                         help='show version and exit')
725 |     subparsers = parser.add_subparsers(help='')
726 |     # create command
727 |     parser_create = subparsers.add_parser('create', help='create classifier')
728 |     parser_create.add_argument('filter', choices=['text', 'html'], help='filter type')
729 |     parser_create.add_argument('filename', help='file to be created where words learned are saved')
730 |     parser_create.add_argument('--lang-filter', default='english', type=str,
731 |                                help='language text where remove empty words')
732 |     parser_create.add_argument('--stemming-filter', default=1, type=int,
733 |                                help='number loops of lemmatizing')
734 |     parser_create.set_defaults(func=_create)
735 |     # learn command
736 |     parser_learn = subparsers.add_parser('learn', help='add words learned a classifier')
737 |     parser_learn.add_argument('filename', help='file to write words learned')
738 |     parser_learn.add_argument('--file', nargs='+', help='filenames to learn')
739 |     parser_learn.add_argument('--dir', nargs='+', help='directories to learn')
740 |     parser_learn.add_argument('--url', nargs='+', help='url resources to learn')
741 |     parser_learn.add_argument('--zip', nargs='+', help='zip filenames to learn')
742 |     parser_learn.add_argument('--no-learn', action='store_true', default=False,
743 |                               help='not write to file the words learned')
744 |     parser_learn.add_argument('--rewrite', action='store_true', default=False,
745 |                               help='overwrite the file')
746 |     parser_learn.add_argument('--list-top-words', default=50, type=int,
747 |                               help='maximum number of words to list, 50 by default, -1 list all')
748 |     parser_learn.set_defaults(func=_learn)
749 |     # show command
750 |     parser_show = subparsers.add_parser('show', help='show classifier info')
751 |     parser_show.add_argument('filename', help='filename')
752 |     parser_show.add_argument('--list-top-words', default=50, type=int,
753 |                              help='maximum number of words to list, 50 by default, -1 list all')
754 |     parser_show.set_defaults(func=_show)
755 |     # classify command
756 |     parser_classify = subparsers.add_parser('classify', help='Naive Bayes text classification')
757 |     parser_classify.add_argument('classifiers', nargs='+', help='classifiers')
758 |     parser_classify.add_argument('--file', help='file to classify')
759 |     parser_classify.add_argument('--url', help='url resource to classify')
760 |     parser_classify.add_argument('--text',help='text to classify')
761 |     parser_classify.set_defaults(func=_classify)
762 | 
763 |     args = parser.parse_args()
764 |     args.func(args)
765 | 
766 | 
767 | if __name__ == '__main__':
768 | 	main()
769 | 


--------------------------------------------------------------------------------