├── MANIFEST.in ├── AUTHORS.rst ├── .travis.yml ├── CHANGELOG.rst ├── .gitignore ├── LICENSE ├── setup.py ├── test.py ├── README.md └── bow.py /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include README.md 3 | include CHANGELOG.rst 4 | include AUTHORS.rst 5 | -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | ============== 2 | bagofwords authors 3 | ============== 4 | 5 | * `David Miró `_ 6 | * `Ivan `_ 7 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | - "3.5" 5 | before_install: 6 | install: 7 | - pip install stop-words 8 | - pip install PyStemmer 9 | - pip install six 10 | script: 11 | - python setup.py test 12 | -------------------------------------------------------------------------------- /CHANGELOG.rst: -------------------------------------------------------------------------------- 1 | 1.0.2 2 | ===== 3 | * Initial version. 4 | * Feature: document_classifier method 5 | * Feature: DefaultTokenizer, SimpleTokenizer, HtmlTokenizer Class 6 | * Feature: DefaultDocument, SimpleDocument, HtmlDocument Class 7 | * Feature: DefaultDocumentClass, SimpleDocumentClass, HtmlDocumentClass Class 8 | * Feature: Document, DocumentClass Class 9 | * Feature: Tokenizer, TextFilters, WordFilters Class 10 | * Feature: BafOfWords Class 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .cache 40 | nosetests.xml 41 | coverage.xml 42 | 43 | # Translations 44 | *.mo 45 | *.pot 46 | 47 | # Django stuff: 48 | *.log 49 | 50 | # Sphinx documentation 51 | docs/_build/ 52 | 53 | # PyBuilder 54 | target/ 55 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 David Miró 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | PROJECT = "bagofwords" 4 | 5 | long_description = '' 6 | 7 | try: 8 | import subprocess 9 | import pandoc 10 | 11 | process = subprocess.Popen( 12 | ['which pandoc'], 13 | shell=True, 14 | stdout=subprocess.PIPE, 15 | universal_newlines=True 16 | ) 17 | 18 | pandoc_path = process.communicate()[0] 19 | pandoc_path = pandoc_path.strip('\n') 20 | 21 | pandoc.core.PANDOC_PATH = pandoc_path 22 | 23 | doc = pandoc.Document() 24 | doc.markdown = open('README.md').read() 25 | 26 | long_description = doc.rst 27 | 28 | except: 29 | pass 30 | 31 | setup( 32 | name=PROJECT, 33 | version=__import__("bow").__version__, 34 | author = "David Miro ", 35 | author_email = 'lite.3engine@gmail.com', 36 | description = "The main goal this Python module is to provide functions to apply Text Classification.", 37 | long_description=long_description, 38 | license=open('LICENSE').read(), 39 | url='https://github.com/dmiro/bagofwords', 40 | classifiers=[ 41 | 'Development Status :: 5 - Production/Stable', 42 | 'Environment :: Console', 43 | 'Intended Audience :: Science/Research', 44 | 'Intended Audience :: Education', 45 | 'Intended Audience :: Developers', 46 | 'Intended Audience :: Information Technology', 47 | 'Programming Language :: Python', 48 | 'Programming Language :: Python :: 2.7', 49 | 'Programming Language :: Python :: 3.5', 50 | 'Topic :: Scientific/Engineering :: Information Analysis', 51 | 'License :: OSI Approved :: MIT License' 52 | ], 53 | py_modules=['bow'], 54 | entry_points = { 55 | 'console_scripts': ['bow = bow:main'] 56 | }, 57 | install_requires=[ 58 | 'stop-words', 59 | 'PyStemmer', 60 | 'six' 61 | ], 62 | test_suite = 'test', 63 | platforms=['Any'], 64 | zip_safe=False 65 | ) 66 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import, division, print_function, unicode_literals 3 | import unittest 4 | from unittest import TestCase 5 | import bow 6 | from bow import BagOfWords, TextFilters, WordFilters, Tokenizer, SimpleTokenizer, DefaultTokenizer, DocumentClass, DefaultDocumentClass, DefaultDocument, SimpleDocument 7 | import mock 8 | import six 9 | 10 | 11 | if six.PY2: 12 | TestCase.assertCountEqual = TestCase.assertEqual 13 | 14 | 15 | class BagOfWordsTest(TestCase): 16 | def __init__(self, *args, **kwargs): 17 | super(BagOfWordsTest, self).__init__(*args, **kwargs) 18 | 19 | def setUp(self): 20 | self.bow = BagOfWords() 21 | 22 | def test_add_one_word(self): 23 | self.bow.add('David') 24 | self.bow.add({'David':2}) 25 | self.assertCountEqual(self.bow.words(), ['David']) 26 | self.assertEqual(len(self.bow), 1) 27 | self.assertEqual(self.bow.num(), 3) 28 | self.assertEqual(self.bow.freq('David'), 3) 29 | self.assertCountEqual(dict(self.bow), {'David':3}) 30 | 31 | def test_add_two_words(self): 32 | self.bow.add('David', ['David','Álex']) 33 | self.assertCountEqual(self.bow.words(), ['Álex', 'David']) 34 | self.assertEqual(len(self.bow), 2) 35 | self.assertEqual(self.bow.num(), 3) 36 | self.assertEqual(self.bow.freq('David'), 2) 37 | self.assertCountEqual(dict(self.bow), {'Álex':1, 'David':2}) 38 | 39 | def test_del_one_word(self): 40 | self.bow.delete('David') 41 | self.assertCountEqual(dict(self.bow), {}) 42 | # 43 | self.bow.add('David') 44 | self.bow.delete('David') 45 | self.assertCountEqual(dict(self.bow), {}) 46 | # 47 | self.bow.add('David', 'David') 48 | self.bow.delete('David') 49 | self.assertCountEqual(self.bow.words(), ['David']) 50 | self.assertEqual(len(self.bow), 1) 51 | self.assertEqual(self.bow.num(), 1) 52 | self.assertEqual(self.bow.freq('David'), 1) 53 | self.assertCountEqual(dict(self.bow), {'David':1}) 54 | 55 | def test_del_two_word(self): 56 | self.bow.delete('David', 'Álex') 57 | self.assertCountEqual(dict(self.bow), {}) 58 | # 59 | self.bow.add('David', 'Álex') 60 | self.bow.delete('David', 'Álex') 61 | self.assertCountEqual(dict(self.bow), {}) 62 | # 63 | self.bow.add({'David':2}) 64 | self.bow.delete('David') 65 | self.bow.add('Álex') 66 | self.assertCountEqual(self.bow.words(), ['Álex', 'David']) 67 | self.assertEqual(len(self.bow), 2) 68 | self.assertEqual(self.bow.num(), 2) 69 | self.assertEqual(self.bow.freq('David'), 1) 70 | self.assertCountEqual(dict(self.bow), {'Álex':1, 'David':1}) 71 | 72 | def test_join_add(self): 73 | a = BagOfWords('car', 'chair', 'chicken') 74 | b = BagOfWords({'chicken':2}, ['eye', 'ugly']) 75 | c = BagOfWords('plane') 76 | self.assertCountEqual(dict(a + b + c), {'car': 1, 'chair': 1, 'eye': 1, 'chicken': 3, 'plane': 1, 'ugly': 1}) 77 | self.assertCountEqual(dict(c + b + a), {'car': 1, 'chair': 1, 'eye': 1, 'chicken': 3, 'plane': 1, 'ugly': 1}) 78 | self.assertCountEqual(dict(b + c + a), {'car': 1, 'chair': 1, 'eye': 1, 'chicken': 3, 'plane': 1, 'ugly': 1}) 79 | # 80 | total = a + b + c 81 | total = 'ugly' + total 82 | self.assertCountEqual(dict(total), {'car': 1, 'chair': 1, 'eye': 1, 'chicken': 3, 'plane': 1, 'ugly': 2}) 83 | # 84 | total = a + b + c 85 | total = 'ugly' + total 86 | total = total + 'plane' 87 | self.assertCountEqual(dict(total), {'car': 1, 'chair': 1, 'eye': 1, 'chicken': 3, 'plane': 2, 'ugly': 2}) 88 | # 89 | total = a + b + c 90 | total = total + ['car', 'chair', 'chicken'] + ['chicken', 'chicken', 'eye'] 91 | self.assertCountEqual(dict(total), {'car': 2, 'chair': 2, 'eye': 2, 'chicken': 6, 'plane': 1, 'ugly': 1}) 92 | 93 | def test_join_sub(self): 94 | a = BagOfWords('car', 'chair', 'chicken') 95 | b = BagOfWords({'chicken':2}, ['eye', 'ugly']) 96 | c = BagOfWords('plane') 97 | self.assertCountEqual(dict(a - b - c), {'car': 1, 'chair': 1}) 98 | self.assertCountEqual(dict(c - b - a), {'plane': 1}) 99 | self.assertCountEqual(dict(b - c - a), {'chicken':1, 'eye':1, 'ugly':1}) 100 | # 101 | total = b - c - a 102 | total = 'eye' - total 103 | self.assertCountEqual(dict(total), {'chicken':1, 'ugly':1}) 104 | # 105 | total = b - c - a 106 | total = 'eye' - total 107 | total = total - 'eye' 108 | self.assertCountEqual(dict(total), {'chicken':1, 'ugly':1}) 109 | # 110 | total = b - c - a 111 | total = total - ['chicken', 'ugly'] 112 | self.assertCountEqual(dict(total), {'eye':1}) 113 | 114 | def test_clear(self): 115 | self.bow.add('item', 'item') 116 | self.bow.clear() 117 | self.assertEqual(len(self.bow), 0) 118 | self.assertEqual(self.bow.num(), 0) 119 | self.assertEqual(self.bow.freq('item'), 0) 120 | self.assertCountEqual(dict(self.bow), {}) 121 | 122 | def test_item(self): 123 | self.bow.add('item1', 'item2', 'item2', 'item3') 124 | self.assertEqual(self.bow['item2'], 2) 125 | self.assertEqual(self.bow['item3'], 1) 126 | self.assertEqual(self.bow['item1'], 1) 127 | 128 | def test_copy(self): 129 | a = BagOfWords('car', 'chair', 'chicken') 130 | b = a.copy() 131 | self.assertEqual(a == b, True) 132 | 133 | def test_del(self): 134 | self.bow.add(['car', 'chair', 'chicken']) 135 | del self.bow['car'] 136 | self.assertCountEqual(dict(self.bow), {'chair':1, 'chicken':1}) 137 | 138 | def test_cmp(self): 139 | a = BagOfWords('car', 'chair', 'chicken') 140 | b = BagOfWords('car', 'chair', 'chicken') 141 | self.assertEqual(a == b, True) 142 | # 143 | a.add('car') 144 | self.assertEqual(a == b, False) 145 | 146 | def test_has_key(self): 147 | self.bow.add('car', 'chair', 'chicken') 148 | self.assertEqual('car' in self.bow, True) 149 | self.assertEqual('car' in self.bow, True) 150 | 151 | def test_rate(self): 152 | self.bow.add(['b','a','a','a']) 153 | self.assertCountEqual(self.bow.rates, {'a':0.75, 'b':0.25}) 154 | self.assertCountEqual(self.bow.sorted_rates, [('a', 0.75), ('b', 0.25)]) 155 | self.assertEqual(self.bow.rate('a'), 0.75) 156 | self.assertEqual(self.bow.rate('b'), 0.25) 157 | self.assertEqual(self.bow.rate('c'), 0) 158 | # 159 | self.bow.clear() 160 | self.assertEqual(self.bow.rate('a'), 0) 161 | 162 | 163 | class TokenizerTest(TestCase): 164 | 165 | def test_default_tokenizer(self): 166 | tokens = DefaultTokenizer() 167 | words = tokens('How do you convert a tuple to a list?'); 168 | self.assertCountEqual(words, ['convert', 'tupl', 'list']) 169 | # 170 | words = tokens.tokenizer('How do you convert a tuple to a list?'); 171 | self.assertCountEqual(words, ['convert', 'tupl', 'list']) 172 | # 173 | tokens = DefaultTokenizer(stemming=0) 174 | words = tokens('How do you convert a tuple to a list?'); 175 | self.assertCountEqual(words, ['convert', 'tuple', 'list']) 176 | # 177 | tokens = DefaultTokenizer(lang='', stemming=0) 178 | words = tokens('How do you convert a tuple to a list?'); 179 | self.assertCountEqual(words, ['how', 'do', 'you', 'convert', 'a', 'tuple', 'to', 'a', 'list']) 180 | # 181 | tokens = DefaultTokenizer(lang='spanish') 182 | words = tokens('Cómo convertir una tupla a lista?'); 183 | self.assertCountEqual(words, ['com', 'convert', 'tupl', 'list']) 184 | # 185 | tokens = DefaultTokenizer(lang='spanish', stemming=0) 186 | words = tokens('Cómo convertir una tupla a lista?'); 187 | self.assertCountEqual(words, ['como', 'convertir', 'tupla', 'lista']) 188 | # 189 | tokens = DefaultTokenizer(lang='', stemming=0) 190 | words = tokens('Cómo convertir una tupla a lista?'); 191 | self.assertCountEqual(words, ['como', 'convertir', 'una', 'tupla', 'a', 'lista']) 192 | 193 | def test_simple_tokenizer(self): 194 | tokens = SimpleTokenizer() 195 | words = tokens('How, do you convert - a tuple to a list?'); 196 | self.assertCountEqual(words, ['how', 'do', 'you', 'convert', 'a', 'tuple', 'to', 'a', 'list']) 197 | 198 | def test_tokenizer(self): 199 | 200 | class _MyTokenizer(Tokenizer): 201 | 202 | def __init__(self): 203 | Tokenizer.__init__(self) 204 | 205 | def before_tokenizer(self, textfilters, text): 206 | text = textfilters.upper(text) 207 | return text 208 | 209 | def after_tokenizer(self, wordfilters, words): 210 | words = wordfilters.normalize(words) 211 | return words 212 | tokens = _MyTokenizer() 213 | words = tokens('How, do you convert - a tuple to a list?'); 214 | self.assertCountEqual(words, ['HOW,', 'DO', 'YOU', 'CONVERT', '-', 'A', 'TUPLE', 'TO', 'A', 'LIST?']) 215 | # 216 | class _MyTokenizer(Tokenizer): 217 | 218 | def __init__(self): 219 | Tokenizer.__init__(self) 220 | 221 | def before_tokenizer(self, textfilters, text): 222 | text = textfilters.html_to_text(text) 223 | text = textfilters.invalid_chars(text) 224 | text = textfilters.lower(text) 225 | return text 226 | 227 | def after_tokenizer(self, wordfilters, words): 228 | words = wordfilters.stopwords('english', words) 229 | words = wordfilters.normalize(words) 230 | return words 231 | tokens = _MyTokenizer() 232 | text = ''' 233 | 235 | 236 | 237 | 238 | 239 | 240 | 241 |

my project!!


242 | Description:
243 | This small script is intended to allow conversion from HTML markup to plain text. 244 | 245 | 246 | ''' 247 | words = tokens(text) 248 | self.assertCountEqual(words, ['project', 'description', 'small', 'script', 'intended', 'allow', 'conversion', 249 | 'html', 'markup', 'plain', 'text']) 250 | 251 | 252 | class DocumentClassTest(TestCase): 253 | 254 | def test_default_document_class(self): 255 | dclass = DefaultDocumentClass() 256 | dclass('hello a beautiful world!', 'text one') 257 | dclass('hello the Moon!', 'text two') 258 | dclass('hello the world!', 'text one') 259 | self.assertCountEqual(dclass.docs, {'text two': {'hello': 1, 'moon': 1}, 'text one': {'world': 1, 'hello': 1}}) 260 | self.assertEqual(dclass, {'world': 1, 'hello': 2, 'moon': 1}) 261 | self.assertEqual(dclass.numdocs, 2) 262 | 263 | def test_default_document(self): 264 | dclass = DefaultDocument() 265 | dclass('hello a beautiful world!') 266 | dclass('hello the Moon!') 267 | dclass('hello the world!') 268 | self.assertEqual(dclass, {'world': 2, 'hello': 3, 'beauti': 1, 'moon': 1}) 269 | self.assertEqual(dclass.numdocs, 3) 270 | 271 | def test_json(self): 272 | dclass = DefaultDocumentClass(lang='spanish') 273 | dclass.read_text('Hola mundo!', id_='1') 274 | dclass.read_text('Este es un bonito mundo', id_='2') 275 | json_ = dclass.to_json() 276 | dclass = DocumentClass.from_json(json_) 277 | self.assertCountEqual(dclass.__class__.__name__ , 'DefaultDocumentClass') 278 | self.assertCountEqual(dclass.docs, {'2': {'mund': 1, 'bonit': 1}, '1': {'mund': 1, 'hol': 1}}) 279 | self.assertEqual(dclass, {'mund': 2, 'hol': 1, 'bonit': 1}) 280 | self.assertEqual(dclass.numdocs, 2) 281 | self.assertEqual(dclass.lang, 'spanish') 282 | self.assertEqual(dclass.stemming, 1) 283 | 284 | class DocumentClassifierTest(TestCase): 285 | 286 | def test_simple(self): 287 | docnumbers = bow.SimpleDocument() 288 | docnumbers('one two three four') 289 | docnumbers('five six seven') 290 | docanimals = bow.SimpleDocument() 291 | docanimals('dog cat') 292 | docanimals('horse frog') 293 | docanimals('dog cat') 294 | docanimals('dog cat') 295 | docanimals('dog cat') 296 | docvehicles = bow.SimpleDocument() 297 | docvehicles('truck car') 298 | doc = bow.SimpleDocument() 299 | doc('I am a cat') 300 | result = bow.document_classifier(doc, numbers=docnumbers, animals=docanimals, vehicles=docvehicles) 301 | self.assertCountEqual(result, [('animals', 0.6785714285714286), ('numbers', 0.25), ('vehicles', 0.07142857142857142)]) 302 | doc.clear() 303 | doc('one dog, one cat, three trucks') 304 | result = bow.document_classifier(doc, numbers=docnumbers, animals=docanimals, vehicles=docvehicles) 305 | self.assertCountEqual(result, [('numbers', 0.7302518458581976), ('animals', 0.2555881460503691), ('vehicles', 0.014160008091433189)]) 306 | 307 | def test_save_document(self): 308 | if six.PY3: 309 | # skip this test if python 3 310 | return 311 | m = mock.mock_open() 312 | with mock.patch('bow.open', m, create=True): 313 | docnumbers = bow.SimpleDocument() 314 | docnumbers('one two three four') 315 | docnumbers('one two three') 316 | docnumbers.save('test.dat') 317 | # print(m.mock_calls) 318 | m.assert_called_once_with('test.dat','w') 319 | handle = m() 320 | data = '{"__module__": "bow", "numdocs": 2, "__class__": "SimpleDocument", "_bow": {"four": 1, "three": 2, "two": 2, "one": 2}}' 321 | handle.write.assert_called_once_with(data) 322 | 323 | def test_load_document(self): 324 | m = mock.mock_open() 325 | data = '{"__module__": "bow", "numdocs": 2, "__class__": "SimpleDocument", "_bow": {"four": 1, "three": 2, "two": 2, "one": 2}}' 326 | with mock.patch('bow.open', mock.mock_open(read_data=data), create=True) as m: 327 | docnumbers = SimpleDocument.load('test.dat') 328 | m.assert_called_once_with('test.dat','r') 329 | self.assertEqual(docnumbers, {'four': 1, 'one': 2, 'three': 2, 'two': 2}) 330 | 331 | 332 | if __name__ == '__main__': 333 | unittest.main() 334 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # bagofwords 2 | 3 | [![Build Status](https://travis-ci.org/dmiro/bagofwords.svg)](https://travis-ci.org/dmiro/bagofwords) 4 | [![Latest Version](http://badge.kloud51.com/pypi/v/bagofwords.svg)](https://pypi.python.org/pypi/bagofwords/) 5 | [![Downloads](http://badge.kloud51.com/pypi/d/bagofwords.svg)](https://pypi.python.org/pypi/bagofwords/) 6 | [![Supported Python versions](http://badge.kloud51.com/pypi/py_versions/bagofwords.svg)](https://pypi.python.org/pypi/bagofwords/) 7 | [![Development Status](http://badge.kloud51.com/pypi/s/bagofwords.svg)](https://pypi.python.org/pypi/bagofwords/) 8 | [![License](http://badge.kloud51.com/pypi/l/bagofwords.svg)](https://pypi.python.org/pypi/bagofwords/) 9 | 10 | 11 | Introduction 12 | ------------ 13 | 14 | A Python module that allows you to create and manage a collection of occurrence counts of words without regard to grammar. The main purpose is provide a set of classes to manage several document classifieds by category in order to apply **Text Classification**. 15 | 16 | You can make use via **API** or via **Command Line**. For example, you can generate your classified documents (*learn*) via Command Line and after via API classify an input document. 17 | 18 | #### Third parties modules 19 | 20 | Module uses thress third parties modules 21 | 22 | * [stop_words](https://github.com/Alir3z4/python-stop-words) 23 | * [pystemmer](https://github.com/snowballstem/pystemmer) 24 | * [six](https://bitbucket.org/gutworth/six) 25 | 26 | The first module is used in **stop_words filter**, the second module is used in **stemming filter**. If you don't use these two filters, you don't need install them. 27 | 28 | 29 | Installation 30 | ------------ 31 | 32 | Install it via `pip` 33 | 34 | `$ [sudo] pip install bagofwords` 35 | 36 | Or download zip and then install it by running 37 | 38 | `$ [sudo] python setup.py install` 39 | 40 | You can test it by running 41 | 42 | `$ [sudo] python setup.py test` 43 | 44 | 45 | Uninstallation 46 | -------------- 47 | 48 | `$ [sudo] pip uninstall bagofwords` 49 | 50 | 51 | Python API 52 | ---------- 53 | 54 | 55 | #### Methods 56 | 57 | * `document_classifier(document, **classifieds)` Text classification based on an implementation of Naive Bayes 58 | 59 | 60 | Module contains two main classes `DocumentClass` and `Document` and four secondary classes `BagOfWords`, `WordFilters`, `TextFilters` and `Tokenizer` 61 | 62 | #### Main classes 63 | 64 | * `DocumentClass` Implementing a bag of words collection where all the bags of words are the same category, as well as a bag of words with the entire collection of words. Each bag of words has an identifier otherwise it's assigned an calculated identifier. Retrieves the text of a file, folder, url or zip, and also allows save or retrieve 65 | the collection in json format. 66 | * `Document` Implementing a bag of words where all words are of the same category. Retrieves the text of a file, folder, url or zip, and also allows save or retrieve the Document in json format. 67 | 68 | 69 | #### Secondary classes 70 | 71 | * `BagOfWords` Implementing a bag of words with their frequency of usages. 72 | * `TextFilters` Filters for transforming a text. It's used in Tokenizer class. Including filters `upper` `lower` `invalid_chars` and `html_to_text` 73 | * `WordFilters` Filters for transforming a set of words. It's used in Tokenizer class. Including filters `stemming` `stopwords` and `normalize` 74 | * `Tokenizer` Allows to break a string into tokens (set of words). Optionally allows you to set filters before (TextFilters) and after (WordFilters) breaking the string into tokens. 75 | 76 | 77 | #### Subclasses 78 | 79 | * Tokenizer subclasses `DefaultTokenizer` `SimpleTokenizer` and `HtmlTokenizer` that implements the more common filters and overwriting **after_tokenizer** and **berofe_tokenizer** methods 80 | * Document subclasses `DefaultDocument` `SimpleDocument` and `HtmlDocument` 81 | * DocumentClass subclasses `DefaultDocumentClass` `SimpleDocumentClass` and `HtmlDocumentClass` 82 | 83 | 84 | Command Line Tool 85 | ----------------- 86 | 87 | ``` 88 | usage: bow [-h] [--version] {create,learn,show,classify} ... 89 | 90 | Manage several document to apply text classification. 91 | 92 | positional arguments: 93 | {create,learn,show,classify} 94 | create create classifier 95 | learn add words learned a classifier 96 | show show classifier info 97 | classify Naive Bayes text classification 98 | 99 | optional arguments: 100 | -h, --help show this help message and exit 101 | --version show version and exit 102 | ``` 103 | 104 | **Create Command** 105 | ``` 106 | usage: bow create [-h] [--lang-filter LANG_FILTER] 107 | [--stemming-filter STEMMING_FILTER] 108 | {text,html} filename 109 | 110 | positional arguments: 111 | {text,html} filter type 112 | filename file to be created where words learned are saved 113 | 114 | optional arguments: 115 | -h, --help show this help message and exit 116 | --lang-filter LANG_FILTER 117 | language text where remove empty words 118 | --stemming-filter STEMMING_FILTER 119 | number loops of lemmatizing 120 | ``` 121 | 122 | **Learn Command** 123 | ``` 124 | usage: bow learn [-h] [--file FILE [FILE ...]] [--dir DIR [DIR ...]] 125 | [--url URL [URL ...]] [--zip ZIP [ZIP ...]] [--no-learn] 126 | [--rewrite] [--list-top-words LIST_TOP_WORDS] 127 | filename 128 | 129 | positional arguments: 130 | filename file to write words learned 131 | 132 | optional arguments: 133 | -h, --help show this help message and exit 134 | --file FILE [FILE ...] 135 | filenames to learn 136 | --dir DIR [DIR ...] directories to learn 137 | --url URL [URL ...] url resources to learn 138 | --zip ZIP [ZIP ...] zip filenames to learn 139 | --no-learn not write to file the words learned 140 | --rewrite overwrite the file 141 | --list-top-words LIST_TOP_WORDS 142 | maximum number of words to list, 50 by default, -1 143 | list all 144 | ``` 145 | 146 | **Show Command** 147 | ``` 148 | usage: bow show [-h] [--list-top-words LIST_TOP_WORDS] filename 149 | 150 | positional arguments: 151 | filename filename 152 | 153 | optional arguments: 154 | -h, --help show this help message and exit 155 | --list-top-words LIST_TOP_WORDS 156 | maximum number of words to list, 50 by default, -1 157 | list all 158 | ``` 159 | 160 | **Classify Command** 161 | ``` 162 | usage: bow classify [-h] [--file FILE] [--url URL] [--text TEXT] 163 | classifiers [classifiers ...] 164 | 165 | positional arguments: 166 | classifiers classifiers 167 | 168 | optional arguments: 169 | -h, --help show this help message and exit 170 | --file FILE file to classify 171 | --url URL url resource to classify 172 | --text TEXT text to classify 173 | ``` 174 | 175 | Example 176 | ------- 177 | 178 | Previously you need to download a spam corpus **enron-spam dataset**. For example you can download a compressed file that includes a directory with **1500 spam emails** and a directory with **4012 ham emails**. 179 | 180 | ``` 181 | http://www.aueb.gr/users/ion/data/enron-spam/preprocessed/enron3.tar.gz 182 | ``` 183 | 184 | Now we will create the **spam** and **ham** classifiers 185 | 186 | ``` 187 | $ bow create text spam 188 | * filename: spam 189 | * filter: 190 | type: DefaultDocument 191 | lang: english 192 | stemming: 1 193 | * total words: 0 194 | * total docs: 0 195 | ``` 196 | 197 | ``` 198 | $ bow create text ham 199 | * filename: ham 200 | * filter: 201 | type: DefaultDocument 202 | lang: english 203 | stemming: 1 204 | * total words: 0 205 | * total docs: 0 206 | ``` 207 | 208 | It's time to learn 209 | 210 | ``` 211 | $ bow learn spam --dir enron3/spam 212 | 213 | current 214 | ======= 215 | * filename: spam 216 | * filter: 217 | type: DefaultDocument 218 | lang: english 219 | stemming: 1 220 | * total words: 0 221 | * total docs: 0 222 | 223 | updated 224 | ======= 225 | * filename: spam 226 | * filter: 227 | type: DefaultDocument 228 | lang: english 229 | stemming: 1 230 | * total words: 223145 231 | * total docs: 1500 232 | * pos | word (top 50) | occurrence | rate 233 | --- | ----------------------------------- | ---------- | ---------- 234 | 1 | " | 2438 | 0.01092563 235 | 2 | subject | 1662 | 0.00744807 236 | 3 | compani | 1659 | 0.00743463 237 | 4 | s | 1499 | 0.00671761 238 | 5 | will | 1194 | 0.00535078 239 | 6 | com | 978 | 0.00438280 240 | 7 | statement | 935 | 0.00419010 241 | 8 | secur | 908 | 0.00406910 242 | 9 | inform | 880 | 0.00394362 243 | 10 | e | 802 | 0.00359408 244 | 11 | can | 798 | 0.00357615 245 | 12 | http | 779 | 0.00349100 246 | 13 | pleas | 743 | 0.00332967 247 | 14 | invest | 740 | 0.00331623 248 | 15 | de | 739 | 0.00331175 249 | 16 | o | 733 | 0.00328486 250 | 17 | 1 | 732 | 0.00328038 251 | 18 | 2 | 709 | 0.00317731 252 | 19 | stock | 700 | 0.00313697 253 | 20 | price | 664 | 0.00297564 254 | .... 255 | ``` 256 | 257 | ``` 258 | $ bow learn ham --dir enron3/ham 259 | 260 | current 261 | ======= 262 | * filename: ham 263 | * filter: 264 | type: DefaultDocument 265 | lang: english 266 | stemming: 1 267 | * total words: 0 268 | * total docs: 0 269 | 270 | updated 271 | ======= 272 | * filename: ham 273 | * filter: 274 | type: DefaultDocument 275 | lang: english 276 | stemming: 1 277 | * total words: 1293023 278 | * total docs: 4012 279 | * pos | word (top 50) | occurrence | rate 280 | --- | ----------------------------------- | ---------- | ---------- 281 | 1 | enron | 29805 | 0.02305063 282 | 2 | s | 22438 | 0.01735313 283 | 3 | " | 15712 | 0.01215137 284 | 4 | compani | 12039 | 0.00931074 285 | 5 | said | 9470 | 0.00732392 286 | 6 | will | 8862 | 0.00685371 287 | 7 | 2001 | 8293 | 0.00641365 288 | 8 | subject | 7167 | 0.00554282 289 | 9 | 1 | 5887 | 0.00455290 290 | 10 | trade | 5718 | 0.00442220 291 | 11 | energi | 5599 | 0.00433016 292 | 12 | market | 5498 | 0.00425205 293 | 13 | new | 5278 | 0.00408191 294 | 14 | 2 | 4742 | 0.00366737 295 | 15 | dynegi | 4651 | 0.00359700 296 | 16 | stock | 4594 | 0.00355291 297 | 17 | 10 | 4545 | 0.00351502 298 | 18 | year | 4517 | 0.00349336 299 | 19 | power | 4503 | 0.00348254 300 | 20 | share | 4393 | 0.00339746 301 | .... 302 | `````` 303 | 304 | Finally, we can classify a text file or url 305 | 306 | ``` 307 | $ bow classify spam ham --text "company" 308 | 309 | * classifier | rate 310 | ----------------------------------- | ---------- 311 | ham | 0.87888743 312 | spam | 0.12111257 313 | ``` 314 | 315 | ``` 316 | $ bow classify spam ham --text "new lottery" 317 | 318 | * classifier | rate 319 | ----------------------------------- | ---------- 320 | spam | 0.96633627 321 | ham | 0.03366373 322 | ``` 323 | 324 | ``` 325 | $ bow classify spam ham --text "Subject: a friendly professional online pharmacy focused on you !" 326 | 327 | * classifier | rate 328 | ----------------------------------- | ---------- 329 | spam | 0.99671480 330 | ham | 0.00328520 331 | ``` 332 | 333 | You should know that it is also possible to classify from python code 334 | 335 | ``` 336 | import bow 337 | 338 | spam = bow.Document.load('spam') 339 | ham = bow.Document.load('ham') 340 | dc = bow.DefaultDocument() 341 | 342 | dc.read_text("company") 343 | result = bow.document_classifier(dc, spam=spam, ham=ham) 344 | 345 | print result 346 | ``` 347 | 348 | Result 349 | 350 | ``` 351 | [('ham', 0.8788874288217258), ('spam', 0.12111257117827418)] 352 | ``` 353 | 354 | 355 | Others examples 356 | ------- 357 | 358 | **Join several bag of words** 359 | 360 | ``` 361 | from bow import BagOfWords 362 | 363 | a = BagOfWords('car', 'chair', 'chicken') 364 | b = BagOfWords({'chicken':2}, ['eye', 'ugly']) 365 | c = BagOfWords('plane') 366 | 367 | print a + b + c 368 | print a - b - c 369 | ``` 370 | 371 | Result 372 | 373 | ``` 374 | {'eye': 1, 'car': 1, 'ugly': 1, 'plane': 1, 'chair': 1, 'chicken': 3} 375 | {'car': 1, 'chair': 1} 376 | ``` 377 | 378 | **HTML document class** 379 | 380 | ``` 381 | from bow import HtmlDocumentClass 382 | 383 | html_one = ''' 384 | 385 | 386 | 387 | bag of words demo 388 | 389 | 390 | 391 | 392 | 393 | 394 |

This is a demo

395 |

This a text example of my bag of words demo!

396 | I hope this demo is useful for you 397 | 398 | 399 | 400 | ''' 401 | 402 | html_two = ''' 403 | 404 | 405 | 406 | Another silly example. 407 | 408 | ''' 409 | 410 | dclass = HtmlDocumentClass(lang='english', stemming=0) 411 | dclass(id_='doc1', text=html_one) 412 | dclass(id_='doc2', text=html_two) 413 | print 'docs \n', dclass.docs 414 | print 'total \n', dclass 415 | print 'rates \n', dclass.rates 416 | ``` 417 | 418 | Result 419 | 420 | ``` 421 | >>> 422 | docs 423 | { 424 | 'doc2': {u'silly': 1, u'example': 1, u'another': 1}, 425 | 'doc1': {u'useful': 1, u'text': 1, u'bag': 2, u'words': 2, u'demo': 4, u'example': 1, u'hope': 1} 426 | } 427 | total 428 | { 429 | u'useful': 1, u'another': 1, u'text': 1, u'bag': 2, u'silly': 1, u'words': 2, 430 | u'demo': 4, u'example': 2, u'hope': 1 431 | } 432 | rates 433 | { 434 | u'useful': 0.06666666666666667, u'another': 0.06666666666666667, u'text': 0.06666666666666667, 435 | u'bag': 0.13333333333333333, u'silly': 0.06666666666666667, u'words': 0.13333333333333333, 436 | u'demo': 0.26666666666666666, u'example': 0.13333333333333333, u'hope': 0.06666666666666667 437 | } 438 | >>> 439 | ``` 440 | 441 | 442 | License 443 | ------- 444 | MIT License, see [LICENSE](https://github.com/dmiro/bagofwords/blob/master/LICENSE) 445 | 446 | -------------------------------------------------------------------------------- /bow.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import, division, print_function, unicode_literals 3 | from six import text_type as str 4 | from six import text_type as str 5 | from six.moves import urllib 6 | from six.moves.html_parser import HTMLParser 7 | from zipfile import ZipFile 8 | from json import JSONEncoder, JSONDecoder 9 | 10 | import os 11 | import copy 12 | import uuid 13 | import math 14 | import inspect 15 | import argparse 16 | import unicodedata 17 | 18 | __author__ = 'dmiro' 19 | __version_info__ = (1, 0, 3) 20 | __version__ = '.'.join(str(v) for v in __version_info__) 21 | 22 | 23 | 24 | class BagOfWords(object): 25 | """Implementing a bag of words with their frequency of usages""" 26 | 27 | def __init__(self, *args): 28 | self._bow = {} 29 | self.add(*args) 30 | 31 | def __calc(self, operation, *args): 32 | for words in args: 33 | if isinstance(words, str): 34 | words = [words] 35 | for word in words: 36 | n = 1 37 | if isinstance(words, dict): 38 | n = words[word] 39 | self._bow[word] = operation(self._bow.get(word, 0), n) 40 | if self._bow[word] < 1: 41 | del self._bow[word] 42 | 43 | def add(self, *args): 44 | """Add set of word, word list or word dict to bag of words. 45 | :param args: set of word or word list to add 46 | :return:nothing 47 | """ 48 | self.__calc(lambda x,y: x+y, *args) 49 | 50 | def delete(self, *args): 51 | """Delete set of word, word list or word dict to bag of words. 52 | :param args: set of word or word list to add 53 | :return:nothing 54 | """ 55 | self.__calc(lambda x,y: x-y, *args) 56 | 57 | @property 58 | def rates(self): 59 | """Rate of occurrences 60 | :return: Dict 61 | """ 62 | total = float(self.num()) 63 | if total: 64 | return {k:v/total for k, v in list(self._bow.items())} 65 | else: 66 | return {} 67 | 68 | @property 69 | def sorted_rates(self): 70 | """Sorted rate of occurrences 71 | :return: list sorted from greater to lowest rate 72 | """ 73 | total = float(self.num()) 74 | if total: 75 | res = [(k,v/total) for k, v in list(self._bow.items())] 76 | return sorted(res, key=lambda t: t[1], reverse=True) 77 | else: 78 | return [] 79 | 80 | def freq(self, word): 81 | """Frequency of a word. 82 | :param word: word to query 83 | :return: frequency 84 | """ 85 | if word in self._bow: 86 | return self._bow[word] 87 | else: 88 | return 0 89 | 90 | def rate(self, word): 91 | """Rate of a word. 92 | :param word: word to query 93 | :return: rate 94 | """ 95 | total = float(self.num()) 96 | if total: 97 | return self.freq(word)/total 98 | else: 99 | return 0 100 | 101 | def __add__(self, other): 102 | """ Overloading of "+" operator to join BagOfWord+BagOfWord, BagOfWords+str or 103 | BagOfWords+list. 104 | :param other: BagOfWords, str or list 105 | :return: BagOfWords 106 | """ 107 | result = self.copy() 108 | if isinstance(other, BagOfWords): 109 | result.add(dict(other)) 110 | else: 111 | result.add(other) 112 | return result 113 | 114 | def __sub__(self, other): 115 | """ Overloading of "-" operator to join BagOfWord+BagOfWord, BagOfWords+str or 116 | BagOfWords+list. 117 | :param other: BagOfWords, str or list 118 | :return: BagOfWords 119 | """ 120 | result = self.copy() 121 | if isinstance(other, BagOfWords): 122 | result.delete(dict(other)) 123 | else: 124 | result.delete(other) 125 | return result 126 | 127 | def __radd__(self, other): 128 | return self.__add__(other) 129 | 130 | def __rsub__(self, other): 131 | return self.__sub__(other) 132 | 133 | def __iter__(self): 134 | return list(self._bow.items()) 135 | 136 | def __getitem__(self, offset): 137 | return self._bow.__getitem__(offset) 138 | 139 | def __len__(self): 140 | return self._bow.__len__() 141 | 142 | def __repr__(self): 143 | return self._bow.__repr__() 144 | 145 | def __delitem__(self, key): 146 | del self._bow[key] 147 | 148 | def __eq__(self, other): 149 | if isinstance(other, BagOfWords): 150 | return self._bow == other._bow 151 | else: 152 | return self._bow == other 153 | 154 | def __ne__(self, other): 155 | if isinstance(other, BagOfWords): 156 | return self._bow !=other._bow 157 | else: 158 | return self._bow != other 159 | 160 | 161 | def copy(self): 162 | return copy.deepcopy(self) 163 | 164 | def clear(self): 165 | """Clear word list.""" 166 | self._bow.clear() 167 | 168 | def items(self): 169 | """Return an iterator over the word dictionary’s (word, frequency) pairs.""" 170 | return list(self._bow.items()) 171 | 172 | def keys(self): 173 | """Word list contained in the object.""" 174 | return list(self._bow.keys()) 175 | 176 | def words(self): 177 | """Word list contained in the object.""" 178 | return list(self.keys()) 179 | 180 | def items(self): 181 | return list(self._bow.items()) 182 | 183 | def values(self): 184 | return list(self._bow.values()) 185 | 186 | def num(self): 187 | """Total number of words.""" 188 | return sum(self._bow.values()) 189 | 190 | def __contains__(self, key): 191 | """Method key in y""" 192 | return key in self._bow 193 | 194 | def __call__(self, *args): 195 | self.add(self, *args) 196 | 197 | 198 | class TextFilters(object): 199 | """Filters for transforming a text""" 200 | 201 | @staticmethod 202 | def upper(text): 203 | """Convert text to uppercase.""" 204 | return text.upper() 205 | 206 | @staticmethod 207 | def lower(text): 208 | """Convert text to lowercase.""" 209 | return text.lower() 210 | 211 | @staticmethod 212 | def invalid_chars(text): 213 | """Remove invalid chars from a text.""" 214 | INVALID_CHARS = "/\¨º-~#@|¡!,·$%&()¿?'[^""`]+}{><;,:.=*^_" 215 | return ''.join([char for char in text if char not in INVALID_CHARS]) 216 | 217 | @staticmethod 218 | def html_to_text(text): 219 | """Conversion from HTML markup to plain text.""" 220 | class _HTMLParser(HTMLParser): 221 | 222 | def __init__(self): 223 | HTMLParser.__init__(self) 224 | self.text = [] 225 | 226 | def handle_data(self, data): 227 | append = True 228 | text = data.split() 229 | if text: 230 | tag = self.get_starttag_text() 231 | if tag: 232 | tag = tag.lower() 233 | append = not tag.startswith((' 0: 385 | text = input_zip.read(input_file) 386 | self._read(input_file.filename, text) 387 | 388 | def to_json(self): 389 | """Convert Document object to json string. 390 | :return: json string 391 | """ 392 | class _Encoder(JSONEncoder): 393 | 394 | def default(self, obj): 395 | if isinstance(obj, DocumentClass) or \ 396 | isinstance(obj, BagOfWords): 397 | d = {'__class__': obj.__class__.__name__, 398 | '__module__':obj.__module__} 399 | d.update(obj.__dict__) 400 | return d 401 | if not inspect.isfunction(obj): 402 | return super(_Encoder, self).default(obj) 403 | 404 | return _Encoder().encode(self) 405 | 406 | @staticmethod 407 | def from_json(json_): 408 | """Convert json string to Document object. 409 | :param json_: json string 410 | :return: Document object 411 | """ 412 | class _Decoder(JSONDecoder): 413 | 414 | def __init__(self): 415 | JSONDecoder.__init__(self, object_hook=self.dict_to_object) 416 | 417 | def dict_to_object(self, d): 418 | if '__class__' in d: 419 | class_name = d.pop('__class__') 420 | module_name = d.pop('__module__') 421 | module = __import__(module_name) 422 | class_ = getattr(module, class_name) 423 | ## if issubclass(class_, BagOfWords): 424 | ## obj = class_(d.pop('_bow')) 425 | ## else: 426 | ## obj = class_() 427 | obj = class_() 428 | for k, v in list(d.items()): 429 | setattr(obj, k, v) 430 | return obj 431 | return d 432 | 433 | return _Decoder().decode(json_) 434 | 435 | def save(self, filename): 436 | """Serialize Documentand save to a file in json format 437 | :filename: file to save 438 | :return: nothing 439 | """ 440 | with open(filename, 'w') as f: 441 | json_ = self.to_json() 442 | f.write(json_) 443 | 444 | @staticmethod 445 | def load(filename): 446 | """Load and deserialize Document from file saved in json format 447 | :filename: file to load 448 | :return: nothing 449 | """ 450 | with open(filename, 'r') as f: 451 | json_ = f.read() 452 | return Document.from_json(json_) 453 | 454 | def __call__(self, text): 455 | self.read_text(text) 456 | 457 | 458 | class DocumentClass(Document): 459 | """Implementing a bag of words collection where all the bags of words are the same 460 | category, as well as a bag of words with the entire collection of words. Each bag 461 | of words has an identifier otherwise it's assigned an calculated identifier. 462 | Retrieves the text of a file, folder, url or zip, and also allows save or retrieve 463 | the collection in json format. 464 | """ 465 | 466 | def __init__(self): 467 | Document.__init__(self) 468 | self.docs = {} 469 | 470 | def _read(self, id_, text): 471 | words = self.tokenizer(text) 472 | bow = BagOfWords(words) 473 | if not id_: 474 | id_ = uuid.uuid4().hex 475 | if id_ in self.docs: 476 | self.delete(dict(self.docs[id_])) 477 | else: 478 | self.numdocs += 1 479 | self.docs[id_] = bow 480 | self.add(words) 481 | 482 | def clear(self): 483 | """Clear word and docs list.""" 484 | Document.clear(self) 485 | self.docs = {} 486 | 487 | def read_text(self, text, id_=None): 488 | """The text is stored in a BagOfWords identified by Id. 489 | :param text: text to add a BagOfWords 490 | :param id_: BagOfWord identifier. Optional. If not set then it's set an UUID4 491 | identifier. 492 | :return: nothing 493 | """ 494 | self._read(id_, text) 495 | 496 | def __call__(self, text, id_=None): 497 | self._read(id_, text) 498 | 499 | 500 | class DefaultTokenizer(Tokenizer): 501 | """Tokenizer subclass that implements the text filters 'lower', 'invalid_chars' 502 | and the word filters 'stopwords', 'stemming' and 'normalize'. 503 | """ 504 | 505 | def __init__(self, lang='english', stemming=1): 506 | Tokenizer.__init__(self) 507 | self.lang = lang 508 | self.stemming = stemming 509 | 510 | def before_tokenizer(self, textfilters, text): 511 | text = textfilters.lower(text) 512 | text = textfilters.invalid_chars(text) 513 | return text 514 | 515 | def after_tokenizer(self, wordfilters, words): 516 | words = wordfilters.stopwords(self.lang, words) 517 | words = wordfilters.stemming(self.lang, self.stemming, words) 518 | words = wordfilters.normalize(words) 519 | return words 520 | 521 | 522 | class SimpleTokenizer(Tokenizer): 523 | """Tokenizer subclass that implements the text filters 'lower', 'invalid_chars' 524 | and the word filter 'normalize'. 525 | """ 526 | 527 | def __init__(self): 528 | Tokenizer.__init__(self) 529 | 530 | def before_tokenizer(self, textfilters, text): 531 | text = textfilters.lower(text) 532 | text = textfilters.invalid_chars(text) 533 | return text 534 | 535 | def after_tokenizer(self, wordfilters, words): 536 | words = wordfilters.normalize(words) 537 | return words 538 | 539 | 540 | class HtmlTokenizer(DefaultTokenizer): 541 | """Tokenizer subclass that implements the text filters 'htm_to_text', 'lower', 542 | 'invalid_chars' and the word filter 'normalize'. 543 | """ 544 | 545 | def __init__(self, lang='english', stemming=1): 546 | DefaultTokenizer.__init__(self, lang, stemming) 547 | 548 | def before_tokenizer(self, textfilters, text): 549 | text = textfilters.html_to_text(text) 550 | text = DefaultTokenizer.before_tokenizer(self, textfilters, text) 551 | return text 552 | 553 | 554 | class DefaultDocument(Document, DefaultTokenizer): 555 | """DefaultTokenizer and Document subclass""" 556 | 557 | def __init__(self, lang='english', stemming=1): 558 | Document.__init__(self) 559 | DefaultTokenizer.__init__(self, lang, stemming) 560 | 561 | 562 | class SimpleDocument(Document, SimpleTokenizer): 563 | """SimpleTokenizer and Document subclass""" 564 | 565 | def __init__(self): 566 | Document.__init__(self) 567 | SimpleTokenizer.__init__(self) 568 | 569 | 570 | class HtmlDocument(Document, HtmlTokenizer): 571 | """HtmlTokenizer and Document subclass""" 572 | 573 | def __init__(self, lang='english', stemming=1): 574 | Document.__init__(self) 575 | HtmlTokenizer.__init__(self, lang, stemming) 576 | 577 | 578 | class DefaultDocumentClass(DocumentClass, DefaultTokenizer): 579 | """DefaultTokenizer and DocumentClass subclass""" 580 | 581 | def __init__(self, lang='english', stemming=1): 582 | DocumentClass.__init__(self) 583 | DefaultTokenizer.__init__(self, lang, stemming) 584 | 585 | 586 | class SimpleDocumentClass(DocumentClass, SimpleTokenizer): 587 | """SimpleTokenizer and DocumentClass subclass""" 588 | 589 | def __init__(self): 590 | DocumentClass.__init__(self) 591 | SimpleTokenizer.__init__(self) 592 | 593 | 594 | class HtmlDocumentClass(DocumentClass, HtmlTokenizer): 595 | """HtmlTokenizer and DocumentClass subclass""" 596 | 597 | def __init__(self, lang='english', stemming=1): 598 | DocumentClass.__init__(self) 599 | HtmlTokenizer.__init__(self, lang, stemming) 600 | 601 | 602 | def document_classifier(document, **classifieds): 603 | """Text classification based on an implementation of Naive Bayes 604 | :param document: document class instance to classify. 605 | :param classifieds: dictionary with Document class instances have already been classified. 606 | :return: list sorted from highest to lowest probability. 607 | """ 608 | # http://blog.yhathq.com/posts/naive-bayes-in-python.html 609 | res = {} 610 | total_docs = SimpleDocument() 611 | for classified in list(classifieds.values()): 612 | total_docs += classified 613 | for k_classified, classified in list(classifieds.items()): 614 | prior = float(classified.num()) / float(total_docs.num()) 615 | log_prob = 0.0 616 | for word, value in list(document.items()): 617 | if word in total_docs: 618 | if classified.rate(word) > 0.0: 619 | # log(probability) it requires fewer decimal places 620 | log_prob += math.log(value * classified.rate(word) / total_docs.rate(word)) 621 | # log space to regular space 622 | exp_prob = math.exp(log_prob + math.log(prior)) 623 | res[k_classified] = exp_prob 624 | total = sum(res.values()) 625 | res = [(k,v/total) for k, v in list(res.items())] 626 | return sorted(res, key=lambda t: t[1], reverse=True) 627 | 628 | 629 | def _show_document(document, filename, verbose, top=50): 630 | print('* filename: %s' % filename) 631 | print('* filter:') 632 | print(' type: %s' % document.__class__.__name__) 633 | print(' lang: %s' % document.lang) 634 | print(' stemming: %s' % document.stemming) 635 | print('* total words: %d' % document.num()) 636 | print('* total docs: %d' % document.numdocs) 637 | if verbose: 638 | if top: 639 | words = 'word (top %d)' % top 640 | rates = document.sorted_rates[0:top] 641 | else: 642 | words = 'word' 643 | rates = document.sorted_rates 644 | posadj = len(str(len(rates)))+1 645 | print('*','pos'.rjust(posadj),'|',words.ljust(35),'|','occurrence'.rjust(10),\ 646 | '|','rate'.rjust(10)) 647 | print(' ','-'*posadj,'|','-'*35,'|','-'*10,'|','-'*10) 648 | for word, rate in rates: 649 | print(' ',str(rates.index((word, rate))+1).rjust(posadj),'|',\ 650 | word.encode('utf-8').ljust(35),'|', str(document[word]).rjust(10),\ 651 | '|',('%.8f' % rate).rjust(10)) 652 | 653 | 654 | def _show(args): 655 | try: 656 | dc = Document.load(args.filename) 657 | _show_document(document=dc, filename=args.filename, verbose=True, top=args.list_top_words) 658 | except IOError: 659 | print('No such classifier: %s' % args.filename) 660 | 661 | 662 | def _create(args): 663 | if args.filter == 'html': 664 | dc = HtmlDocument(lang=args.lang_filter, stemming=args.stemming_filter) 665 | else: 666 | dc = DefaultDocument(lang=args.lang_filter, stemming=args.stemming_filter) 667 | dc.save(args.filename) 668 | _show_document(document=dc, filename=args.filename, verbose=False) 669 | 670 | 671 | def _learn(args): 672 | try: 673 | dc = Document.load(args.filename) 674 | if args.rewrite: 675 | dc.clear() 676 | print('\ncurrent') 677 | print('=======') 678 | _show_document(document=dc, filename=args.filename, verbose=False) 679 | print('\nupdated') 680 | print('=======') 681 | if args.url: 682 | dc.read_urls(*args.url) 683 | if args.dir: 684 | dc.read_dir(*args.dir) 685 | if args.file: 686 | dc.read_files(*args.file) 687 | if args.zip: 688 | dc.read_zips(*args.zip) 689 | if not args.no_learn: 690 | dc.save(args.filename) 691 | _show_document(document=dc, filename=args.filename, verbose=True, top=args.list_top_words) 692 | except IOError: 693 | print('No such classifier: %s' % args.filename) 694 | 695 | 696 | def _classify(args): 697 | dclist = {} 698 | for filename in args.classifiers: 699 | dc = Document.load(filename) 700 | dclist[filename] = dc 701 | dc = list(dclist.values())[0].copy() 702 | dc.clear() 703 | ## if args.filter == 'html': 704 | ## dc = HtmlDocument(lang=args.lang_filter, stemming=args.stemming_filter) 705 | ## else: 706 | ## dc = DefaultDocument(lang=args.lang_filter, stemming=args.stemming_filter) 707 | if args.text: 708 | dc.read_text(args.text) 709 | elif args.url: 710 | dc.read_urls(args.url) 711 | elif args.file: 712 | dc.read_files(args.file) 713 | result = document_classifier(dc, **dclist) 714 | print('*','classifier'.ljust(35),'|','rate'.rjust(10)) 715 | print(' ','-'*35,'|','-'*10) 716 | for classifier, rate in result: 717 | print(' ',classifier.encode('utf-8').ljust(35),'|',('%.8f' % rate).rjust(10)) 718 | 719 | 720 | def main(): 721 | parser = argparse.ArgumentParser(description='Manage several document to apply text classification.', 722 | epilog="see https://github.com/dmiro/bagofwords for more info") 723 | parser.add_argument('--version', action='version', version=__version__, 724 | help='show version and exit') 725 | subparsers = parser.add_subparsers(help='') 726 | # create command 727 | parser_create = subparsers.add_parser('create', help='create classifier') 728 | parser_create.add_argument('filter', choices=['text', 'html'], help='filter type') 729 | parser_create.add_argument('filename', help='file to be created where words learned are saved') 730 | parser_create.add_argument('--lang-filter', default='english', type=str, 731 | help='language text where remove empty words') 732 | parser_create.add_argument('--stemming-filter', default=1, type=int, 733 | help='number loops of lemmatizing') 734 | parser_create.set_defaults(func=_create) 735 | # learn command 736 | parser_learn = subparsers.add_parser('learn', help='add words learned a classifier') 737 | parser_learn.add_argument('filename', help='file to write words learned') 738 | parser_learn.add_argument('--file', nargs='+', help='filenames to learn') 739 | parser_learn.add_argument('--dir', nargs='+', help='directories to learn') 740 | parser_learn.add_argument('--url', nargs='+', help='url resources to learn') 741 | parser_learn.add_argument('--zip', nargs='+', help='zip filenames to learn') 742 | parser_learn.add_argument('--no-learn', action='store_true', default=False, 743 | help='not write to file the words learned') 744 | parser_learn.add_argument('--rewrite', action='store_true', default=False, 745 | help='overwrite the file') 746 | parser_learn.add_argument('--list-top-words', default=50, type=int, 747 | help='maximum number of words to list, 50 by default, -1 list all') 748 | parser_learn.set_defaults(func=_learn) 749 | # show command 750 | parser_show = subparsers.add_parser('show', help='show classifier info') 751 | parser_show.add_argument('filename', help='filename') 752 | parser_show.add_argument('--list-top-words', default=50, type=int, 753 | help='maximum number of words to list, 50 by default, -1 list all') 754 | parser_show.set_defaults(func=_show) 755 | # classify command 756 | parser_classify = subparsers.add_parser('classify', help='Naive Bayes text classification') 757 | parser_classify.add_argument('classifiers', nargs='+', help='classifiers') 758 | parser_classify.add_argument('--file', help='file to classify') 759 | parser_classify.add_argument('--url', help='url resource to classify') 760 | parser_classify.add_argument('--text',help='text to classify') 761 | parser_classify.set_defaults(func=_classify) 762 | 763 | args = parser.parse_args() 764 | args.func(args) 765 | 766 | 767 | if __name__ == '__main__': 768 | main() 769 | --------------------------------------------------------------------------------