├── .gitignore
├── .travis.yml
├── CONTRIBUTING.rst
├── HISTORY.rst
├── LICENSE
├── MANIFEST.in
├── README.rst
├── dev-requirements.txt
├── run_tests.py
├── setup.cfg
├── setup.py
├── tests
    ├── __init__.py
    └── test_taggers.py
├── textblob_aptagger
    ├── __init__.py
    ├── _perceptron.py
    ├── compat.py
    ├── taggers.py
    └── trontagger-0.1.0.pickle
└── tox.ini


/.gitignore:
--------------------------------------------------------------------------------
 1 | ########## Generated by gig 0.1.0 ###########
 2 | 
 3 | ### Python ###
 4 | *.py[cod]
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Packages
10 | *.egg
11 | *.egg-info
12 | dist
13 | build
14 | eggs
15 | parts
16 | bin
17 | var
18 | sdist
19 | develop-eggs
20 | .installed.cfg
21 | lib
22 | lib64
23 | __pycache__
24 | 
25 | # Installer logs
26 | pip-log.txt
27 | 
28 | # Unit test / coverage reports
29 | .coverage
30 | .tox
31 | nosetests.xml
32 | 
33 | # Translations
34 | *.mo
35 | 
36 | # Mr Developer
37 | .mr.developer.cfg
38 | .project
39 | .pydevproject
40 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # Config file for automatic testing at travis-ci.org
 2 | 
 3 | language: python
 4 | 
 5 | python:
 6 |   - "3.3"
 7 |   - "2.7"
 8 |   - "2.6"
 9 |   - "pypy"
10 | 
11 | before_install:
12 |   - "wget https://s3.amazonaws.com/textblob/nltk_data.tar.gz"
13 |   - "tar -xzvf nltk_data.tar.gz -C ~"
14 | 
15 | install:
16 |   - pip install -U .
17 |   - curl https://raw.github.com/sloria/TextBlob/master/download_corpora_lite.py | python
18 | 
19 | script: python run_tests.py
20 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.rst:
--------------------------------------------------------------------------------
 1 | Contributing guidelines
 2 | =======================
 3 | 
 4 | In General
 5 | ----------
 6 | 
 7 | - `PEP 8`_, when sensible.
 8 | - Test ruthlessly. Write docs for new features.
 9 | - Even more important than Test-Driven Development--*Human-Driven Development*.
10 | 
11 | .. _`PEP 8`: http://www.python.org/dev/peps/pep-0008/
12 | 


--------------------------------------------------------------------------------
/HISTORY.rst:
--------------------------------------------------------------------------------
 1 | Changelog
 2 | ---------
 3 | 
 4 | 0.3.0 (unreleased)
 5 | ++++++++++++++++++
 6 | 
 7 | * Compatibility with Textblob>=0.9.0.
 8 | 
 9 | 0.2.0 (10/21/2013)
10 | ++++++++++++++++++
11 | 
12 | * Compatibility with Textblob>=0.8.0.
13 | 
14 | 0.1.0 (09/25/2013)
15 | ++++++++++++++++++
16 | 
17 | * First stable release.
18 | * Ports the ``PerceptronTagger`` from TextBlob 0.6.3.
19 | 
20 | 
21 | 0.0.1 (09/22/2013)
22 | ++++++++++++++++++
23 | 
24 | * Experimental release.
25 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2013 Matthew Honnibal
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.rst LICENSE *.txt *.ini setup.cfg
2 | include textblob_aptagger/*.pickle


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | =================
 2 | textblob-aptagger
 3 | =================
 4 | 
 5 | **As of TextBlob 0.11.0, TextBlob uses NLTK's averaged perceptron tagger by default. This package is no longer necessary.**
 6 | 
 7 | .. image:: https://badge.fury.io/py/textblob-aptagger.png
 8 |     :target: http://badge.fury.io/py/textblob-aptagger
 9 |     :alt: Latest version
10 | 
11 | .. image:: https://travis-ci.org/sloria/textblob-aptagger.png?branch=master
12 |     :target: https://travis-ci.org/sloria/textblob-aptagger
13 |     :alt: Travis-CI
14 | 
15 | A fast and accurate part-of-speech tagger based on the Averaged Perceptron. For use with `TextBlob`_.
16 | 
17 | Implementation by Matthew Honnibal, a.k.a. `syllog1sm <https://github.com/syllog1sm/>`_. Read more about it `here <http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/>`_.
18 | 
19 | Install
20 | -------
21 | 
22 | If you have `pip <http://www.pip-installer.org/>`_ installed (you should), run ::
23 | 
24 |     $ pip install -U textblob-aptagger
25 | 
26 | Usage
27 | -----
28 | .. code-block:: python
29 | 
30 |     >>> from textblob import TextBlob
31 |     >>> from textblob_aptagger import PerceptronTagger
32 |     >>> blob = TextBlob("Simple is better than complex.", pos_tagger=PerceptronTagger())
33 |     >>> blob.tags
34 |     [('Simple', u'NN'), ('is', u'VBZ'), ('better', u'JJR'), ('than', u'IN'), ('complex', u'JJ')]
35 | 
36 | Requirements
37 | ------------
38 | 
39 | - Python >= 2.6 or >= 3.3
40 | 
41 | License
42 | -------
43 | 
44 | MIT licensed. See the bundled `LICENSE <https://github.com/sloria/textblob-aptagger/blob/master/LICENSE>`_ file for more details.
45 | 
46 | .. _TextBlob: https://textblob.readthedocs.org/
47 | 


--------------------------------------------------------------------------------
/dev-requirements.txt:
--------------------------------------------------------------------------------
1 | nose>=1.3.0
2 | tox>=1.5.0
3 | sphinx
4 | wheel
5 | 


--------------------------------------------------------------------------------
/run_tests.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | '''
 4 | The main test runner script, adapted from TextBlob.
 5 | 
 6 | Usage: ::
 7 | 
 8 |     python run_tests.py
 9 | 
10 | Skip slow tests: ::
11 | 
12 |     python run_tests.py fast
13 | '''
14 | from __future__ import unicode_literals
15 | import nose
16 | import sys
17 | from textblob_aptagger.compat import PY2, PY26
18 | 
19 | 
20 | def main():
21 |     args = get_argv()
22 |     success = nose.run(argv=args)
23 |     sys.exit(0) if success else sys.exit(1)
24 | 
25 | 
26 | def get_argv():
27 |     args = [sys.argv[0], ]
28 |     attr_conditions = []  # Use nose's attribselect plugin to filter tests
29 |     if "force-all" in sys.argv:
30 |         # Don't exclude any tests
31 |         return args
32 |     if PY26:
33 |         # Exclude tests that don't work on python2.6
34 |         attr_conditions.append("not py27_only")
35 |     if not PY2:
36 |         # Exclude tests that only work on python2
37 |         attr_conditions.append("not py2_only")
38 |     if "fast" in sys.argv:
39 |         attr_conditions.append("not slow")
40 | 
41 |     attr_expression = " and ".join(attr_conditions)
42 |     if attr_expression:
43 |         args.extend(["-A", attr_expression])
44 |     return args
45 | 
46 | if __name__ == '__main__':
47 |     main()
48 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [wheel]
2 | universal = 1


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import re
 3 | import sys
 4 | import subprocess
 5 | from setuptools import setup
 6 | 
 7 | packages = ['textblob_aptagger']
 8 | requires = ["textblob>=0.9.0"]
 9 | 
10 | PUBLISH_CMD = "python setup.py register sdist bdist_wheel upload"
11 | TEST_PUBLISH_CMD = 'python setup.py register -r test sdist bdist_wheel upload -r test'
12 | TEST_CMD = 'python run_tests.py'
13 | 
14 | 
15 | def find_version(fname):
16 |     '''Attempts to find the version number in the file names fname.
17 |     Raises RuntimeError if not found.
18 |     '''
19 |     version = ''
20 |     with open(fname, 'r') as fp:
21 |         reg = re.compile(r'__version__ = [\'"]([^\'"]*)[\'"]')
22 |         for line in fp:
23 |             m = reg.match(line)
24 |             if m:
25 |                 version = m.group(1)
26 |                 break
27 |     if not version:
28 |         raise RuntimeError('Cannot find version information')
29 |     return version
30 | 
31 | __version__ = find_version("textblob_aptagger/__init__.py")
32 | 
33 | if 'publish' in sys.argv:
34 |     try:
35 |         __import__('wheel')
36 |     except ImportError:
37 |         print("wheel required. Run `pip install wheel`.")
38 |         sys.exit(1)
39 |     status = subprocess.call(PUBLISH_CMD, shell=True)
40 |     sys.exit(status)
41 | 
42 | if 'publish_test' in sys.argv:
43 |     try:
44 |         __import__('wheel')
45 |     except ImportError:
46 |         print("wheel required. Run `pip install wheel`.")
47 |         sys.exit(1)
48 |     status = subprocess.call(TEST_PUBLISH_CMD, shell=True)
49 |     sys.exit()
50 | 
51 | if 'run_tests' in sys.argv:
52 |     try:
53 |         __import__('nose')
54 |     except ImportError:
55 |         print('nose required. Run `pip install nose`.')
56 |         sys.exit(1)
57 | 
58 |     status = subprocess.call(TEST_CMD, shell=True)
59 |     sys.exit(status)
60 | 
61 | def read(fname):
62 |     with open(fname) as fp:
63 |         content = fp.read()
64 |     return content
65 | 
66 | setup(
67 |     name='textblob-aptagger',
68 |     version=__version__,
69 |     description='A fast and accurate part-of-speech tagger for TextBlob.',
70 |     long_description=(read("README.rst") + '\n\n' +
71 |                         read("HISTORY.rst")),
72 |     author='Steven Loria',
73 |     author_email='sloria1@gmail.com',
74 |     url='https://github.com/sloria/textblob-aptagger',
75 |     packages=packages,
76 |     package_dir={'textblob_aptagger': 'textblob_aptagger'},
77 |     include_package_data=True,
78 |     package_data={
79 |         "textblob_aptagger": ["*.pickle"]
80 |     },
81 |     install_requires=requires,
82 |     license=read("LICENSE"),
83 |     zip_safe=False,
84 |     keywords='textblob_aptagger',
85 |     classifiers=[
86 |         'Development Status :: 2 - Pre-Alpha',
87 |         'Intended Audience :: Developers',
88 |         'License :: OSI Approved :: MIT License',
89 |         'Natural Language :: English',
90 |         "Programming Language :: Python :: 2",
91 |         'Programming Language :: Python :: 2.6',
92 |         'Programming Language :: Python :: 2.7',
93 |         'Programming Language :: Python :: 3',
94 |         'Programming Language :: Python :: 3.3',
95 |     ],
96 |     test_suite='tests',
97 |     tests_require=['nose'],
98 | )
99 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/tests/test_taggers.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import unicode_literals
 3 | import unittest
 4 | from nose.tools import *  # PEP8 asserts
 5 | from nose.plugins.attrib import attr
 6 | 
 7 | from textblob.base import BaseTagger
 8 | from textblob.blob import TextBlob
 9 | from textblob.exceptions import MissingCorpusError
10 | from textblob_aptagger import PerceptronTagger
11 | 
12 | class TestPerceptronTagger(unittest.TestCase):
13 | 
14 |     def setUp(self):
15 |         self.text = ("Simple is better than complex. "
16 |                      "Complex is better than complicated.")
17 |         self.tagger = PerceptronTagger(load=False)
18 | 
19 |     def test_init(self):
20 |         tagger = PerceptronTagger(load=False)
21 |         assert_true(isinstance(tagger, BaseTagger))
22 | 
23 |     def test_train(self):
24 |         sentences = _read_tagged(_wsj_train)
25 |         nr_iter = 5
26 |         self.tagger.train(sentences, nr_iter=nr_iter)
27 |         nr_words = sum(len(words) for words, tags in sentences)
28 |         # Check that the model has 'ticked over' once per instance
29 |         assert_equal(nr_words * nr_iter, self.tagger.model.i)
30 |         # Check that the tagger has a class for every seen tag
31 |         tag_set = set()
32 |         for _, tags in sentences:
33 |             tag_set.update(tags)
34 |         assert_equal(len(tag_set), len(self.tagger.model.classes))
35 |         for tag in tag_set:
36 |             assert_true(tag in self.tagger.model.classes)
37 | 
38 |     @attr("slow")
39 |     def test_tag(self):
40 |         trained_tagger = PerceptronTagger()
41 |         tokens = trained_tagger.tag(self.text)
42 |         assert_equal([w for w, t in tokens],
43 |             ['Simple', 'is', 'better', 'than', 'complex', '.', 'Complex', 'is',
44 |              'better', 'than', 'complicated', '.'])
45 | 
46 |     @attr("slow")
47 |     def test_tag_textblob(self):
48 |         trained_tagger = PerceptronTagger()
49 |         blob = TextBlob(self.text, pos_tagger=trained_tagger)
50 |         # Punctuation is excluded
51 |         assert_equal([w for w, t in blob.tags],
52 |             ['Simple', 'is', 'better', 'than', 'complex', 'Complex', 'is',
53 |              'better', 'than', 'complicated'])
54 | 
55 |     def test_loading_missing_file_raises_missing_corpus_exception(self):
56 |         tagger = PerceptronTagger(load=False)
57 |         assert_raises(MissingCorpusError, tagger.load, 'missing.pickle')
58 | 
59 | 
60 | def _read_tagged(text, sep='|'):
61 |     sentences = []
62 |     for sent in text.split('\n'):
63 |         tokens = []
64 |         tags = []
65 |         for token in sent.split():
66 |             word, pos = token.split(sep)
67 |             tokens.append(word)
68 |             tags.append(pos)
69 |         sentences.append((tokens, tags))
70 |     return sentences
71 | 
72 | _wsj_train = ("Pierre|NNP Vinken|NNP ,|, 61|CD years|NNS old|JJ ,|, will|MD "
73 |               "join|VB the|DT board|NN as|IN a|DT nonexecutive|JJ director|NN "
74 |               "Nov.|NNP 29|CD .|.\nMr.|NNP Vinken|NNP is|VBZ chairman|NN of|IN "
75 |               "Elsevier|NNP N.V.|NNP ,|, the|DT Dutch|NNP publishing|VBG "
76 |               "group|NN .|. Rudolph|NNP Agnew|NNP ,|, 55|CD years|NNS old|JJ "
77 |               "and|CC former|JJ chairman|NN of|IN Consolidated|NNP Gold|NNP "
78 |               "Fields|NNP PLC|NNP ,|, was|VBD named|VBN a|DT nonexecutive|JJ "
79 |               "director|NN of|IN this|DT British|JJ industrial|JJ conglomerate|NN "
80 |               ".|.\nA|DT form|NN of|IN asbestos|NN once|RB used|VBN to|TO make|VB "
81 |               "Kent|NNP cigarette|NN filters|NNS has|VBZ caused|VBN a|DT high|JJ "
82 |               "percentage|NN of|IN cancer|NN deaths|NNS among|IN a|DT group|NN "
83 |               "of|IN workers|NNS exposed|VBN to|TO it|PRP more|RBR than|IN "
84 |               "30|CD years|NNS ago|IN ,|, researchers|NNS reported|VBD .|.")
85 | 
86 | 
87 | if __name__ == '__main__':
88 |     unittest.main()
89 | 


--------------------------------------------------------------------------------
/textblob_aptagger/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | '''textblob-aptagger
 3 | 
 4 | A TextBlob extension that adds the `PerceptronTagger`, a fast and accurate
 5 | part-of-speech tagger based on the Averaged Perceptron algorithm.
 6 | '''
 7 | from __future__ import absolute_import
 8 | from textblob_aptagger.taggers import PerceptronTagger
 9 | 
10 | __version__ = '0.3.0-dev'
11 | __license__ = "MIT"
12 | 


--------------------------------------------------------------------------------
/textblob_aptagger/_perceptron.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Averaged perceptron classifier. Implementation geared for simplicity rather than
 3 | efficiency.
 4 | """
 5 | from collections import defaultdict
 6 | import pickle
 7 | import random
 8 | 
 9 | 
10 | class AveragedPerceptron(object):
11 | 
12 |     '''An averaged perceptron, as implemented by Matthew Honnibal.
13 | 
14 |     See more implementation details here:
15 |         http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/
16 |     '''
17 | 
18 |     def __init__(self):
19 |         # Each feature gets its own weight vector, so weights is a dict-of-dicts
20 |         self.weights = {}
21 |         self.classes = set()
22 |         # The accumulated values, for the averaging. These will be keyed by
23 |         # feature/clas tuples
24 |         self._totals = defaultdict(int)
25 |         # The last time the feature was changed, for the averaging. Also
26 |         # keyed by feature/clas tuples
27 |         # (tstamps is short for timestamps)
28 |         self._tstamps = defaultdict(int)
29 |         # Number of instances seen
30 |         self.i = 0
31 | 
32 |     def predict(self, features):
33 |         '''Dot-product the features and current weights and return the best label.'''
34 |         scores = defaultdict(float)
35 |         for feat, value in features.items():
36 |             if feat not in self.weights or value == 0:
37 |                 continue
38 |             weights = self.weights[feat]
39 |             for label, weight in weights.items():
40 |                 scores[label] += value * weight
41 |         # Do a secondary alphabetic sort, for stability
42 |         return max(self.classes, key=lambda label: (scores[label], label))
43 | 
44 |     def update(self, truth, guess, features):
45 |         '''Update the feature weights.'''
46 |         def upd_feat(c, f, w, v):
47 |             param = (f, c)
48 |             self._totals[param] += (self.i - self._tstamps[param]) * w
49 |             self._tstamps[param] = self.i
50 |             self.weights[f][c] = w + v
51 | 
52 |         self.i += 1
53 |         if truth == guess:
54 |             return None
55 |         for f in features:
56 |             weights = self.weights.setdefault(f, {})
57 |             upd_feat(truth, f, weights.get(truth, 0.0), 1.0)
58 |             upd_feat(guess, f, weights.get(guess, 0.0), -1.0)
59 |         return None
60 | 
61 |     def average_weights(self):
62 |         '''Average weights from all iterations.'''
63 |         for feat, weights in self.weights.items():
64 |             new_feat_weights = {}
65 |             for clas, weight in weights.items():
66 |                 param = (feat, clas)
67 |                 total = self._totals[param]
68 |                 total += (self.i - self._tstamps[param]) * weight
69 |                 averaged = round(total / float(self.i), 3)
70 |                 if averaged:
71 |                     new_feat_weights[clas] = averaged
72 |             self.weights[feat] = new_feat_weights
73 |         return None
74 | 
75 |     def save(self, path):
76 |         '''Save the pickled model weights.'''
77 |         return pickle.dump(dict(self.weights), open(path, 'w'))
78 | 
79 |     def load(self, path):
80 |         '''Load the pickled model weights.'''
81 |         self.weights = pickle.load(open(path))
82 |         return None
83 | 
84 | 
85 | def train(nr_iter, examples):
86 |     '''Return an averaged perceptron model trained on ``examples`` for
87 |     ``nr_iter`` iterations.
88 |     '''
89 |     model = AveragedPerceptron()
90 |     for i in range(nr_iter):
91 |         random.shuffle(examples)
92 |         for features, class_ in examples:
93 |             scores = model.predict(features)
94 |             guess, score = max(scores.items(), key=lambda i: i[1])
95 |             if guess != class_:
96 |                 model.update(class_, guess, features)
97 |     model.average_weights()
98 |     return model
99 | 


--------------------------------------------------------------------------------
/textblob_aptagger/compat.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import sys
 3 | 
 4 | PY2 = int(sys.version[0]) == 2
 5 | PY26 = PY2 and int(sys.version_info[1]) < 7
 6 | 
 7 | if PY2:
 8 |     text_type = unicode
 9 |     binary_type = str
10 |     string_types = (str, unicode)
11 |     unicode = unicode
12 |     basestring = basestring
13 | else:
14 |     text_type = str
15 |     binary_type = bytes
16 |     string_types = (str,)
17 |     unicode = str
18 |     basestring = (str, bytes)


--------------------------------------------------------------------------------
/textblob_aptagger/taggers.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | import os
  4 | import random
  5 | from collections import defaultdict
  6 | import pickle
  7 | import logging
  8 | 
  9 | from textblob.base import BaseTagger
 10 | from textblob.tokenizers import WordTokenizer, SentenceTokenizer
 11 | from textblob.exceptions import MissingCorpusError
 12 | from textblob_aptagger._perceptron import AveragedPerceptron
 13 | 
 14 | PICKLE = "trontagger-0.1.0.pickle"
 15 | 
 16 | 
 17 | class PerceptronTagger(BaseTagger):
 18 | 
 19 |     '''Greedy Averaged Perceptron tagger, as implemented by Matthew Honnibal.
 20 | 
 21 |     See more implementation details here:
 22 |         http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/
 23 | 
 24 |     :param load: Load the pickled model upon instantiation.
 25 |     '''
 26 | 
 27 |     START = ['-START-', '-START2-']
 28 |     END = ['-END-', '-END2-']
 29 |     AP_MODEL_LOC = os.path.join(os.path.dirname(__file__), PICKLE)
 30 | 
 31 |     def __init__(self, load=True):
 32 |         self.model = AveragedPerceptron()
 33 |         self.tagdict = {}
 34 |         self.classes = set()
 35 |         if load:
 36 |             self.load(self.AP_MODEL_LOC)
 37 | 
 38 |     def tag(self, corpus, tokenize=True):
 39 |         '''Tags a string `corpus`.'''
 40 |         # Assume untokenized corpus has \n between sentences and ' ' between words
 41 |         s_split = SentenceTokenizer().tokenize if tokenize else lambda t: t.split('\n')
 42 |         w_split = WordTokenizer().tokenize if tokenize else lambda s: s.split()
 43 |         def split_sents(corpus):
 44 |             for s in s_split(corpus):
 45 |                 yield w_split(s)
 46 | 
 47 |         prev, prev2 = self.START
 48 |         tokens = []
 49 |         for words in split_sents(corpus):
 50 |             context = self.START + [self._normalize(w) for w in words] + self.END
 51 |             for i, word in enumerate(words):
 52 |                 tag = self.tagdict.get(word)
 53 |                 if not tag:
 54 |                     features = self._get_features(i, word, context, prev, prev2)
 55 |                     tag = self.model.predict(features)
 56 |                 tokens.append((word, tag))
 57 |                 prev2 = prev
 58 |                 prev = tag
 59 |         return tokens
 60 | 
 61 |     def train(self, sentences, save_loc=None, nr_iter=5):
 62 |         '''Train a model from sentences, and save it at ``save_loc``. ``nr_iter``
 63 |         controls the number of Perceptron training iterations.
 64 | 
 65 |         :param sentences: A list of (words, tags) tuples.
 66 |         :param save_loc: If not ``None``, saves a pickled model in this location.
 67 |         :param nr_iter: Number of training iterations.
 68 |         '''
 69 |         self._make_tagdict(sentences)
 70 |         self.model.classes = self.classes
 71 |         for iter_ in range(nr_iter):
 72 |             c = 0
 73 |             n = 0
 74 |             for words, tags in sentences:
 75 |                 prev, prev2 = self.START
 76 |                 context = self.START + [self._normalize(w) for w in words] \
 77 |                                                                     + self.END
 78 |                 for i, word in enumerate(words):
 79 |                     guess = self.tagdict.get(word)
 80 |                     if not guess:
 81 |                         feats = self._get_features(i, word, context, prev, prev2)
 82 |                         guess = self.model.predict(feats)
 83 |                         self.model.update(tags[i], guess, feats)
 84 |                     prev2 = prev
 85 |                     prev = guess
 86 |                     c += guess == tags[i]
 87 |                     n += 1
 88 |             random.shuffle(sentences)
 89 |             logging.info("Iter {0}: {1}/{2}={3}".format(iter_, c, n, _pc(c, n)))
 90 |         self.model.average_weights()
 91 |         # Pickle as a binary file
 92 |         if save_loc is not None:
 93 |             pickle.dump((self.model.weights, self.tagdict, self.classes),
 94 |                          open(save_loc, 'wb'), -1)
 95 |         return None
 96 | 
 97 |     def load(self, loc):
 98 |         '''Load a pickled model.'''
 99 |         try:
100 |             w_td_c = pickle.load(open(loc, 'rb'))
101 |         except IOError:
102 |             msg = ("Missing trontagger.pickle file.")
103 |             raise MissingCorpusError(msg)
104 |         self.model.weights, self.tagdict, self.classes = w_td_c
105 |         self.model.classes = self.classes
106 |         return None
107 | 
108 |     def _normalize(self, word):
109 |         '''Normalization used in pre-processing.
110 | 
111 |         - All words are lower cased
112 |         - Digits in the range 1800-2100 are represented as !YEAR;
113 |         - Other digits are represented as !DIGITS
114 | 
115 |         :rtype: str
116 |         '''
117 |         if '-' in word and word[0] != '-':
118 |             return '!HYPHEN'
119 |         elif word.isdigit() and len(word) == 4:
120 |             return '!YEAR'
121 |         elif word[0].isdigit():
122 |             return '!DIGITS'
123 |         else:
124 |             return word.lower()
125 | 
126 |     def _get_features(self, i, word, context, prev, prev2):
127 |         '''Map tokens into a feature representation, implemented as a
128 |         {hashable: float} dict. If the features change, a new model must be
129 |         trained.
130 |         '''
131 |         def add(name, *args):
132 |             features[' '.join((name,) + tuple(args))] += 1
133 | 
134 |         i += len(self.START)
135 |         features = defaultdict(int)
136 |         # It's useful to have a constant feature, which acts sort of like a prior
137 |         add('bias')
138 |         add('i suffix', word[-3:])
139 |         add('i pref1', word[0])
140 |         add('i-1 tag', prev)
141 |         add('i-2 tag', prev2)
142 |         add('i tag+i-2 tag', prev, prev2)
143 |         add('i word', context[i])
144 |         add('i-1 tag+i word', prev, context[i])
145 |         add('i-1 word', context[i-1])
146 |         add('i-1 suffix', context[i-1][-3:])
147 |         add('i-2 word', context[i-2])
148 |         add('i+1 word', context[i+1])
149 |         add('i+1 suffix', context[i+1][-3:])
150 |         add('i+2 word', context[i+2])
151 |         return features
152 | 
153 |     def _make_tagdict(self, sentences):
154 |         '''Make a tag dictionary for single-tag words.'''
155 |         counts = defaultdict(lambda: defaultdict(int))
156 |         for words, tags in sentences:
157 |             for word, tag in zip(words, tags):
158 |                 counts[word][tag] += 1
159 |                 self.classes.add(tag)
160 |         freq_thresh = 20
161 |         ambiguity_thresh = 0.97
162 |         for word, tag_freqs in counts.items():
163 |             tag, mode = max(tag_freqs.items(), key=lambda item: item[1])
164 |             n = sum(tag_freqs.values())
165 |             # Don't add rare words to the tag dictionary
166 |             # Only add quite unambiguous words
167 |             if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh:
168 |                 self.tagdict[word] = tag
169 | 
170 | 
171 | def _pc(n, d):
172 |     return (float(n) / d) * 100
173 | 


--------------------------------------------------------------------------------
/textblob_aptagger/trontagger-0.1.0.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sloria/textblob-aptagger/fb98bbd16a83650cab4819c4b89f0973e60fb3fe/textblob_aptagger/trontagger-0.1.0.pickle


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist =py26,py27,py33
3 | [testenv]
4 | deps=nose
5 | commands=
6 |     python run_tests.py
7 | 


--------------------------------------------------------------------------------