├── README.rst
├── polyglot
    ├── tag
    │   ├── tests
    │   │   ├── __init__.py
    │   │   └── test_base.py
    │   ├── __init__.py
    │   └── base.py
    ├── mapping
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── test_embeddings.py
    │   │   └── test_expansion.py
    │   ├── __init__.py
    │   ├── expansion.py
    │   ├── base.py
    │   └── embeddings.py
    ├── tokenize
    │   ├── tests
    │   │   ├── __init__.py
    │   │   └── test_base.py
    │   ├── __init__.py
    │   └── base.py
    ├── transliteration
    │   ├── tests
    │   │   ├── __init__.py
    │   │   └── test_base.py
    │   ├── __init__.py
    │   └── base.py
    ├── detect
    │   ├── __init__.py
    │   └── base.py
    ├── __init__.py
    ├── decorators.py
    ├── utils.py
    ├── load.py
    ├── base.py
    └── mixins.py
├── setup.cfg
├── docs
    ├── authors.rst
    ├── history.rst
    ├── readme.rst
    ├── contributing.rst
    ├── modules.rst
    ├── usage.rst
    ├── Embeddings_files
    │   └── Embeddings_12_0.png
    ├── polyglot.tag.rst
    ├── polyglot.tokenize.rst
    ├── index_latex.rst
    ├── polyglot.transliteration.rst
    ├── polyglot.detect.rst
    ├── index.rst
    ├── polyglot.mapping.rst
    ├── TODO.rst
    ├── Installation.rst
    ├── polyglot.rst
    ├── sphinxext
    │   └── github_link.py
    ├── Tokenization.rst
    ├── Transliteration.rst
    ├── README.rst
    ├── POS.rst
    ├── NamedEntityRecognition.rst
    ├── Embeddings.rst
    ├── Makefile
    ├── make.bat
    ├── Sentiment.rst
    ├── CLI.rst
    └── MorphologicalAnalysis.rst
├── tests
    ├── __init__.py
    └── test_polyglot.py
├── requirements.txt
├── rtd_requirements.txt
├── tox.ini
├── AUTHORS.rst
├── nb2rst.sh
├── MANIFEST.in
├── .travis.yml
├── HISTORY.rst
├── .gitignore
├── notebooks
    ├── testdata
    │   └── cricket.txt
    ├── TODO.ipynb
    ├── Installation.ipynb
    ├── Transliteration.ipynb
    ├── Tokenization.ipynb
    ├── POS.ipynb
    └── README.ipynb
├── Makefile
├── CONTRIBUTING.rst
└── setup.py


/README.rst:
--------------------------------------------------------------------------------
1 | docs/README.rst


--------------------------------------------------------------------------------
/polyglot/tag/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/polyglot/mapping/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/polyglot/tokenize/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [wheel]
2 | universal = 1


--------------------------------------------------------------------------------
/docs/authors.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../AUTHORS.rst


--------------------------------------------------------------------------------
/docs/history.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../HISTORY.rst


--------------------------------------------------------------------------------
/docs/readme.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../README.rst


--------------------------------------------------------------------------------
/polyglot/transliteration/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-


--------------------------------------------------------------------------------
/docs/contributing.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../CONTRIBUTING.rst


--------------------------------------------------------------------------------
/docs/modules.rst:
--------------------------------------------------------------------------------
1 | polyglot
2 | ========
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    polyglot
8 | 


--------------------------------------------------------------------------------
/docs/usage.rst:
--------------------------------------------------------------------------------
1 | ========
2 | Usage
3 | ========
4 | 
5 | To use polyglot in a project::
6 | 
7 | 	import polyglot


--------------------------------------------------------------------------------
/polyglot/detect/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import Detector, Language
2 | 
3 | __all__ = ['Detector', 'Language']
4 | 


--------------------------------------------------------------------------------
/polyglot/transliteration/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import Transliterator
2 | 
3 | __all__ = ["Transliterator"]
4 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | wheel>=0.23.0
2 | PyICU>=1.8
3 | pycld2>=0.3
4 | six>=1.7.3
5 | futures>=2.1.6
6 | morfessor>=2.0.2a1
7 | 


--------------------------------------------------------------------------------
/docs/Embeddings_files/Embeddings_12_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Todo/polyglot/master/docs/Embeddings_files/Embeddings_12_0.png


--------------------------------------------------------------------------------
/polyglot/tag/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import NEChunker, POSTagger, get_pos_tagger, get_ner_tagger
2 | 
3 | __all__ = ['NEChunker', "POSTagger", "get_pos_tagger", "get_ner_tagger"]
4 | 


--------------------------------------------------------------------------------
/rtd_requirements.txt:
--------------------------------------------------------------------------------
1 | wheel>=0.23.0
2 | pycld2>=0.20
3 | six>=1.7.3
4 | futures>=2.1.6
5 | sphinxcontrib-napoleon>=0.2.8
6 | mock>=1.0.1
7 | sphinx-bootstrap-theme>=0.4.5
8 | alabaster>=0.7.1
9 | 


--------------------------------------------------------------------------------
/polyglot/tokenize/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | 
4 | from .base import WordTokenizer, SentenceTokenizer
5 | 
6 | 
7 | __all__ = ['WordTokenizer',
8 |            'SentenceTokenizer']
9 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist = py26, py27, py33, py34
3 | 
4 | [testenv]
5 | setenv =
6 |     PYTHONPATH = {toxinidir}:{toxinidir}/polyglot
7 | commands = python setup.py test
8 | deps =
9 |     -r{toxinidir}/requirements.txt


--------------------------------------------------------------------------------
/AUTHORS.rst:
--------------------------------------------------------------------------------
 1 | =======
 2 | Credits
 3 | =======
 4 | 
 5 | Development Lead
 6 | ----------------
 7 | 
 8 | * Rami Al-Rfou <rmyeid@gmail.com>
 9 | 
10 | Contributors
11 | ------------
12 | 
13 | * Yingtao Tian <yittian@cs.stonybrook.edu>
14 | 


--------------------------------------------------------------------------------
/nb2rst.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | 
 4 | files=`ls notebooks/*ipynb`
 5 | cd docs
 6 | for f in $files
 7 | do
 8 |   b=`basename -s .ipynb $f`
 9 |   ipython nbconvert ../notebooks/${b}.ipynb --to rst --output ${b}.rst
10 | done
11 | cd -
12 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include *.rst
 2 | include notebooks/*.ipynb
 3 | include *.txt
 4 | include Makefile
 5 | 
 6 | recursive-include tests *
 7 | recursive-exclude * __pycache__
 8 | recursive-exclude * *.py[co]
 9 | 
10 | recursive-include docs *.rst conf.py Makefile make.bat
11 | 


--------------------------------------------------------------------------------
/polyglot/transliteration/tests/test_base.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """Test basic Transliterators facilities."""
 5 | 
 6 | import unittest
 7 | from .. import Transliterator
 8 | 
 9 | class TransliteratorTest(unittest.TestCase):
10 |   def __init__(self):
11 |     pass
12 | 
13 | if __name__ == "__main__":
14 |   unittest.main()
15 | 


--------------------------------------------------------------------------------
/polyglot/tag/tests/test_base.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """Test basic Taggers."""
 5 | 
 6 | import unittest
 7 | from .. import NEChunker, POSTagger
 8 | 
 9 | from io import StringIO
10 | 
11 | class NERChunkerTest(unittest.TestCase):
12 |   def __init__(self):
13 |     pass
14 | 
15 | if __name__ == "__main__":
16 |   unittest.main()
17 | 


--------------------------------------------------------------------------------
/polyglot/mapping/__init__.py:
--------------------------------------------------------------------------------
 1 | from .base import CountedVocabulary, OrderedVocabulary, VocabularyBase
 2 | from .embeddings import Embedding
 3 | from .expansion import CaseExpander, DigitExpander
 4 | 
 5 | __all__ = ['CountedVocabulary',
 6 |            'OrderedVocabulary',
 7 |            'VocabularyBase',
 8 |            'Embedding',
 9 |            'CaseExpander',
10 |            'DigitExpander']
11 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # Config file for automatic testing at travis-ci.org
 2 | 
 3 | language: python
 4 | 
 5 | python:
 6 |   - "3.4"
 7 |   - "2.7"
 8 | #  - "pypy"
 9 | 
10 | # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
11 | install:
12 |   - sudo apt-get install python-numpy libicu-dev
13 |   - pip install -r requirements.txt
14 | 
15 | # command to run tests, e.g. python setup.py test
16 | script: nosetests
17 | 


--------------------------------------------------------------------------------
/HISTORY.rst:
--------------------------------------------------------------------------------
 1 | .. :changelog:
 2 | 
 3 | History
 4 | -------
 5 | 
 6 | "14.11" (2014-01-11)
 7 | ---------------------
 8 | 
 9 | * First release on PyPI.
10 | 
11 | 
12 | "15.5.2" (2015-05-02)
13 | ---------------------
14 | 
15 | * Polyglot is feature complete.
16 | 
17 | 
18 | "15.10.03" (2015-10-03)
19 | ---------------------------
20 | 
21 | * Change the polyglot models mirror to Stony Brook University DSL lab instead
22 |   of Google cloud storage.
23 | 


--------------------------------------------------------------------------------
/polyglot/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | __author__ = 'Rami Al-Rfou'
 4 | __email__ = 'rmyeid@gmail.com'
 5 | __version__ = '15.10.03'
 6 | 
 7 | import types
 8 | 
 9 | from six.moves import copyreg
10 | from .base import Sequence, TokenSequence
11 | from .utils import _pickle_method, _unpickle_method
12 | 
13 | __all__ = ['Sequence', 'TokenSequence']
14 | 
15 | data_path = '~/'
16 | 
17 | copyreg.pickle(types.MethodType, _pickle_method, _unpickle_method)
18 | 


--------------------------------------------------------------------------------
/tests/test_polyglot.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | test_polyglot
 6 | ----------------------------------
 7 | 
 8 | Tests for `polyglot` module.
 9 | """
10 | 
11 | import unittest
12 | 
13 | from polyglot import polyglot
14 | 
15 | 
16 | class TestPolyglot(unittest.TestCase):
17 | 
18 |     def setUp(self):
19 |         pass
20 | 
21 |     def test_something(self):
22 |         pass
23 | 
24 |     def tearDown(self):
25 |         pass
26 | 
27 | if __name__ == '__main__':
28 |     unittest.main()


--------------------------------------------------------------------------------
/docs/polyglot.tag.rst:
--------------------------------------------------------------------------------
 1 | polyglot.tag package
 2 | ====================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 | 
 9 |     polyglot.tag.tests
10 | 
11 | Submodules
12 | ----------
13 | 
14 | polyglot.tag.base module
15 | ------------------------
16 | 
17 | .. automodule:: polyglot.tag.base
18 |     :members:
19 |     :undoc-members:
20 |     :show-inheritance:
21 | 
22 | 
23 | Module contents
24 | ---------------
25 | 
26 | .. automodule:: polyglot.tag
27 |     :members:
28 |     :undoc-members:
29 |     :show-inheritance:
30 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | 
 3 | # C extensions
 4 | *.so
 5 | 
 6 | # Packages
 7 | *.egg
 8 | *.egg-info
 9 | dist
10 | build
11 | eggs
12 | parts
13 | bin
14 | var
15 | sdist
16 | develop-eggs
17 | .installed.cfg
18 | lib
19 | lib64
20 | 
21 | # Installer logs
22 | pip-log.txt
23 | 
24 | # Unit test / coverage reports
25 | .coverage
26 | .tox
27 | nosetests.xml
28 | htmlcov
29 | 
30 | # Translations
31 | *.mo
32 | 
33 | # Mr Developer
34 | .mr.developer.cfg
35 | .project
36 | .pydevproject
37 | 
38 | # Complexity
39 | output/*.html
40 | output/*/index.html
41 | 
42 | # Sphinx
43 | docs/_build


--------------------------------------------------------------------------------
/docs/polyglot.tokenize.rst:
--------------------------------------------------------------------------------
 1 | polyglot.tokenize package
 2 | =========================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 | 
 9 |     polyglot.tokenize.tests
10 | 
11 | Submodules
12 | ----------
13 | 
14 | polyglot.tokenize.base module
15 | -----------------------------
16 | 
17 | .. automodule:: polyglot.tokenize.base
18 |     :members:
19 |     :undoc-members:
20 |     :show-inheritance:
21 | 
22 | 
23 | Module contents
24 | ---------------
25 | 
26 | .. automodule:: polyglot.tokenize
27 |     :members:
28 |     :undoc-members:
29 |     :show-inheritance:
30 | 


--------------------------------------------------------------------------------
/docs/index_latex.rst:
--------------------------------------------------------------------------------
 1 | .. complexity documentation master file, created by
 2 |    sphinx-quickstart on Tue Jul  9 22:26:36 2013.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to polyglot's documentation!
 7 | ======================================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 3
11 | 
12 |    Installation
13 |    Detection
14 |    Tokenization
15 |    CLI
16 |    Download
17 |    Embeddings
18 |    POS
19 |    NamedEntityRecognition
20 |    MorphologicalAnalysis
21 |    Transliteration
22 |    Sentiment
23 | 


--------------------------------------------------------------------------------
/docs/polyglot.transliteration.rst:
--------------------------------------------------------------------------------
 1 | polyglot.transliteration package
 2 | ================================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 | 
 9 |     polyglot.transliteration.tests
10 | 
11 | Submodules
12 | ----------
13 | 
14 | polyglot.transliteration.base module
15 | ------------------------------------
16 | 
17 | .. automodule:: polyglot.transliteration.base
18 |     :members:
19 |     :undoc-members:
20 |     :show-inheritance:
21 | 
22 | 
23 | Module contents
24 | ---------------
25 | 
26 | .. automodule:: polyglot.transliteration
27 |     :members:
28 |     :undoc-members:
29 |     :show-inheritance:
30 | 


--------------------------------------------------------------------------------
/docs/polyglot.detect.rst:
--------------------------------------------------------------------------------
 1 | polyglot.detect package
 2 | =======================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | polyglot.detect.base module
 8 | ---------------------------
 9 | 
10 | .. automodule:: polyglot.detect.base
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | polyglot.detect.langids module
16 | ------------------------------
17 | 
18 | .. automodule:: polyglot.detect.langids
19 |     :members:
20 |     :undoc-members:
21 |     :show-inheritance:
22 | 
23 | 
24 | Module contents
25 | ---------------
26 | 
27 | .. automodule:: polyglot.detect
28 |     :members:
29 |     :undoc-members:
30 |     :show-inheritance:
31 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. complexity documentation master file, created by
 2 |    sphinx-quickstart on Tue Jul  9 22:26:36 2013.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to polyglot's documentation!
 7 | ======================================
 8 | 
 9 | .. include::
10 |    README.rst
11 | 
12 | Contents:
13 | =========
14 | 
15 | .. toctree::
16 |    :maxdepth: 2
17 | 
18 |    Installation
19 |    Detection
20 |    Tokenization
21 |    CLI
22 |    Download
23 |    Embeddings
24 |    POS
25 |    NamedEntityRecognition
26 |    MorphologicalAnalysis
27 |    Transliteration
28 |    Sentiment
29 |    modules
30 | 
31 | 


--------------------------------------------------------------------------------
/notebooks/testdata/cricket.txt:
--------------------------------------------------------------------------------
1 | Australia posted a World Cup record total of 417-6 as they beat Afghanistan by 275 runs.
2 | David Warner hit 178 off 133 balls, Steve Smith scored 95 while Glenn Maxwell struck 88 in 39 deliveries in the Pool A encounter in Perth.
3 | Afghanistan were then dismissed for 142, with Mitchell Johnson and Mitchell Starc taking six wickets between them.
4 | Australia's score surpassed the 413-5 India made against Bermuda in 2007.
5 | It continues the pattern of bat dominating ball in this tournament as the third 400 plus score achieved in the pool stages, following South Africa's 408-5 and 411-4 against West Indies and Ireland respectively.
6 | The winning margin beats the 257-run amount by which India beat Bermuda in Port of Spain in 2007, which was equalled five days ago by South Africa in their victory over West Indies in Sydney.
7 | 


--------------------------------------------------------------------------------
/polyglot/decorators.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import functools
 5 | 
 6 | class cached_property(object):
 7 |   """A property that is only computed once per instance and then replaces
 8 |   itself with an ordinary attribute. Deleting the attribute resets the
 9 |   property.
10 |   Credit to Marcel Hellkamp, author of bottle.py.
11 |   """
12 | 
13 |   def __init__(self, func):
14 |     self.__doc__ = getattr(func, '__doc__')
15 |     self.func = func
16 | 
17 |   def __get__(self, obj, cls):
18 |     if obj is None:
19 |         return self
20 |     value = obj.__dict__[self.func.__name__] = self.func(obj)
21 |     return value
22 | 
23 | def memoize(obj):
24 |   cache = obj.cache = {}
25 | 
26 |   @functools.wraps(obj)
27 |   def memoizer(*args, **kwargs):
28 |     key = tuple(list(args) + sorted(kwargs.items()))
29 |     if key not in cache:
30 |       cache[key] = obj(*args, **kwargs)
31 |     return cache[key]
32 |   return memoizer
33 | 


--------------------------------------------------------------------------------
/docs/polyglot.mapping.rst:
--------------------------------------------------------------------------------
 1 | polyglot.mapping package
 2 | ========================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 | 
 9 |     polyglot.mapping.tests
10 | 
11 | Submodules
12 | ----------
13 | 
14 | polyglot.mapping.base module
15 | ----------------------------
16 | 
17 | .. automodule:: polyglot.mapping.base
18 |     :members:
19 |     :undoc-members:
20 |     :show-inheritance:
21 | 
22 | polyglot.mapping.embeddings module
23 | ----------------------------------
24 | 
25 | .. automodule:: polyglot.mapping.embeddings
26 |     :members:
27 |     :undoc-members:
28 |     :show-inheritance:
29 | 
30 | polyglot.mapping.expansion module
31 | ---------------------------------
32 | 
33 | .. automodule:: polyglot.mapping.expansion
34 |     :members:
35 |     :undoc-members:
36 |     :show-inheritance:
37 | 
38 | 
39 | Module contents
40 | ---------------
41 | 
42 | .. automodule:: polyglot.mapping
43 |     :members:
44 |     :undoc-members:
45 |     :show-inheritance:
46 | 


--------------------------------------------------------------------------------
/docs/TODO.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Tasks
 3 | =====
 4 | 
 5 | -  [STRIKEOUT:POS]
 6 | -  [STRIKEOUT:morphological analysis]
 7 | -  [STRIKEOUT:transliteration]
 8 | 
 9 | Library Interface
10 | =================
11 | 
12 | -  [STRIKEOUT:Sentiment]
13 | -  [STRIKEOUT:NER]
14 | -  Frequency based comparison
15 | 
16 | Command Line interface
17 | ======================
18 | 
19 | -  Sentiment
20 | -  Reading stdin column format
21 | 
22 | Infrastructure
23 | ==============
24 | 
25 | -  [STRIKEOUT:Cache models]
26 | -  [STRIKEOUT:Add normalization to the embeddings]
27 | -  [STRIKEOUT:Detect supported languages]
28 | -  [STRIKEOUT:added task/lang as part of the identifier, what is left is
29 |    to iterate over the collections.]
30 | -  [STRIKEOUT:Throw different exception for missing package than
31 |    undownloaded one]
32 | -  [STRIKEOUT:Define NotSupportedLanguage/Task Exception for the
33 |    downloader]
34 | -  [STRIKEOUT:Remove noun phrases support.]
35 | -  [STRIKEOUT:Train more/new POS taggers]
36 | 
37 | Documentation
38 | =============
39 | 
40 | -  Add a quick tutorial
41 | -  Embed demos in our documentation
42 | -  [STRIKEOUT:pycld2 README]
43 | -  [STRIKEOUT:Update rtdcs with the new submodules.]
44 | 


--------------------------------------------------------------------------------
/polyglot/tokenize/base.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """ Basic text segmenters."""
 5 | 
 6 | from icu import Locale, BreakIterator
 7 | from polyglot.base import Sequence
 8 | 
 9 | 
10 | class Breaker(object):
11 |   """ Base class to segment text."""
12 | 
13 |   def __init__(self, locale):
14 |     self.locale = Locale('locale')
15 |     self.breaker = None
16 | 
17 |   def transform(self, sequence):
18 |     seq = Sequence(sequence.text)
19 |     seq.idx = [0]
20 |     for segment in sequence:
21 |       offset = seq.idx[-1]
22 |       self.breaker.setText(segment)
23 |       seq.idx.extend([offset+x for x in self.breaker])
24 |     return seq
25 | 
26 |  
27 | class SentenceTokenizer(Breaker):
28 |   """ Segment text to sentences. """
29 | 
30 |   def __init__(self, locale='en'):
31 |     super(SentenceTokenizer, self).__init__(locale)
32 |     self.breaker = BreakIterator.createSentenceInstance(self.locale)
33 | 
34 | 
35 | class WordTokenizer(Breaker):
36 |   """ Segment text to words or tokens."""
37 | 
38 |   def __init__(self, locale='en'):
39 |     super(WordTokenizer, self).__init__(locale)
40 |     self.breaker = BreakIterator.createWordInstance(self.locale)
41 | 


--------------------------------------------------------------------------------
/docs/Installation.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Installation
 3 | ============
 4 | 
 5 | Installing/Upgrading From the PyPI
 6 | ----------------------------------
 7 | 
 8 | ::
 9 | 
10 |     $ pip install polyglot
11 | 
12 | Dependencies
13 | ~~~~~~~~~~~~
14 | 
15 | polyglot depends on `numpy <http://www.numpy.org/>`__ and
16 | `libicu-dev <https://packages.debian.org/sid/libicu-dev>`__, on
17 | ubuntu/debian linux distribution you can install such packages by
18 | executing the following command:
19 | 
20 | .. code:: python
21 | 
22 |     sudo apt-get install python-numpy libicu-dev
23 | 
24 | From Source
25 | -----------
26 | 
27 | polyglot is actively developed on
28 | `Github <https://github.com/aboSamoor/polyglot>`__.
29 | 
30 | You can clone the public repo:
31 | 
32 | .. code:: python
33 | 
34 |     git clone https://github.com/aboSamoor/polyglot
35 | 
36 | Or download one of the following:
37 | 
38 | -  `tarball <https://github.com/aboSamoor/polyglot/tarball/master>`__
39 | -  `zipball <https://github.com/aboSamoor/polyglot/zipball/master>`__
40 | 
41 | Once you have the source, you can install it into your site-packages
42 | with:
43 | 
44 | .. code:: python
45 | 
46 |     python setup.py install
47 | 
48 | Get the bleeding edge version
49 | -----------------------------
50 | 
51 | To get the latest development version of polyglot, run :
52 | 
53 | ::
54 | 
55 |     $ pip install -U git+https://github.com/aboSamoor/polyglot.git@master
56 | 
57 | Python
58 | ~~~~~~
59 | 
60 | polyglot supports Python >=2.7 or >=3.4.
61 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: clean-pyc clean-build docs clean
 2 | 
 3 | help:
 4 | 	@echo "clean-build - remove build artifacts"
 5 | 	@echo "clean-pyc - remove Python file artifacts"
 6 | 	@echo "lint - check style with flake8"
 7 | 	@echo "test - run tests quickly with the default Python"
 8 | 	@echo "test-all - run tests on every Python version with tox"
 9 | 	@echo "coverage - check code coverage quickly with the default Python"
10 | 	@echo "docs - generate Sphinx HTML documentation, including API docs"
11 | 	@echo "release - package and upload a release"
12 | 	@echo "dist - package"
13 | 
14 | clean: clean-build clean-pyc
15 | 	rm -fr htmlcov/
16 | 
17 | clean-build:
18 | 	rm -fr build/
19 | 	rm -fr dist/
20 | 	rm -fr *.egg-info
21 | 
22 | clean-pyc:
23 | 	find . -name '*.pyc' -exec rm -f {} +
24 | 	find . -name '*.pyo' -exec rm -f {} +
25 | 	find . -name '*~' -exec rm -f {} +
26 | 
27 | lint:
28 | 	flake8 polyglot tests
29 | 
30 | test:
31 | 	python setup.py test
32 | 
33 | test-all:
34 | 	tox
35 | 
36 | coverage:
37 | 	coverage run --source polyglot setup.py test
38 | 	coverage report -m
39 | 	coverage html
40 | 	open htmlcov/index.html
41 | 
42 | docs:
43 | 	./nb2rst.sh
44 | 	rm -f docs/polyglot.rst
45 | 	rm -f docs/modules.rst
46 | 	sphinx-apidoc -o docs/ polyglot
47 | 	$(MAKE) -C docs clean
48 | 	$(MAKE) -C docs html
49 | 	rm -f docs/*tests*rst
50 | 	xdg-open docs/_build/html/index.html
51 | 
52 | release: clean
53 | 	python setup.py sdist upload
54 | 	python setup.py bdist_wheel upload
55 | 
56 | dist: clean
57 | 	python setup.py sdist
58 | 	python setup.py bdist_wheel
59 | 	ls -l dist
60 | 


--------------------------------------------------------------------------------
/docs/polyglot.rst:
--------------------------------------------------------------------------------
 1 | polyglot package
 2 | ================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 | 
 9 |     polyglot.detect
10 |     polyglot.mapping
11 |     polyglot.tag
12 |     polyglot.tokenize
13 |     polyglot.transliteration
14 | 
15 | Submodules
16 | ----------
17 | 
18 | polyglot.base module
19 | --------------------
20 | 
21 | .. automodule:: polyglot.base
22 |     :members:
23 |     :undoc-members:
24 |     :show-inheritance:
25 | 
26 | polyglot.decorators module
27 | --------------------------
28 | 
29 | .. automodule:: polyglot.decorators
30 |     :members:
31 |     :undoc-members:
32 |     :show-inheritance:
33 | 
34 | polyglot.downloader module
35 | --------------------------
36 | 
37 | .. automodule:: polyglot.downloader
38 |     :members:
39 |     :undoc-members:
40 |     :show-inheritance:
41 | 
42 | polyglot.load module
43 | --------------------
44 | 
45 | .. automodule:: polyglot.load
46 |     :members:
47 |     :undoc-members:
48 |     :show-inheritance:
49 | 
50 | polyglot.mixins module
51 | ----------------------
52 | 
53 | .. automodule:: polyglot.mixins
54 |     :members:
55 |     :undoc-members:
56 |     :show-inheritance:
57 | 
58 | polyglot.text module
59 | --------------------
60 | 
61 | .. automodule:: polyglot.text
62 |     :members:
63 |     :undoc-members:
64 |     :show-inheritance:
65 | 
66 | polyglot.utils module
67 | ---------------------
68 | 
69 | .. automodule:: polyglot.utils
70 |     :members:
71 |     :undoc-members:
72 |     :show-inheritance:
73 | 
74 | 
75 | Module contents
76 | ---------------
77 | 
78 | .. automodule:: polyglot
79 |     :members:
80 |     :undoc-members:
81 |     :show-inheritance:
82 | 


--------------------------------------------------------------------------------
/notebooks/TODO.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Tasks\n",
 8 |     "\n",
 9 |     "- ~~POS~~\n",
10 |     "- ~~morphological analysis~~\n",
11 |     "- ~~transliteration~~\n",
12 |     "\n",
13 |     "# Library Interface\n",
14 |     "- ~~Sentiment~~\n",
15 |     "- ~~NER~~\n",
16 |     "- Frequency based comparison\n",
17 |     "\n",
18 |     "\n",
19 |     "# Command Line interface\n",
20 |     "\n",
21 |     "- Sentiment\n",
22 |     "- Reading stdin column format\n",
23 |     "\n",
24 |     "\n",
25 |     "# Infrastructure\n",
26 |     "- ~~Cache models~~\n",
27 |     "- ~~Add normalization to the embeddings~~\n",
28 |     "- ~~Detect supported languages~~\n",
29 |     " - ~~added task/lang as part of the identifier, what is left is to iterate over the collections.~~\n",
30 |     "- ~~Throw different exception for missing package than undownloaded one~~\n",
31 |     "- ~~Define NotSupportedLanguage/Task Exception for the downloader~~\n",
32 |     "- ~~Remove noun phrases support.~~\n",
33 |     "- ~~Train more/new POS taggers~~\n",
34 |     "\n",
35 |     "\n",
36 |     "# Documentation\n",
37 |     "\n",
38 |     "- Add a quick tutorial\n",
39 |     "- Embed demos in our documentation\n",
40 |     "- ~~pycld2 README~~\n",
41 |     "- ~~Update rtdcs with the new submodules.~~"
42 |    ]
43 |   }
44 |  ],
45 |  "metadata": {
46 |   "kernelspec": {
47 |    "display_name": "Python 2",
48 |    "language": "python",
49 |    "name": "python2"
50 |   },
51 |   "language_info": {
52 |    "codemirror_mode": {
53 |     "name": "ipython",
54 |     "version": 2
55 |    },
56 |    "file_extension": ".py",
57 |    "mimetype": "text/x-python",
58 |    "name": "python",
59 |    "nbconvert_exporter": "python",
60 |    "pygments_lexer": "ipython2",
61 |    "version": "2.7.6"
62 |   }
63 |  },
64 |  "nbformat": 4,
65 |  "nbformat_minor": 0
66 | }
67 | 


--------------------------------------------------------------------------------
/polyglot/mapping/tests/test_embeddings.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """Test basic embedding utilities."""
 5 | 
 6 | import unittest
 7 | from ..embeddings import Embedding
 8 | 
 9 | from io import StringIO
10 | 
11 | word2vec_dump = u"""
12 | 9 5
13 | </s> 0.001329 -0.000965 -0.001856 -0.000425 -0.000381 
14 | the -0.144928 0.074345 -0.069327 -0.017698 0.090774
15 | , -0.022361 -0.033252 -0.000350 -0.027688 -0.025736
16 | . 0.006878 0.064503 0.074926 -0.048397 -0.041165
17 | of 0.182565 0.125933 0.065001 -0.004585 0.164688
18 | and 0.013473 0.012923 0.027855 0.046051 -0.043293
19 | in -0.003114 -0.126757 0.099654 0.059442 0.003293
20 | to 0.223011 -0.080497 -0.083754 -0.182311 0.057853
21 | a -0.136669 0.161203 0.192028 0.068527 0.292363
22 | """.strip()
23 | 
24 | 
25 | class EmbeddingTest(unittest.TestCase):
26 |   def setUp(self):
27 |     self.fname = StringIO(word2vec_dump)
28 |     self.model = Embedding.from_word2vec(self.fname, binary=0, fvocab=None)
29 |     self.words = ["</s>", "the", ",", ".", "of", "and", "in", "to", "a"]
30 | 
31 |   def tearDown(self):
32 |     pass
33 | 
34 |   def test_model_words(self):
35 |     self.assertEqual(self.model.words, self.words)
36 |     self.assertAlmostEqual(self.model[self.words[-1]][-1], 0.292363)
37 | 
38 |   def test_most_frequent(self):
39 |     model = self.model.most_frequent(3)
40 |     self.assertEqual(model.words, self.words[:3])
41 |     self.assertEqual(model.shape, (3, 5))
42 | 
43 |   def test_model_shape(self):
44 |     self.assertEqual(self.model.shape, (9, 5))
45 | 
46 |   def test_deletion(self):
47 |     del self.model[self.words[5]]
48 |     self.assertEqual(self.model.shape, (8, 5))
49 |     self.assertEqual(self.model.words, self.words[:5]+self.words[6:])
50 |     self.assertFalse(self.words[5] in self.model)
51 | 
52 |   def test_word_with_space(self):
53 |     new_dump = word2vec_dump.replace("9", "10") + u"\na b 1.0 2.0 3.0 4.0 5.0"
54 |     fname = StringIO(new_dump)
55 |     model = Embedding.from_word2vec(fname, binary=0, fvocab=None)
56 |     self.assertEqual(model.words[-1], u"a b")
57 | 
58 |   def test_norm(self):
59 |     model = self.model.normalize_words()
60 |     norms = (model.vectors ** 2).sum(axis=1)
61 |     _ = [self.assertAlmostEqual(x,y, places=6) for x,y in zip(norms, [1.]*model.shape[0])]
62 |     
63 | 
64 | if __name__ == "__main__":
65 |   unittest.main()
66 | 


--------------------------------------------------------------------------------
/polyglot/utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """Collection of general utilities."""
 5 | 
 6 | from __future__ import print_function
 7 | from os import path
 8 | import os
 9 | import tarfile
10 | 
11 | import six
12 | from six import text_type as unicode
13 | from six import string_types
14 | 
15 | 
16 | def _open(file_, mode='r'):
17 |   """Open file object given filenames, open files or even archives."""
18 |   if isinstance(file_, string_types):
19 |     _, ext = path.splitext(file_)
20 |     if ext in {'.bz2', '.gz'}:
21 |       s = tarfile.open(file_)
22 |       return s.extractfile(s.next())
23 |     else:
24 |       return open(file_, mode)
25 |   return file_
26 | 
27 | 
28 | def _print(text):
29 |   """Handle the differences between Pytho2,3 print functions.
30 |   Args:
31 |     text (string): Should be in unicode.
32 |   """
33 |   if six.PY3:
34 |     print(text)
35 |   else:
36 |     print(text.encode("utf8"))
37 | 
38 | def _pickle_method(method):
39 |   """Pickle methods properly, including class methods."""
40 |   func_name = method.im_func.__name__
41 |   obj = method.im_self
42 |   cls = method.im_class
43 |   if isinstance(cls, type):
44 |       # handle classmethods differently
45 |       cls = obj
46 |       obj = None
47 |   if func_name.startswith('__') and not func_name.endswith('__'):
48 |       #deal with mangled names
49 |       cls_name = cls.__name__.lstrip('_')
50 |       func_name = '_%s%s' % (cls_name, func_name)
51 |   return _unpickle_method, (func_name, obj, cls)
52 | 
53 | def _unpickle_method(func_name, obj, cls):
54 |   """Unpickle methods properly, including class methods."""
55 | 
56 |   if obj is None:
57 |     return cls.__dict__[func_name].__get__(obj, cls)
58 |   for cls in cls.__mro__:
59 |     try:
60 |       func = cls.__dict__[func_name]
61 |     except KeyError:
62 |       pass
63 |     else:
64 |       break
65 |   return func.__get__(obj, cls)
66 | 
67 | def pretty_list(items, cols=3):
68 |   text = []
69 |   width = 24
70 |   col_width = u"{" + u":<" + str(width) + u"} "
71 |   for i, lang in enumerate(items):
72 |     if not six.PY3:
73 |       lang = lang.decode(u"utf-8")
74 |     if len(lang) > width:
75 |       lang = lang[:width-3] + "..."
76 |     text.append(u"{:>3}. ".format(i+1))
77 |     text.append(col_width.format(lang))
78 |     if (i+1) % cols  == 0:
79 |       text.append(u"\n")
80 |   return u"".join(text)
81 | 


--------------------------------------------------------------------------------
/polyglot/mapping/tests/test_expansion.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """Test Expanding vocbulary."""
 5 | 
 6 | import unittest
 7 | from io import StringIO
 8 | 
 9 | from ..base import OrderedVocabulary
10 | from ..expansion import DigitExpander, CaseExpander
11 | 
12 | 
13 | vocab = u"""
14 | the
15 | book
16 | Book
17 | 3
18 | upper
19 | lower
20 | 5
21 | cool
22 | McCain
23 | """.strip()
24 | 
25 | 
26 | class DigitExpanderTest(unittest.TestCase):
27 |   def setUp(self):
28 |     self.v = OrderedVocabulary.from_vocabfile(StringIO(vocab))
29 |     
30 |   def test_load(self):
31 |     self.assertEqual(len(self.v), 9)
32 |     
33 |   def test_digit_expansion(self):
34 |     v = DigitExpander(vocabulary=self.v, strategy='most_frequent')
35 |     self.assertEqual(len(v), 10)
36 |     
37 |   def test_digit_membership(self):
38 |     v = DigitExpander(vocabulary=self.v, strategy='most_frequent')
39 |     self.assertTrue(u"8" in v)
40 |     self.assertTrue(u"3" in v)
41 |     self.assertFalse(u"71" in v)
42 |     
43 |   def test_digit_ids(self):
44 |     v = DigitExpander(vocabulary=self.v, strategy='most_frequent')
45 |     self.assertEqual(v["6"], 3)
46 |     self.assertEqual(v["7"], v["2"])
47 |     self.assertNotEqual(v["3"], v["5"])
48 |     
49 | class CaseExpanderTest(unittest.TestCase):
50 |   def setUp(self):
51 |     self.v = OrderedVocabulary.from_vocabfile(StringIO(vocab))
52 |     
53 |   def test_load(self):
54 |     self.assertEqual(len(self.v), 9)
55 |     
56 |   def test_case_expansion(self):
57 |     v = CaseExpander(vocabulary=self.v, strategy='most_frequent')
58 |     self.assertEqual(len(v), 21)
59 |     
60 |   def test_digit_membership(self):
61 |     v = CaseExpander(vocabulary=self.v, strategy='most_frequent')
62 |     self.assertTrue(u"3" in v)
63 |     self.assertTrue(u"BOOK" in v)
64 |     self.assertTrue(u"mccain" in v)
65 |     
66 |   def test_digit_ids(self):
67 |     v = CaseExpander(vocabulary=self.v, strategy='most_frequent')
68 |     self.assertEqual(v["THE"], 0)
69 |     self.assertEqual(v["UPPER"], v["upper"])
70 |     
71 | class MixedExpansionTest(unittest.TestCase):
72 |   def setUp(self):
73 |     self.v = OrderedVocabulary.from_vocabfile(StringIO(vocab))
74 |     self.v1 = CaseExpander(vocabulary=self.v, strategy='most_frequent')
75 |     self.v2 = DigitExpander(vocabulary=self.v1, strategy='most_frequent')
76 |     
77 |   def test_expansion(self):
78 |     self.assertEqual(len(self.v2), 22)
79 |     
80 |   def test_membership(self):
81 |     self.assertTrue(u"3" in self.v2)
82 |     self.assertTrue(u"9" in self.v2)
83 |     self.assertTrue(u"#" in self.v2)
84 |     self.assertTrue(u"BOOK" in self.v2)
85 |     self.assertTrue(u"mccain" in self.v2)
86 |     
87 |   def test_ids(self):
88 |     self.assertEqual(self.v2["THE"], 0)
89 |     self.assertEqual(self.v2["UPPER"], self.v2["upper"])    
90 |     self.assertEqual(self.v2["3"], self.v2["7"])
91 |     
92 | 
93 | if __name__ == "__main__":
94 |   unittest.main()
95 | 


--------------------------------------------------------------------------------
/docs/sphinxext/github_link.py:
--------------------------------------------------------------------------------
 1 | from operator import attrgetter
 2 | import inspect
 3 | import subprocess
 4 | import os
 5 | import sys
 6 | from functools import partial
 7 | 
 8 | REVISION_CMD = 'git rev-parse --short HEAD'
 9 | 
10 | 
11 | def _get_git_revision():
12 |     try:
13 |         revision = subprocess.check_output(REVISION_CMD.split()).strip()
14 |     except subprocess.CalledProcessError:
15 |         print('Failed to execute git to get revision')
16 |         return None
17 |     return revision.decode('utf-8')
18 | 
19 | 
20 | def _linkcode_resolve(domain, info, package, url_fmt, revision):
21 |     """Determine a link to online source for a class/method/function
22 | 
23 |     This is called by sphinx.ext.linkcode
24 | 
25 |     An example with a long-untouched module that everyone has
26 |     >>> _linkcode_resolve('py', {'module': 'tty',
27 |     ...                          'fullname': 'setraw'},
28 |     ...                   package='tty',
29 |     ...                   url_fmt='http://hg.python.org/cpython/file/'
30 |     ...                           '{revision}/Lib/{package}/{path}#L{lineno}',
31 |     ...                   revision='xxxx')
32 |     'http://hg.python.org/cpython/file/xxxx/Lib/tty/tty.py#L18'
33 |     """
34 | 
35 |     if revision is None:
36 |         return
37 |     if domain not in ('py', 'pyx'):
38 |         return
39 |     if not info.get('module') or not info.get('fullname'):
40 |         return
41 | 
42 |     class_name = info['fullname'].split('.')[0]
43 |     if type(class_name) != str:
44 |         # Python 2 only
45 |         class_name = class_name.encode('utf-8')
46 |     module = __import__(info['module'], fromlist=[class_name])
47 |     try:
48 |       obj = attrgetter(info['fullname'])(module)
49 |     except AttributeError:
50 |       return
51 | 
52 |     try:
53 |         fn = inspect.getsourcefile(obj)
54 |     except Exception:
55 |         fn = None
56 |     if not fn:
57 |         try:
58 |             fn = inspect.getsourcefile(sys.modules[obj.__module__])
59 |         except Exception:
60 |             fn = None
61 |     if not fn:
62 |         return
63 | 
64 |     fn = os.path.relpath(fn,
65 |                          start=os.path.dirname(__import__(package).__file__))
66 |     try:
67 |         lineno = inspect.getsourcelines(obj)[1]
68 |     except Exception:
69 |         lineno = ''
70 |     return url_fmt.format(revision=revision, package=package,
71 |                           path=fn, lineno=lineno)
72 | 
73 | 
74 | def make_linkcode_resolve(package, url_fmt):
75 |     """Returns a linkcode_resolve function for the given URL format
76 | 
77 |     revision is a git commit reference (hash or name)
78 | 
79 |     package is the name of the root module of the package
80 | 
81 |     url_fmt is along the lines of ('https://github.com/USER/PROJECT/'
82 |                                    'blob/{revision}/{package}/'
83 |                                    '{path}#L{lineno}')
84 |     """
85 |     revision = _get_git_revision()
86 |     return partial(_linkcode_resolve, revision=revision, package=package,
87 |                    url_fmt=url_fmt)
88 | 


--------------------------------------------------------------------------------
/polyglot/detect/base.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | 
  5 | """Detecting languages"""
  6 | 
  7 | 
  8 | import logging
  9 | 
 10 | 
 11 | from icu import Locale
 12 | import pycld2 as cld2
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | class Error(Exception):
 18 |   """Base exception class for this class."""
 19 | 
 20 | 
 21 | class UnknownLanguage(Error):
 22 |   """Raised if we can not detect the language of a text snippet."""
 23 | 
 24 | 
 25 | class Language(object):
 26 |   def __init__(self, choice):
 27 |     basic_name, code, confidence, bytesize = choice
 28 |     self.locale = Locale(code)
 29 |     self.confidence = float(confidence)
 30 |     self.read_bytes = int(bytesize)
 31 | 
 32 |   @property
 33 |   def name(self):
 34 |     return self.locale.getDisplayLanguage()
 35 | 
 36 |   @property
 37 |   def code(self):
 38 |     return self.locale.getName()
 39 | 
 40 |   def __str__(self):
 41 |     return ("name: {:<12}code: {:<9}confidence: {:>5.1f} "
 42 |             "read bytes:{:>6}".format(self.name, self.code,
 43 |                                     self.confidence, self.read_bytes))
 44 | 
 45 |   @staticmethod
 46 |   def from_code(code):
 47 |     return Language(("", code, 100, 0))
 48 | 
 49 | 
 50 | class Detector(object):
 51 |   """ Detect the language used in a snippet of text.
 52 |   """
 53 | 
 54 |   def __init__(self, text, quiet=False):
 55 |     """ Detector of the language used in `text`.
 56 | 
 57 |     Args:
 58 |       text (string): unicode string.
 59 |     """
 60 |     self.__text = text
 61 |     self.reliable = True
 62 |     """False if the detector used Best Effort strategy in detection."""
 63 |     self.quiet = quiet
 64 |     """If true, exceptions will be silenced."""
 65 |     self.detect(text)
 66 | 
 67 |   @staticmethod
 68 |   def supported_languages():
 69 |     """Returns a list of the languages that can be detected by pycld2."""
 70 |     return [name.capitalize() for name,code in cld2.LANGUAGES if not name.startswith("X_")]
 71 | 
 72 |   def detect(self, text):
 73 |     """Decide which language is used to write the text.
 74 | 
 75 |     The method tries first to detect the language with high reliability. If
 76 |     that is not possible, the method switches to best effort strategy.
 77 | 
 78 | 
 79 |     Args:
 80 |       text (string): A snippet of text, the longer it is the more reliable we
 81 |                      can detect the language used to write the text.
 82 |     """
 83 |     t = text.encode("utf-8")
 84 |     reliable, index, top_3_choices = cld2.detect(t, bestEffort=False)
 85 | 
 86 |     if not reliable:
 87 |       self.reliable = False
 88 |       reliable, index, top_3_choices = cld2.detect(t, bestEffort=True)
 89 | 
 90 |       if not reliable and not self.quiet:
 91 |         raise UnknownLanguage("Try passing a longer snippet of text")
 92 |       else:
 93 |         logger.warning("Detector is not able to detect the language reliably.")
 94 | 
 95 |     self.languages = [Language(x) for x in top_3_choices]
 96 |     self.language = self.languages[0]
 97 |     return self.language
 98 | 
 99 |   def __str__(self):
100 |     text = "Prediction is reliable: {}\n".format(self.reliable)
101 |     text += u"\n".join(["Language {}: {}".format(i+1, str(l))
102 |                         for i,l in enumerate(self.languages)])
103 |     return text
104 | 


--------------------------------------------------------------------------------
/notebooks/Installation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "#Installation"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## Installing/Upgrading From the PyPI\n",
 15 |     "\n",
 16 |     "\n",
 17 |     "    $ pip install polyglot"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "### Dependencies\n",
 25 |     "\n",
 26 |     "polyglot depends on [numpy](http://www.numpy.org/) and [libicu-dev](https://packages.debian.org/sid/libicu-dev), on ubuntu/debian linux distribution you can install such packages by executing the following command:"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "metadata": {
 33 |     "collapsed": false
 34 |    },
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "sudo apt-get install python-numpy libicu-dev"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "## From Source\n",
 45 |     "\n",
 46 |     "\n",
 47 |     "polyglot is actively developed on\n",
 48 |     "[Github](https://github.com/aboSamoor/polyglot).\n",
 49 |     "\n",
 50 |     "You can clone the public repo:"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "metadata": {
 57 |     "collapsed": true
 58 |    },
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "git clone https://github.com/aboSamoor/polyglot"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "markdown",
 66 |    "metadata": {},
 67 |    "source": [
 68 |     "Or download one of the following:\n",
 69 |     "\n",
 70 |     "-   [tarball](https://github.com/aboSamoor/polyglot/tarball/master)\n",
 71 |     "-   [zipball](https://github.com/aboSamoor/polyglot/zipball/master)"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "metadata": {},
 77 |    "source": [
 78 |     "Once you have the source, you can install it into your site-packages\n",
 79 |     "with:"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "metadata": {
 86 |     "collapsed": true
 87 |    },
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "python setup.py install"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "markdown",
 95 |    "metadata": {},
 96 |    "source": [
 97 |     "## Get the bleeding edge version\n",
 98 |     "\n",
 99 |     "To get the latest development version of polyglot, run :\n",
100 |     "\n",
101 |     "    $ pip install -U git+https://github.com/aboSamoor/polyglot.git@master"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "markdown",
106 |    "metadata": {},
107 |    "source": [
108 |     "### Python\n",
109 |     "\n",
110 |     "polyglot supports Python \\>=2.7 or \\>=3.4."
111 |    ]
112 |   }
113 |  ],
114 |  "metadata": {
115 |   "kernelspec": {
116 |    "display_name": "Python 2",
117 |    "language": "python",
118 |    "name": "python2"
119 |   },
120 |   "language_info": {
121 |    "codemirror_mode": {
122 |     "name": "ipython",
123 |     "version": 2
124 |    },
125 |    "file_extension": ".py",
126 |    "mimetype": "text/x-python",
127 |    "name": "python",
128 |    "nbconvert_exporter": "python",
129 |    "pygments_lexer": "ipython2",
130 |    "version": "2.7.6"
131 |   }
132 |  },
133 |  "nbformat": 4,
134 |  "nbformat_minor": 0
135 | }
136 | 


--------------------------------------------------------------------------------
/polyglot/mapping/expansion.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from .base import OrderedVocabulary
 5 | from collections import defaultdict
 6 | from six import iteritems
 7 | import re
 8 | import logging
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | class VocabExpander(OrderedVocabulary):
13 |   def __init__(self, vocabulary, formatters, strategy):
14 |     super(VocabExpander, self).__init__(vocabulary.words)
15 |     self.strategy = strategy
16 |     self._vocab = vocabulary
17 |     self.aux_word_id = defaultdict(lambda: [])
18 |     self.formatters = formatters
19 |     self.expand(formatters)
20 |     self.aux_id_word = {id_:w for w, id_ in iteritems(self.aux_word_id)}
21 | 
22 |   def __getitem__(self, key):
23 |     try:
24 |       return self._vocab[key]
25 |     except KeyError as e:
26 |       try:
27 |         return self.aux_word_id[key]
28 |       except KeyError as e:
29 |         return self.approximate_ids(key)
30 | 
31 |   def __contains__(self, key):
32 |     return ((key in self._vocab) or
33 |             (key in self.aux_word_id) or
34 |             self.approximate(key))
35 | 
36 |   def __len__(self):
37 |     return len(self._vocab) + len(self.aux_word_id)
38 |   
39 |   def __delitem__(self):
40 |     raise NotImplementedError("It is quite complex, let us do it in the future")
41 | 
42 |   def format(self, w):
43 |     return [f(w) for f in self.formatters]
44 |   
45 |   def approximate(self, w):
46 |     f = lambda key: (key in self._vocab) or (key in self.aux_word_id)
47 |     return {w_:self[w_] for w_ in self.format(w) if f(w_)}
48 |   
49 |   def approximate_ids(self, key):
50 |     ids = [id_ for w, id_ in self.approximate(key).items()]
51 |     if not ids:
52 |       raise KeyError(u"{} not found".format(key))
53 |     else:
54 |       if self.strategy == 'most_frequent':
55 |         return min(ids)
56 |       else:
57 |         return tuple(sorted(ids))  
58 | 
59 |   def _expand(self, formatter):
60 |     for w in self.word_id:
61 |       w_ = formatter(w)
62 |       if w_ not in self._vocab:
63 |         id_ = self.word_id[w]
64 |         self.aux_word_id[w_].append(id_)
65 | 
66 |   def expand(self, formatters):
67 |     for formatter in formatters:
68 |       self._expand(formatter)
69 |     if self.strategy == 'average':
70 |       self.aux_word_id = {w: tuple(sorted(ids)) for w, ids in iteritems(self.aux_word_id)}
71 |     elif self.strategy == 'most_frequent':
72 |       self.aux_word_id = {w: min(ids) for w, ids in iteritems(self.aux_word_id)}
73 |     else:
74 |       raise ValueError("A strategy is needed")
75 | 
76 |     words_added = self.aux_word_id.keys()
77 |     old_no = len(self._vocab)
78 |     new_no = len(self.aux_word_id)
79 |     logger.info("We have {} original words.".format(old_no))
80 |     logger.info("Added {} new words.".format(new_no))
81 |     logger.info("The new total number of words is {}".format(len(self)))
82 |     logger.debug(u"Words added\n{}\n".format(u" ".join(words_added)))
83 | 
84 | 
85 | class CaseExpander(VocabExpander):
86 |   def __init__(self, vocabulary, strategy='most_frequent'):
87 |     formatters = [lambda x: x.lower(),
88 |                   lambda x: x.title(),
89 |                   lambda x: x.upper()]
90 |     super(CaseExpander, self).__init__(vocabulary=vocabulary, formatters=formatters, strategy=strategy)
91 | 
92 |     
93 | class DigitExpander(VocabExpander):
94 |   def __init__(self, vocabulary, strategy='most_frequent'):
95 |     pattern = re.compile("[0-9]", flags=re.UNICODE)
96 |     formatters = [lambda x: pattern.sub("#", x)]
97 |     super(DigitExpander, self).__init__(vocabulary=vocabulary, formatters=formatters, strategy=strategy)
98 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.rst:
--------------------------------------------------------------------------------
  1 | ============
  2 | Contributing
  3 | ============
  4 | 
  5 | Contributions are welcome, and they are greatly appreciated! Every
  6 | little bit helps, and credit will always be given. 
  7 | 
  8 | You can contribute in many ways:
  9 | 
 10 | Types of Contributions
 11 | ----------------------
 12 | 
 13 | Report Bugs
 14 | ~~~~~~~~~~~
 15 | 
 16 | Report bugs at https://github.com/aboSamoor/polyglot/issues.
 17 | 
 18 | If you are reporting a bug, please include:
 19 | 
 20 | * Your operating system name and version.
 21 | * Any details about your local setup that might be helpful in troubleshooting.
 22 | * Detailed steps to reproduce the bug.
 23 | 
 24 | Fix Bugs
 25 | ~~~~~~~~
 26 | 
 27 | Look through the GitHub issues for bugs. Anything tagged with "bug"
 28 | is open to whoever wants to implement it.
 29 | 
 30 | Implement Features
 31 | ~~~~~~~~~~~~~~~~~~
 32 | 
 33 | Look through the GitHub issues for features. Anything tagged with "feature"
 34 | is open to whoever wants to implement it.
 35 | 
 36 | Write Documentation
 37 | ~~~~~~~~~~~~~~~~~~~
 38 | 
 39 | polyglot could always use more documentation, whether as part of the 
 40 | official polyglot docs, in docstrings, or even on the web in blog posts,
 41 | articles, and such.
 42 | 
 43 | Submit Feedback
 44 | ~~~~~~~~~~~~~~~
 45 | 
 46 | The best way to send feedback is to file an issue at https://github.com/aboSamoor/polyglot/issues.
 47 | 
 48 | If you are proposing a feature:
 49 | 
 50 | * Explain in detail how it would work.
 51 | * Keep the scope as narrow as possible, to make it easier to implement.
 52 | * Remember that this is a volunteer-driven project, and that contributions
 53 |   are welcome :)
 54 | 
 55 | Get Started!
 56 | ------------
 57 | 
 58 | Ready to contribute? Here's how to set up `polyglot` for local development.
 59 | 
 60 | 1. Fork the `polyglot` repo on GitHub.
 61 | 2. Clone your fork locally::
 62 | 
 63 |     $ git clone git@github.com:your_name_here/polyglot.git
 64 | 
 65 | 3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development::
 66 | 
 67 |     $ mkvirtualenv polyglot
 68 |     $ cd polyglot/
 69 |     $ python setup.py develop
 70 | 
 71 | 4. Create a branch for local development::
 72 | 
 73 |     $ git checkout -b name-of-your-bugfix-or-feature
 74 |    
 75 |    Now you can make your changes locally.
 76 | 
 77 | 5. When you're done making changes, check that your changes pass flake8 and the tests, including testing other Python versions with tox::
 78 | 
 79 |     $ flake8 polyglot tests
 80 |     $ python setup.py test
 81 |     $ tox
 82 | 
 83 |    To get flake8 and tox, just pip install them into your virtualenv. 
 84 | 
 85 | 6. Commit your changes and push your branch to GitHub::
 86 | 
 87 |     $ git add .
 88 |     $ git commit -m "Your detailed description of your changes."
 89 |     $ git push origin name-of-your-bugfix-or-feature
 90 | 
 91 | 7. Submit a pull request through the GitHub website.
 92 | 
 93 | Pull Request Guidelines
 94 | -----------------------
 95 | 
 96 | Before you submit a pull request, check that it meets these guidelines:
 97 | 
 98 | 1. The pull request should include tests.
 99 | 2. If the pull request adds functionality, the docs should be updated. Put
100 |    your new functionality into a function with a docstring, and add the
101 |    feature to the list in README.rst.
102 | 3. The pull request should work for Python 2.6, 2.7, and 3.3, 3.4, and for PyPy. Check
103 |    https://travis-ci.org/aboSamoor/polyglot/pull_requests
104 |    and make sure that the tests pass for all supported Python versions.
105 | 
106 | Tips
107 | ----
108 | 
109 | To run a subset of tests::
110 | 
111 | 	$ python -m unittest tests.test_polyglot


--------------------------------------------------------------------------------
/polyglot/tokenize/tests/test_base.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """ Test basic tokenization utilities."""
 5 | 
 6 | import unittest
 7 | from ..base import SentenceTokenizer, WordTokenizer
 8 | from ...base import Sequence
 9 | 
10 | en_text = u"""A Ukrainian separatist leader is calling on Russia to "absorb" the eastern region of Donetsk after Sunday's referendum on self rule. Self-declared Donetsk People's Republic leader Denis Pushilin urged Moscow to listen to the "will of the people". In neighbouring Luhansk, where a vote was also held, rebels declared independence. Ukraine, the EU and US have declared the referendums illegal but Russia says the results should be "implemented". Moscow has so far not commented on the call for Donetsk to become part of Russia but has appealed for dialogue between the militants and Kiev, with the participation of the Organisation for Security and Co-operation in Europe.
11 | """
12 | 
13 | ar_text  = u"""عبر أحد قادة المتمردين الموالين لروسيا في أوكرانيا عن مساندته لفكرة الوحدة مع روسيا في أعقاب الإعلان عن نتائج الاستفتاء المثير للجدل في شرق البلاد. وقال رومان لياجين، رئيس لجنة المتمردين للانتخابات في دونيتسك إن الانضمام لروسيا "قد يكون خطوة مناسبة".
14 | """
15 | 
16 | ja_text = u"""やった!"""
17 | 
18 | 
19 | class BaseTest(unittest.TestCase):
20 |   def setUp(self):
21 |     self.en_seq = Sequence(en_text)
22 |     self.ar_seq = Sequence(ar_text)
23 |     self.ja_seq = Sequence(ja_text)
24 | 
25 |     self.en_sent = SentenceTokenizer(locale='en')
26 |     self.ar_sent = SentenceTokenizer(locale='ar')
27 |     self.ja_sent = SentenceTokenizer(locale='ja')
28 |     
29 |     self.en_word = WordTokenizer(locale='en')
30 |     self.ar_word = WordTokenizer(locale='ar')
31 |     self.ja_word = WordTokenizer(locale='ja')
32 | 
33 |     self.en_sents = self.en_sent.transform(self.en_seq)
34 |     self.ar_sents = self.ar_sent.transform(self.ar_seq)
35 |     self.ja_sents = self.ja_sent.transform(self.ja_seq)
36 |     
37 |     self.en_words = self.en_word.transform(self.en_seq)
38 |     self.ar_words = self.ar_word.transform(self.ar_seq)
39 |     self.ja_words = self.ja_word.transform(self.ja_seq)
40 | 
41 |   def tearDown(self):
42 |     pass
43 | 
44 |   def test_sentences_count(self):
45 |     """ Sentence segmentation produces correct number of sentences."""
46 | 
47 |     self.assertEqual(5, len(self.en_sents))
48 |     self.assertEqual(2, len(self.ar_sents))
49 |     self.assertEqual(1, len(self.ja_sents))
50 | 
51 |   def test_redundant_idx(self):
52 |     """ Test if there are redundant indices."""
53 | 
54 |     self.assertEqual(len(self.en_sents.idx), len(set(self.en_sents.idx)))
55 |     self.assertEqual(len(self.ar_sents.idx), len(set(self.ar_sents.idx)))
56 |     self.assertEqual(len(self.ja_sents.idx), len(set(self.ja_sents.idx)))
57 |     
58 |     self.assertEqual(len(self.en_words.idx), len(set(self.en_words.idx)))
59 |     self.assertEqual(len(self.ar_words.idx), len(set(self.ar_words.idx)))
60 |     self.assertEqual(len(self.ja_words.idx), len(set(self.ja_words.idx)))
61 | 
62 |   def test_boundaries(self):
63 |     """ Sentence boundaries should be also word boundaries."""
64 | 
65 |     self.assertTrue(set(self.en_sents.idx).issubset(set(self.en_words.idx)))
66 |     self.assertTrue(set(self.ar_sents.idx).issubset(set(self.ar_words.idx)))
67 |     self.assertTrue(set(self.ja_sents.idx).issubset(set(self.ja_words.idx)))
68 |     
69 |   def test_transformations_equal(self):
70 |     """ Word toeknization over text is equal to over sentences."""
71 |   
72 |     idx1 = self.en_words.idx
73 |     idx2 = self.en_word.transform(self.en_sents).idx
74 |     self.assertListEqual(idx1, idx2)
75 |     
76 |     idx1 = self.ar_words.idx
77 |     idx2 = self.ar_word.transform(self.ar_sents).idx
78 |     self.assertListEqual(idx1, idx2)
79 |     
80 |     idx1 = self.ja_words.idx
81 |     idx2 = self.ja_word.transform(self.ja_sents).idx
82 |     self.assertListEqual(idx1, idx2)
83 | 
84 | 
85 | if __name__ == "__main__":
86 |   unittest.main()
87 | 


--------------------------------------------------------------------------------
/polyglot/transliteration/base.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """Transliteration.
  5 | 
  6 | Transliteration across pair of languages.
  7 | 
  8 | """
  9 | 
 10 | from math import log
 11 | 
 12 | from ..load import load_transliteration_table
 13 | from ..decorators import cached_property
 14 | 
 15 | 
 16 | class Transliterator(object):
 17 |   """Transliterator between pair of languages. """
 18 | 
 19 |   def __init__(self, source_lang="en", target_lang="en"):
 20 |     """
 21 |     Args:
 22 |       source_lang (string): language code of the input langauge.
 23 |       target_lang (string): language code of the generated output langauge.
 24 |     """
 25 |     self.source_lang = source_lang
 26 |     self.target_lang = target_lang
 27 | 
 28 |     self.decoder = self._decoder()
 29 |     """Transliterate a string from English to the target language."""
 30 |     self.encoder = self._encoder()
 31 |     """Transliterate a string from the input language to English."""
 32 | 
 33 |   def _decoder(self):
 34 |     """Transliterate a string from English to the target language."""
 35 |     if self.target_lang == 'en':
 36 |       return Transliterator._dummy_coder
 37 |     else:
 38 |       weights = load_transliteration_table(self.target_lang)
 39 |       decoder_weights = weights["decoder"]
 40 |       return Transliterator._transliterate_string(decoder_weights)
 41 | 
 42 |   def _encoder(self):
 43 |     """Transliterate a string from the input language to English."""
 44 |     if self.source_lang == 'en':
 45 |       return Transliterator._dummy_coder
 46 |     else:
 47 |       weights = load_transliteration_table(self.source_lang)
 48 |       encoder_weights = weights["encoder"]
 49 |       return Transliterator._transliterate_string(encoder_weights)
 50 | 
 51 |   @staticmethod
 52 |   def _dummy_coder(word):
 53 |     """Returns the string as it is, no transliteration is done."""
 54 |     return  word
 55 | 
 56 |   def transliterate(self, word):
 57 |     """Transliterate the word from its source language to the target one.
 58 | 
 59 |     The method works by encoding the word into English then decoding the new
 60 |     Enlgish word to the target language.
 61 |     """
 62 |     encoded_word = self.encoder(word)
 63 |     decoded_word = self.decoder(encoded_word)
 64 |     return decoded_word
 65 | 
 66 |   @staticmethod
 67 |   def _transliterate_string(weight, ngram1=6, ngram2=6):
 68 |     def translate_string(word):
 69 |       unlimited5 = 99999
 70 |       # Convert input to lower case
 71 |       word = word.lower().strip()
 72 |       # Initialize bestk results
 73 |       best_source_string = []
 74 |       best_target_string = []
 75 |       best_string_cost = []
 76 |       for i in range(len(word)+1):
 77 |         best_source_string.append('')
 78 |         best_target_string.append('')
 79 |         best_string_cost.append(unlimited5)
 80 |       # Only 1 initial state
 81 |       best_string_cost[0] = 0
 82 |       # Start DP to generate bestk results
 83 |       for i in range(1, len(word)+1):
 84 |         for j in range(1, ngram1+1):
 85 |           if i >= j:
 86 |             piece = word[i-j:i]
 87 |             for item in weight:
 88 |               if item[0].strip() == piece:
 89 |                 vfinal = -log(weight[item])
 90 |                 if best_string_cost[i - j] < unlimited5:
 91 |                   tmp_string_cost = best_string_cost[i - j]
 92 |                   # Final cost value.
 93 |                   # Things need to be considered:
 94 |                   # 1) Individual cost of tranliterating from piece to tar
 95 |                   # 2) Length of piece and tar
 96 |                   # 3) Prefix of piece
 97 |                   # 4) Prefix of tar
 98 |                   tmp_string_cost += vfinal
 99 |                   if tmp_string_cost < best_string_cost[i]:
100 |                     tmp_source_string = best_source_string[i - j] + piece
101 |                     tmp_target_string = best_target_string[i - j] + item[1].strip()
102 |                     best_source_string[i] = tmp_source_string
103 |                     best_target_string[i] = tmp_target_string
104 |                     best_string_cost[i] = tmp_string_cost
105 |       return best_target_string[len(word)]
106 |     return translate_string
107 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import os
  5 | import sys
  6 | 
  7 | 
  8 | try:
  9 |     from setuptools import setup
 10 | except ImportError:
 11 |     from distutils.core import setup
 12 | 
 13 | 
 14 | with open('README.rst') as readme_file:
 15 |     readme = readme_file.read()
 16 | 
 17 | with open('HISTORY.rst') as history_file:
 18 |     history = history_file.read().replace('.. :changelog:', '')
 19 | 
 20 | packages = set(open("requirements.txt", "r").read().splitlines())
 21 | 
 22 | requirements = filter(lambda x: "http" not in x, packages)
 23 | 
 24 | 
 25 | test_requirements = [
 26 |     # TODO: put package test requirements here
 27 | ]
 28 | 
 29 | setup(
 30 |     name='polyglot',
 31 |     version='15.10.03',
 32 |     description='Polyglot is a natural language pipeline that supports massive multilingual applications.',
 33 |     long_description=readme + '\n\n' + history,
 34 |     author='Rami Al-Rfou',
 35 |     author_email='rmyeid@gmail.com',
 36 |     url='https://github.com/aboSamoor/polyglot',
 37 |     packages = ['polyglot',
 38 |                 'polyglot.detect',
 39 |                 'polyglot.tokenize',
 40 |                 'polyglot.mapping',
 41 |                 'polyglot.tag',
 42 |                 'polyglot.transliteration'],
 43 |     entry_points={
 44 |         'console_scripts': [
 45 |             'polyglot = polyglot.__main__:main',
 46 |         ],
 47 |     },
 48 |     include_package_data=True,
 49 |     install_requires=requirements,
 50 |     license="GPLv3",
 51 |     zip_safe=False,
 52 |     keywords='polyglot',
 53 |     classifiers=[
 54 |         'Development Status :: 4 - Beta',
 55 |         'Environment :: Console',
 56 |         'Intended Audience :: Science/Research',
 57 |         'Intended Audience :: Education',
 58 |         'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)',
 59 |         'Natural Language :: Afrikaans',
 60 |         'Natural Language :: Arabic',
 61 |         'Natural Language :: Bengali',
 62 |         'Natural Language :: Bosnian',
 63 |         'Natural Language :: Bulgarian',
 64 |         'Natural Language :: Catalan',
 65 |         'Natural Language :: Chinese (Simplified)',
 66 |         'Natural Language :: Chinese (Traditional)',
 67 |         'Natural Language :: Croatian',
 68 |         'Natural Language :: Czech',
 69 |         'Natural Language :: Danish',
 70 |         'Natural Language :: Dutch',
 71 |         'Natural Language :: English',
 72 |         'Natural Language :: Esperanto',
 73 |         'Natural Language :: Finnish',
 74 |         'Natural Language :: French',
 75 |         'Natural Language :: Galician',
 76 |         'Natural Language :: German',
 77 |         'Natural Language :: Greek',
 78 |         'Natural Language :: Hebrew',
 79 |         'Natural Language :: Hindi',
 80 |         'Natural Language :: Hungarian',
 81 |         'Natural Language :: Icelandic',
 82 |         'Natural Language :: Indonesian',
 83 |         'Natural Language :: Italian',
 84 |         'Natural Language :: Japanese',
 85 |         'Natural Language :: Javanese',
 86 |         'Natural Language :: Korean',
 87 |         'Natural Language :: Latin',
 88 |         'Natural Language :: Latvian',
 89 |         'Natural Language :: Macedonian',
 90 |         'Natural Language :: Malay',
 91 |         'Natural Language :: Marathi',
 92 |         'Natural Language :: Norwegian',
 93 |         'Natural Language :: Panjabi',
 94 |         'Natural Language :: Persian',
 95 |         'Natural Language :: Polish',
 96 |         'Natural Language :: Portuguese',
 97 |         'Natural Language :: Portuguese (Brazilian)',
 98 |         'Natural Language :: Romanian',
 99 |         'Natural Language :: Russian',
100 |         'Natural Language :: Serbian',
101 |         'Natural Language :: Slovak',
102 |         'Natural Language :: Slovenian',
103 |         'Natural Language :: Spanish',
104 |         'Natural Language :: Swedish',
105 |         'Natural Language :: Tamil',
106 |         'Natural Language :: Telugu',
107 |         'Natural Language :: Thai',
108 |         'Natural Language :: Turkish',
109 |         'Natural Language :: Ukranian',
110 |         'Natural Language :: Urdu',
111 |         'Natural Language :: Vietnamese',
112 |         "Programming Language :: Python :: 2",
113 |         'Programming Language :: Python :: 2.7',
114 |         'Programming Language :: Python :: 3',
115 |         'Programming Language :: Python :: 3.4',
116 |         'Topic :: Scientific/Engineering :: Artificial Intelligence',
117 |         'Topic :: Text Processing :: Linguistic',
118 |     ],
119 |     test_suite='tests',
120 |     tests_require=test_requirements,
121 | )
122 | 


--------------------------------------------------------------------------------
/polyglot/load.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | from os import path
  5 | import os
  6 | from tempfile import NamedTemporaryFile
  7 | 
  8 | import numpy as np
  9 | import morfessor
 10 | 
 11 | from six import PY2
 12 | from six.moves import cPickle as pickle
 13 | 
 14 | from . import data_path
 15 | from .decorators import memoize
 16 | from .downloader import downloader
 17 | from .mapping import Embedding, CountedVocabulary, CaseExpander, DigitExpander
 18 | 
 19 | from .utils import _open
 20 | 
 21 | if "~" in data_path:
 22 |   data_path = path.expanduser(data_path)
 23 | 
 24 | polyglot_path = path.join(path.abspath(data_path), "polyglot_data")
 25 | 
 26 | 
 27 | resource_dir = {
 28 |   "cw_embeddings":"embeddings2",
 29 |   "sgns_embeddings":"sgns2",
 30 |   "visualization": "tsne2",
 31 |   "wiki_vocab": "counts2",
 32 |   "sentiment": "sentiment2",
 33 | }
 34 | 
 35 | 
 36 | def locate_resource(name, lang, filter=None):
 37 |   """Return filename that contains specific language resource name.
 38 | 
 39 |   Args:
 40 |     name (string): Name of the resource.
 41 |     lang (string): language code to be loaded.
 42 |   """
 43 |   task_dir = resource_dir.get(name, name)
 44 |   package_id = u"{}.{}".format(task_dir, lang)
 45 |   p = path.join(polyglot_path, task_dir, lang)
 46 |   if not path.isdir(p):
 47 |     if downloader.status(package_id) != downloader.INSTALLED:
 48 |       raise ValueError("This resource is available in the index "
 49 |                        "but not downloaded, yet. Try to run\n\n"
 50 |                        "polyglot download {}".format(package_id))
 51 |   return path.join(p, os.listdir(p)[0])
 52 | 
 53 | 
 54 | @memoize
 55 | def load_embeddings(lang="en", task="embeddings", type="cw"):
 56 |   """Return a word embeddings object for `lang` and of type `type`
 57 | 
 58 |   Args:
 59 |     lang (string): language code.
 60 |     task (string): parameters that define task.
 61 |     type (string): skipgram, cw, cbow ...
 62 |   """
 63 |   src_dir = "_".join((type, task)) if type else task
 64 |   p = locate_resource(src_dir, lang)
 65 |   e = Embedding.load(p)
 66 |   if type == "cw":
 67 |     e.apply_expansion(CaseExpander)
 68 |     e.apply_expansion(DigitExpander)
 69 |   if type == "sgns":
 70 |     e.apply_expansion(CaseExpander)
 71 |   return e
 72 | 
 73 | 
 74 | @memoize
 75 | def load_vocabulary(lang="en", type="wiki"):
 76 |   """Return a CountedVocabulary object.
 77 | 
 78 |   Args:
 79 |     lang (string): language code.
 80 |     type (string): wiki,...
 81 |   """
 82 |   src_dir = "{}_vocab".format(type)
 83 |   p = locate_resource(src_dir, lang)
 84 |   return CountedVocabulary.from_vocabfile(p)
 85 | 
 86 | 
 87 | @memoize
 88 | def load_ner_model(lang="en", version="2"):
 89 |   """Return a named entity extractor parameters for `lang` and of version `version`
 90 | 
 91 |   Args:
 92 |     lang (string): language code.
 93 |     version (string): version of the parameters to be used.
 94 |   """
 95 |   src_dir = "ner{}".format(version)
 96 |   p = locate_resource(src_dir, lang)
 97 |   fh = _open(p)
 98 |   try:
 99 |     return pickle.load(fh)
100 |   except UnicodeDecodeError:
101 |     fh.seek(0)
102 |     return pickle.load(fh, encoding='latin1')
103 | 
104 | 
105 | @memoize
106 | def load_pos_model(lang="en", version="2"):
107 |   """Return a part of speech tagger parameters for `lang` and of version `version`
108 | 
109 |   Args:
110 |     lang (string): language code.
111 |     version (string): version of the parameters to be used.
112 |   """
113 |   src_dir = "pos{}".format(version)
114 |   p = locate_resource(src_dir, lang)
115 |   fh = _open(p)
116 |   return dict(np.load(fh))
117 | 
118 | 
119 | @memoize
120 | def load_morfessor_model(lang="en", version="2"):
121 |   """Return a morfessor model for `lang` and of version `version`
122 | 
123 |   Args:
124 |     lang (string): language code.
125 |     version (string): version of the parameters to be used.
126 |   """
127 |   src_dir = "morph{}".format(version)
128 |   p = locate_resource(src_dir, lang)
129 |   file_handler = _open(p)
130 |   tmp_file_ = NamedTemporaryFile(delete=False)
131 |   tmp_file_.write(file_handler.read())
132 |   tmp_file_.close()
133 |   io = morfessor.MorfessorIO()
134 |   model = io.read_any_model(tmp_file_.name)
135 |   os.remove(tmp_file_.name)
136 |   return model
137 | 
138 | 
139 | @memoize
140 | def load_transliteration_table(lang="en", version="2"):
141 |   """Return a morfessor model for `lang` and of version `version`
142 | 
143 |   Args:
144 |     lang (string): language code.
145 |     version (string): version of the parameters to be used.
146 |   """
147 |   src_dir = "transliteration{}".format(version)
148 |   p = locate_resource(src_dir, lang)
149 |   file_handler = _open(p)
150 |   return pickle.load(file_handler)
151 | 


--------------------------------------------------------------------------------
/polyglot/tag/base.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """POS and NER Taggers.
  5 | 
  6 | Part of speech taggers (POS) classifies words into 17 syntactic category.
  7 | Named entity Recognition extractors (NER) Detect three types of entities: {Person, Location, Organization.}
  8 | 
  9 | """
 10 | 
 11 | import numpy as np
 12 | from six.moves import range
 13 | 
 14 | from ..decorators import memoize
 15 | from ..load import load_embeddings, load_ner_model, load_pos_model
 16 | 
 17 | 
 18 | NER_ID_TAG = {0: u'O', 1: u'I-PER', 2: u'I-LOC', 3: u'I-ORG'}
 19 | 
 20 | POS_TAG_ID = {u'ADJ': 0, u'ADP': 1, u'ADV': 2, u'AUX': 3, u'CONJ': 4,
 21 |               u'DET': 5, u'INTJ': 6, u'NOUN': 7, u'NUM': 8, u'PART': 9,
 22 |               u'PRON': 10, u'PROPN': 11, u'PUNCT': 12, u'SCONJ': 13,
 23 |               u'SYM': 14, u'VERB': 15, u'X': 16}
 24 | 
 25 | POS_ID_TAG = {v:k for k,v in POS_TAG_ID.items()}
 26 | 
 27 | class TaggerBase(object):
 28 |   """Tagger base class that defines the interface. """
 29 |   PAD = u'<PAD>'
 30 |   START = u'<S>'
 31 |   END = u'</S>'
 32 |   UNK = u'<UNK>'
 33 | 
 34 |   def __init__(self, lang='en'):
 35 |     """
 36 |     Args:
 37 |       lang: language code to decide which chunker to use.
 38 |     """
 39 |     self.lang = lang
 40 |     self.predictor = self._load_network()
 41 |     self.ID_TAG = {}
 42 |     self.add_bias = True
 43 |     self.context = 2
 44 | 
 45 |   @staticmethod
 46 |   def ngrams(sequence, n):
 47 |     ngrams_ = []
 48 |     seq = ((n-1) * [TaggerBase.PAD] + [TaggerBase.START] +
 49 |            sequence +
 50 |            [TaggerBase.END] + (n-1) * [TaggerBase.PAD])
 51 |     for i in range(n, n+len(sequence)):
 52 |       yield seq[i-n: i+n+1]
 53 | 
 54 |   def _load_network(self):
 55 |     raise NotImplementedError()
 56 | 
 57 |   def annotate(self, sent):
 58 |     """Annotate a squence of words with entity tags.
 59 | 
 60 |     Args:
 61 |       sent: sequence of strings/words.
 62 |     """
 63 |     preds = []
 64 |     words = []
 65 |     for word, fv in self.sent2examples(sent):
 66 |       probs = self.predictor(fv)
 67 |       tags = probs.argsort()
 68 |       tag = self.ID_TAG[tags[-1]]
 69 | 
 70 |       words.append(word)
 71 |       preds.append(tag)
 72 | 
 73 |     # fix_chunks(preds)
 74 |     annotations = zip(words, preds)
 75 |     return annotations
 76 | 
 77 |   def sent2examples(self, sent):
 78 |     """ Convert ngrams into feature vectors."""
 79 | 
 80 |     # TODO(rmyeid): use expanders.
 81 |     words = [w if w in self.embeddings else TaggerBase.UNK for w in sent]
 82 |     ngrams = TaggerBase.ngrams(words, self.context)
 83 |     fvs = []
 84 |     for word, ngram in zip(sent, ngrams):
 85 |       fv = np.array([self.embeddings[w] for w in ngram]).flatten()
 86 |       if self.add_bias:
 87 |         fv = np.hstack((fv, np.array(1)))
 88 |       yield word, fv
 89 | 
 90 | 
 91 | class NEChunker(TaggerBase):
 92 |   """Named entity extractor."""
 93 | 
 94 |   def __init__(self, lang='en'):
 95 |     """
 96 |     Args:
 97 |       lang: language code to decide which chunker to use.
 98 |     """
 99 |     super(NEChunker, self).__init__(lang=lang)
100 |     self.ID_TAG = NER_ID_TAG
101 | 
102 |   def _load_network(self):
103 |     """ Building the predictor out of the model."""
104 |     self.embeddings = load_embeddings(self.lang, type='cw')
105 |     self.embeddings.normalize_words(inplace=True)
106 |     self.model = load_ner_model(lang=self.lang, version=2)
107 |     first_layer, second_layer = self.model
108 |     def predict_proba(input_):
109 |       hidden = np.tanh(np.dot(first_layer, input_))
110 |       hidden = np.hstack((hidden, np.ones((hidden.shape[0], 1))))
111 |       output =  (second_layer *  hidden).sum(axis=1)
112 |       output_ = 1.0/(1.0 + np.exp(-output))
113 |       probs = output_/output_.sum()
114 |       return probs
115 |     return predict_proba
116 | 
117 | 
118 | class POSTagger(TaggerBase):
119 |   """Universal Part of Speech Tagger."""
120 | 
121 |   def __init__(self, lang='en'):
122 |     """
123 |     Args:
124 |       lang: language code to decide which chunker to use.
125 |     """
126 |     super(POSTagger, self).__init__(lang=lang)
127 |     self.ID_TAG = POS_ID_TAG
128 |     self.add_bias = False
129 | 
130 |   def _load_network(self):
131 |     """ Building the predictor out of the model."""
132 |     self.embeddings = load_embeddings(self.lang, type='cw')
133 |     #self.embeddings.normalize_words(inplace=True)
134 |     self.model = load_pos_model(lang=self.lang, version=2)
135 | 
136 |     def predict_proba(input_):
137 |       hidden = np.tanh(np.dot(input_, self.model["W1"]) + self.model["b1"])
138 |       output =  np.dot(hidden, self.model["W2"]) + self.model["b2"]
139 |       scores = np.exp(output)
140 |       probs = scores/scores.sum()
141 |       return probs
142 |     return predict_proba
143 | 
144 | @memoize
145 | def get_pos_tagger(lang='en'):
146 |   """Return a POS tagger from the models cache."""
147 |   return POSTagger(lang=lang)
148 | 
149 | @memoize
150 | def get_ner_tagger(lang='en'):
151 |   """Return a NER tagger from the models cache."""
152 |   return NEChunker(lang=lang)
153 | 


--------------------------------------------------------------------------------
/docs/Tokenization.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | Tokenization
  3 | ============
  4 | 
  5 | Toeknization is the process that identifies the text boundaries of words
  6 | and sentences. We can identify the boundaries of sentences first then
  7 | tokenize each sentence to identify the words that compose the sentence.
  8 | Of course, we can do word tokenization first and then segment the token
  9 | sequence into sentneces. Tokenization in polyglot relies on the `Unicode
 10 | Text Segmentation <http://www.unicode.org/reports/tr29/>`__ algorithm as
 11 | implemented by the `ICU Project <http://site.icu-project.org/>`__.
 12 | 
 13 | You can use C/C++ ICU library by installing the required package
 14 | ``libicu-dev``. For example, on ubuntu/debian systems you should use
 15 | ``apt-get`` utility as the following:
 16 | 
 17 | .. code:: python
 18 | 
 19 |     sudo apt-get install libicu-dev
 20 | 
 21 | .. code:: python
 22 | 
 23 |     from polyglot.text import Text
 24 | 
 25 | Word Tokenization
 26 | -----------------
 27 | 
 28 | To call our word tokenizer, first we need to construct a Text object.
 29 | 
 30 | .. code:: python
 31 | 
 32 |     blob = u"""
 33 |     两个月前遭受恐怖袭击的法国巴黎的犹太超市在装修之后周日重新开放，法国内政部长以及超市的管理者都表示，这显示了生命力要比野蛮行为更强大。
 34 |     该超市1月9日遭受枪手袭击，导致4人死亡，据悉这起事件与法国《查理周刊》杂志社恐怖袭击案有关。
 35 |     """
 36 |     text = Text(blob)
 37 | 
 38 | The property words will call the word tokenizer.
 39 | 
 40 | .. code:: python
 41 | 
 42 |     text.words
 43 | 
 44 | 
 45 | 
 46 | 
 47 | .. parsed-literal::
 48 | 
 49 |     WordList(['两', '个', '月', '前', '遭受', '恐怖', '袭击', '的', '法国', '巴黎', '的', '犹太', '超市', '在', '装修', '之后', '周日', '重新', '开放', '，', '法国', '内政', '部长', '以及', '超市', '的', '管理者', '都', '表示', '，', '这', '显示', '了', '生命力', '要', '比', '野蛮', '行为', '更', '强大', '。', '该', '超市', '1', '月', '9', '日', '遭受', '枪手', '袭击', '，', '导致', '4', '人', '死亡', '，', '据悉', '这', '起', '事件', '与', '法国', '《', '查理', '周刊', '》', '杂志', '社', '恐怖', '袭击', '案', '有关', '。'])
 50 | 
 51 | 
 52 | 
 53 | Since ICU boundary break algorithms are language aware, polyglot will
 54 | detect the language used first before calling the tokenizer
 55 | 
 56 | .. code:: python
 57 | 
 58 |     print(text.language)
 59 | 
 60 | 
 61 | .. parsed-literal::
 62 | 
 63 |     name:             code: zh       confidence:  99.0 read bytes:  1920
 64 | 
 65 | 
 66 | Sentence Segementation
 67 | ----------------------
 68 | 
 69 | If we are interested in segmenting the text first into sentences, we can
 70 | query the ``sentences`` property
 71 | 
 72 | .. code:: python
 73 | 
 74 |     text.sentences
 75 | 
 76 | 
 77 | 
 78 | 
 79 | .. parsed-literal::
 80 | 
 81 |     [Sentence("两个月前遭受恐怖袭击的法国巴黎的犹太超市在装修之后周日重新开放，法国内政部长以及超市的管理者都表示，这显示了生命力要比野蛮行为更强大。"),
 82 |      Sentence("该超市1月9日遭受枪手袭击，导致4人死亡，据悉这起事件与法国《查理周刊》杂志社恐怖袭击案有关。")]
 83 | 
 84 | 
 85 | 
 86 | ``Sentence`` class inherits ``Text``, therefore, we can tokenize each
 87 | sentence into words using the same property ``words``
 88 | 
 89 | .. code:: python
 90 | 
 91 |     first_sentence = text.sentences[0]
 92 |     first_sentence.words
 93 | 
 94 | 
 95 | 
 96 | 
 97 | .. parsed-literal::
 98 | 
 99 |     WordList(['两', '个', '月', '前', '遭受', '恐怖', '袭击', '的', '法国', '巴黎', '的', '犹太', '超市', '在', '装修', '之后', '周日', '重新', '开放', '，', '法国', '内政', '部长', '以及', '超市', '的', '管理者', '都', '表示', '，', '这', '显示', '了', '生命力', '要', '比', '野蛮', '行为', '更', '强大', '。'])
100 | 
101 | 
102 | 
103 | Command Line
104 | ------------
105 | 
106 | The subcommand tokenize does by default sentence segmentation and word
107 | tokenization.
108 | 
109 | .. code:: python
110 | 
111 |     ! polyglot tokenize --help
112 | 
113 | 
114 | .. parsed-literal::
115 | 
116 |     usage: polyglot tokenize [-h] [--only-sent | --only-word] [--input [INPUT [INPUT ...]]]
117 |     
118 |     optional arguments:
119 |       -h, --help            show this help message and exit
120 |       --only-sent           Segment sentences without word tokenization
121 |       --only-word           Tokenize words without sentence segmentation
122 |       --input [INPUT [INPUT ...]]
123 | 
124 | 
125 | Each line represents a sentence where the words are split by spaces.
126 | 
127 | .. code:: python
128 | 
129 |     !polyglot --lang en tokenize --input testdata/cricket.txt
130 | 
131 | 
132 | .. parsed-literal::
133 | 
134 |     Australia posted a World Cup record total of 417 - 6 as they beat Afghanistan by 275 runs .
135 |     David Warner hit 178 off 133 balls , Steve Smith scored 95 while Glenn Maxwell struck 88 in 39 deliveries in the Pool A encounter in Perth .
136 |     Afghanistan were then dismissed for 142 , with Mitchell Johnson and Mitchell Starc taking six wickets between them .
137 |     Australia's score surpassed the 413 - 5 India made against Bermuda in 2007 .
138 |     It continues the pattern of bat dominating ball in this tournament as the third 400 plus score achieved in the pool stages , following South Africa's 408 - 5 and 411 - 4 against West Indies and Ireland respectively .
139 |     The winning margin beats the 257 - run amount by which India beat Bermuda in Port of Spain in 2007 , which was equalled five days ago by South Africa in their victory over West Indies in Sydney .
140 | 
141 | 
142 | References
143 | ~~~~~~~~~~
144 | 
145 | -  `Unicode Text Segmentation
146 |    Algorithm <http://www.unicode.org/reports/tr29/>`__
147 | -  `Unicode Line Breaking
148 |    Algorithm <http://www.unicode.org/reports/tr14/>`__
149 | -  `Boundary
150 |    Analysis <http://userguide.icu-project.org/boundaryanalysis>`__
151 | -  `ICU Homepage <http://site.icu-project.org/>`__
152 | -  `Python Wrapper for libicu <https://pypi.python.org/pypi/PyICU>`__
153 | 


--------------------------------------------------------------------------------
/docs/Transliteration.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | Transliteration
  3 | ===============
  4 | 
  5 | Transliteration is the conversion of a text from one script to another.
  6 | For instance, a Latin transliteration of the Greek phrase "Ελληνική
  7 | Δημοκρατία", usually translated as 'Hellenic Republic', is "Ellēnikḗ
  8 | Dēmokratía".
  9 | 
 10 | .. code:: python
 11 | 
 12 |     from polyglot.transliteration import Transliterator
 13 | 
 14 | Languages Coverage
 15 | ------------------
 16 | 
 17 | .. code:: python
 18 | 
 19 |     from polyglot.downloader import downloader
 20 |     print(downloader.supported_languages_table("transliteration2"))
 21 | 
 22 | 
 23 | .. parsed-literal::
 24 | 
 25 |       1. Haitian; Haitian Creole    2. Tamil                      3. Vietnamese               
 26 |       4. Telugu                     5. Croatian                   6. Hungarian                
 27 |       7. Thai                       8. Kannada                    9. Tagalog                  
 28 |      10. Armenian                  11. Hebrew (modern)           12. Turkish                  
 29 |      13. Portuguese                14. Belarusian                15. Norwegian Nynorsk        
 30 |      16. Norwegian                 17. Dutch                     18. Japanese                 
 31 |      19. Albanian                  20. Bulgarian                 21. Serbian                  
 32 |      22. Swahili                   23. Swedish                   24. French                   
 33 |      25. Latin                     26. Czech                     27. Yiddish                  
 34 |      28. Hindi                     29. Danish                    30. Finnish                  
 35 |      31. German                    32. Bosnian-Croatian-Serbian  33. Slovak                   
 36 |      34. Persian                   35. Lithuanian                36. Slovene                  
 37 |      37. Latvian                   38. Bosnian                   39. Gujarati                 
 38 |      40. Italian                   41. Icelandic                 42. Spanish; Castilian       
 39 |      43. Ukrainian                 44. Georgian                  45. Urdu                     
 40 |      46. Indonesian                47. Marathi (Marāṭhī)         48. Korean                   
 41 |      49. Galician                  50. Khmer                     51. Catalan; Valencian       
 42 |      52. Romanian, Moldavian, ...  53. Basque                    54. Macedonian               
 43 |      55. Russian                   56. Azerbaijani               57. Chinese                  
 44 |      58. Estonian                  59. Welsh                     60. Arabic                   
 45 |      61. Bengali                   62. Amharic                   63. Irish                    
 46 |      64. Malay                     65. Afrikaans                 66. Polish                   
 47 |      67. Greek, Modern             68. Esperanto                 69. Maltese                  
 48 |     
 49 | 
 50 | 
 51 | Downloading Necessary Models
 52 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 53 | 
 54 | .. code:: python
 55 | 
 56 |     %%bash
 57 |     polyglot download embeddings2.en pos2.en
 58 | 
 59 | 
 60 | .. parsed-literal::
 61 | 
 62 |     [polyglot_data] Downloading package embeddings2.en to
 63 |     [polyglot_data]     /home/rmyeid/polyglot_data...
 64 |     [polyglot_data]   Package embeddings2.en is already up-to-date!
 65 |     [polyglot_data] Downloading package pos2.en to
 66 |     [polyglot_data]     /home/rmyeid/polyglot_data...
 67 |     [polyglot_data]   Package pos2.en is already up-to-date!
 68 | 
 69 | 
 70 | Example
 71 | -------
 72 | 
 73 | We tag each word in the text with one part of speech.
 74 | 
 75 | .. code:: python
 76 | 
 77 |     from polyglot.text import Text
 78 | 
 79 | .. code:: python
 80 | 
 81 |     blob = """We will meet at eight o'clock on Thursday morning."""
 82 |     text = Text(blob)
 83 | 
 84 | We can query all the tagged words
 85 | 
 86 | .. code:: python
 87 | 
 88 |     for x in text.transliterate("ar"):
 89 |       print(x)
 90 | 
 91 | 
 92 | .. parsed-literal::
 93 | 
 94 |     وي
 95 |     ويل
 96 |     ميت
 97 |     ات
 98 |     ييايت
 99 |     أوكلوك
100 |     ون
101 |     ثورسداي
102 |     مورنينغ
103 |     
104 | 
105 | 
106 | Command Line Interface
107 | ~~~~~~~~~~~~~~~~~~~~~~
108 | 
109 | .. code:: python
110 | 
111 |     !polyglot --lang en tokenize --input testdata/cricket.txt |  polyglot --lang en transliteration --target ar | tail -n 30
112 | 
113 | 
114 | .. parsed-literal::
115 | 
116 |     which           ويكه            
117 |     India           ينديا           
118 |     beat            بيت             
119 |     Bermuda         بيرمودا         
120 |     in              ين              
121 |     Port            بورت            
122 |     of              وف              
123 |     Spain           سباين           
124 |     in              ين              
125 |     2007                            
126 |     ,                               
127 |     which           ويكه            
128 |     was             واس             
129 |     equalled        يكالليد         
130 |     five            فيفي            
131 |     days            دايس            
132 |     ago             اغو             
133 |     by              بي              
134 |     South           سووث            
135 |     Africa          افريكا          
136 |     in              ين              
137 |     their           ثير             
138 |     victory         فيكتوري         
139 |     over            وفير            
140 |     West            ويست            
141 |     Indies          يندييس          
142 |     in              ين              
143 |     Sydney          سيدني           
144 |     .                               
145 |     
146 | 
147 | 


--------------------------------------------------------------------------------
/docs/README.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | polyglot
  3 | ========
  4 | 
  5 | |Downloads| |Latest Version| |Build Status| |Documentation Status|
  6 | 
  7 | .. |Downloads| image:: https://img.shields.io/pypi/dm/polyglot.svg
  8 |    :target: https://pypi.python.org/pypi/polyglot
  9 | .. |Latest Version| image:: https://badge.fury.io/py/polyglot.svg
 10 |    :target: https://pypi.python.org/pypi/polyglot
 11 | .. |Build Status| image:: https://travis-ci.org/aboSamoor/polyglot.png?branch=master
 12 |    :target: https://travis-ci.org/aboSamoor/polyglot
 13 | .. |Documentation Status| image:: https://readthedocs.org/projects/polyglot/badge/?version=latest
 14 |    :target: https://readthedocs.org/builds/polyglot/
 15 | 
 16 | Polyglot is a natural language pipeline that supports massive
 17 | multilingual applications.
 18 | 
 19 | -  Free software: GPLv3 license
 20 | -  Documentation: http://polyglot.readthedocs.org.
 21 | 
 22 | Features
 23 | ~~~~~~~~
 24 | 
 25 | -  Tokenization (165 Languages)
 26 | -  Language detection (196 Languages)
 27 | -  Named Entity Recognition (40 Languages)
 28 | -  Part of Speech Tagging (16 Languages)
 29 | -  Sentiment Analysis (136 Languages)
 30 | -  Word Embeddings (137 Languages)
 31 | -  Morphological analysis (135 Languages)
 32 | -  Transliteration (69 Languages)
 33 | 
 34 | Developer
 35 | ~~~~~~~~~
 36 | 
 37 | -  Rami Al-Rfou @ ``rmyeid gmail com``
 38 | 
 39 | Quick Tutorial
 40 | --------------
 41 | 
 42 | .. code:: python
 43 | 
 44 |     import polyglot
 45 |     from polyglot.text import Text, Word
 46 | 
 47 | Language Detection
 48 | ~~~~~~~~~~~~~~~~~~
 49 | 
 50 | .. code:: python
 51 | 
 52 |     text = Text("Bonjour, Mesdames.")
 53 |     print("Language Detected: Code={}, Name={}\n".format(text.language.code, text.language.name))
 54 | 
 55 | 
 56 | .. parsed-literal::
 57 | 
 58 |     Language Detected: Code=fr, Name=French
 59 |     
 60 | 
 61 | 
 62 | Tokenization
 63 | ~~~~~~~~~~~~
 64 | 
 65 | .. code:: python
 66 | 
 67 |     zen = Text("Beautiful is better than ugly. "
 68 |                "Explicit is better than implicit. "
 69 |                "Simple is better than complex.")
 70 |     print(zen.words)
 71 | 
 72 | 
 73 | .. parsed-literal::
 74 | 
 75 |     [u'Beautiful', u'is', u'better', u'than', u'ugly', u'.', u'Explicit', u'is', u'better', u'than', u'implicit', u'.', u'Simple', u'is', u'better', u'than', u'complex', u'.']
 76 | 
 77 | 
 78 | .. code:: python
 79 | 
 80 |     print(zen.sentences)
 81 | 
 82 | 
 83 | .. parsed-literal::
 84 | 
 85 |     [Sentence("Beautiful is better than ugly."), Sentence("Explicit is better than implicit."), Sentence("Simple is better than complex.")]
 86 | 
 87 | 
 88 | Part of Speech Tagging
 89 | ~~~~~~~~~~~~~~~~~~~~~~
 90 | 
 91 | .. code:: python
 92 | 
 93 |     text = Text(u"O primeiro uso de desobediência civil em massa ocorreu em setembro de 1906.")
 94 |     
 95 |     print("{:<16}{}".format("Word", "POS Tag")+"\n"+"-"*30)
 96 |     for word, tag in text.pos_tags:
 97 |         print(u"{:<16}{:>2}".format(word, tag))
 98 | 
 99 | 
100 | .. parsed-literal::
101 | 
102 |     Word            POS Tag
103 |     ------------------------------
104 |     O               DET
105 |     primeiro        ADJ
106 |     uso             NOUN
107 |     de              ADP
108 |     desobediência   NOUN
109 |     civil           ADJ
110 |     em              ADP
111 |     massa           NOUN
112 |     ocorreu         ADJ
113 |     em              ADP
114 |     setembro        NOUN
115 |     de              ADP
116 |     1906            NUM
117 |     .               PUNCT
118 | 
119 | 
120 | Named Entity Recognition
121 | ~~~~~~~~~~~~~~~~~~~~~~~~
122 | 
123 | .. code:: python
124 | 
125 |     text = Text(u"In Großbritannien war Gandhi mit dem westlichen Lebensstil vertraut geworden")
126 |     print(text.entities)
127 | 
128 | 
129 | .. parsed-literal::
130 | 
131 |     [I-LOC([u'Gro\xdfbritannien']), I-PER([u'Gandhi'])]
132 | 
133 | 
134 | Polarity
135 | ~~~~~~~~
136 | 
137 | .. code:: python
138 | 
139 |     print("{:<16}{}".format("Word", "Polarity")+"\n"+"-"*30)
140 |     for w in zen.words[:6]:
141 |         print("{:<16}{:>2}".format(w, w.polarity))
142 | 
143 | 
144 | .. parsed-literal::
145 | 
146 |     Word            Polarity
147 |     ------------------------------
148 |     Beautiful        0
149 |     is               0
150 |     better           1
151 |     than             0
152 |     ugly            -1
153 |     .                0
154 | 
155 | 
156 | Embeddings
157 | ~~~~~~~~~~
158 | 
159 | .. code:: python
160 | 
161 |     word = Word("Obama", language="en")
162 |     print("Neighbors (Synonms) of {}".format(word)+"\n"+"-"*30)
163 |     for w in word.neighbors:
164 |         print("{:<16}".format(w))
165 |     print("\n\nThe first 10 dimensions out the {} dimensions\n".format(word.vector.shape[0]))
166 |     print(word.vector[:10])
167 | 
168 | 
169 | .. parsed-literal::
170 | 
171 |     Neighbors (Synonms) of Obama
172 |     ------------------------------
173 |     Bush            
174 |     Reagan          
175 |     Clinton         
176 |     Ahmadinejad     
177 |     Nixon           
178 |     Karzai          
179 |     McCain          
180 |     Biden           
181 |     Huckabee        
182 |     Lula            
183 |     
184 |     
185 |     The first 10 dimensions out the 256 dimensions
186 |     
187 |     [-2.57382345  1.52175975  0.51070285  1.08678675 -0.74386948 -1.18616164
188 |       2.92784619 -0.25694436 -1.40958667 -2.39675403]
189 | 
190 | 
191 | Morphology
192 | ~~~~~~~~~~
193 | 
194 | .. code:: python
195 | 
196 |     word = Text("Preprocessing is an essential step.").words[0]
197 |     print(word.morphemes)
198 | 
199 | 
200 | .. parsed-literal::
201 | 
202 |     [u'Pre', u'process', u'ing']
203 | 
204 | 
205 | Transliteration
206 | ~~~~~~~~~~~~~~~
207 | 
208 | .. code:: python
209 | 
210 |     from polyglot.transliteration import Transliterator
211 |     transliterator = Transliterator(source_lang="en", target_lang="ru")
212 |     print(transliterator.transliterate(u"preprocessing"))
213 | 
214 | 
215 | .. parsed-literal::
216 | 
217 |     препрокессинг
218 | 
219 | 


--------------------------------------------------------------------------------
/docs/POS.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | Part of Speech Tagging
  3 | ======================
  4 | 
  5 | Part of speech tagging task aims to assign every word/token in plain
  6 | text a category that identifies the syntactic functionality of the word
  7 | occurrence.
  8 | 
  9 | Polyglot recognizes 17 parts of speech, this set is called the
 10 | ``universal part of speech tag set``:
 11 | 
 12 | -  **ADJ**: adjective
 13 | -  **ADP**: adposition
 14 | -  **ADV**: adverb
 15 | -  **AUX**: auxiliary verb
 16 | -  **CONJ**: coordinating conjunction
 17 | -  **DET**: determiner
 18 | -  **INTJ**: interjection
 19 | -  **NOUN**: noun
 20 | -  **NUM**: numeral
 21 | -  **PART**: particle
 22 | -  **PRON**: pronoun
 23 | -  **PROPN**: proper noun
 24 | -  **PUNCT**: punctuation
 25 | -  **SCONJ**: subordinating conjunction
 26 | -  **SYM**: symbol
 27 | -  **VERB**: verb
 28 | -  **X**: other
 29 | 
 30 | Languages Coverage
 31 | ------------------
 32 | 
 33 | The models were trained on a combination of:
 34 | 
 35 | -  Original CONLL datasets after the tags were converted using the
 36 |    `universal POS
 37 |    tables <http://universaldependencies.github.io/docs/tagset-conversion/index.html>`__.
 38 | 
 39 | -  Universal Dependencies 1.0 corpora whenever they are available.
 40 | 
 41 | .. code:: python
 42 | 
 43 |     from polyglot.downloader import downloader
 44 |     print(downloader.supported_languages_table("pos2"))
 45 | 
 46 | 
 47 | .. parsed-literal::
 48 | 
 49 |       1. German                     2. Italian                    3. Danish                   
 50 |       4. Czech                      5. Slovene                    6. French                   
 51 |       7. English                    8. Swedish                    9. Bulgarian                
 52 |      10. Spanish; Castilian        11. Indonesian                12. Portuguese               
 53 |      13. Finnish                   14. Irish                     15. Hungarian                
 54 |      16. Dutch                    
 55 | 
 56 | 
 57 | Download Necessary Models
 58 | ^^^^^^^^^^^^^^^^^^^^^^^^^
 59 | 
 60 | .. code:: python
 61 | 
 62 |     %%bash
 63 |     polyglot download embeddings2.en pos2.en
 64 | 
 65 | 
 66 | .. parsed-literal::
 67 | 
 68 |     [polyglot_data] Downloading package embeddings2.en to
 69 |     [polyglot_data]     /home/rmyeid/polyglot_data...
 70 |     [polyglot_data]   Package embeddings2.en is already up-to-date!
 71 |     [polyglot_data] Downloading package pos2.en to
 72 |     [polyglot_data]     /home/rmyeid/polyglot_data...
 73 |     [polyglot_data]   Package pos2.en is already up-to-date!
 74 | 
 75 | 
 76 | Example
 77 | -------
 78 | 
 79 | We tag each word in the text with one part of speech.
 80 | 
 81 | .. code:: python
 82 | 
 83 |     from polyglot.text import Text
 84 | 
 85 | .. code:: python
 86 | 
 87 |     blob = """We will meet at eight o'clock on Thursday morning."""
 88 |     text = Text(blob)
 89 | 
 90 | We can query all the tagged words
 91 | 
 92 | .. code:: python
 93 | 
 94 |     text.pos_tags
 95 | 
 96 | 
 97 | 
 98 | 
 99 | .. parsed-literal::
100 | 
101 |     [(u'We', u'PRON'),
102 |      (u'will', u'AUX'),
103 |      (u'meet', u'VERB'),
104 |      (u'at', u'ADP'),
105 |      (u'eight', u'NUM'),
106 |      (u"o'clock", u'NOUN'),
107 |      (u'on', u'ADP'),
108 |      (u'Thursday', u'PROPN'),
109 |      (u'morning', u'NOUN'),
110 |      (u'.', u'PUNCT')]
111 | 
112 | 
113 | 
114 | After calling the pos\_tags property once, the words objects will carry
115 | the POS tags.
116 | 
117 | .. code:: python
118 | 
119 |     text.words[0].pos_tag
120 | 
121 | 
122 | 
123 | 
124 | .. parsed-literal::
125 | 
126 |     u'PRON'
127 | 
128 | 
129 | 
130 | Command Line Interface
131 | ~~~~~~~~~~~~~~~~~~~~~~
132 | 
133 | .. code:: python
134 | 
135 |     !polyglot --lang en tokenize --input testdata/cricket.txt |  polyglot --lang en pos | tail -n 30
136 | 
137 | 
138 | .. parsed-literal::
139 | 
140 |     which           DET  
141 |     India           PROPN
142 |     beat            VERB 
143 |     Bermuda         PROPN
144 |     in              ADP  
145 |     Port            PROPN
146 |     of              ADP  
147 |     Spain           PROPN
148 |     in              ADP  
149 |     2007            NUM  
150 |     ,               PUNCT
151 |     which           DET  
152 |     was             AUX  
153 |     equalled        VERB 
154 |     five            NUM  
155 |     days            NOUN 
156 |     ago             ADV  
157 |     by              ADP  
158 |     South           PROPN
159 |     Africa          PROPN
160 |     in              ADP  
161 |     their           PRON 
162 |     victory         NOUN 
163 |     over            ADP  
164 |     West            PROPN
165 |     Indies          PROPN
166 |     in              ADP  
167 |     Sydney          PROPN
168 |     .               PUNCT
169 |     
170 | 
171 | 
172 | Citation
173 | ~~~~~~~~
174 | 
175 | This work is a direct implementation of the research being described in
176 | the `Polyglot: Distributed Word Representations for Multilingual
177 | NLP <http://www.aclweb.org/anthology/W13-3520>`__ paper. The author of
178 | this library strongly encourage you to cite the following paper if you
179 | are using this software.
180 | 
181 | ::
182 | 
183 |        @InProceedings{polyglot:2013:ACL-CoNLL,
184 |          author    = {Al-Rfou, Rami  and  Perozzi, Bryan  and  Skiena, Steven},
185 |          title     = {Polyglot: Distributed Word Representations for Multilingual NLP},
186 |          booktitle = {Proceedings of the Seventeenth Conference on Computational Natural Language Learning},
187 |          month     = {August},
188 |          year      = {2013},
189 |          address   = {Sofia, Bulgaria},
190 |          publisher = {Association for Computational Linguistics},
191 |          pages     = {183--192}, 
192 |          url       = {http://www.aclweb.org/anthology/W13-3520}
193 |        }
194 | 
195 | References
196 | ----------
197 | 
198 | -  `Universal Part of Speech
199 |    Tagging <http://universaldependencies.github.io/docs/u/pos/index.html>`__
200 | -  `Universal Dependencies
201 |    1.0 <https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-1464>`__.
202 | 


--------------------------------------------------------------------------------
/docs/NamedEntityRecognition.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | Named Entity Extraction
  3 | =======================
  4 | 
  5 | Named entity extraction task aims to extract phrases from plain text
  6 | that correpond to entities. Polyglot recognizes 3 categories of
  7 | entities:
  8 | 
  9 | -  Locations (Tag: ``I-LOC``): cities, countries, regions, continents,
 10 |    neighborhoods, administrative divisions ...
 11 | -  Organizations (Tag: ``I-ORG``): sports teams, newspapers, banks,
 12 |    universities, schools, non-profits, companies, ...
 13 | -  Persons (Tag: ``I-PER``): politicians, scientists, artists, atheletes
 14 |    ...
 15 | 
 16 | Languages Coverage
 17 | ------------------
 18 | 
 19 | The models were trained on datasets extracted automatically from
 20 | Wikipedia. Polyglot currently supports 40 major languages.
 21 | 
 22 | .. code:: python
 23 | 
 24 |     from polyglot.downloader import downloader
 25 |     print(downloader.supported_languages_table("ner2", 3))
 26 | 
 27 | 
 28 | .. parsed-literal::
 29 | 
 30 |       1. Polish                     2. Turkish                    3. Russian                  
 31 |       4. Indonesian                 5. Czech                      6. Arabic                   
 32 |       7. Korean                     8. Catalan; Valencian         9. Italian                  
 33 |      10. Thai                      11. Romanian, Moldavian, ...  12. Tagalog                  
 34 |      13. Danish                    14. Finnish                   15. German                   
 35 |      16. Persian                   17. Dutch                     18. Chinese                  
 36 |      19. French                    20. Portuguese                21. Slovak                   
 37 |      22. Hebrew (modern)           23. Malay                     24. Slovene                  
 38 |      25. Bulgarian                 26. Hindi                     27. Japanese                 
 39 |      28. Hungarian                 29. Croatian                  30. Ukrainian                
 40 |      31. Serbian                   32. Lithuanian                33. Norwegian                
 41 |      34. Latvian                   35. Swedish                   36. English                  
 42 |      37. Greek, Modern             38. Spanish; Castilian        39. Vietnamese               
 43 |      40. Estonian                 
 44 | 
 45 | 
 46 | Download Necessary Models
 47 | ^^^^^^^^^^^^^^^^^^^^^^^^^
 48 | 
 49 | .. code:: python
 50 | 
 51 |     %%bash
 52 |     polyglot download embeddings2.en ner2.en
 53 | 
 54 | 
 55 | .. parsed-literal::
 56 | 
 57 |     [polyglot_data] Downloading package embeddings2.en to
 58 |     [polyglot_data]     /home/rmyeid/polyglot_data...
 59 |     [polyglot_data]   Package embeddings2.en is already up-to-date!
 60 |     [polyglot_data] Downloading package ner2.en to
 61 |     [polyglot_data]     /home/rmyeid/polyglot_data...
 62 |     [polyglot_data]   Package ner2.en is already up-to-date!
 63 | 
 64 | 
 65 | Example
 66 | -------
 67 | 
 68 | Entities inside a text object or a sentence are represented as chunks.
 69 | Each chunk identifies the start and the end indices of the word
 70 | subsequence within the text.
 71 | 
 72 | .. code:: python
 73 | 
 74 |     from polyglot.text import Text
 75 | 
 76 | .. code:: python
 77 | 
 78 |     blob = """The Israeli Prime Minister Benjamin Netanyahu has warned that Iran poses a "threat to the entire world"."""
 79 |     text = Text(blob)
 80 | 
 81 | We can query all entities mentioned in a text.
 82 | 
 83 | .. code:: python
 84 | 
 85 |     text.entities
 86 | 
 87 | 
 88 | 
 89 | 
 90 | .. parsed-literal::
 91 | 
 92 |     [I-ORG([u'Israeli']), I-PER([u'Benjamin', u'Netanyahu']), I-LOC([u'Iran'])]
 93 | 
 94 | 
 95 | 
 96 | Or, we can query entites per sentence
 97 | 
 98 | .. code:: python
 99 | 
100 |     for sent in text.sentences:
101 |       print(sent, "\n")
102 |       for entity in sent.entities:
103 |         print(entity.tag, entity)
104 | 
105 | 
106 | .. parsed-literal::
107 | 
108 |     The Israeli Prime Minister Benjamin Netanyahu has warned that Iran poses a "threat to the entire world". 
109 |     
110 |     I-ORG [u'Israeli']
111 |     I-PER [u'Benjamin', u'Netanyahu']
112 |     I-LOC [u'Iran']
113 | 
114 | 
115 | By doing more careful inspection of the second entity
116 | ``Benjamin Netanyahu``, we can locate the position of the entity within
117 | the sentence.
118 | 
119 | .. code:: python
120 | 
121 |     benjamin = sent.entities[1]
122 |     sent.words[benjamin.start: benjamin.end]
123 | 
124 | 
125 | 
126 | 
127 | .. parsed-literal::
128 | 
129 |     WordList([u'Benjamin', u'Netanyahu'])
130 | 
131 | 
132 | 
133 | Command Line Interface
134 | ~~~~~~~~~~~~~~~~~~~~~~
135 | 
136 | .. code:: python
137 | 
138 |     !polyglot --lang en tokenize --input testdata/cricket.txt |  polyglot --lang en ner | tail -n 20
139 | 
140 | 
141 | .. parsed-literal::
142 | 
143 |     ,               O    
144 |     which           O    
145 |     was             O    
146 |     equalled        O    
147 |     five            O    
148 |     days            O    
149 |     ago             O    
150 |     by              O    
151 |     South           I-LOC
152 |     Africa          I-LOC
153 |     in              O    
154 |     their           O    
155 |     victory         O    
156 |     over            O    
157 |     West            I-ORG
158 |     Indies          I-ORG
159 |     in              O    
160 |     Sydney          I-LOC
161 |     .               O    
162 |     
163 | 
164 | 
165 | Demo
166 | ----
167 | 
168 | .. raw:: html
169 |    <embed>
170 |    <iframe src="https://entityextractor.appspot.com/" width="100%" height="225" seamless></iframe>
171 |    </embed>
172 | 
173 | Citation
174 | ~~~~~~~~
175 | 
176 | This work is a direct implementation of the research being described in
177 | the `Polyglot-NER: Multilingual Named Entity
178 | Recognition <https://sites.google.com/site/rmyeid/papers/polyglot-ner.pdf?attredirects=0&d=1>`__
179 | paper. The author of this library strongly encourage you to cite the
180 | following paper if you are using this software.
181 | 
182 | ::
183 | 
184 |     @article{polyglotner,
185 |             author = {Al-Rfou, Rami and Kulkarni, Vivek and Perozzi, Bryan and Skiena, Steven},
186 |             title = {{Polyglot-NER}: Massive Multilingual Named Entity Recognition},
187 |             journal = {{Proceedings of the 2015 {SIAM} International Conference on Data Mining, Vancouver, British Columbia, Canada, April 30 - May 2, 2015}},
188 |             month     = {April},
189 |             year      = {2015},
190 |             publisher = {SIAM}
191 |     }
192 | 
193 | References
194 | ----------
195 | 
196 | -  `Polyglot-NER project page. <https://bit.ly/polyglot-ner>`__
197 | -  `Wikipedia on
198 |    NER <http://en.wikipedia.org/wiki/Named-entity_recognition>`__.
199 | 


--------------------------------------------------------------------------------
/polyglot/base.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """Basic data types."""
  5 | 
  6 | from io import open, StringIO
  7 | from collections import Counter
  8 | import os
  9 | from concurrent.futures import ProcessPoolExecutor
 10 | from itertools import islice
 11 | 
 12 | import six
 13 | from six.moves import zip
 14 | from six import text_type as unicode
 15 | from six import iteritems
 16 | from six import string_types
 17 | 
 18 | 
 19 | class Sequence(object):
 20 |   """ Text with indices indicates boundaries."""
 21 | 
 22 |   def __init__(self, text):
 23 | 
 24 |     if not text:
 25 |       raise ValueError("This Sequence is Empty")
 26 |     if not isinstance(text, unicode):
 27 |       raise ValueError("This is not unicode text instead {}".format(type(text)))
 28 | 
 29 |     self.__text = text
 30 |     self.idx = [0, len(self.text)]
 31 | 
 32 |   @property
 33 |   def text(self):
 34 |     return self.__text
 35 | 
 36 |   def __iter__(self):
 37 |     for start, end in zip(self.idx[:-1], self.idx[1:]):
 38 |       yield self.text[start: end]
 39 | 
 40 |   def tokens(self):
 41 |     """ Returns segmented text after stripping whitespace."""
 42 | 
 43 |     return [x.strip() for x in self if x.strip()]
 44 | 
 45 |   def __str__(self):
 46 |     if six.PY3:
 47 |       return self.__unicode__()
 48 |     return self.__unicode__().encode("utf-8")
 49 | 
 50 |   def __unicode__(self):
 51 |     return u'\n'.join(self.tokens())
 52 | 
 53 |   def split(self, sequence):
 54 |     """ Split into subsequences according to `sequence`."""
 55 | 
 56 |     major_idx = sequence.idx
 57 |     idx2 = 0
 58 |     for start, end in zip(major_idx[:-1], major_idx[1:]):
 59 |       idx1 = self.idx.index(start, idx2)
 60 |       idx2 = self.idx.index(end, idx2)
 61 |       seq = Sequence(self.text[start:end])
 62 |       seq.idx = [x-start for x in self.idx[idx1:idx2]]
 63 |       yield seq
 64 | 
 65 |   def __len__(self):
 66 |     return len(self.idx) - 1
 67 | 
 68 |   def empty(self):
 69 |     return not self.text.strip()
 70 | 
 71 | 
 72 | class TokenSequence(list):
 73 |   """A list of tokens.
 74 | 
 75 |   Args:
 76 |    tokens (list): list of symbols.
 77 |   """
 78 | 
 79 |   def sliding_window(self, width=2, padding=None):
 80 |     seq = self
 81 |     if padding:
 82 |       pad = [padding for x in range(width/2)]
 83 |       seq = pad + self + pad
 84 |     args = [islice(seq, i, None)  for i in range(width)]
 85 |     for x in zip(*args):
 86 |       yield x
 87 | 
 88 | 
 89 | class TextFile(object):
 90 |   """ Wrapper around text files.
 91 | 
 92 |       It uses io.open to guarantee reading text files with unicode encoding.
 93 |       It has an iterator that supports arbitrary delimiter instead of only
 94 |       new lines.
 95 | 
 96 |   Attributes:
 97 |     delimiter (string): A string that defines the limit of each chunk.
 98 |     file (string): A path to a file.
 99 |     buf (StringIO): a buffer to store the results of peeking into the file.
100 |   """
101 | 
102 |   def __init__(self, file, delimiter=u'\n'):
103 |     self.name = file
104 |     self.delimiter = delimiter
105 |     self.open_file = open(file, 'r')
106 |     self.buf = StringIO()
107 | 
108 |   def iter_delimiter(self, byte_size=8192):
109 |     """ Generalization of the default iter file delimited by '\n'.
110 |     Note:
111 |       The newline string can be arbitrarily long; it need not be restricted to a
112 |       single character. You can also set the read size and control whether or not
113 |       the newline string is left on the end of the iterated lines.  Setting
114 |       newline to '\0' is particularly good for use with an input file created with
115 |       something like "os.popen('find -print0')".
116 | 
117 |     Args:
118 |       byte_size (integer): Number of bytes to be read at each time.
119 |     """
120 |     partial = u''
121 |     while True:
122 |       read_chars = self.read(byte_size)
123 |       if not read_chars: break
124 |       partial += read_chars
125 |       lines = partial.split(self.delimiter)
126 |       partial = lines.pop()
127 | 
128 |       for line in lines:
129 |         yield line + self.delimiter
130 | 
131 |     if partial:
132 |       yield partial
133 | 
134 |   def __iter__(self):
135 |     for l in self.iter_delimiter():
136 |       yield l
137 | 
138 |   def iter_chunks(self, chunksize):
139 |     chunk = []
140 |     for i, l in enumerate(self):
141 |       chunk.append(l)
142 |       if i % chunksize == chunksize -1:
143 |         yield chunk
144 |         chunk = []
145 |     if chunk:
146 |       yield chunk
147 | 
148 |   def _append_to_buf(self, contents):
149 |     oldpos = self.buf.tell()
150 |     self.buf.seek(0, os.SEEK_END)
151 |     self.buf.write(contents)
152 |     self.buf.seek(oldpos)
153 | 
154 |   def peek(self, size):
155 |     contents = self.open_file.read(size)
156 |     self._append_to_buf(contents)
157 |     return contents
158 | 
159 |   def read(self, size=None):
160 |     """ Read `size` of bytes."""
161 |     if size is None:
162 |       return self.buf.read() + self.open_file.read()
163 |     contents = self.buf.read(size)
164 |     if len(contents) < size:
165 |       contents += self.open_file.read(size - len(contents))
166 |     return contents
167 | 
168 |   def readline(self):
169 |     line = self.buf.readline()
170 |     if not line.endswith('\n'):
171 |       line += self.open_file.readline()
172 |     return line
173 | 
174 |   def apply(self, func, workers=1, job_size=10000):
175 |     """Apply `func` to lines of text in parallel or sequential.
176 | 
177 |     Args:
178 |       func : a function that takes a list of lines.
179 |     """
180 |     if workers == 1:
181 |       for lines in self.iter_chunks(job_size):
182 |         yield func(lines)
183 |     else:
184 |       with ProcessPoolExecutor(max_workers=workers) as executor:
185 |         for result in executor.map(func, self.iter_chunks(job_size)):
186 |           yield result
187 | 
188 | 
189 | class TextFiles(TextFile):
190 |   """Interface for a sequence of files."""
191 | 
192 |   def __init__(self, files, delimiter=u'\n'):
193 |     if isinstance(files[0], string_types):
194 |       self.files = [TextFile(f) for f in files]
195 |     self.files = files
196 |     self.delimiter = delimiter
197 |     self.buf = StringIO()
198 |     self.i = 0
199 |     self.open_file = self.files[self.i].open_file
200 | 
201 |   def readline(self):
202 |     raise NotImplementedError("Future work")
203 | 
204 |   def peek(self, size):
205 |     self.open_file.seek(0)
206 |     contents = self.open_file.read(size)
207 |     self.open_file.seek(0)
208 |     return contents
209 | 
210 |   def read(self, size=None):
211 |     content = super(TextFiles, self).read(size)
212 |     if not content and self.i < len(self.files)-1:
213 |       self.i += 1
214 |       self.buf = StringIO()
215 |       self.open_file = self.files[self.i].open_file
216 |       return self.read(size)
217 |     return content
218 | 
219 |   @property
220 |   def names(self):
221 |     return [f.name for f in self.files]
222 | 


--------------------------------------------------------------------------------
/docs/Embeddings.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | Word Embeddings
  3 | ===============
  4 | 
  5 | Word embedding is a mapping of a word to a d-dimensional vector space.
  6 | This real valued vector representation captures semantic and syntactic
  7 | features. Polyglot offers a simple interface to load several formats of
  8 | word embeddings.
  9 | 
 10 | .. code:: python
 11 | 
 12 |     from polyglot.mapping import Embedding
 13 | 
 14 | Formats
 15 | -------
 16 | 
 17 | The Embedding class can read word embeddings from different sources:
 18 | 
 19 | -  Gensim word2vec objects: (``from_gensim`` method)
 20 | -  Word2vec binary/text models: (``from_word2vec`` method)
 21 | -  polyglot pickle files: (``load`` method)
 22 | 
 23 | .. code:: python
 24 | 
 25 |     embeddings = Embedding.load("/home/rmyeid/polyglot_data/embeddings2/en/embeddings_pkl.tar.bz2")
 26 | 
 27 | Nearest Neighbors
 28 | -----------------
 29 | 
 30 | A common way to investigate the space capture by the embeddings is to
 31 | query for the nearest neightbors of any word.
 32 | 
 33 | .. code:: python
 34 | 
 35 |     neighbors = embeddings.nearest_neighbors("green")
 36 |     neighbors
 37 | 
 38 | 
 39 | 
 40 | 
 41 | .. parsed-literal::
 42 | 
 43 |     [u'blue',
 44 |      u'white',
 45 |      u'red',
 46 |      u'yellow',
 47 |      u'black',
 48 |      u'grey',
 49 |      u'purple',
 50 |      u'pink',
 51 |      u'light',
 52 |      u'gray']
 53 | 
 54 | 
 55 | 
 56 | to calculate the distance between a word and the nieghbors, we can call
 57 | the ``distances`` method
 58 | 
 59 | .. code:: python
 60 | 
 61 |     embeddings.distances("green", neighbors)
 62 | 
 63 | 
 64 | 
 65 | 
 66 | .. parsed-literal::
 67 | 
 68 |     array([ 1.34894466,  1.37864077,  1.39504588,  1.39524949,  1.43183875,
 69 |             1.68007386,  1.75897062,  1.88401115,  1.89186132,  1.902614  ], dtype=float32)
 70 | 
 71 | 
 72 | 
 73 | The word embeddings are not unit vectors, actually the more frequent the
 74 | word is the larger the norm of its own vector.
 75 | 
 76 | .. code:: python
 77 | 
 78 |     %matplotlib inline
 79 |     import matplotlib.pyplot as plt
 80 |     import numpy as np
 81 | 
 82 | .. code:: python
 83 | 
 84 |     norms = np.linalg.norm(embeddings.vectors, axis=1)
 85 |     window = 300
 86 |     smooth_line = np.convolve(norms, np.ones(window)/float(window), mode='valid')
 87 |     plt.plot(smooth_line)
 88 |     plt.xlabel("Word Rank"); _ = plt.ylabel("$L_2$ norm")
 89 | 
 90 | 
 91 | 
 92 | .. image:: Embeddings_files/Embeddings_12_0.png
 93 | 
 94 | 
 95 | This could be problematic for some applications and training algorithms.
 96 | We can normalize them by :math:`L_2` norms to get unit vectors to reduce
 97 | effects of word frequency, as the following
 98 | 
 99 | .. code:: python
100 | 
101 |     embeddings = embeddings.normalize_words()
102 | 
103 | .. code:: python
104 | 
105 |     neighbors = embeddings.nearest_neighbors("green")
106 |     for w,d in zip(neighbors, embeddings.distances("green", neighbors)):
107 |       print("{:<8}{:.4f}".format(w,d))
108 | 
109 | 
110 | .. parsed-literal::
111 | 
112 |     white   0.4261
113 |     blue    0.4451
114 |     black   0.4591
115 |     red     0.4786
116 |     yellow  0.4947
117 |     grey    0.6072
118 |     purple  0.6392
119 |     light   0.6483
120 |     pink    0.6574
121 |     colour  0.6824
122 | 
123 | 
124 | Vocabulary Expansion
125 | --------------------
126 | 
127 | .. code:: python
128 | 
129 |     from polyglot.mapping import CaseExpander, DigitExpander
130 | 
131 | Not all the words are available in the dictionary defined by the word
132 | embeddings. Sometimes it would be useful to map new words to similar
133 | ones that we have embeddings for.
134 | 
135 | Case Expansion
136 | ~~~~~~~~~~~~~~
137 | 
138 | For example, the word ``GREEN`` is not available in the embeddings,
139 | 
140 | .. code:: python
141 | 
142 |     "GREEN" in embeddings
143 | 
144 | 
145 | 
146 | 
147 | .. parsed-literal::
148 | 
149 |     False
150 | 
151 | 
152 | 
153 | we would like to return the vector that represents the word ``Green``,
154 | to do that we apply a case expansion:
155 | 
156 | .. code:: python
157 | 
158 |     embeddings.apply_expansion(CaseExpander)
159 | 
160 | .. code:: python
161 | 
162 |     "GREEN" in embeddings
163 | 
164 | 
165 | 
166 | 
167 | .. parsed-literal::
168 | 
169 |     True
170 | 
171 | 
172 | 
173 | .. code:: python
174 | 
175 |     embeddings.nearest_neighbors("GREEN")
176 | 
177 | 
178 | 
179 | 
180 | .. parsed-literal::
181 | 
182 |     [u'White',
183 |      u'Black',
184 |      u'Brown',
185 |      u'Blue',
186 |      u'Diamond',
187 |      u'Wood',
188 |      u'Young',
189 |      u'Hudson',
190 |      u'Cook',
191 |      u'Gold']
192 | 
193 | 
194 | 
195 | Digit Expansion
196 | ~~~~~~~~~~~~~~~
197 | 
198 | We reduce the size of the vocabulary while training the embeddings by
199 | grouping special classes of words. Once common case of such grouping is
200 | digits. Every digit in the training corpus get replaced by the symbol
201 | ``#``. For example, a number like ``123.54`` becomes ``###.##``.
202 | Therefore, querying the embedding for a new number like ``434`` will
203 | result in a failure
204 | 
205 | .. code:: python
206 | 
207 |     "434" in embeddings
208 | 
209 | 
210 | 
211 | 
212 | .. parsed-literal::
213 | 
214 |     False
215 | 
216 | 
217 | 
218 | To fix that, we apply another type of vocabulary expansion
219 | ``DigitExpander``. It will map any number to a sequence of ``#``\ s.
220 | 
221 | .. code:: python
222 | 
223 |     embeddings.apply_expansion(DigitExpander)
224 | 
225 | .. code:: python
226 | 
227 |     "434" in embeddings
228 | 
229 | 
230 | 
231 | 
232 | .. parsed-literal::
233 | 
234 |     True
235 | 
236 | 
237 | 
238 | As expected, the neighbors of the new number ``434`` will be other
239 | numbers:
240 | 
241 | .. code:: python
242 | 
243 |     embeddings.nearest_neighbors("434")
244 | 
245 | 
246 | 
247 | 
248 | .. parsed-literal::
249 | 
250 |     [u'##',
251 |      u'#',
252 |      u'3',
253 |      u'#####',
254 |      u'#,###',
255 |      u'##,###',
256 |      u'##EN##',
257 |      u'####',
258 |      u'###EN###',
259 |      u'n']
260 | 
261 | 
262 | 
263 | Demo
264 | ----
265 | 
266 | Demo is available `here <https://bit.ly/embeddings>`__.
267 | 
268 | Citation
269 | ~~~~~~~~
270 | 
271 | This work is a direct implementation of the research being described in
272 | the `Polyglot: Distributed Word Representations for Multilingual
273 | NLP <http://www.aclweb.org/anthology/W13-3520>`__ paper. The author of
274 | this library strongly encourage you to cite the following paper if you
275 | are using this software.
276 | 
277 | ::
278 | 
279 |     @InProceedings{polyglot:2013:ACL-CoNLL,
280 |      author    = {Al-Rfou, Rami  and  Perozzi, Bryan  and  Skiena, Steven},
281 |      title     = {Polyglot: Distributed Word Representations for Multilingual NLP},
282 |      booktitle = {Proceedings of the Seventeenth Conference on Computational Natural Language Learning},
283 |      month     = {August},
284 |      year      = {2013},
285 |      address   = {Sofia, Bulgaria},
286 |      publisher = {Association for Computational Linguistics},
287 |      pages     = {183--192}, 
288 |      url       = {http://www.aclweb.org/anthology/W13-3520}
289 |     }
290 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 21 | 
 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 23 | 
 24 | help:
 25 | 	@echo "Please use \`make <target>' where <target> is one of"
 26 | 	@echo "  html       to make standalone HTML files"
 27 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 28 | 	@echo "  singlehtml to make a single large HTML file"
 29 | 	@echo "  pickle     to make pickle files"
 30 | 	@echo "  json       to make JSON files"
 31 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 32 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 33 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 34 | 	@echo "  epub       to make an epub"
 35 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 36 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 37 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 38 | 	@echo "  text       to make text files"
 39 | 	@echo "  man        to make manual pages"
 40 | 	@echo "  texinfo    to make Texinfo files"
 41 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 42 | 	@echo "  gettext    to make PO message catalogs"
 43 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 44 | 	@echo "  xml        to make Docutils-native XML files"
 45 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 46 | 	@echo "  linkcheck  to check all external links for integrity"
 47 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 48 | 
 49 | clean:
 50 | 	rm -rf $(BUILDDIR)/*
 51 | 
 52 | html:
 53 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 54 | 	@echo
 55 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 56 | 
 57 | dirhtml:
 58 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 59 | 	@echo
 60 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 61 | 
 62 | singlehtml:
 63 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 64 | 	@echo
 65 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 66 | 
 67 | pickle:
 68 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 69 | 	@echo
 70 | 	@echo "Build finished; now you can process the pickle files."
 71 | 
 72 | json:
 73 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 74 | 	@echo
 75 | 	@echo "Build finished; now you can process the JSON files."
 76 | 
 77 | htmlhelp:
 78 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 79 | 	@echo
 80 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 81 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 82 | 
 83 | qthelp:
 84 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 85 | 	@echo
 86 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 87 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 88 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/complexity.qhcp"
 89 | 	@echo "To view the help file:"
 90 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/complexity.qhc"
 91 | 
 92 | devhelp:
 93 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 94 | 	@echo
 95 | 	@echo "Build finished."
 96 | 	@echo "To view the help file:"
 97 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/complexity"
 98 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/complexity"
 99 | 	@echo "# devhelp"
100 | 
101 | epub:
102 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
103 | 	@echo
104 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
105 | 
106 | latex:
107 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
108 | 	@echo
109 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
110 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
111 | 	      "(use \`make latexpdf' here to do that automatically)."
112 | 
113 | latexpdf:
114 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
115 | 	@echo "Running LaTeX files through pdflatex..."
116 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
117 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
118 | 
119 | latexpdfja:
120 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
121 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
122 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
123 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
124 | 
125 | text:
126 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
127 | 	@echo
128 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
129 | 
130 | man:
131 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
132 | 	@echo
133 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
134 | 
135 | texinfo:
136 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
137 | 	@echo
138 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
139 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
140 | 	      "(use \`make info' here to do that automatically)."
141 | 
142 | info:
143 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
144 | 	@echo "Running Texinfo files through makeinfo..."
145 | 	make -C $(BUILDDIR)/texinfo info
146 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
147 | 
148 | gettext:
149 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
150 | 	@echo
151 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
152 | 
153 | changes:
154 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
155 | 	@echo
156 | 	@echo "The overview file is in $(BUILDDIR)/changes."
157 | 
158 | linkcheck:
159 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
160 | 	@echo
161 | 	@echo "Link check complete; look for any errors in the above output " \
162 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
163 | 
164 | doctest:
165 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
166 | 	@echo "Testing of doctests in the sources finished, look at the " \
167 | 	      "results in $(BUILDDIR)/doctest/output.txt."
168 | 
169 | xml:
170 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
171 | 	@echo
172 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
173 | 
174 | pseudoxml:
175 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
176 | 	@echo
177 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  xml        to make Docutils-native XML files
 37 | 	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
 38 | 	echo.  linkcheck  to check all external links for integrity
 39 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 40 | 	goto end
 41 | )
 42 | 
 43 | if "%1" == "clean" (
 44 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 45 | 	del /q /s %BUILDDIR%\*
 46 | 	goto end
 47 | )
 48 | 
 49 | 
 50 | %SPHINXBUILD% 2> nul
 51 | if errorlevel 9009 (
 52 | 	echo.
 53 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
 54 | 	echo.installed, then set the SPHINXBUILD environment variable to point
 55 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 56 | 	echo.may add the Sphinx directory to PATH.
 57 | 	echo.
 58 | 	echo.If you don't have Sphinx installed, grab it from
 59 | 	echo.http://sphinx-doc.org/
 60 | 	exit /b 1
 61 | )
 62 | 
 63 | if "%1" == "html" (
 64 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 65 | 	if errorlevel 1 exit /b 1
 66 | 	echo.
 67 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 68 | 	goto end
 69 | )
 70 | 
 71 | if "%1" == "dirhtml" (
 72 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 73 | 	if errorlevel 1 exit /b 1
 74 | 	echo.
 75 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 76 | 	goto end
 77 | )
 78 | 
 79 | if "%1" == "singlehtml" (
 80 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 81 | 	if errorlevel 1 exit /b 1
 82 | 	echo.
 83 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 84 | 	goto end
 85 | )
 86 | 
 87 | if "%1" == "pickle" (
 88 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
 89 | 	if errorlevel 1 exit /b 1
 90 | 	echo.
 91 | 	echo.Build finished; now you can process the pickle files.
 92 | 	goto end
 93 | )
 94 | 
 95 | if "%1" == "json" (
 96 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
 97 | 	if errorlevel 1 exit /b 1
 98 | 	echo.
 99 | 	echo.Build finished; now you can process the JSON files.
100 | 	goto end
101 | )
102 | 
103 | if "%1" == "htmlhelp" (
104 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
105 | 	if errorlevel 1 exit /b 1
106 | 	echo.
107 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
108 | .hhp project file in %BUILDDIR%/htmlhelp.
109 | 	goto end
110 | )
111 | 
112 | if "%1" == "qthelp" (
113 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
114 | 	if errorlevel 1 exit /b 1
115 | 	echo.
116 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
117 | .qhcp project file in %BUILDDIR%/qthelp, like this:
118 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\complexity.qhcp
119 | 	echo.To view the help file:
120 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\complexity.ghc
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "devhelp" (
125 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished.
129 | 	goto end
130 | )
131 | 
132 | if "%1" == "epub" (
133 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
134 | 	if errorlevel 1 exit /b 1
135 | 	echo.
136 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
137 | 	goto end
138 | )
139 | 
140 | if "%1" == "latex" (
141 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
142 | 	if errorlevel 1 exit /b 1
143 | 	echo.
144 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
145 | 	goto end
146 | )
147 | 
148 | if "%1" == "latexpdf" (
149 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
150 | 	cd %BUILDDIR%/latex
151 | 	make all-pdf
152 | 	cd %BUILDDIR%/..
153 | 	echo.
154 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
155 | 	goto end
156 | )
157 | 
158 | if "%1" == "latexpdfja" (
159 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
160 | 	cd %BUILDDIR%/latex
161 | 	make all-pdf-ja
162 | 	cd %BUILDDIR%/..
163 | 	echo.
164 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
165 | 	goto end
166 | )
167 | 
168 | if "%1" == "text" (
169 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
170 | 	if errorlevel 1 exit /b 1
171 | 	echo.
172 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
173 | 	goto end
174 | )
175 | 
176 | if "%1" == "man" (
177 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
178 | 	if errorlevel 1 exit /b 1
179 | 	echo.
180 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
181 | 	goto end
182 | )
183 | 
184 | if "%1" == "texinfo" (
185 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
186 | 	if errorlevel 1 exit /b 1
187 | 	echo.
188 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
189 | 	goto end
190 | )
191 | 
192 | if "%1" == "gettext" (
193 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
194 | 	if errorlevel 1 exit /b 1
195 | 	echo.
196 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
197 | 	goto end
198 | )
199 | 
200 | if "%1" == "changes" (
201 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
202 | 	if errorlevel 1 exit /b 1
203 | 	echo.
204 | 	echo.The overview file is in %BUILDDIR%/changes.
205 | 	goto end
206 | )
207 | 
208 | if "%1" == "linkcheck" (
209 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
210 | 	if errorlevel 1 exit /b 1
211 | 	echo.
212 | 	echo.Link check complete; look for any errors in the above output ^
213 | or in %BUILDDIR%/linkcheck/output.txt.
214 | 	goto end
215 | )
216 | 
217 | if "%1" == "doctest" (
218 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
219 | 	if errorlevel 1 exit /b 1
220 | 	echo.
221 | 	echo.Testing of doctests in the sources finished, look at the ^
222 | results in %BUILDDIR%/doctest/output.txt.
223 | 	goto end
224 | )
225 | 
226 | if "%1" == "xml" (
227 | 	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
228 | 	if errorlevel 1 exit /b 1
229 | 	echo.
230 | 	echo.Build finished. The XML files are in %BUILDDIR%/xml.
231 | 	goto end
232 | )
233 | 
234 | if "%1" == "pseudoxml" (
235 | 	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
236 | 	if errorlevel 1 exit /b 1
237 | 	echo.
238 | 	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
239 | 	goto end
240 | )
241 | 
242 | :end


--------------------------------------------------------------------------------
/polyglot/mixins.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | import sys
  4 | 
  5 | import six
  6 | from six import PY2, binary_type
  7 | 
  8 | if PY2:
  9 |     string_types = (str, unicode)
 10 |     basestring = basestring
 11 |     def implements_to_string(cls):
 12 |         """Class decorator that renames __str__ to __unicode__ and
 13 |         modifies __str__ that returns utf-8.
 14 |         """
 15 |         cls.__unicode__ = cls.__str__
 16 |         cls.__str__ = lambda x: x.__unicode__().encode('utf-8')
 17 |         return cls
 18 | else: # PY3
 19 |     string_types = (str,)
 20 |     basestring = (str, bytes)
 21 |     implements_to_string = lambda x: x
 22 | 
 23 | class ComparableMixin(object):
 24 | 
 25 |     '''Implements rich operators for an object.'''
 26 | 
 27 |     def _compare(self, other, method):
 28 |         try:
 29 |             return method(self._cmpkey(), other._cmpkey())
 30 |         except (AttributeError, TypeError):
 31 |             # _cmpkey not implemented, or return different type,
 32 |             # so I can't compare with "other". Try the reverse comparison
 33 |             return NotImplemented
 34 | 
 35 |     def __lt__(self, other):
 36 |         return self._compare(other, lambda s, o: s < o)
 37 | 
 38 |     def __le__(self, other):
 39 |         return self._compare(other, lambda s, o: s <= o)
 40 | 
 41 |     def __eq__(self, other):
 42 |         return self._compare(other, lambda s, o: s == o)
 43 | 
 44 |     def __ge__(self, other):
 45 |         return self._compare(other, lambda s, o: s >= o)
 46 | 
 47 |     def __gt__(self, other):
 48 |         return self._compare(other, lambda s, o: s > o)
 49 | 
 50 |     def __ne__(self, other):
 51 |         return self._compare(other, lambda s, o: s != o)
 52 | 
 53 | 
 54 | class BlobComparableMixin(ComparableMixin):
 55 | 
 56 |     '''Allow blob objects to be comparable with both strings and blobs.'''
 57 | 
 58 |     def _compare(self, other, method):
 59 |         if isinstance(other, basestring):
 60 |             # Just compare with the other string
 61 |             return method(self._cmpkey(), other)
 62 |         return super(BlobComparableMixin, self)._compare(other, method)
 63 | 
 64 | 
 65 | @implements_to_string
 66 | class StringlikeMixin(object):
 67 | 
 68 |     '''Make blob objects behave like Python strings.
 69 | 
 70 |     Expects that classes that use this mixin to have a _strkey() method that
 71 |     returns the string to apply string methods to. Using _strkey() instead
 72 |     of __str__ ensures consistent behavior between Python 2 and 3.
 73 |     '''
 74 | 
 75 |     def __repr__(self):
 76 |         '''Returns a string representation for debugging.'''
 77 |         class_name = self.__class__.__name__
 78 |         text = self.__unicode__().encode("utf-8") if PY2 else str(self)
 79 |         ret = '{cls}("{text}")'.format(cls=class_name,
 80 |                                         text=text)
 81 |         return binary_type(ret) if PY2 else ret
 82 | 
 83 |     def __str__(self):
 84 |         '''Returns a string representation used in print statements
 85 |         or str(my_blob).'''
 86 |         return self._strkey()
 87 | 
 88 |     def __len__(self):
 89 |         '''Returns the length of the raw text.'''
 90 |         return len(self._strkey())
 91 | 
 92 |     def __iter__(self):
 93 |         '''Makes the object iterable as if it were a string,
 94 |         iterating through the raw string's characters.
 95 |         '''
 96 |         return iter(self._strkey())
 97 | 
 98 |     def __contains__(self, sub):
 99 |         '''Implements the `in` keyword like a Python string.'''
100 |         return sub in self._strkey()
101 | 
102 |     def __getitem__(self, index):
103 |         '''Returns a  substring. If index is an integer, returns a Python
104 |         string of a single character. If a range is given, e.g. `blob[3:5]`,
105 |         a new instance of the class is returned.
106 |         '''
107 |         if isinstance(index, int):
108 |             return self._strkey()[index]  # Just return a single character
109 |         else:
110 |             # Return a new blob object
111 |             return self.__class__(self._strkey()[index])
112 | 
113 |     def find(self, sub, start=0, end=sys.maxsize):
114 |         '''Behaves like the built-in str.find() method. Returns an integer,
115 |         the index of the first occurrence of the substring argument sub in the
116 |         sub-string given by [start:end].
117 |         '''
118 |         return self._strkey().find(sub, start, end)
119 | 
120 |     def rfind(self, sub, start=0, end=sys.maxsize):
121 |         '''Behaves like the built-in str.rfind() method. Returns an integer,
122 |         the index of he last (right-most) occurence of the substring argument
123 |         sub in the sub-sequence given by [start:end].
124 |         '''
125 |         return self._strkey().rfind(sub, start, end)
126 | 
127 |     def index(self, sub, start=0, end=sys.maxsize):
128 |         '''Like blob.find() but raise ValueError when the substring
129 |         is not found.
130 |         '''
131 |         return self._strkey().index(sub, start, end)
132 | 
133 |     def rindex(self, sub, start=0, end=sys.maxsize):
134 |         '''Like blob.rfind() but raise ValueError when substring is not
135 |         found.
136 |         '''
137 |         return self._strkey().rindex(sub, start, end)
138 | 
139 |     def startswith(self, prefix, start=0, end=sys.maxsize):
140 |         """Returns True if the blob starts with the given prefix."""
141 |         return self._strkey().startswith(prefix, start, end)
142 | 
143 |     def endswith(self, suffix, start=0, end=sys.maxsize):
144 |         """Returns True if the blob ends with the given suffix."""
145 |         return self._strkey().endswith(suffix, start, end)
146 | 
147 |     # PEP8 aliases
148 |     starts_with = startswith
149 |     ends_with = endswith
150 | 
151 |     def title(self):
152 |         """Returns a blob object with the text in title-case."""
153 |         return self.__class__(self._strkey().title())
154 | 
155 |     def format(self, *args, **kwargs):
156 |         """Perform a string formatting operation, like the built-in
157 |         `str.format(*args, **kwargs)`. Returns a blob object.
158 |         """
159 |         return self.__class__(self._strkey().format(*args, **kwargs))
160 | 
161 |     def split(self, sep=None, maxsplit=sys.maxsize):
162 |         """Behaves like the built-in str.split().
163 |         """
164 |         return self._strkey().split(sep, maxsplit)
165 | 
166 |     def strip(self, chars=None):
167 |         """Behaves like the built-in str.strip([chars]) method. Returns
168 |         an object with leading and trailing whitespace removed.
169 |         """
170 |         return self.__class__(self._strkey().strip(chars))
171 | 
172 |     def upper(self):
173 |         """Like str.upper(), returns new object with all upper-cased characters.
174 |         """
175 |         return self.__class__(self._strkey().upper())
176 | 
177 |     def lower(self):
178 |         """Like str.lower(), returns new object with all lower-cased characters.
179 |         """
180 |         return self.__class__(self._strkey().lower())
181 | 
182 |     def join(self, iterable):
183 |         """Behaves like the built-in `str.join(iterable)` method, except
184 |         returns a blob object.
185 | 
186 |         Returns a blob which is the concatenation of the strings or blobs
187 |         in the iterable.
188 |         """
189 |         return self.__class__(self._strkey().join(iterable))
190 | 
191 |     def replace(self, old, new, count=sys.maxsize):
192 |         """Return a new blob object with all the occurence of `old` replaced
193 |         by `new`.
194 |         """
195 |         return self.__class__(self._strkey().replace(old, new, count))
196 | 


--------------------------------------------------------------------------------
/docs/Sentiment.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | Sentiment
  3 | =========
  4 | 
  5 | Polyglot has polarity lexicons for 136 languages. The scale of the
  6 | words' polarity consisted of three degrees: +1 for positive words, and
  7 | -1 for negatives words. Neutral words will have a score of 0.
  8 | 
  9 | Languages Coverage
 10 | ~~~~~~~~~~~~~~~~~~
 11 | 
 12 | .. code:: python
 13 | 
 14 |     from polyglot.downloader import downloader
 15 |     print(downloader.supported_languages_table("sentiment2", 3))
 16 | 
 17 | 
 18 | .. parsed-literal::
 19 | 
 20 |       1. Turkmen                    2. Thai                       3. Latvian                  
 21 |       4. Zazaki                     5. Tagalog                    6. Tamil                    
 22 |       7. Tajik                      8. Telugu                     9. Luxembourgish, Letzeb... 
 23 |      10. Alemannic                 11. Latin                     12. Turkish                  
 24 |      13. Limburgish, Limburgan...  14. Egyptian Arabic           15. Tatar                    
 25 |      16. Lithuanian                17. Spanish; Castilian        18. Basque                   
 26 |      19. Estonian                  20. Asturian                  21. Greek, Modern            
 27 |      22. Esperanto                 23. English                   24. Ukrainian                
 28 |      25. Marathi (Marāṭhī)         26. Maltese                   27. Burmese                  
 29 |      28. Kapampangan               29. Uighur, Uyghur            30. Uzbek                    
 30 |      31. Malagasy                  32. Yiddish                   33. Macedonian               
 31 |      34. Urdu                      35. Malayalam                 36. Mongolian                
 32 |      37. Breton                    38. Bosnian                   39. Bengali                  
 33 |      40. Tibetan Standard, Tib...  41. Belarusian                42. Bulgarian                
 34 |      43. Bashkir                   44. Vietnamese                45. Volapük                  
 35 |      46. Gan Chinese               47. Manx                      48. Gujarati                 
 36 |      49. Yoruba                    50. Occitan                   51. Scottish Gaelic; Gaelic  
 37 |      52. Irish                     53. Galician                  54. Ossetian, Ossetic        
 38 |      55. Oriya                     56. Walloon                   57. Swedish                  
 39 |      58. Silesian                  59. Lombard language          60. Divehi; Dhivehi; Mald... 
 40 |      61. Danish                    62. German                    63. Armenian                 
 41 |      64. Haitian; Haitian Creole   65. Hungarian                 66. Croatian                 
 42 |      67. Bishnupriya Manipuri      68. Hindi                     69. Hebrew (modern)          
 43 |      70. Portuguese                71. Afrikaans                 72. Pashto, Pushto           
 44 |      73. Amharic                   74. Aragonese                 75. Bavarian                 
 45 |      76. Assamese                  77. Panjabi, Punjabi          78. Polish                   
 46 |      79. Azerbaijani               80. Italian                   81. Arabic                   
 47 |      82. Icelandic                 83. Ido                       84. Scots                    
 48 |      85. Sicilian                  86. Indonesian                87. Chinese Word             
 49 |      88. Interlingua               89. Waray-Waray               90. Piedmontese language     
 50 |      91. Quechua                   92. French                    93. Dutch                    
 51 |      94. Norwegian Nynorsk         95. Norwegian                 96. Western Frisian          
 52 |      97. Upper Sorbian             98. Nepali                    99. Persian                  
 53 |     100. Ilokano                  101. Finnish                  102. Faroese                  
 54 |     103. Romansh                  104. Javanese                 105. Romanian, Moldavian, ... 
 55 |     106. Malay                    107. Japanese                 108. Russian                  
 56 |     109. Catalan; Valencian       110. Fiji Hindi               111. Chinese                  
 57 |     112. Cebuano                  113. Czech                    114. Chuvash                  
 58 |     115. Welsh                    116. West Flemish             117. Kirghiz, Kyrgyz          
 59 |     118. Kurdish                  119. Kazakh                   120. Korean                   
 60 |     121. Kannada                  122. Khmer                    123. Georgian                 
 61 |     124. Sakha                    125. Serbian                  126. Albanian                 
 62 |     127. Swahili                  128. Chechen                  129. Sundanese                
 63 |     130. Sanskrit (Saṁskṛta)      131. Venetian                 132. Northern Sami            
 64 |     133. Slovak                   134. Sinhala, Sinhalese       135. Bosnian-Croatian-Serbian 
 65 |     136. Slovene                  
 66 | 
 67 | 
 68 | .. code:: python
 69 | 
 70 |     from polyglot.text import Text
 71 | 
 72 | Polarity
 73 | --------
 74 | 
 75 | To inquiry the polarity of a word, we can just call its own attribute
 76 | ``polarity``
 77 | 
 78 | .. code:: python
 79 | 
 80 |     text = Text("The movie was really good.")
 81 | 
 82 | .. code:: python
 83 | 
 84 |     print("{:<16}{}".format("Word", "Polarity")+"\n"+"-"*30)
 85 |     for w in text.words:
 86 |         print("{:<16}{:>2}".format(w, w.polarity))
 87 | 
 88 | 
 89 | .. parsed-literal::
 90 | 
 91 |     Word            Polarity
 92 |     ------------------------------
 93 |     The              0
 94 |     movie            0
 95 |     was              0
 96 |     really           0
 97 |     good             1
 98 |     .                0
 99 | 
100 | 
101 | Entity Sentiment
102 | ----------------
103 | 
104 | We can calculate a more sphosticated sentiment score for an entity that
105 | is mentioned in text as the following:
106 | 
107 | .. code:: python
108 | 
109 |     blob = ("Barack Obama gave a fantastic speech last night. "
110 |             "Reports indicate he will move next to New Hampshire.")
111 |     text = Text(blob)
112 | 
113 | First, we need split the text into sentneces, this will limit the words
114 | tha affect the sentiment of an entity to the words mentioned in the
115 | sentnece.
116 | 
117 | .. code:: python
118 | 
119 |     first_sentence = text.sentences[0]
120 |     print(first_sentence)
121 | 
122 | 
123 | .. parsed-literal::
124 | 
125 |     The movie was really good.
126 | 
127 | 
128 | Second, we extract the entities
129 | 
130 | .. code:: python
131 | 
132 |     first_entity = first_sentence.entities[0]
133 |     print(first_entity)
134 | 
135 | 
136 | .. parsed-literal::
137 | 
138 |     [u'Obama']
139 | 
140 | 
141 | Finally, for each entity we identified, we can calculate the strength of
142 | the positive or negative sentiment it has on a scale from 0-1
143 | 
144 | .. code:: python
145 | 
146 |     first_entity.positive_sentiment
147 | 
148 | 
149 | 
150 | 
151 | .. parsed-literal::
152 | 
153 |     0.9375
154 | 
155 | 
156 | 
157 | .. code:: python
158 | 
159 |     first_entity.negative_sentiment
160 | 
161 | 
162 | 
163 | 
164 | .. parsed-literal::
165 | 
166 |     0
167 | 
168 | 
169 | 
170 | Citation
171 | ~~~~~~~~
172 | 
173 | This work is a direct implementation of the research being described in
174 | the `Building sentiment lexicons for all major
175 | languages <http://aclweb.org/anthology/P14-2063>`__ paper. The author of
176 | this library strongly encourage you to cite the following paper if you
177 | are using this software.
178 | 
179 | ::
180 | 
181 |        @inproceedings{chen2014building,
182 |        title={Building sentiment lexicons for all major languages},
183 |        author={Chen, Yanqing and Skiena, Steven},
184 |        booktitle={Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics (Short Papers)},
185 |        pages={383--389},
186 |        year={2014}}
187 | 


--------------------------------------------------------------------------------
/docs/CLI.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | Command Line Interface
  3 | ======================
  4 | 
  5 | polyglot package offer a command line interface along with the library
  6 | access. For each task in polyglot, there is a subcommand with specific
  7 | options for that task. Common options are gathered under the main
  8 | command ``polyglot``
  9 | 
 10 | .. code:: python
 11 | 
 12 |     !polyglot --help
 13 | 
 14 | 
 15 | .. parsed-literal::
 16 | 
 17 |     usage: polyglot [-h] [--lang LANG] [--delimiter DELIMITER] [--workers WORKERS] [-l LOG] [--debug]
 18 |                     {detect,morph,tokenize,download,count,cat,ner,pos,transliteration,sentiment} ...
 19 |     
 20 |     optional arguments:
 21 |       -h, --help            show this help message and exit
 22 |       --lang LANG           Language to be processed
 23 |       --delimiter DELIMITER
 24 |                             Delimiter that seperates documents, records or even sentences.
 25 |       --workers WORKERS     Number of parallel processes.
 26 |       -l LOG, --log LOG     log verbosity level
 27 |       --debug               drop a debugger if an exception is raised.
 28 |     
 29 |     tools:
 30 |       multilingual tools for all languages
 31 |     
 32 |       {detect,morph,tokenize,download,count,cat,ner,pos,transliteration,sentiment}
 33 |         detect              Detect the language(s) used in text.
 34 |         tokenize            Tokenize text into sentences and words.
 35 |         download            Download polyglot resources and models.
 36 |         count               Count words frequency in a corpus.
 37 |         cat                 Print the contents of the input file to the screen.
 38 |         ner                 Named entity recognition chunking.
 39 |         pos                 Part of Speech tagger.
 40 |         transliteration     Rewriting the input in the target language script.
 41 |         sentiment           Classify text to positive and negative polarity.
 42 | 
 43 | 
 44 | Notice that most of the operations are language specific. For example,
 45 | tokenization rules and part of speech taggers differ between languages.
 46 | Therefore, it is important that the lanaguage of the input is detected
 47 | or given. The ``--lang`` option allows you to tell polyglot which
 48 | language the input is written in.
 49 | 
 50 | .. code:: python
 51 | 
 52 |     !polyglot --lang en tokenize --input testdata/cricket.txt | head -n 3
 53 | 
 54 | 
 55 | .. parsed-literal::
 56 | 
 57 |     Australia posted a World Cup record total of 417 - 6 as they beat Afghanistan by 275 runs .
 58 |     David Warner hit 178 off 133 balls , Steve Smith scored 95 while Glenn Maxwell struck 88 in 39 deliveries in the Pool A encounter in Perth .
 59 |     Afghanistan were then dismissed for 142 , with Mitchell Johnson and Mitchell Starc taking six wickets between them .
 60 | 
 61 | 
 62 | In case the user did not supply the the language code, polyglot will
 63 | peek ahead and read the first 1KB of data to detect the language used in
 64 | the input.
 65 | 
 66 | .. code:: python
 67 | 
 68 |     !polyglot tokenize --input testdata/cricket.txt | head -n 3
 69 | 
 70 | 
 71 | .. parsed-literal::
 72 | 
 73 |     2015-03-15 17:06:45 INFO __main__.py: 276 Language English is detected while reading the first 1128 bytes.
 74 |     Australia posted a World Cup record total of 417 - 6 as they beat Afghanistan by 275 runs .
 75 |     David Warner hit 178 off 133 balls , Steve Smith scored 95 while Glenn Maxwell struck 88 in 39 deliveries in the Pool A encounter in Perth .
 76 |     Afghanistan were then dismissed for 142 , with Mitchell Johnson and Mitchell Starc taking six wickets between them .
 77 | 
 78 | 
 79 | Input formats
 80 | -------------
 81 | 
 82 | Polyglot will process the input contents line by line assuming that the
 83 | lines are separated by "``\n``\ ". If the file is formatted differently,
 84 | you can use the polyglot main command option ``delimiter`` to specify
 85 | any string other than "``\n``\ ".
 86 | 
 87 | You can pass text to the polyglot subcommands in several ways:
 88 | 
 89 | -  **Standard input**: This is, usually, useful for building processing
 90 |    pipelines.
 91 | 
 92 | -  **Text file**: The file contents will be processed line by line.
 93 | 
 94 | -  **Collection of text files**: Polyglot will iterate over the files
 95 |    one by one. If the polyglot main command option ``workers`` is
 96 |    activated, the execution will be parallelized and each file will be
 97 |    processed by a different process.
 98 | 
 99 | Word Count Example
100 | ------------------
101 | 
102 | This example will demonstrate how to use the polyglot main command
103 | options and the subcommand count to generate a count of the words
104 | appearing in a collection of text files.
105 | 
106 | First, let us examine the subcommand ``count`` options
107 | 
108 | .. code:: python
109 | 
110 |     !polyglot count --help
111 | 
112 | 
113 | .. parsed-literal::
114 | 
115 |     usage: polyglot count [-h] [--min-count MIN_COUNT | --most-freq MOST_FREQ] [--input [INPUT [INPUT ...]]]
116 |     
117 |     optional arguments:
118 |       -h, --help            show this help message and exit
119 |       --min-count MIN_COUNT
120 |                             Ignore all words that appear <= min_freq.
121 |       --most-freq MOST_FREQ
122 |                             Consider only the most frequent k words.
123 |       --input [INPUT [INPUT ...]]
124 | 
125 | 
126 | To avoid long output, we will restrict the count to the words that
127 | appeared at least twice
128 | 
129 | .. code:: python
130 | 
131 |     !polyglot count --input testdata/cricket.txt --min-count 2
132 | 
133 | 
134 | .. parsed-literal::
135 | 
136 |     in	10
137 |     the	6
138 |     by	3
139 |     and	3
140 |     of	3
141 |     Bermuda	2
142 |     West	2
143 |     Mitchell	2
144 |     South	2
145 |     Indies	2
146 |     against	2
147 |     beat	2
148 |     as	2
149 |     India	2
150 |     which	2
151 |     score	2
152 |     Afghanistan	2
153 | 
154 | 
155 | Let us consider the scenario where we have hundreds of files that
156 | contains words we want to count. Notice, that we can parallelize the
157 | process by passing a number higher than 1 to the polyglot main command
158 | option ``workers``.
159 | 
160 | .. code:: python
161 | 
162 |     !polyglot --log debug --workers 5 count --input testdata/cricket.txt testdata/cricket.txt --min-count 3
163 | 
164 | 
165 | .. parsed-literal::
166 | 
167 |     in	20
168 |     the	12
169 |     of	6
170 |     by	6
171 |     and	6
172 |     West	4
173 |     Afghanistan	4
174 |     India	4
175 |     beat	4
176 |     which	4
177 |     Indies	4
178 |     Bermuda	4
179 |     as	4
180 |     South	4
181 |     Mitchell	4
182 |     against	4
183 |     score	4
184 | 
185 | 
186 | Building Pipelines
187 | ------------------
188 | 
189 | The previous subcommand ``count`` assumed that the words are separted by
190 | spaces. Given that we never tokenized the text file, that may result in
191 | suboptimal word counting. Let us take a closer look at the tail of the
192 | word counts
193 | 
194 | .. code:: python
195 | 
196 |     !polyglot count --input testdata/cricket.txt | tail -n 10
197 | 
198 | 
199 | .. parsed-literal::
200 | 
201 |     Ireland	1
202 |     surpassed	1
203 |     amount	1
204 |     equalled	1
205 |     a	1
206 |     The	1
207 |     413-5	1
208 |     Africa's	1
209 |     tournament	1
210 |     Johnson	1
211 | 
212 | 
213 | Observe that words like "2007." could have been considered two words
214 | "2007" and "." and the same for "Africa's". To fix this issue, we can
215 | use the polyglot subcommand tokenize to deal with these cases. We can
216 | stage the counting to happen after the tokenization using the stdin to
217 | build a simple pipe.
218 | 
219 | .. code:: python
220 | 
221 |     !polyglot --lang en tokenize --input testdata/cricket.txt | polyglot count --min-count 2
222 | 
223 | 
224 | .. parsed-literal::
225 | 
226 |     in	10
227 |     the	6
228 |     .	6
229 |     -	5
230 |     ,	4
231 |     of	3
232 |     and	3
233 |     by	3
234 |     South	2
235 |     5	2
236 |     2007	2
237 |     Bermuda	2
238 |     which	2
239 |     score	2
240 |     against	2
241 |     Mitchell	2
242 |     as	2
243 |     West	2
244 |     India	2
245 |     beat	2
246 |     Afghanistan	2
247 |     Indies	2
248 | 
249 | 
250 | Notice, that the word "2007" started appearing in the words counts list.
251 | 


--------------------------------------------------------------------------------
/polyglot/mapping/base.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """ Supports word embeddings."""
  5 | 
  6 | from io import open, StringIO
  7 | from collections import Counter
  8 | import os
  9 | from concurrent.futures import ProcessPoolExecutor
 10 | 
 11 | import six
 12 | from six.moves import zip
 13 | from six import iteritems
 14 | from six import text_type as unicode
 15 | from six import string_types
 16 | 
 17 | from ..base import TextFile
 18 | from ..utils import _open
 19 | 
 20 | def count(lines):
 21 |   """ Counts the word frequences in a list of sentences.
 22 | 
 23 |   Note:
 24 |     This is a helper function for parallel execution of `Vocabulary.from_text`
 25 |     method.
 26 |   """
 27 |   words = [w for l in lines for w in l.strip().split()]
 28 |   return Counter(words)
 29 | 
 30 | 
 31 | class VocabularyBase(object):
 32 |   """ A set of words/tokens that have consistent IDs.
 33 | 
 34 |   Note:
 35 |     Words will be sorted according to their lexicographic order.
 36 | 
 37 |   Attributes:
 38 |     word_id (dictionary): Mapping from words to IDs.
 39 |     id_word (dictionary): A reverse map of `word_id`.
 40 |   """
 41 | 
 42 |   def __init__(self, words=None):
 43 |     """ Build attributes word_id and id_word from input.
 44 | 
 45 |     Args:
 46 |       words (list/set): list or set of words.
 47 |     """
 48 |     words = self.sanitize_words(words)
 49 |     self.word_id = {w:i for i, w in enumerate(sorted(words))}
 50 |     self.id_word = {i:w for w,i in iteritems(self.word_id)}
 51 | 
 52 |   def sanitize_words(self, words):
 53 |     """Guarantees that all textual symbols are unicode.
 54 | 
 55 |     Note:
 56 |       We do not convert numbers, only strings to unicode.
 57 |       We assume that the strings are encoded in utf-8.
 58 |     """
 59 |     _words = []
 60 |     for w in words:
 61 |       if isinstance(w, string_types) and not isinstance(w, unicode):
 62 |         _words.append(unicode(w, encoding="utf-8"))
 63 |       else:
 64 |         _words.append(w)
 65 |     return _words
 66 | 
 67 |   def __iter__(self):
 68 |     """Iterate over the words in a vocabulary."""
 69 |     for w,i in sorted(iteritems(self.word_id), key=lambda wc: wc[1]):
 70 |       yield w
 71 | 
 72 |   @property
 73 |   def words(self):
 74 |     """ Ordered list of words according to their IDs."""
 75 |     return list(self)
 76 | 
 77 |   def __unicode__(self):
 78 |     return u"\n".join(self.words)
 79 | 
 80 |   def __str__(self):
 81 |     if six.PY3:
 82 |       return self.__unicode__()
 83 |     return self.__unicode__().encode("utf-8")
 84 | 
 85 |   def __getitem__(self, key):
 86 |     if isinstance(key, string_types) and not isinstance(key, unicode):
 87 |       key = unicode(key, encoding="utf-8")
 88 |     return self.word_id[key]
 89 | 
 90 |   def __contains__(self, key):
 91 |     return key in self.word_id
 92 | 
 93 |   def __delitem__(self, key):
 94 |     """Delete a word from vocabulary.
 95 | 
 96 |     Note:
 97 |      To maintain consecutive IDs, this operation implemented
 98 |      with a complexity of \\theta(n).
 99 |     """
100 |     del self.word_id[key]
101 |     self.id_word = dict(enumerate(self.words))
102 |     self.word_id = {w:i for i,w in iteritems(self.id_word)}
103 | 
104 |   def __len__(self):
105 |     return len(self.word_id)
106 | 
107 |   def get(self, k, default=None):
108 |     try:
109 |       return self[k]
110 |     except KeyError as e:
111 |       return default
112 | 
113 |   def getstate(self):
114 |     return list(self.words)
115 | 
116 |   @classmethod
117 |   def from_vocabfile(cls, filename):
118 |     """ Construct a CountedVocabulary out of a vocabulary file.
119 | 
120 |     Note:
121 |       File has the following format word1
122 |                                     word2
123 |     """
124 |     words = [x.strip() for x in _open(filename, 'r').read().splitlines()]
125 |     return cls(words=words)
126 | 
127 | 
128 | class OrderedVocabulary(VocabularyBase):
129 |   """ An ordered list of words/tokens according to their frequency.
130 | 
131 |   Note:
132 |     The words order is assumed to be sorted according to the word frequency.
133 |     Most frequent words appear first in the list.
134 | 
135 |   Attributes:
136 |     word_id (dictionary): Mapping from words to IDs.
137 |     id_word (dictionary): A reverse map of `word_id`.
138 |   """
139 | 
140 |   def __init__(self, words=None):
141 |     """ Build attributes word_id and id_word from input.
142 | 
143 |     Args:
144 |       words (list): list of sorted words according to frequency.
145 |     """
146 | 
147 |     words = self.sanitize_words(words)
148 |     self.word_id = {w:i for i, w in enumerate(words)}
149 |     self.id_word = {i:w for w,i in iteritems(self.word_id)}
150 | 
151 | 
152 |   def most_frequent(self, k):
153 |     """ Returns a vocabulary with the most frequent `k` words.
154 | 
155 |     Args:
156 |       k (integer): specifies the top k most frequent words to be returned.
157 |     """
158 |     return OrderedVocabulary(words=self.words[:k])
159 | 
160 | 
161 | class CountedVocabulary(OrderedVocabulary):
162 |   """ List of words and counts sorted according to word count.
163 |   """
164 | 
165 |   def __init__(self, word_count=None):
166 |     """ Build attributes word_id and id_word from input.
167 | 
168 |     Args:
169 |       word_count (dictionary): A dictionary of the type word:count or
170 |                                list of tuples of the type (word, count).
171 |     """
172 | 
173 |     if isinstance(word_count, dict):
174 |       word_count = iteritems(word_count)
175 |     sorted_counts = list(sorted(word_count, key=lambda wc: wc[1], reverse=True))
176 |     words = [w for w,c in sorted_counts]
177 |     super(CountedVocabulary, self).__init__(words=words)
178 |     self.word_count = dict(sorted_counts)
179 | 
180 |   @staticmethod
181 |   def from_textfiles(files, workers=1, job_size=1000):
182 |     c = Counter()
183 |     if workers == 1:
184 |       for lines in files.iter_chunks(job_size):
185 |         c.update(count(lines))
186 |     else:
187 |       with ProcessPoolExecutor(max_workers=workers) as executor:
188 |         for counter_ in executor.map(CountedVocabulary.from_textfile, files.names):
189 |           c.update(Counter(counter_.word_count))
190 |     return CountedVocabulary(word_count=c)
191 | 
192 |   @classmethod
193 |   def from_textfile(cls, textfile, workers=1, job_size=1000):
194 |     """ Count the set of words appeared in a text file.
195 | 
196 |     Args:
197 |       textfile (string): The name of the text file or `TextFile` object.
198 |       min_count (integer): Minimum number of times a word/token appeared in the document
199 |                  to be considered part of the vocabulary.
200 |       workers (integer): Number of parallel workers to read the file simulatenously.
201 |       job_size (integer): Size of the batch send to each worker.
202 |       most_frequent (integer): if no min_count is specified, consider the most frequent k words for the vocabulary.
203 | 
204 |     Returns:
205 |       A vocabulary of the most frequent words appeared in the document.
206 |     """
207 | 
208 |     c = Counter()
209 |     if isinstance(textfile, string_types):
210 |       textfile = TextFile(textfile)
211 |     for result in textfile.apply(count, workers, job_size):
212 |       c.update(result)
213 |     return CountedVocabulary(word_count=c)
214 | 
215 |   def most_frequent(self, k):
216 |     """ Returns a vocabulary with the most frequent `k` words.
217 | 
218 |     Args:
219 |       k (integer): specifies the top k most frequent words to be returned.
220 |     """
221 |     word_count = {w:self.word_count[w] for w in self.words[:k]}
222 |     return CountedVocabulary(word_count=word_count)
223 | 
224 |   def min_count(self, n=1):
225 |     """ Returns a vocabulary after eliminating the words that appear < `n`.
226 | 
227 |     Args:
228 |       n (integer): specifies the minimum word frequency allowed.
229 |     """
230 |     word_count = {w:c for w,c in iteritems(self.word_count) if c >= n}
231 |     return CountedVocabulary(word_count=word_count)
232 | 
233 |   def __unicode__(self):
234 |     return u"\n".join([u"{}\t{}".format(w,self.word_count[w]) for w in self.words])
235 | 
236 |   def __delitem__(self, key):
237 |     super(CountedVocabulary, self).__delitem__(key)
238 |     self.word_count = {w:self.word_count[w] for w in self}
239 | 
240 |   def getstate(self):
241 |     words = list(self.words)
242 |     counts = [self.word_count[w] for w in words]
243 |     return (words, counts)
244 | 
245 |   @staticmethod
246 |   def from_vocabfile(filename):
247 |     """ Construct a CountedVocabulary out of a vocabulary file.
248 | 
249 |     Note:
250 |       File has the following format word1 count1
251 |                                     word2 count2
252 |     """
253 |     word_count = [x.strip().split() for x in _open(filename, 'r').read().splitlines()]
254 |     word_count = {w:int(c) for w,c in word_count}
255 |     return CountedVocabulary(word_count=word_count)
256 | 


--------------------------------------------------------------------------------
/notebooks/Transliteration.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Transliteration"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Transliteration is the conversion of a text from one script to another.\n",
 15 |     "For instance, a Latin transliteration of the Greek phrase \"Ελληνική Δημοκρατία\", usually translated as 'Hellenic Republic', is \"Ellēnikḗ Dēmokratía\"."
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 1,
 21 |    "metadata": {
 22 |     "collapsed": false
 23 |    },
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "from polyglot.transliteration import Transliterator"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "metadata": {},
 32 |    "source": [
 33 |     "## Languages Coverage"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 2,
 39 |    "metadata": {
 40 |     "collapsed": false
 41 |    },
 42 |    "outputs": [
 43 |     {
 44 |      "name": "stdout",
 45 |      "output_type": "stream",
 46 |      "text": [
 47 |       "  1. Haitian; Haitian Creole    2. Tamil                      3. Vietnamese               \n",
 48 |       "  4. Telugu                     5. Croatian                   6. Hungarian                \n",
 49 |       "  7. Thai                       8. Kannada                    9. Tagalog                  \n",
 50 |       " 10. Armenian                  11. Hebrew (modern)           12. Turkish                  \n",
 51 |       " 13. Portuguese                14. Belarusian                15. Norwegian Nynorsk        \n",
 52 |       " 16. Norwegian                 17. Dutch                     18. Japanese                 \n",
 53 |       " 19. Albanian                  20. Bulgarian                 21. Serbian                  \n",
 54 |       " 22. Swahili                   23. Swedish                   24. French                   \n",
 55 |       " 25. Latin                     26. Czech                     27. Yiddish                  \n",
 56 |       " 28. Hindi                     29. Danish                    30. Finnish                  \n",
 57 |       " 31. German                    32. Bosnian-Croatian-Serbian  33. Slovak                   \n",
 58 |       " 34. Persian                   35. Lithuanian                36. Slovene                  \n",
 59 |       " 37. Latvian                   38. Bosnian                   39. Gujarati                 \n",
 60 |       " 40. Italian                   41. Icelandic                 42. Spanish; Castilian       \n",
 61 |       " 43. Ukrainian                 44. Georgian                  45. Urdu                     \n",
 62 |       " 46. Indonesian                47. Marathi (Marāṭhī)         48. Korean                   \n",
 63 |       " 49. Galician                  50. Khmer                     51. Catalan; Valencian       \n",
 64 |       " 52. Romanian, Moldavian, ...  53. Basque                    54. Macedonian               \n",
 65 |       " 55. Russian                   56. Azerbaijani               57. Chinese                  \n",
 66 |       " 58. Estonian                  59. Welsh                     60. Arabic                   \n",
 67 |       " 61. Bengali                   62. Amharic                   63. Irish                    \n",
 68 |       " 64. Malay                     65. Afrikaans                 66. Polish                   \n",
 69 |       " 67. Greek, Modern             68. Esperanto                 69. Maltese                  \n",
 70 |       "\n"
 71 |      ]
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "from polyglot.downloader import downloader\n",
 76 |     "print(downloader.supported_languages_table(\"transliteration2\"))"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "#### Downloading Necessary Models"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 3,
 89 |    "metadata": {
 90 |     "collapsed": false
 91 |    },
 92 |    "outputs": [
 93 |     {
 94 |      "name": "stdout",
 95 |      "output_type": "stream",
 96 |      "text": [
 97 |       "[polyglot_data] Downloading package embeddings2.en to\n",
 98 |       "[polyglot_data]     /home/rmyeid/polyglot_data...\n",
 99 |       "[polyglot_data]   Package embeddings2.en is already up-to-date!\n",
100 |       "[polyglot_data] Downloading package pos2.en to\n",
101 |       "[polyglot_data]     /home/rmyeid/polyglot_data...\n",
102 |       "[polyglot_data]   Package pos2.en is already up-to-date!\n"
103 |      ]
104 |     }
105 |    ],
106 |    "source": [
107 |     "%%bash\n",
108 |     "polyglot download embeddings2.en pos2.en"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "markdown",
113 |    "metadata": {},
114 |    "source": [
115 |     "## Example"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "metadata": {},
121 |    "source": [
122 |     "We tag each word in the text with one part of speech."
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": 7,
128 |    "metadata": {
129 |     "collapsed": false
130 |    },
131 |    "outputs": [],
132 |    "source": [
133 |     "from polyglot.text import Text"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": 8,
139 |    "metadata": {
140 |     "collapsed": true
141 |    },
142 |    "outputs": [],
143 |    "source": [
144 |     "blob = \"\"\"We will meet at eight o'clock on Thursday morning.\"\"\"\n",
145 |     "text = Text(blob)"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "markdown",
150 |    "metadata": {},
151 |    "source": [
152 |     "We can query all the tagged words"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": 9,
158 |    "metadata": {
159 |     "collapsed": false
160 |    },
161 |    "outputs": [
162 |     {
163 |      "name": "stdout",
164 |      "output_type": "stream",
165 |      "text": [
166 |       "وي\n",
167 |       "ويل\n",
168 |       "ميت\n",
169 |       "ات\n",
170 |       "ييايت\n",
171 |       "أوكلوك\n",
172 |       "ون\n",
173 |       "ثورسداي\n",
174 |       "مورنينغ\n",
175 |       "\n"
176 |      ]
177 |     }
178 |    ],
179 |    "source": [
180 |     "for x in text.transliterate(\"ar\"):\n",
181 |     "  print(x)"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "markdown",
186 |    "metadata": {},
187 |    "source": [
188 |     "### Command Line Interface"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": 20,
194 |    "metadata": {
195 |     "collapsed": false
196 |    },
197 |    "outputs": [
198 |     {
199 |      "name": "stdout",
200 |      "output_type": "stream",
201 |      "text": [
202 |       "which           ويكه            \r\n",
203 |       "India           ينديا           \r\n",
204 |       "beat            بيت             \r\n",
205 |       "Bermuda         بيرمودا         \r\n",
206 |       "in              ين              \r\n",
207 |       "Port            بورت            \r\n",
208 |       "of              وف              \r\n",
209 |       "Spain           سباين           \r\n",
210 |       "in              ين              \r\n",
211 |       "2007                            \r\n",
212 |       ",                               \r\n",
213 |       "which           ويكه            \r\n",
214 |       "was             واس             \r\n",
215 |       "equalled        يكالليد         \r\n",
216 |       "five            فيفي            \r\n",
217 |       "days            دايس            \r\n",
218 |       "ago             اغو             \r\n",
219 |       "by              بي              \r\n",
220 |       "South           سووث            \r\n",
221 |       "Africa          افريكا          \r\n",
222 |       "in              ين              \r\n",
223 |       "their           ثير             \r\n",
224 |       "victory         فيكتوري         \r\n",
225 |       "over            وفير            \r\n",
226 |       "West            ويست            \r\n",
227 |       "Indies          يندييس          \r\n",
228 |       "in              ين              \r\n",
229 |       "Sydney          سيدني           \r\n",
230 |       ".                               \r\n",
231 |       "\r\n"
232 |      ]
233 |     }
234 |    ],
235 |    "source": [
236 |     "!polyglot --lang en tokenize --input testdata/cricket.txt |  polyglot --lang en transliteration --target ar | tail -n 30"
237 |    ]
238 |   }
239 |  ],
240 |  "metadata": {
241 |   "kernelspec": {
242 |    "display_name": "Python 2",
243 |    "language": "python",
244 |    "name": "python2"
245 |   },
246 |   "language_info": {
247 |    "codemirror_mode": {
248 |     "name": "ipython",
249 |     "version": 2
250 |    },
251 |    "file_extension": ".py",
252 |    "mimetype": "text/x-python",
253 |    "name": "python",
254 |    "nbconvert_exporter": "python",
255 |    "pygments_lexer": "ipython2",
256 |    "version": "2.7.6"
257 |   }
258 |  },
259 |  "nbformat": 4,
260 |  "nbformat_minor": 0
261 | }
262 | 


--------------------------------------------------------------------------------
/notebooks/Tokenization.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Tokenization"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Toeknization is the process that identifies the text boundaries of words and sentences.\n",
 15 |     "We can identify the boundaries of sentences first then tokenize each sentence to identify the words that compose the sentence.\n",
 16 |     "Of course, we can do word tokenization first and then segment the token sequence into sentneces.\n",
 17 |     "Tokenization in polyglot relies on the [Unicode Text Segmentation](http://www.unicode.org/reports/tr29/) algorithm as implemented by the [ICU Project](http://site.icu-project.org/).\n",
 18 |     "\n",
 19 |     "You can use C/C++ ICU library by installing the required package `libicu-dev`. For example, on ubuntu/debian systems you should use `apt-get` utility as the following:"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": null,
 25 |    "metadata": {
 26 |     "collapsed": true
 27 |    },
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "sudo apt-get install libicu-dev"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 4,
 36 |    "metadata": {
 37 |     "collapsed": false
 38 |    },
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "from polyglot.text import Text"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "markdown",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "## Word Tokenization"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "To call our word tokenizer, first we need to construct a Text object."
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 9,
 61 |    "metadata": {
 62 |     "collapsed": true
 63 |    },
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "blob = u\"\"\"\n",
 67 |     "两个月前遭受恐怖袭击的法国巴黎的犹太超市在装修之后周日重新开放，法国内政部长以及超市的管理者都表示，这显示了生命力要比野蛮行为更强大。\n",
 68 |     "该超市1月9日遭受枪手袭击，导致4人死亡，据悉这起事件与法国《查理周刊》杂志社恐怖袭击案有关。\n",
 69 |     "\"\"\"\n",
 70 |     "text = Text(blob)"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "The property words will call the word tokenizer."
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 10,
 83 |    "metadata": {
 84 |     "collapsed": false
 85 |    },
 86 |    "outputs": [
 87 |     {
 88 |      "data": {
 89 |       "text/plain": [
 90 |        "WordList(['两', '个', '月', '前', '遭受', '恐怖', '袭击', '的', '法国', '巴黎', '的', '犹太', '超市', '在', '装修', '之后', '周日', '重新', '开放', '，', '法国', '内政', '部长', '以及', '超市', '的', '管理者', '都', '表示', '，', '这', '显示', '了', '生命力', '要', '比', '野蛮', '行为', '更', '强大', '。', '该', '超市', '1', '月', '9', '日', '遭受', '枪手', '袭击', '，', '导致', '4', '人', '死亡', '，', '据悉', '这', '起', '事件', '与', '法国', '《', '查理', '周刊', '》', '杂志', '社', '恐怖', '袭击', '案', '有关', '。'])"
 91 |       ]
 92 |      },
 93 |      "execution_count": 10,
 94 |      "metadata": {},
 95 |      "output_type": "execute_result"
 96 |     }
 97 |    ],
 98 |    "source": [
 99 |     "text.words"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "markdown",
104 |    "metadata": {},
105 |    "source": [
106 |     "Since ICU boundary break algorithms are language aware, polyglot will detect the language used first before calling the tokenizer"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": 26,
112 |    "metadata": {
113 |     "collapsed": false
114 |    },
115 |    "outputs": [
116 |     {
117 |      "name": "stdout",
118 |      "output_type": "stream",
119 |      "text": [
120 |       "name:             code: zh       confidence:  99.0 read bytes:  1920\n"
121 |      ]
122 |     }
123 |    ],
124 |    "source": [
125 |     "print(text.language)"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "metadata": {
131 |     "collapsed": true
132 |    },
133 |    "source": [
134 |     "## Sentence Segementation"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "metadata": {},
140 |    "source": [
141 |     "If we are interested in segmenting the text first into sentences, we can query the `sentences` property"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 20,
147 |    "metadata": {
148 |     "collapsed": false
149 |    },
150 |    "outputs": [
151 |     {
152 |      "data": {
153 |       "text/plain": [
154 |        "[Sentence(\"两个月前遭受恐怖袭击的法国巴黎的犹太超市在装修之后周日重新开放，法国内政部长以及超市的管理者都表示，这显示了生命力要比野蛮行为更强大。\"),\n",
155 |        " Sentence(\"该超市1月9日遭受枪手袭击，导致4人死亡，据悉这起事件与法国《查理周刊》杂志社恐怖袭击案有关。\")]"
156 |       ]
157 |      },
158 |      "execution_count": 20,
159 |      "metadata": {},
160 |      "output_type": "execute_result"
161 |     }
162 |    ],
163 |    "source": [
164 |     "text.sentences"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "markdown",
169 |    "metadata": {},
170 |    "source": [
171 |     "`Sentence` class inherits `Text`, therefore, we can tokenize each sentence into words using the same property `words`"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": 21,
177 |    "metadata": {
178 |     "collapsed": false
179 |    },
180 |    "outputs": [
181 |     {
182 |      "data": {
183 |       "text/plain": [
184 |        "WordList(['两', '个', '月', '前', '遭受', '恐怖', '袭击', '的', '法国', '巴黎', '的', '犹太', '超市', '在', '装修', '之后', '周日', '重新', '开放', '，', '法国', '内政', '部长', '以及', '超市', '的', '管理者', '都', '表示', '，', '这', '显示', '了', '生命力', '要', '比', '野蛮', '行为', '更', '强大', '。'])"
185 |       ]
186 |      },
187 |      "execution_count": 21,
188 |      "metadata": {},
189 |      "output_type": "execute_result"
190 |     }
191 |    ],
192 |    "source": [
193 |     "first_sentence = text.sentences[0]\n",
194 |     "first_sentence.words"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "markdown",
199 |    "metadata": {},
200 |    "source": [
201 |     "## Command Line"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "markdown",
206 |    "metadata": {},
207 |    "source": [
208 |     "The subcommand tokenize does by default sentence segmentation and word tokenization."
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": 4,
214 |    "metadata": {
215 |     "collapsed": false
216 |    },
217 |    "outputs": [
218 |     {
219 |      "name": "stdout",
220 |      "output_type": "stream",
221 |      "text": [
222 |       "usage: polyglot tokenize [-h] [--only-sent | --only-word] [--input [INPUT [INPUT ...]]]\r\n",
223 |       "\r\n",
224 |       "optional arguments:\r\n",
225 |       "  -h, --help            show this help message and exit\r\n",
226 |       "  --only-sent           Segment sentences without word tokenization\r\n",
227 |       "  --only-word           Tokenize words without sentence segmentation\r\n",
228 |       "  --input [INPUT [INPUT ...]]\r\n"
229 |      ]
230 |     }
231 |    ],
232 |    "source": [
233 |     "! polyglot tokenize --help"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "markdown",
238 |    "metadata": {},
239 |    "source": [
240 |     "Each line represents a sentence where the words are split by spaces."
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": 25,
246 |    "metadata": {
247 |     "collapsed": false
248 |    },
249 |    "outputs": [
250 |     {
251 |      "name": "stdout",
252 |      "output_type": "stream",
253 |      "text": [
254 |       "Australia posted a World Cup record total of 417 - 6 as they beat Afghanistan by 275 runs .\r\n",
255 |       "David Warner hit 178 off 133 balls , Steve Smith scored 95 while Glenn Maxwell struck 88 in 39 deliveries in the Pool A encounter in Perth .\r\n",
256 |       "Afghanistan were then dismissed for 142 , with Mitchell Johnson and Mitchell Starc taking six wickets between them .\r\n",
257 |       "Australia's score surpassed the 413 - 5 India made against Bermuda in 2007 .\r\n",
258 |       "It continues the pattern of bat dominating ball in this tournament as the third 400 plus score achieved in the pool stages , following South Africa's 408 - 5 and 411 - 4 against West Indies and Ireland respectively .\r\n",
259 |       "The winning margin beats the 257 - run amount by which India beat Bermuda in Port of Spain in 2007 , which was equalled five days ago by South Africa in their victory over West Indies in Sydney .\r\n"
260 |      ]
261 |     }
262 |    ],
263 |    "source": [
264 |     "!polyglot --lang en tokenize --input testdata/cricket.txt"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "markdown",
269 |    "metadata": {},
270 |    "source": [
271 |     "### References\n",
272 |     "\n",
273 |     "- [Unicode Text Segmentation Algorithm](http://www.unicode.org/reports/tr29/)\n",
274 |     "- [Unicode Line Breaking Algorithm](http://www.unicode.org/reports/tr14/)\n",
275 |     "- [Boundary Analysis](http://userguide.icu-project.org/boundaryanalysis)\n",
276 |     "- [ICU Homepage](http://site.icu-project.org/)\n",
277 |     "- [Python Wrapper for libicu](https://pypi.python.org/pypi/PyICU)"
278 |    ]
279 |   }
280 |  ],
281 |  "metadata": {
282 |   "kernelspec": {
283 |    "display_name": "Python 3",
284 |    "language": "python",
285 |    "name": "python3"
286 |   },
287 |   "language_info": {
288 |    "codemirror_mode": {
289 |     "name": "ipython",
290 |     "version": 3
291 |    },
292 |    "file_extension": ".py",
293 |    "mimetype": "text/x-python",
294 |    "name": "python",
295 |    "nbconvert_exporter": "python",
296 |    "pygments_lexer": "ipython3",
297 |    "version": "3.4.0"
298 |   }
299 |  },
300 |  "nbformat": 4,
301 |  "nbformat_minor": 0
302 | }
303 | 


--------------------------------------------------------------------------------
/docs/MorphologicalAnalysis.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | Morphological Analysis
  3 | ======================
  4 | 
  5 | Polyglot offers trained `morfessor
  6 | models <http://www.cis.hut.fi/cis/projects/morpho/>`__ to generate
  7 | morphemes from words. The goal of the Morpho project is to develop
  8 | unsupervised data-driven methods that discover the regularities behind
  9 | word forming in natural languages. In particular, Morpho project is
 10 | focussing on the discovery of morphemes, which are the primitive units
 11 | of syntax, the smallest individually meaningful elements in the
 12 | utterances of a language. Morphemes are important in automatic
 13 | generation and recognition of a language, especially in languages in
 14 | which words may have many different inflected forms.
 15 | 
 16 | Languages Coverage
 17 | ------------------
 18 | 
 19 | Using polyglot vocabulary dictionaries, we trained morfessor models on
 20 | the most frequent words 50,000 words of each language.
 21 | 
 22 | .. code:: python
 23 | 
 24 |     from polyglot.downloader import downloader
 25 |     print(downloader.supported_languages_table("morph2"))
 26 | 
 27 | 
 28 | .. parsed-literal::
 29 | 
 30 |       1. Piedmontese language       2. Lombard language           3. Gan Chinese              
 31 |       4. Sicilian                   5. Scots                      6. Kirghiz, Kyrgyz          
 32 |       7. Pashto, Pushto             8. Kurdish                    9. Portuguese               
 33 |      10. Kannada                   11. Korean                    12. Khmer                    
 34 |      13. Kazakh                    14. Ilokano                   15. Polish                   
 35 |      16. Panjabi, Punjabi          17. Georgian                  18. Chuvash                  
 36 |      19. Alemannic                 20. Czech                     21. Welsh                    
 37 |      22. Chechen                   23. Catalan; Valencian        24. Northern Sami            
 38 |      25. Sanskrit (Saṁskṛta)       26. Slovene                   27. Javanese                 
 39 |      28. Slovak                    29. Bosnian-Croatian-Serbian  30. Bavarian                 
 40 |      31. Swedish                   32. Swahili                   33. Sundanese                
 41 |      34. Serbian                   35. Albanian                  36. Japanese                 
 42 |      37. Western Frisian           38. French                    39. Finnish                  
 43 |      40. Upper Sorbian             41. Faroese                   42. Persian                  
 44 |      43. Sinhala, Sinhalese        44. Italian                   45. Amharic                  
 45 |      46. Aragonese                 47. Volapük                   48. Icelandic                
 46 |      49. Sakha                     50. Afrikaans                 51. Indonesian               
 47 |      52. Interlingua               53. Azerbaijani               54. Ido                      
 48 |      55. Arabic                    56. Assamese                  57. Yoruba                   
 49 |      58. Yiddish                   59. Waray-Waray               60. Croatian                 
 50 |      61. Hungarian                 62. Haitian; Haitian Creole   63. Quechua                  
 51 |      64. Armenian                  65. Hebrew (modern)           66. Silesian                 
 52 |      67. Hindi                     68. Divehi; Dhivehi; Mald...  69. German                   
 53 |      70. Danish                    71. Occitan                   72. Tagalog                  
 54 |      73. Turkmen                   74. Thai                      75. Tajik                    
 55 |      76. Greek, Modern             77. Telugu                    78. Tamil                    
 56 |      79. Oriya                     80. Ossetian, Ossetic         81. Tatar                    
 57 |      82. Turkish                   83. Kapampangan               84. Venetian                 
 58 |      85. Manx                      86. Gujarati                  87. Galician                 
 59 |      88. Irish                     89. Scottish Gaelic; Gaelic   90. Nepali                   
 60 |      91. Cebuano                   92. Zazaki                    93. Walloon                  
 61 |      94. Dutch                     95. Norwegian                 96. Norwegian Nynorsk        
 62 |      97. West Flemish              98. Chinese                   99. Bosnian                  
 63 |     100. Breton                   101. Belarusian               102. Bulgarian                
 64 |     103. Bashkir                  104. Egyptian Arabic          105. Tibetan Standard, Tib... 
 65 |     106. Bengali                  107. Burmese                  108. Romansh                  
 66 |     109. Marathi (Marāṭhī)        110. Malay                    111. Maltese                  
 67 |     112. Russian                  113. Macedonian               114. Malayalam                
 68 |     115. Mongolian                116. Malagasy                 117. Vietnamese               
 69 |     118. Spanish; Castilian       119. Estonian                 120. Basque                   
 70 |     121. Bishnupriya Manipuri     122. Asturian                 123. English                  
 71 |     124. Esperanto                125. Luxembourgish, Letzeb... 126. Latin                    
 72 |     127. Uighur, Uyghur           128. Ukrainian                129. Limburgish, Limburgan... 
 73 |     130. Latvian                  131. Urdu                     132. Lithuanian               
 74 |     133. Fiji Hindi               134. Uzbek                    135. Romanian, Moldavian, ... 
 75 |     
 76 | 
 77 | 
 78 | Download Necessary Models
 79 | ^^^^^^^^^^^^^^^^^^^^^^^^^
 80 | 
 81 | .. code:: python
 82 | 
 83 |     %%bash
 84 |     polyglot download morph2.en morph2.ar
 85 | 
 86 | 
 87 | .. parsed-literal::
 88 | 
 89 |     [polyglot_data] Downloading package morph2.en to
 90 |     [polyglot_data]     /home/rmyeid/polyglot_data...
 91 |     [polyglot_data]   Package morph2.en is already up-to-date!
 92 |     [polyglot_data] Downloading package morph2.ar to
 93 |     [polyglot_data]     /home/rmyeid/polyglot_data...
 94 |     [polyglot_data]   Package morph2.ar is already up-to-date!
 95 | 
 96 | 
 97 | Example
 98 | -------
 99 | 
100 | Word Segmentation
101 | ~~~~~~~~~~~~~~~~~
102 | 
103 | .. code:: python
104 | 
105 |     from polyglot.text import Text, Word
106 | 
107 | .. code:: python
108 | 
109 |     words = ["preprocessing", "processor", "invaluable", "thankful", "crossed"]
110 |     for w in words:
111 |       w = Word(w, language="en")
112 |       print("{:<20}{}".format(w, w.morphemes))
113 | 
114 | 
115 | .. parsed-literal::
116 | 
117 |     preprocessing       ['pre', 'process', 'ing']
118 |     processor           ['process', 'or']
119 |     invaluable          ['in', 'valuable']
120 |     thankful            ['thank', 'ful']
121 |     crossed             ['cross', 'ed']
122 | 
123 | 
124 | Sentence Segmentation
125 | ~~~~~~~~~~~~~~~~~~~~~
126 | 
127 | If the text is not tokenized properly, morphological analysis could
128 | offer a smart of way of splitting the text into its original units.
129 | Here, is an example:
130 | 
131 | .. code:: python
132 | 
133 |     blob = "Wewillmeettoday."
134 |     text = Text(blob)
135 |     text.language = "en"
136 | 
137 | .. code:: python
138 | 
139 |     text.morphemes
140 | 
141 | 
142 | 
143 | 
144 | .. parsed-literal::
145 | 
146 |     WordList([u'We', u'will', u'meet', u'to', u'day', u'.'])
147 | 
148 | 
149 | 
150 | Command Line Interface
151 | ~~~~~~~~~~~~~~~~~~~~~~
152 | 
153 | .. code:: python
154 | 
155 |     !polyglot --lang en tokenize --input testdata/cricket.txt |  polyglot --lang en morph | tail -n 30
156 | 
157 | 
158 | .. parsed-literal::
159 | 
160 |     which           which
161 |     India           In_dia
162 |     beat            beat 
163 |     Bermuda         Ber_mud_a
164 |     in              in   
165 |     Port            Port 
166 |     of              of   
167 |     Spain           Spa_in
168 |     in              in   
169 |     2007            2007 
170 |     ,               ,    
171 |     which           which
172 |     was             wa_s 
173 |     equalled        equal_led
174 |     five            five 
175 |     days            day_s
176 |     ago             ago  
177 |     by              by   
178 |     South           South
179 |     Africa          Africa
180 |     in              in   
181 |     their           t_heir
182 |     victory         victor_y
183 |     over            over 
184 |     West            West 
185 |     Indies          In_dies
186 |     in              in   
187 |     Sydney          Syd_ney
188 |     .               .    
189 |     
190 | 
191 | 
192 | Demo
193 | ----
194 | 
195 | This demo does not reflect the models supplied by polyglot, however, we
196 | think it is indicative of what you should expect from morfessor
197 | 
198 | `Demo <http://www.cis.hut.fi/cgi-bin/morpho/nform.cgi>`__
199 | 
200 | Citation
201 | ~~~~~~~~
202 | 
203 | This is an interface to the implementation being described in the
204 | `Morfessor2.0: Python Implementation and Extensions for Morfessor
205 | Baseline <https://aaltodoc.aalto.fi/bitstream/handle/123456789/11836/isbn9789526055015.pdf?sequence=1>`__
206 | technical report.
207 | 
208 | ::
209 | 
210 |     @InProceedings{morfessor2,
211 |                    title:{Morfessor 2.0: Python Implementation and Extensions for Morfessor Baseline},
212 |                    author:  {Virpioja, Sami ; Smit, Peter ; Grönroos, Stig-Arne ; Kurimo, Mikko},
213 |                    year: {2013},
214 |                    publisher: {Department of Signal Processing and Acoustics, Aalto University},
215 |                    booktitle:{Aalto University publication series}
216 |     }
217 | 
218 | References
219 | ----------
220 | 
221 | -  `Morpho project <http://www.cis.hut.fi/cis/projects/morpho/>`__
222 | -  `Background information on morpheme
223 |    discovery <http://www.cis.hut.fi/cis/projects/morpho/problem.shtml>`__.
224 | 


--------------------------------------------------------------------------------
/polyglot/mapping/embeddings.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """Defines classes related to mapping vocabulary to n-dimensional points."""
  5 | 
  6 | from io import open
  7 | import logging
  8 | from os import path
  9 | import tarfile
 10 | 
 11 | import numpy as np
 12 | from numpy import float32
 13 | 
 14 | from six import PY2
 15 | from six import text_type as unicode
 16 | from six import iteritems
 17 | from six.moves import map
 18 | from six import string_types
 19 | from six.moves import cPickle as pickle
 20 | 
 21 | from .base import CountedVocabulary, OrderedVocabulary
 22 | from ..utils import _open
 23 | 
 24 | 
 25 | logger = logging.getLogger(__name__)
 26 | 
 27 | 
 28 | class Embedding(object):
 29 |   """ Mapping a vocabulary to a d-dimensional points."""
 30 | 
 31 |   def __init__(self, vocabulary, vectors):
 32 |     self.vocabulary = vocabulary
 33 |     self.vectors = np.asarray(vectors)
 34 | 
 35 |     if len(self.vocabulary) != self.vectors.shape[0]:
 36 |       raise ValueError("Vocabulary has {} items but we have {} "
 37 |                        "vectors".format(len(vocabulary), self.vectors.shape[0]))
 38 | 
 39 |   def __getitem__(self, k):
 40 |     return self.vectors[self.vocabulary[k]]
 41 | 
 42 |   def __contains__(self, k):
 43 |     return k in self.vocabulary
 44 | 
 45 |   def __delitem__(self, k):
 46 |     """Remove the word and its vector from the embedding.
 47 | 
 48 |     Note:
 49 |      This operation costs \\theta(n). Be careful putting it in a loop.
 50 |     """
 51 |     index = self.vocabulary[k]
 52 |     del self.vocabulary[k]
 53 |     self.vectors = np.delete(self.vectors, index, 0)
 54 | 
 55 |   def __len__(self):
 56 |     return len(self.vocabulary)
 57 | 
 58 |   def __iter__(self):
 59 |     for w in self.vocabulary:
 60 |       yield w, self[w]
 61 | 
 62 |   @property
 63 |   def words(self):
 64 |     return self.vocabulary.words
 65 | 
 66 |   @property
 67 |   def shape(self):
 68 |     return self.vectors.shape
 69 | 
 70 |   def apply_expansion(self, expansion):
 71 |     """Apply a vocabulary expansion to the current emebddings."""
 72 |     self.vocabulary = expansion(self.vocabulary)
 73 | 
 74 |   def get(self, k, default=None):
 75 |     try:
 76 |       return self[k]
 77 |     except KeyError as e:
 78 |       return default
 79 | 
 80 |   def most_frequent(self, k, inplace=False):
 81 |     """Only most frequent k words to be included in the embeddings."""
 82 |     vocabulary = self.vocabulary.most_frequent(k)
 83 |     vectors = np.asarray([self[w] for w in vocabulary])
 84 |     if inplace:
 85 |       self.vocabulary = vocabulary
 86 |       self.vectors = vectors
 87 |       return self
 88 |     return Embedding(vectors=vectors, vocabulary=vocabulary)
 89 | 
 90 |   def normalize_words(self, ord=2, inplace=False):
 91 |     """Normalize embeddings matrix row-wise.
 92 | 
 93 |     Args:
 94 |       ord: normalization order. Possible values {1, 2, 'inf', '-inf'}
 95 |     """
 96 |     if ord == 2:
 97 |       ord = None # numpy uses this flag to indicate l2.
 98 |     vectors = self.vectors.T / np.linalg.norm(self.vectors, ord, axis=1)
 99 |     if inplace:
100 |       self.vectors = vectors.T
101 |       return self
102 |     return Embedding(vectors=vectors.T, vocabulary=self.vocabulary)
103 | 
104 |   def nearest_neighbors(self, word, top_k=10):
105 |     """Return the nearest k words to the given `word`.
106 | 
107 |     Args:
108 |       word (string): single word.
109 |       top_k (integer): decides how many neighbors to report.
110 | 
111 |     Returns:
112 |       A list of words sorted by the distances. The closest is the first.
113 | 
114 |     Note:
115 |       L2 metric is used to calculate distances.
116 |     """
117 |     #TODO(rmyeid): Use scikit ball tree, if scikit is available
118 |     point = self[word]
119 |     diff = self.vectors - point
120 |     distances = np.linalg.norm(diff, axis=1)
121 |     top_ids = distances.argsort()[1:top_k+1]
122 |     return [self.vocabulary.id_word[i] for i in top_ids]
123 | 
124 |   def distances(self, word, words):
125 |     """Calculate eucledean pairwise distances between `word` and `words`.
126 | 
127 |     Args:
128 |       word (string): single word.
129 |       words (list): list of strings.
130 | 
131 |     Returns:
132 |       numpy array of the distances.
133 | 
134 |     Note:
135 |       L2 metric is used to calculate distances.
136 |     """
137 | 
138 |     point = self[word]
139 |     vectors = np.asarray([self[w] for w in words])
140 |     diff = vectors - point
141 |     distances = np.linalg.norm(diff, axis=1)
142 |     return distances
143 | 
144 |   @staticmethod
145 |   def from_gensim(model):
146 |     word_count = {}
147 |     vectors = []
148 |     for word, vocab in sorted(iteritems(model.vocab), key=lambda item: -item[1].count):
149 |       vectors.append(model.syn0[vocab.index])
150 |       word_count[word] = vocab.count
151 |     vocab = CountedVocabulary(word_count=word_count)
152 |     vectors = np.asarray(vectors)
153 |     return Embedding(vocabulary=vocab, vectors=vectors)
154 | 
155 |   @staticmethod
156 |   def from_word2vec_vocab(fvocab):
157 |     counts = {}
158 |     with _open(fvocab) as fin:
159 |       for line in fin:
160 |         word, count = unicode(line).strip().split()
161 |         counts[word] = int(count)
162 |     return CountedVocabulary(word_count=counts)
163 | 
164 |   @staticmethod
165 |   def _from_word2vec_binary(fname):
166 |     with _open(fname, 'rb') as fin:
167 |       words = []
168 |       header = unicode(fin.readline())
169 |       vocab_size, layer1_size = list(map(int, header.split())) # throws for invalid file format
170 |       vectors = np.zeros((vocab_size, layer1_size), dtype=float32)
171 |       binary_len = np.dtype(float32).itemsize * layer1_size
172 |       for line_no in xrange(vocab_size):
173 |         # mixed text and binary: read text first, then binary
174 |         word = []
175 |         while True:
176 |           ch = fin.read(1)
177 |           if ch == b' ':
178 |             break
179 |           if ch != b'\n': # ignore newlines in front of words (some binary files have newline, some don't)
180 |             word.append(ch)
181 |         word = b''.join(word)
182 |         index = line_no
183 |         words.append(word)
184 |         vectors[index, :] = np.fromstring(fin.read(binary_len), dtype=float32)
185 |       return words, vectors
186 | 
187 |   @staticmethod
188 |   def _from_word2vec_text(fname):
189 |     with _open(fname, 'rb') as fin:
190 |       words = []
191 |       header = unicode(fin.readline())
192 |       vocab_size, layer1_size = list(map(int, header.split())) # throws for invalid file format
193 |       vectors = []
194 |       for line_no, line in enumerate(fin):
195 |         try:
196 |           parts = unicode(line, encoding="utf-8").strip().split()
197 |         except TypeError as e:
198 |           parts = line.strip().split()
199 |         except Exception as e:
200 |           logger.warning("We ignored line number {} because of erros in parsing"
201 |                           "\n{}".format(line_no, e))
202 |           continue
203 |         # We differ from Gensim implementation.
204 |         # Our assumption that a difference of one happens because of having a
205 |         # space in the word.
206 |         if len(parts) == layer1_size + 1:
207 |           word, weights = parts[0], list(map(float32, parts[1:]))
208 |         elif len(parts) == layer1_size + 2:
209 |           word, weights = parts[:2], list(map(float32, parts[2:]))
210 |           word = u" ".join(word)
211 |         else:
212 |           logger.warning("We ignored line number {} because of unrecognized "
213 |                           "number of columns {}".format(line_no, parts[:-layer1_size]))
214 |           continue
215 |         index = line_no
216 |         words.append(word)
217 |         vectors.append(weights)
218 |       vectors = np.asarray(vectors, dtype=np.float32)
219 |       return words, vectors
220 | 
221 |   @staticmethod
222 |   def from_word2vec(fname, fvocab=None, binary=False):
223 |     """
224 |     Load the input-hidden weight matrix from the original C word2vec-tool format.
225 | 
226 |     Note that the information stored in the file is incomplete (the binary tree is missing),
227 |     so while you can query for word similarity etc., you cannot continue training
228 |     with a model loaded this way.
229 | 
230 |     `binary` is a boolean indicating whether the data is in binary word2vec format.
231 |     Word counts are read from `fvocab` filename, if set (this is the file generated
232 |     by `-save-vocab` flag of the original C tool).
233 |     """
234 |     vocabulary = None
235 |     if fvocab is not None:
236 |       logger.info("loading word counts from %s" % (fvocab))
237 |       vocabulary = Embedding.from_word2vec_vocab(fvocab)
238 | 
239 |     logger.info("loading projection weights from %s" % (fname))
240 |     if binary:
241 |       words, vectors = Embedding._from_word2vec_binary(fname)
242 |     else:
243 |       words, vectors = Embedding._from_word2vec_text(fname)
244 | 
245 |     if not vocabulary:
246 |       vocabulary = OrderedVocabulary(words=words)
247 | 
248 |     return Embedding(vocabulary=vocabulary, vectors=vectors)
249 | 
250 |   @staticmethod
251 |   def load(fname):
252 |     """Load an embedding dump generated by `save`"""
253 | 
254 |     content = _open(fname).read()
255 |     if PY2:
256 |       state = pickle.loads(content)
257 |     else:
258 |       state = pickle.loads(content, encoding='latin1')
259 |     voc, vec = state
260 |     if len(voc) == 2:
261 |       words, counts = voc
262 |       word_count = dict(zip(words, counts))
263 |       vocab = CountedVocabulary(word_count=word_count)
264 |     else:
265 |       vocab = OrderedVocabulary(voc)
266 |     return Embedding(vocabulary=vocab, vectors=vec)
267 | 
268 |   def save(self, fname):
269 |     """Save a pickled version of the embedding into `fname`."""
270 | 
271 |     vec = self.vectors
272 |     voc = self.vocabulary.getstate()
273 |     state = (voc, vec)
274 |     with open(fname, 'wb') as f:
275 |       pickle.dump(state, f, protocol=pickle.HIGHEST_PROTOCOL)
276 | 


--------------------------------------------------------------------------------
/notebooks/POS.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Part of Speech Tagging"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Part of speech tagging task aims to assign every word/token in plain text a category that identifies the syntactic functionality of the word occurrence.\n",
 15 |     "\n",
 16 |     "Polyglot recognizes 17 parts of speech, this set is called the `universal part of speech tag set`:\n",
 17 |     "\n",
 18 |     "- **ADJ**: adjective\n",
 19 |     "- **ADP**: adposition\n",
 20 |     "- **ADV**: adverb\n",
 21 |     "- **AUX**: auxiliary verb\n",
 22 |     "- **CONJ**: coordinating conjunction\n",
 23 |     "- **DET**: determiner\n",
 24 |     "- **INTJ**: interjection\n",
 25 |     "- **NOUN**: noun\n",
 26 |     "- **NUM**: numeral\n",
 27 |     "- **PART**: particle\n",
 28 |     "- **PRON**: pronoun\n",
 29 |     "- **PROPN**: proper noun\n",
 30 |     "- **PUNCT**: punctuation\n",
 31 |     "- **SCONJ**: subordinating conjunction\n",
 32 |     "- **SYM**: symbol\n",
 33 |     "- **VERB**: verb\n",
 34 |     "- **X**: other"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "## Languages Coverage"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "markdown",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "The models were trained on a combination of:\n",
 49 |     "\n",
 50 |     "- Original CONLL datasets after the tags were converted using the [universal POS tables](http://universaldependencies.github.io/docs/tagset-conversion/index.html).\n",
 51 |     "\n",
 52 |     "- Universal Dependencies 1.0 corpora whenever they are available."
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 1,
 58 |    "metadata": {
 59 |     "collapsed": false
 60 |    },
 61 |    "outputs": [
 62 |     {
 63 |      "name": "stdout",
 64 |      "output_type": "stream",
 65 |      "text": [
 66 |       "  1. German                     2. Italian                    3. Danish                   \n",
 67 |       "  4. Czech                      5. Slovene                    6. French                   \n",
 68 |       "  7. English                    8. Swedish                    9. Bulgarian                \n",
 69 |       " 10. Spanish; Castilian        11. Indonesian                12. Portuguese               \n",
 70 |       " 13. Finnish                   14. Irish                     15. Hungarian                \n",
 71 |       " 16. Dutch                    \n"
 72 |      ]
 73 |     }
 74 |    ],
 75 |    "source": [
 76 |     "from polyglot.downloader import downloader\n",
 77 |     "print(downloader.supported_languages_table(\"pos2\"))"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "metadata": {},
 83 |    "source": [
 84 |     "#### Download Necessary Models"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 2,
 90 |    "metadata": {
 91 |     "collapsed": false
 92 |    },
 93 |    "outputs": [
 94 |     {
 95 |      "name": "stdout",
 96 |      "output_type": "stream",
 97 |      "text": [
 98 |       "[polyglot_data] Downloading package embeddings2.en to\n",
 99 |       "[polyglot_data]     /home/rmyeid/polyglot_data...\n",
100 |       "[polyglot_data]   Package embeddings2.en is already up-to-date!\n",
101 |       "[polyglot_data] Downloading package pos2.en to\n",
102 |       "[polyglot_data]     /home/rmyeid/polyglot_data...\n",
103 |       "[polyglot_data]   Package pos2.en is already up-to-date!\n"
104 |      ]
105 |     }
106 |    ],
107 |    "source": [
108 |     "%%bash\n",
109 |     "polyglot download embeddings2.en pos2.en"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "markdown",
114 |    "metadata": {},
115 |    "source": [
116 |     "## Example"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "markdown",
121 |    "metadata": {},
122 |    "source": [
123 |     "We tag each word in the text with one part of speech."
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 3,
129 |    "metadata": {
130 |     "collapsed": false
131 |    },
132 |    "outputs": [],
133 |    "source": [
134 |     "from polyglot.text import Text"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": 4,
140 |    "metadata": {
141 |     "collapsed": true
142 |    },
143 |    "outputs": [],
144 |    "source": [
145 |     "blob = \"\"\"We will meet at eight o'clock on Thursday morning.\"\"\"\n",
146 |     "text = Text(blob)"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "markdown",
151 |    "metadata": {},
152 |    "source": [
153 |     "We can query all the tagged words"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 5,
159 |    "metadata": {
160 |     "collapsed": false
161 |    },
162 |    "outputs": [
163 |     {
164 |      "data": {
165 |       "text/plain": [
166 |        "[(u'We', u'PRON'),\n",
167 |        " (u'will', u'AUX'),\n",
168 |        " (u'meet', u'VERB'),\n",
169 |        " (u'at', u'ADP'),\n",
170 |        " (u'eight', u'NUM'),\n",
171 |        " (u\"o'clock\", u'NOUN'),\n",
172 |        " (u'on', u'ADP'),\n",
173 |        " (u'Thursday', u'PROPN'),\n",
174 |        " (u'morning', u'NOUN'),\n",
175 |        " (u'.', u'PUNCT')]"
176 |       ]
177 |      },
178 |      "execution_count": 5,
179 |      "metadata": {},
180 |      "output_type": "execute_result"
181 |     }
182 |    ],
183 |    "source": [
184 |     "text.pos_tags"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "markdown",
189 |    "metadata": {},
190 |    "source": [
191 |     "After calling the pos_tags property once, the words objects will carry the POS tags."
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": 6,
197 |    "metadata": {
198 |     "collapsed": false,
199 |     "scrolled": true
200 |    },
201 |    "outputs": [
202 |     {
203 |      "data": {
204 |       "text/plain": [
205 |        "u'PRON'"
206 |       ]
207 |      },
208 |      "execution_count": 6,
209 |      "metadata": {},
210 |      "output_type": "execute_result"
211 |     }
212 |    ],
213 |    "source": [
214 |     "text.words[0].pos_tag"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "markdown",
219 |    "metadata": {},
220 |    "source": [
221 |     "### Command Line Interface"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": 7,
227 |    "metadata": {
228 |     "collapsed": false
229 |    },
230 |    "outputs": [
231 |     {
232 |      "name": "stdout",
233 |      "output_type": "stream",
234 |      "text": [
235 |       "which           DET  \r\n",
236 |       "India           PROPN\r\n",
237 |       "beat            VERB \r\n",
238 |       "Bermuda         PROPN\r\n",
239 |       "in              ADP  \r\n",
240 |       "Port            PROPN\r\n",
241 |       "of              ADP  \r\n",
242 |       "Spain           PROPN\r\n",
243 |       "in              ADP  \r\n",
244 |       "2007            NUM  \r\n",
245 |       ",               PUNCT\r\n",
246 |       "which           DET  \r\n",
247 |       "was             AUX  \r\n",
248 |       "equalled        VERB \r\n",
249 |       "five            NUM  \r\n",
250 |       "days            NOUN \r\n",
251 |       "ago             ADV  \r\n",
252 |       "by              ADP  \r\n",
253 |       "South           PROPN\r\n",
254 |       "Africa          PROPN\r\n",
255 |       "in              ADP  \r\n",
256 |       "their           PRON \r\n",
257 |       "victory         NOUN \r\n",
258 |       "over            ADP  \r\n",
259 |       "West            PROPN\r\n",
260 |       "Indies          PROPN\r\n",
261 |       "in              ADP  \r\n",
262 |       "Sydney          PROPN\r\n",
263 |       ".               PUNCT\r\n",
264 |       "\r\n"
265 |      ]
266 |     }
267 |    ],
268 |    "source": [
269 |     "!polyglot --lang en tokenize --input testdata/cricket.txt |  polyglot --lang en pos | tail -n 30"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "markdown",
274 |    "metadata": {},
275 |    "source": [
276 |     "\n",
277 |     "### Citation\n",
278 |     "\n",
279 |     "This work is a direct implementation of the research being described in the [Polyglot: Distributed Word Representations for Multilingual NLP](http://www.aclweb.org/anthology/W13-3520) paper.\n",
280 |     "The author of this library strongly encourage you to cite the following paper if you are using this software."
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "markdown",
285 |    "metadata": {},
286 |    "source": [
287 |     "\n",
288 |     "```\n",
289 |     "   @InProceedings{polyglot:2013:ACL-CoNLL,\n",
290 |     "     author    = {Al-Rfou, Rami  and  Perozzi, Bryan  and  Skiena, Steven},\n",
291 |     "     title     = {Polyglot: Distributed Word Representations for Multilingual NLP},\n",
292 |     "     booktitle = {Proceedings of the Seventeenth Conference on Computational Natural Language Learning},\n",
293 |     "     month     = {August},\n",
294 |     "     year      = {2013},\n",
295 |     "     address   = {Sofia, Bulgaria},\n",
296 |     "     publisher = {Association for Computational Linguistics},\n",
297 |     "     pages     = {183--192}, \n",
298 |     "     url       = {http://www.aclweb.org/anthology/W13-3520}\n",
299 |     "   }\n",
300 |     "```"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "markdown",
305 |    "metadata": {},
306 |    "source": [
307 |     "## References\n",
308 |     "\n",
309 |     "- [Universal Part of Speech Tagging](http://universaldependencies.github.io/docs/u/pos/index.html)\n",
310 |     "- [Universal Dependencies 1.0](https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-1464)."
311 |    ]
312 |   }
313 |  ],
314 |  "metadata": {
315 |   "kernelspec": {
316 |    "display_name": "Python 2",
317 |    "language": "python",
318 |    "name": "python2"
319 |   },
320 |   "language_info": {
321 |    "codemirror_mode": {
322 |     "name": "ipython",
323 |     "version": 2
324 |    },
325 |    "file_extension": ".py",
326 |    "mimetype": "text/x-python",
327 |    "name": "python",
328 |    "nbconvert_exporter": "python",
329 |    "pygments_lexer": "ipython2",
330 |    "version": "2.7.6"
331 |   }
332 |  },
333 |  "nbformat": 4,
334 |  "nbformat_minor": 0
335 | }
336 | 


--------------------------------------------------------------------------------
/notebooks/README.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "polyglot\n",
  8 |     "==============================="
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "metadata": {},
 14 |    "source": [
 15 |     "[![Downloads](https://img.shields.io/pypi/dm/polyglot.svg \"Downloads\")](https://pypi.python.org/pypi/polyglot)\n",
 16 |     "[![Latest Version](https://badge.fury.io/py/polyglot.svg \"Latest Version\")](https://pypi.python.org/pypi/polyglot)\n",
 17 |     "[![Build Status](https://travis-ci.org/aboSamoor/polyglot.png?branch=master \"Build Status\")](https://travis-ci.org/aboSamoor/polyglot)\n",
 18 |     "[![Documentation Status](https://readthedocs.org/projects/polyglot/badge/?version=latest \"Documentation Status\")](https://readthedocs.org/builds/polyglot/)"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "Polyglot is a natural language pipeline that supports massive multilingual applications.\n",
 26 |     "\n",
 27 |     "* Free software: GPLv3 license\n",
 28 |     "* Documentation: http://polyglot.readthedocs.org.\n",
 29 |     "\n",
 30 |     "###Features\n",
 31 |     "\n",
 32 |     "\n",
 33 |     "* Tokenization (165 Languages)\n",
 34 |     "* Language detection (196 Languages)\n",
 35 |     "* Named Entity Recognition (40 Languages)\n",
 36 |     "* Part of Speech Tagging (16 Languages)\n",
 37 |     "* Sentiment Analysis (136 Languages)\n",
 38 |     "* Word Embeddings (137 Languages)\n",
 39 |     "* Morphological analysis (135 Languages)\n",
 40 |     "* Transliteration (69 Languages)"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "markdown",
 45 |    "metadata": {},
 46 |    "source": [
 47 |     "### Developer\n",
 48 |     "\n",
 49 |     "* Rami Al-Rfou @ `rmyeid gmail com`"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "metadata": {},
 55 |    "source": [
 56 |     "\n",
 57 |     "## Quick Tutorial"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 9,
 63 |    "metadata": {
 64 |     "collapsed": false
 65 |    },
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "import polyglot\n",
 69 |     "from polyglot.text import Text, Word"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "### Language Detection"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 10,
 82 |    "metadata": {
 83 |     "collapsed": false
 84 |    },
 85 |    "outputs": [
 86 |     {
 87 |      "name": "stdout",
 88 |      "output_type": "stream",
 89 |      "text": [
 90 |       "Language Detected: Code=fr, Name=French\n",
 91 |       "\n"
 92 |      ]
 93 |     }
 94 |    ],
 95 |    "source": [
 96 |     "text = Text(\"Bonjour, Mesdames.\")\n",
 97 |     "print(\"Language Detected: Code={}, Name={}\\n\".format(text.language.code, text.language.name))"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "markdown",
102 |    "metadata": {},
103 |    "source": [
104 |     "### Tokenization"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": 11,
110 |    "metadata": {
111 |     "collapsed": false
112 |    },
113 |    "outputs": [
114 |     {
115 |      "name": "stdout",
116 |      "output_type": "stream",
117 |      "text": [
118 |       "[u'Beautiful', u'is', u'better', u'than', u'ugly', u'.', u'Explicit', u'is', u'better', u'than', u'implicit', u'.', u'Simple', u'is', u'better', u'than', u'complex', u'.']\n"
119 |      ]
120 |     }
121 |    ],
122 |    "source": [
123 |     "zen = Text(\"Beautiful is better than ugly. \"\n",
124 |     "           \"Explicit is better than implicit. \"\n",
125 |     "           \"Simple is better than complex.\")\n",
126 |     "print(zen.words)"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 12,
132 |    "metadata": {
133 |     "collapsed": false,
134 |     "scrolled": true
135 |    },
136 |    "outputs": [
137 |     {
138 |      "name": "stdout",
139 |      "output_type": "stream",
140 |      "text": [
141 |       "[Sentence(\"Beautiful is better than ugly.\"), Sentence(\"Explicit is better than implicit.\"), Sentence(\"Simple is better than complex.\")]\n"
142 |      ]
143 |     }
144 |    ],
145 |    "source": [
146 |     "print(zen.sentences)"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "markdown",
151 |    "metadata": {},
152 |    "source": [
153 |     "### Part of Speech Tagging"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 13,
159 |    "metadata": {
160 |     "collapsed": false
161 |    },
162 |    "outputs": [
163 |     {
164 |      "name": "stdout",
165 |      "output_type": "stream",
166 |      "text": [
167 |       "Word            POS Tag\n",
168 |       "------------------------------\n",
169 |       "O               DET\n",
170 |       "primeiro        ADJ\n",
171 |       "uso             NOUN\n",
172 |       "de              ADP\n",
173 |       "desobediência   NOUN\n",
174 |       "civil           ADJ\n",
175 |       "em              ADP\n",
176 |       "massa           NOUN\n",
177 |       "ocorreu         ADJ\n",
178 |       "em              ADP\n",
179 |       "setembro        NOUN\n",
180 |       "de              ADP\n",
181 |       "1906            NUM\n",
182 |       ".               PUNCT\n"
183 |      ]
184 |     }
185 |    ],
186 |    "source": [
187 |     "text = Text(u\"O primeiro uso de desobediência civil em massa ocorreu em setembro de 1906.\")\n",
188 |     "\n",
189 |     "print(\"{:<16}{}\".format(\"Word\", \"POS Tag\")+\"\\n\"+\"-\"*30)\n",
190 |     "for word, tag in text.pos_tags:\n",
191 |     "    print(u\"{:<16}{:>2}\".format(word, tag))"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "markdown",
196 |    "metadata": {},
197 |    "source": [
198 |     "### Named Entity Recognition"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": 14,
204 |    "metadata": {
205 |     "collapsed": false
206 |    },
207 |    "outputs": [
208 |     {
209 |      "name": "stdout",
210 |      "output_type": "stream",
211 |      "text": [
212 |       "[I-LOC([u'Gro\\xdfbritannien']), I-PER([u'Gandhi'])]\n"
213 |      ]
214 |     }
215 |    ],
216 |    "source": [
217 |     "text = Text(u\"In Großbritannien war Gandhi mit dem westlichen Lebensstil vertraut geworden\")\n",
218 |     "print(text.entities)"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "markdown",
223 |    "metadata": {},
224 |    "source": [
225 |     "### Polarity"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": 15,
231 |    "metadata": {
232 |     "collapsed": false
233 |    },
234 |    "outputs": [
235 |     {
236 |      "name": "stdout",
237 |      "output_type": "stream",
238 |      "text": [
239 |       "Word            Polarity\n",
240 |       "------------------------------\n",
241 |       "Beautiful        0\n",
242 |       "is               0\n",
243 |       "better           1\n",
244 |       "than             0\n",
245 |       "ugly            -1\n",
246 |       ".                0\n"
247 |      ]
248 |     }
249 |    ],
250 |    "source": [
251 |     "print(\"{:<16}{}\".format(\"Word\", \"Polarity\")+\"\\n\"+\"-\"*30)\n",
252 |     "for w in zen.words[:6]:\n",
253 |     "    print(\"{:<16}{:>2}\".format(w, w.polarity))"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "markdown",
258 |    "metadata": {},
259 |    "source": [
260 |     "### Embeddings"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": 19,
266 |    "metadata": {
267 |     "collapsed": false
268 |    },
269 |    "outputs": [
270 |     {
271 |      "name": "stdout",
272 |      "output_type": "stream",
273 |      "text": [
274 |       "Neighbors (Synonms) of Obama\n",
275 |       "------------------------------\n",
276 |       "Bush            \n",
277 |       "Reagan          \n",
278 |       "Clinton         \n",
279 |       "Ahmadinejad     \n",
280 |       "Nixon           \n",
281 |       "Karzai          \n",
282 |       "McCain          \n",
283 |       "Biden           \n",
284 |       "Huckabee        \n",
285 |       "Lula            \n",
286 |       "\n",
287 |       "\n",
288 |       "The first 10 dimensions out the 256 dimensions\n",
289 |       "\n",
290 |       "[-2.57382345  1.52175975  0.51070285  1.08678675 -0.74386948 -1.18616164\n",
291 |       "  2.92784619 -0.25694436 -1.40958667 -2.39675403]\n"
292 |      ]
293 |     }
294 |    ],
295 |    "source": [
296 |     "word = Word(\"Obama\", language=\"en\")\n",
297 |     "print(\"Neighbors (Synonms) of {}\".format(word)+\"\\n\"+\"-\"*30)\n",
298 |     "for w in word.neighbors:\n",
299 |     "    print(\"{:<16}\".format(w))\n",
300 |     "print(\"\\n\\nThe first 10 dimensions out the {} dimensions\\n\".format(word.vector.shape[0]))\n",
301 |     "print(word.vector[:10])"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "markdown",
306 |    "metadata": {},
307 |    "source": [
308 |     "### Morphology"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "code",
313 |    "execution_count": 17,
314 |    "metadata": {
315 |     "collapsed": false
316 |    },
317 |    "outputs": [
318 |     {
319 |      "name": "stdout",
320 |      "output_type": "stream",
321 |      "text": [
322 |       "[u'Pre', u'process', u'ing']\n"
323 |      ]
324 |     }
325 |    ],
326 |    "source": [
327 |     "word = Text(\"Preprocessing is an essential step.\").words[0]\n",
328 |     "print(word.morphemes)"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "markdown",
333 |    "metadata": {},
334 |    "source": [
335 |     "### Transliteration"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "code",
340 |    "execution_count": 18,
341 |    "metadata": {
342 |     "collapsed": false
343 |    },
344 |    "outputs": [
345 |     {
346 |      "name": "stdout",
347 |      "output_type": "stream",
348 |      "text": [
349 |       "препрокессинг\n"
350 |      ]
351 |     }
352 |    ],
353 |    "source": [
354 |     "from polyglot.transliteration import Transliterator\n",
355 |     "transliterator = Transliterator(source_lang=\"en\", target_lang=\"ru\")\n",
356 |     "print(transliterator.transliterate(u\"preprocessing\"))"
357 |    ]
358 |   }
359 |  ],
360 |  "metadata": {
361 |   "kernelspec": {
362 |    "display_name": "Python 2",
363 |    "language": "python",
364 |    "name": "python2"
365 |   },
366 |   "language_info": {
367 |    "codemirror_mode": {
368 |     "name": "ipython",
369 |     "version": 2
370 |    },
371 |    "file_extension": ".py",
372 |    "mimetype": "text/x-python",
373 |    "name": "python",
374 |    "nbconvert_exporter": "python",
375 |    "pygments_lexer": "ipython2",
376 |    "version": "2.7.6"
377 |   }
378 |  },
379 |  "nbformat": 4,
380 |  "nbformat_minor": 0
381 | }
382 | 


--------------------------------------------------------------------------------