├── README.rst ├── polyglot ├── tag │ ├── tests │ │ ├── __init__.py │ │ └── test_base.py │ ├── __init__.py │ └── base.py ├── mapping │ ├── tests │ │ ├── __init__.py │ │ ├── test_embeddings.py │ │ └── test_expansion.py │ ├── __init__.py │ ├── expansion.py │ ├── base.py │ └── embeddings.py ├── tokenize │ ├── tests │ │ ├── __init__.py │ │ └── test_base.py │ ├── __init__.py │ └── base.py ├── transliteration │ ├── tests │ │ ├── __init__.py │ │ └── test_base.py │ ├── __init__.py │ └── base.py ├── detect │ ├── __init__.py │ └── base.py ├── __init__.py ├── decorators.py ├── utils.py ├── load.py ├── base.py └── mixins.py ├── setup.cfg ├── docs ├── authors.rst ├── history.rst ├── readme.rst ├── contributing.rst ├── modules.rst ├── usage.rst ├── Embeddings_files │ └── Embeddings_12_0.png ├── polyglot.tag.rst ├── polyglot.tokenize.rst ├── index_latex.rst ├── polyglot.transliteration.rst ├── polyglot.detect.rst ├── index.rst ├── polyglot.mapping.rst ├── TODO.rst ├── Installation.rst ├── polyglot.rst ├── sphinxext │ └── github_link.py ├── Tokenization.rst ├── Transliteration.rst ├── README.rst ├── POS.rst ├── NamedEntityRecognition.rst ├── Embeddings.rst ├── Makefile ├── make.bat ├── Sentiment.rst ├── CLI.rst └── MorphologicalAnalysis.rst ├── tests ├── __init__.py └── test_polyglot.py ├── requirements.txt ├── rtd_requirements.txt ├── tox.ini ├── AUTHORS.rst ├── nb2rst.sh ├── MANIFEST.in ├── .travis.yml ├── HISTORY.rst ├── .gitignore ├── notebooks ├── testdata │ └── cricket.txt ├── TODO.ipynb ├── Installation.ipynb ├── Transliteration.ipynb ├── Tokenization.ipynb ├── POS.ipynb └── README.ipynb ├── Makefile ├── CONTRIBUTING.rst └── setup.py /README.rst: -------------------------------------------------------------------------------- 1 | docs/README.rst -------------------------------------------------------------------------------- /polyglot/tag/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /polyglot/mapping/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /polyglot/tokenize/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [wheel] 2 | universal = 1 -------------------------------------------------------------------------------- /docs/authors.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../AUTHORS.rst -------------------------------------------------------------------------------- /docs/history.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../HISTORY.rst -------------------------------------------------------------------------------- /docs/readme.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../README.rst -------------------------------------------------------------------------------- /polyglot/transliteration/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- -------------------------------------------------------------------------------- /docs/contributing.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../CONTRIBUTING.rst -------------------------------------------------------------------------------- /docs/modules.rst: -------------------------------------------------------------------------------- 1 | polyglot 2 | ======== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | polyglot 8 | -------------------------------------------------------------------------------- /docs/usage.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Usage 3 | ======== 4 | 5 | To use polyglot in a project:: 6 | 7 | import polyglot -------------------------------------------------------------------------------- /polyglot/detect/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import Detector, Language 2 | 3 | __all__ = ['Detector', 'Language'] 4 | -------------------------------------------------------------------------------- /polyglot/transliteration/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import Transliterator 2 | 3 | __all__ = ["Transliterator"] 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | wheel>=0.23.0 2 | PyICU>=1.8 3 | pycld2>=0.3 4 | six>=1.7.3 5 | futures>=2.1.6 6 | morfessor>=2.0.2a1 7 | -------------------------------------------------------------------------------- /docs/Embeddings_files/Embeddings_12_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Todo/polyglot/master/docs/Embeddings_files/Embeddings_12_0.png -------------------------------------------------------------------------------- /polyglot/tag/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import NEChunker, POSTagger, get_pos_tagger, get_ner_tagger 2 | 3 | __all__ = ['NEChunker', "POSTagger", "get_pos_tagger", "get_ner_tagger"] 4 | -------------------------------------------------------------------------------- /rtd_requirements.txt: -------------------------------------------------------------------------------- 1 | wheel>=0.23.0 2 | pycld2>=0.20 3 | six>=1.7.3 4 | futures>=2.1.6 5 | sphinxcontrib-napoleon>=0.2.8 6 | mock>=1.0.1 7 | sphinx-bootstrap-theme>=0.4.5 8 | alabaster>=0.7.1 9 | -------------------------------------------------------------------------------- /polyglot/tokenize/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from .base import WordTokenizer, SentenceTokenizer 5 | 6 | 7 | __all__ = ['WordTokenizer', 8 | 'SentenceTokenizer'] 9 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py26, py27, py33, py34 3 | 4 | [testenv] 5 | setenv = 6 | PYTHONPATH = {toxinidir}:{toxinidir}/polyglot 7 | commands = python setup.py test 8 | deps = 9 | -r{toxinidir}/requirements.txt -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | Credits 3 | ======= 4 | 5 | Development Lead 6 | ---------------- 7 | 8 | * Rami Al-Rfou 9 | 10 | Contributors 11 | ------------ 12 | 13 | * Yingtao Tian 14 | -------------------------------------------------------------------------------- /nb2rst.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | 4 | files=`ls notebooks/*ipynb` 5 | cd docs 6 | for f in $files 7 | do 8 | b=`basename -s .ipynb $f` 9 | ipython nbconvert ../notebooks/${b}.ipynb --to rst --output ${b}.rst 10 | done 11 | cd - 12 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.rst 2 | include notebooks/*.ipynb 3 | include *.txt 4 | include Makefile 5 | 6 | recursive-include tests * 7 | recursive-exclude * __pycache__ 8 | recursive-exclude * *.py[co] 9 | 10 | recursive-include docs *.rst conf.py Makefile make.bat 11 | -------------------------------------------------------------------------------- /polyglot/transliteration/tests/test_base.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """Test basic Transliterators facilities.""" 5 | 6 | import unittest 7 | from .. import Transliterator 8 | 9 | class TransliteratorTest(unittest.TestCase): 10 | def __init__(self): 11 | pass 12 | 13 | if __name__ == "__main__": 14 | unittest.main() 15 | -------------------------------------------------------------------------------- /polyglot/tag/tests/test_base.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """Test basic Taggers.""" 5 | 6 | import unittest 7 | from .. import NEChunker, POSTagger 8 | 9 | from io import StringIO 10 | 11 | class NERChunkerTest(unittest.TestCase): 12 | def __init__(self): 13 | pass 14 | 15 | if __name__ == "__main__": 16 | unittest.main() 17 | -------------------------------------------------------------------------------- /polyglot/mapping/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import CountedVocabulary, OrderedVocabulary, VocabularyBase 2 | from .embeddings import Embedding 3 | from .expansion import CaseExpander, DigitExpander 4 | 5 | __all__ = ['CountedVocabulary', 6 | 'OrderedVocabulary', 7 | 'VocabularyBase', 8 | 'Embedding', 9 | 'CaseExpander', 10 | 'DigitExpander'] 11 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Config file for automatic testing at travis-ci.org 2 | 3 | language: python 4 | 5 | python: 6 | - "3.4" 7 | - "2.7" 8 | # - "pypy" 9 | 10 | # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors 11 | install: 12 | - sudo apt-get install python-numpy libicu-dev 13 | - pip install -r requirements.txt 14 | 15 | # command to run tests, e.g. python setup.py test 16 | script: nosetests 17 | -------------------------------------------------------------------------------- /HISTORY.rst: -------------------------------------------------------------------------------- 1 | .. :changelog: 2 | 3 | History 4 | ------- 5 | 6 | "14.11" (2014-01-11) 7 | --------------------- 8 | 9 | * First release on PyPI. 10 | 11 | 12 | "15.5.2" (2015-05-02) 13 | --------------------- 14 | 15 | * Polyglot is feature complete. 16 | 17 | 18 | "15.10.03" (2015-10-03) 19 | --------------------------- 20 | 21 | * Change the polyglot models mirror to Stony Brook University DSL lab instead 22 | of Google cloud storage. 23 | -------------------------------------------------------------------------------- /polyglot/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __author__ = 'Rami Al-Rfou' 4 | __email__ = 'rmyeid@gmail.com' 5 | __version__ = '15.10.03' 6 | 7 | import types 8 | 9 | from six.moves import copyreg 10 | from .base import Sequence, TokenSequence 11 | from .utils import _pickle_method, _unpickle_method 12 | 13 | __all__ = ['Sequence', 'TokenSequence'] 14 | 15 | data_path = '~/' 16 | 17 | copyreg.pickle(types.MethodType, _pickle_method, _unpickle_method) 18 | -------------------------------------------------------------------------------- /tests/test_polyglot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | test_polyglot 6 | ---------------------------------- 7 | 8 | Tests for `polyglot` module. 9 | """ 10 | 11 | import unittest 12 | 13 | from polyglot import polyglot 14 | 15 | 16 | class TestPolyglot(unittest.TestCase): 17 | 18 | def setUp(self): 19 | pass 20 | 21 | def test_something(self): 22 | pass 23 | 24 | def tearDown(self): 25 | pass 26 | 27 | if __name__ == '__main__': 28 | unittest.main() -------------------------------------------------------------------------------- /docs/polyglot.tag.rst: -------------------------------------------------------------------------------- 1 | polyglot.tag package 2 | ==================== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | polyglot.tag.tests 10 | 11 | Submodules 12 | ---------- 13 | 14 | polyglot.tag.base module 15 | ------------------------ 16 | 17 | .. automodule:: polyglot.tag.base 18 | :members: 19 | :undoc-members: 20 | :show-inheritance: 21 | 22 | 23 | Module contents 24 | --------------- 25 | 26 | .. automodule:: polyglot.tag 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | 21 | # Installer logs 22 | pip-log.txt 23 | 24 | # Unit test / coverage reports 25 | .coverage 26 | .tox 27 | nosetests.xml 28 | htmlcov 29 | 30 | # Translations 31 | *.mo 32 | 33 | # Mr Developer 34 | .mr.developer.cfg 35 | .project 36 | .pydevproject 37 | 38 | # Complexity 39 | output/*.html 40 | output/*/index.html 41 | 42 | # Sphinx 43 | docs/_build -------------------------------------------------------------------------------- /docs/polyglot.tokenize.rst: -------------------------------------------------------------------------------- 1 | polyglot.tokenize package 2 | ========================= 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | polyglot.tokenize.tests 10 | 11 | Submodules 12 | ---------- 13 | 14 | polyglot.tokenize.base module 15 | ----------------------------- 16 | 17 | .. automodule:: polyglot.tokenize.base 18 | :members: 19 | :undoc-members: 20 | :show-inheritance: 21 | 22 | 23 | Module contents 24 | --------------- 25 | 26 | .. automodule:: polyglot.tokenize 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | -------------------------------------------------------------------------------- /docs/index_latex.rst: -------------------------------------------------------------------------------- 1 | .. complexity documentation master file, created by 2 | sphinx-quickstart on Tue Jul 9 22:26:36 2013. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to polyglot's documentation! 7 | ====================================== 8 | 9 | .. toctree:: 10 | :maxdepth: 3 11 | 12 | Installation 13 | Detection 14 | Tokenization 15 | CLI 16 | Download 17 | Embeddings 18 | POS 19 | NamedEntityRecognition 20 | MorphologicalAnalysis 21 | Transliteration 22 | Sentiment 23 | -------------------------------------------------------------------------------- /docs/polyglot.transliteration.rst: -------------------------------------------------------------------------------- 1 | polyglot.transliteration package 2 | ================================ 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | polyglot.transliteration.tests 10 | 11 | Submodules 12 | ---------- 13 | 14 | polyglot.transliteration.base module 15 | ------------------------------------ 16 | 17 | .. automodule:: polyglot.transliteration.base 18 | :members: 19 | :undoc-members: 20 | :show-inheritance: 21 | 22 | 23 | Module contents 24 | --------------- 25 | 26 | .. automodule:: polyglot.transliteration 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | -------------------------------------------------------------------------------- /docs/polyglot.detect.rst: -------------------------------------------------------------------------------- 1 | polyglot.detect package 2 | ======================= 3 | 4 | Submodules 5 | ---------- 6 | 7 | polyglot.detect.base module 8 | --------------------------- 9 | 10 | .. automodule:: polyglot.detect.base 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | polyglot.detect.langids module 16 | ------------------------------ 17 | 18 | .. automodule:: polyglot.detect.langids 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | 24 | Module contents 25 | --------------- 26 | 27 | .. automodule:: polyglot.detect 28 | :members: 29 | :undoc-members: 30 | :show-inheritance: 31 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. complexity documentation master file, created by 2 | sphinx-quickstart on Tue Jul 9 22:26:36 2013. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to polyglot's documentation! 7 | ====================================== 8 | 9 | .. include:: 10 | README.rst 11 | 12 | Contents: 13 | ========= 14 | 15 | .. toctree:: 16 | :maxdepth: 2 17 | 18 | Installation 19 | Detection 20 | Tokenization 21 | CLI 22 | Download 23 | Embeddings 24 | POS 25 | NamedEntityRecognition 26 | MorphologicalAnalysis 27 | Transliteration 28 | Sentiment 29 | modules 30 | 31 | -------------------------------------------------------------------------------- /notebooks/testdata/cricket.txt: -------------------------------------------------------------------------------- 1 | Australia posted a World Cup record total of 417-6 as they beat Afghanistan by 275 runs. 2 | David Warner hit 178 off 133 balls, Steve Smith scored 95 while Glenn Maxwell struck 88 in 39 deliveries in the Pool A encounter in Perth. 3 | Afghanistan were then dismissed for 142, with Mitchell Johnson and Mitchell Starc taking six wickets between them. 4 | Australia's score surpassed the 413-5 India made against Bermuda in 2007. 5 | It continues the pattern of bat dominating ball in this tournament as the third 400 plus score achieved in the pool stages, following South Africa's 408-5 and 411-4 against West Indies and Ireland respectively. 6 | The winning margin beats the 257-run amount by which India beat Bermuda in Port of Spain in 2007, which was equalled five days ago by South Africa in their victory over West Indies in Sydney. 7 | -------------------------------------------------------------------------------- /polyglot/decorators.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import functools 5 | 6 | class cached_property(object): 7 | """A property that is only computed once per instance and then replaces 8 | itself with an ordinary attribute. Deleting the attribute resets the 9 | property. 10 | Credit to Marcel Hellkamp, author of bottle.py. 11 | """ 12 | 13 | def __init__(self, func): 14 | self.__doc__ = getattr(func, '__doc__') 15 | self.func = func 16 | 17 | def __get__(self, obj, cls): 18 | if obj is None: 19 | return self 20 | value = obj.__dict__[self.func.__name__] = self.func(obj) 21 | return value 22 | 23 | def memoize(obj): 24 | cache = obj.cache = {} 25 | 26 | @functools.wraps(obj) 27 | def memoizer(*args, **kwargs): 28 | key = tuple(list(args) + sorted(kwargs.items())) 29 | if key not in cache: 30 | cache[key] = obj(*args, **kwargs) 31 | return cache[key] 32 | return memoizer 33 | -------------------------------------------------------------------------------- /docs/polyglot.mapping.rst: -------------------------------------------------------------------------------- 1 | polyglot.mapping package 2 | ======================== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | polyglot.mapping.tests 10 | 11 | Submodules 12 | ---------- 13 | 14 | polyglot.mapping.base module 15 | ---------------------------- 16 | 17 | .. automodule:: polyglot.mapping.base 18 | :members: 19 | :undoc-members: 20 | :show-inheritance: 21 | 22 | polyglot.mapping.embeddings module 23 | ---------------------------------- 24 | 25 | .. automodule:: polyglot.mapping.embeddings 26 | :members: 27 | :undoc-members: 28 | :show-inheritance: 29 | 30 | polyglot.mapping.expansion module 31 | --------------------------------- 32 | 33 | .. automodule:: polyglot.mapping.expansion 34 | :members: 35 | :undoc-members: 36 | :show-inheritance: 37 | 38 | 39 | Module contents 40 | --------------- 41 | 42 | .. automodule:: polyglot.mapping 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | -------------------------------------------------------------------------------- /docs/TODO.rst: -------------------------------------------------------------------------------- 1 | 2 | Tasks 3 | ===== 4 | 5 | - [STRIKEOUT:POS] 6 | - [STRIKEOUT:morphological analysis] 7 | - [STRIKEOUT:transliteration] 8 | 9 | Library Interface 10 | ================= 11 | 12 | - [STRIKEOUT:Sentiment] 13 | - [STRIKEOUT:NER] 14 | - Frequency based comparison 15 | 16 | Command Line interface 17 | ====================== 18 | 19 | - Sentiment 20 | - Reading stdin column format 21 | 22 | Infrastructure 23 | ============== 24 | 25 | - [STRIKEOUT:Cache models] 26 | - [STRIKEOUT:Add normalization to the embeddings] 27 | - [STRIKEOUT:Detect supported languages] 28 | - [STRIKEOUT:added task/lang as part of the identifier, what is left is 29 | to iterate over the collections.] 30 | - [STRIKEOUT:Throw different exception for missing package than 31 | undownloaded one] 32 | - [STRIKEOUT:Define NotSupportedLanguage/Task Exception for the 33 | downloader] 34 | - [STRIKEOUT:Remove noun phrases support.] 35 | - [STRIKEOUT:Train more/new POS taggers] 36 | 37 | Documentation 38 | ============= 39 | 40 | - Add a quick tutorial 41 | - Embed demos in our documentation 42 | - [STRIKEOUT:pycld2 README] 43 | - [STRIKEOUT:Update rtdcs with the new submodules.] 44 | -------------------------------------------------------------------------------- /polyglot/tokenize/base.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ Basic text segmenters.""" 5 | 6 | from icu import Locale, BreakIterator 7 | from polyglot.base import Sequence 8 | 9 | 10 | class Breaker(object): 11 | """ Base class to segment text.""" 12 | 13 | def __init__(self, locale): 14 | self.locale = Locale('locale') 15 | self.breaker = None 16 | 17 | def transform(self, sequence): 18 | seq = Sequence(sequence.text) 19 | seq.idx = [0] 20 | for segment in sequence: 21 | offset = seq.idx[-1] 22 | self.breaker.setText(segment) 23 | seq.idx.extend([offset+x for x in self.breaker]) 24 | return seq 25 | 26 | 27 | class SentenceTokenizer(Breaker): 28 | """ Segment text to sentences. """ 29 | 30 | def __init__(self, locale='en'): 31 | super(SentenceTokenizer, self).__init__(locale) 32 | self.breaker = BreakIterator.createSentenceInstance(self.locale) 33 | 34 | 35 | class WordTokenizer(Breaker): 36 | """ Segment text to words or tokens.""" 37 | 38 | def __init__(self, locale='en'): 39 | super(WordTokenizer, self).__init__(locale) 40 | self.breaker = BreakIterator.createWordInstance(self.locale) 41 | -------------------------------------------------------------------------------- /docs/Installation.rst: -------------------------------------------------------------------------------- 1 | 2 | Installation 3 | ============ 4 | 5 | Installing/Upgrading From the PyPI 6 | ---------------------------------- 7 | 8 | :: 9 | 10 | $ pip install polyglot 11 | 12 | Dependencies 13 | ~~~~~~~~~~~~ 14 | 15 | polyglot depends on `numpy `__ and 16 | `libicu-dev `__, on 17 | ubuntu/debian linux distribution you can install such packages by 18 | executing the following command: 19 | 20 | .. code:: python 21 | 22 | sudo apt-get install python-numpy libicu-dev 23 | 24 | From Source 25 | ----------- 26 | 27 | polyglot is actively developed on 28 | `Github `__. 29 | 30 | You can clone the public repo: 31 | 32 | .. code:: python 33 | 34 | git clone https://github.com/aboSamoor/polyglot 35 | 36 | Or download one of the following: 37 | 38 | - `tarball `__ 39 | - `zipball `__ 40 | 41 | Once you have the source, you can install it into your site-packages 42 | with: 43 | 44 | .. code:: python 45 | 46 | python setup.py install 47 | 48 | Get the bleeding edge version 49 | ----------------------------- 50 | 51 | To get the latest development version of polyglot, run : 52 | 53 | :: 54 | 55 | $ pip install -U git+https://github.com/aboSamoor/polyglot.git@master 56 | 57 | Python 58 | ~~~~~~ 59 | 60 | polyglot supports Python >=2.7 or >=3.4. 61 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: clean-pyc clean-build docs clean 2 | 3 | help: 4 | @echo "clean-build - remove build artifacts" 5 | @echo "clean-pyc - remove Python file artifacts" 6 | @echo "lint - check style with flake8" 7 | @echo "test - run tests quickly with the default Python" 8 | @echo "test-all - run tests on every Python version with tox" 9 | @echo "coverage - check code coverage quickly with the default Python" 10 | @echo "docs - generate Sphinx HTML documentation, including API docs" 11 | @echo "release - package and upload a release" 12 | @echo "dist - package" 13 | 14 | clean: clean-build clean-pyc 15 | rm -fr htmlcov/ 16 | 17 | clean-build: 18 | rm -fr build/ 19 | rm -fr dist/ 20 | rm -fr *.egg-info 21 | 22 | clean-pyc: 23 | find . -name '*.pyc' -exec rm -f {} + 24 | find . -name '*.pyo' -exec rm -f {} + 25 | find . -name '*~' -exec rm -f {} + 26 | 27 | lint: 28 | flake8 polyglot tests 29 | 30 | test: 31 | python setup.py test 32 | 33 | test-all: 34 | tox 35 | 36 | coverage: 37 | coverage run --source polyglot setup.py test 38 | coverage report -m 39 | coverage html 40 | open htmlcov/index.html 41 | 42 | docs: 43 | ./nb2rst.sh 44 | rm -f docs/polyglot.rst 45 | rm -f docs/modules.rst 46 | sphinx-apidoc -o docs/ polyglot 47 | $(MAKE) -C docs clean 48 | $(MAKE) -C docs html 49 | rm -f docs/*tests*rst 50 | xdg-open docs/_build/html/index.html 51 | 52 | release: clean 53 | python setup.py sdist upload 54 | python setup.py bdist_wheel upload 55 | 56 | dist: clean 57 | python setup.py sdist 58 | python setup.py bdist_wheel 59 | ls -l dist 60 | -------------------------------------------------------------------------------- /docs/polyglot.rst: -------------------------------------------------------------------------------- 1 | polyglot package 2 | ================ 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | polyglot.detect 10 | polyglot.mapping 11 | polyglot.tag 12 | polyglot.tokenize 13 | polyglot.transliteration 14 | 15 | Submodules 16 | ---------- 17 | 18 | polyglot.base module 19 | -------------------- 20 | 21 | .. automodule:: polyglot.base 22 | :members: 23 | :undoc-members: 24 | :show-inheritance: 25 | 26 | polyglot.decorators module 27 | -------------------------- 28 | 29 | .. automodule:: polyglot.decorators 30 | :members: 31 | :undoc-members: 32 | :show-inheritance: 33 | 34 | polyglot.downloader module 35 | -------------------------- 36 | 37 | .. automodule:: polyglot.downloader 38 | :members: 39 | :undoc-members: 40 | :show-inheritance: 41 | 42 | polyglot.load module 43 | -------------------- 44 | 45 | .. automodule:: polyglot.load 46 | :members: 47 | :undoc-members: 48 | :show-inheritance: 49 | 50 | polyglot.mixins module 51 | ---------------------- 52 | 53 | .. automodule:: polyglot.mixins 54 | :members: 55 | :undoc-members: 56 | :show-inheritance: 57 | 58 | polyglot.text module 59 | -------------------- 60 | 61 | .. automodule:: polyglot.text 62 | :members: 63 | :undoc-members: 64 | :show-inheritance: 65 | 66 | polyglot.utils module 67 | --------------------- 68 | 69 | .. automodule:: polyglot.utils 70 | :members: 71 | :undoc-members: 72 | :show-inheritance: 73 | 74 | 75 | Module contents 76 | --------------- 77 | 78 | .. automodule:: polyglot 79 | :members: 80 | :undoc-members: 81 | :show-inheritance: 82 | -------------------------------------------------------------------------------- /notebooks/TODO.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Tasks\n", 8 | "\n", 9 | "- ~~POS~~\n", 10 | "- ~~morphological analysis~~\n", 11 | "- ~~transliteration~~\n", 12 | "\n", 13 | "# Library Interface\n", 14 | "- ~~Sentiment~~\n", 15 | "- ~~NER~~\n", 16 | "- Frequency based comparison\n", 17 | "\n", 18 | "\n", 19 | "# Command Line interface\n", 20 | "\n", 21 | "- Sentiment\n", 22 | "- Reading stdin column format\n", 23 | "\n", 24 | "\n", 25 | "# Infrastructure\n", 26 | "- ~~Cache models~~\n", 27 | "- ~~Add normalization to the embeddings~~\n", 28 | "- ~~Detect supported languages~~\n", 29 | " - ~~added task/lang as part of the identifier, what is left is to iterate over the collections.~~\n", 30 | "- ~~Throw different exception for missing package than undownloaded one~~\n", 31 | "- ~~Define NotSupportedLanguage/Task Exception for the downloader~~\n", 32 | "- ~~Remove noun phrases support.~~\n", 33 | "- ~~Train more/new POS taggers~~\n", 34 | "\n", 35 | "\n", 36 | "# Documentation\n", 37 | "\n", 38 | "- Add a quick tutorial\n", 39 | "- Embed demos in our documentation\n", 40 | "- ~~pycld2 README~~\n", 41 | "- ~~Update rtdcs with the new submodules.~~" 42 | ] 43 | } 44 | ], 45 | "metadata": { 46 | "kernelspec": { 47 | "display_name": "Python 2", 48 | "language": "python", 49 | "name": "python2" 50 | }, 51 | "language_info": { 52 | "codemirror_mode": { 53 | "name": "ipython", 54 | "version": 2 55 | }, 56 | "file_extension": ".py", 57 | "mimetype": "text/x-python", 58 | "name": "python", 59 | "nbconvert_exporter": "python", 60 | "pygments_lexer": "ipython2", 61 | "version": "2.7.6" 62 | } 63 | }, 64 | "nbformat": 4, 65 | "nbformat_minor": 0 66 | } 67 | -------------------------------------------------------------------------------- /polyglot/mapping/tests/test_embeddings.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """Test basic embedding utilities.""" 5 | 6 | import unittest 7 | from ..embeddings import Embedding 8 | 9 | from io import StringIO 10 | 11 | word2vec_dump = u""" 12 | 9 5 13 | 0.001329 -0.000965 -0.001856 -0.000425 -0.000381 14 | the -0.144928 0.074345 -0.069327 -0.017698 0.090774 15 | , -0.022361 -0.033252 -0.000350 -0.027688 -0.025736 16 | . 0.006878 0.064503 0.074926 -0.048397 -0.041165 17 | of 0.182565 0.125933 0.065001 -0.004585 0.164688 18 | and 0.013473 0.012923 0.027855 0.046051 -0.043293 19 | in -0.003114 -0.126757 0.099654 0.059442 0.003293 20 | to 0.223011 -0.080497 -0.083754 -0.182311 0.057853 21 | a -0.136669 0.161203 0.192028 0.068527 0.292363 22 | """.strip() 23 | 24 | 25 | class EmbeddingTest(unittest.TestCase): 26 | def setUp(self): 27 | self.fname = StringIO(word2vec_dump) 28 | self.model = Embedding.from_word2vec(self.fname, binary=0, fvocab=None) 29 | self.words = ["", "the", ",", ".", "of", "and", "in", "to", "a"] 30 | 31 | def tearDown(self): 32 | pass 33 | 34 | def test_model_words(self): 35 | self.assertEqual(self.model.words, self.words) 36 | self.assertAlmostEqual(self.model[self.words[-1]][-1], 0.292363) 37 | 38 | def test_most_frequent(self): 39 | model = self.model.most_frequent(3) 40 | self.assertEqual(model.words, self.words[:3]) 41 | self.assertEqual(model.shape, (3, 5)) 42 | 43 | def test_model_shape(self): 44 | self.assertEqual(self.model.shape, (9, 5)) 45 | 46 | def test_deletion(self): 47 | del self.model[self.words[5]] 48 | self.assertEqual(self.model.shape, (8, 5)) 49 | self.assertEqual(self.model.words, self.words[:5]+self.words[6:]) 50 | self.assertFalse(self.words[5] in self.model) 51 | 52 | def test_word_with_space(self): 53 | new_dump = word2vec_dump.replace("9", "10") + u"\na b 1.0 2.0 3.0 4.0 5.0" 54 | fname = StringIO(new_dump) 55 | model = Embedding.from_word2vec(fname, binary=0, fvocab=None) 56 | self.assertEqual(model.words[-1], u"a b") 57 | 58 | def test_norm(self): 59 | model = self.model.normalize_words() 60 | norms = (model.vectors ** 2).sum(axis=1) 61 | _ = [self.assertAlmostEqual(x,y, places=6) for x,y in zip(norms, [1.]*model.shape[0])] 62 | 63 | 64 | if __name__ == "__main__": 65 | unittest.main() 66 | -------------------------------------------------------------------------------- /polyglot/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """Collection of general utilities.""" 5 | 6 | from __future__ import print_function 7 | from os import path 8 | import os 9 | import tarfile 10 | 11 | import six 12 | from six import text_type as unicode 13 | from six import string_types 14 | 15 | 16 | def _open(file_, mode='r'): 17 | """Open file object given filenames, open files or even archives.""" 18 | if isinstance(file_, string_types): 19 | _, ext = path.splitext(file_) 20 | if ext in {'.bz2', '.gz'}: 21 | s = tarfile.open(file_) 22 | return s.extractfile(s.next()) 23 | else: 24 | return open(file_, mode) 25 | return file_ 26 | 27 | 28 | def _print(text): 29 | """Handle the differences between Pytho2,3 print functions. 30 | Args: 31 | text (string): Should be in unicode. 32 | """ 33 | if six.PY3: 34 | print(text) 35 | else: 36 | print(text.encode("utf8")) 37 | 38 | def _pickle_method(method): 39 | """Pickle methods properly, including class methods.""" 40 | func_name = method.im_func.__name__ 41 | obj = method.im_self 42 | cls = method.im_class 43 | if isinstance(cls, type): 44 | # handle classmethods differently 45 | cls = obj 46 | obj = None 47 | if func_name.startswith('__') and not func_name.endswith('__'): 48 | #deal with mangled names 49 | cls_name = cls.__name__.lstrip('_') 50 | func_name = '_%s%s' % (cls_name, func_name) 51 | return _unpickle_method, (func_name, obj, cls) 52 | 53 | def _unpickle_method(func_name, obj, cls): 54 | """Unpickle methods properly, including class methods.""" 55 | 56 | if obj is None: 57 | return cls.__dict__[func_name].__get__(obj, cls) 58 | for cls in cls.__mro__: 59 | try: 60 | func = cls.__dict__[func_name] 61 | except KeyError: 62 | pass 63 | else: 64 | break 65 | return func.__get__(obj, cls) 66 | 67 | def pretty_list(items, cols=3): 68 | text = [] 69 | width = 24 70 | col_width = u"{" + u":<" + str(width) + u"} " 71 | for i, lang in enumerate(items): 72 | if not six.PY3: 73 | lang = lang.decode(u"utf-8") 74 | if len(lang) > width: 75 | lang = lang[:width-3] + "..." 76 | text.append(u"{:>3}. ".format(i+1)) 77 | text.append(col_width.format(lang)) 78 | if (i+1) % cols == 0: 79 | text.append(u"\n") 80 | return u"".join(text) 81 | -------------------------------------------------------------------------------- /polyglot/mapping/tests/test_expansion.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """Test Expanding vocbulary.""" 5 | 6 | import unittest 7 | from io import StringIO 8 | 9 | from ..base import OrderedVocabulary 10 | from ..expansion import DigitExpander, CaseExpander 11 | 12 | 13 | vocab = u""" 14 | the 15 | book 16 | Book 17 | 3 18 | upper 19 | lower 20 | 5 21 | cool 22 | McCain 23 | """.strip() 24 | 25 | 26 | class DigitExpanderTest(unittest.TestCase): 27 | def setUp(self): 28 | self.v = OrderedVocabulary.from_vocabfile(StringIO(vocab)) 29 | 30 | def test_load(self): 31 | self.assertEqual(len(self.v), 9) 32 | 33 | def test_digit_expansion(self): 34 | v = DigitExpander(vocabulary=self.v, strategy='most_frequent') 35 | self.assertEqual(len(v), 10) 36 | 37 | def test_digit_membership(self): 38 | v = DigitExpander(vocabulary=self.v, strategy='most_frequent') 39 | self.assertTrue(u"8" in v) 40 | self.assertTrue(u"3" in v) 41 | self.assertFalse(u"71" in v) 42 | 43 | def test_digit_ids(self): 44 | v = DigitExpander(vocabulary=self.v, strategy='most_frequent') 45 | self.assertEqual(v["6"], 3) 46 | self.assertEqual(v["7"], v["2"]) 47 | self.assertNotEqual(v["3"], v["5"]) 48 | 49 | class CaseExpanderTest(unittest.TestCase): 50 | def setUp(self): 51 | self.v = OrderedVocabulary.from_vocabfile(StringIO(vocab)) 52 | 53 | def test_load(self): 54 | self.assertEqual(len(self.v), 9) 55 | 56 | def test_case_expansion(self): 57 | v = CaseExpander(vocabulary=self.v, strategy='most_frequent') 58 | self.assertEqual(len(v), 21) 59 | 60 | def test_digit_membership(self): 61 | v = CaseExpander(vocabulary=self.v, strategy='most_frequent') 62 | self.assertTrue(u"3" in v) 63 | self.assertTrue(u"BOOK" in v) 64 | self.assertTrue(u"mccain" in v) 65 | 66 | def test_digit_ids(self): 67 | v = CaseExpander(vocabulary=self.v, strategy='most_frequent') 68 | self.assertEqual(v["THE"], 0) 69 | self.assertEqual(v["UPPER"], v["upper"]) 70 | 71 | class MixedExpansionTest(unittest.TestCase): 72 | def setUp(self): 73 | self.v = OrderedVocabulary.from_vocabfile(StringIO(vocab)) 74 | self.v1 = CaseExpander(vocabulary=self.v, strategy='most_frequent') 75 | self.v2 = DigitExpander(vocabulary=self.v1, strategy='most_frequent') 76 | 77 | def test_expansion(self): 78 | self.assertEqual(len(self.v2), 22) 79 | 80 | def test_membership(self): 81 | self.assertTrue(u"3" in self.v2) 82 | self.assertTrue(u"9" in self.v2) 83 | self.assertTrue(u"#" in self.v2) 84 | self.assertTrue(u"BOOK" in self.v2) 85 | self.assertTrue(u"mccain" in self.v2) 86 | 87 | def test_ids(self): 88 | self.assertEqual(self.v2["THE"], 0) 89 | self.assertEqual(self.v2["UPPER"], self.v2["upper"]) 90 | self.assertEqual(self.v2["3"], self.v2["7"]) 91 | 92 | 93 | if __name__ == "__main__": 94 | unittest.main() 95 | -------------------------------------------------------------------------------- /docs/sphinxext/github_link.py: -------------------------------------------------------------------------------- 1 | from operator import attrgetter 2 | import inspect 3 | import subprocess 4 | import os 5 | import sys 6 | from functools import partial 7 | 8 | REVISION_CMD = 'git rev-parse --short HEAD' 9 | 10 | 11 | def _get_git_revision(): 12 | try: 13 | revision = subprocess.check_output(REVISION_CMD.split()).strip() 14 | except subprocess.CalledProcessError: 15 | print('Failed to execute git to get revision') 16 | return None 17 | return revision.decode('utf-8') 18 | 19 | 20 | def _linkcode_resolve(domain, info, package, url_fmt, revision): 21 | """Determine a link to online source for a class/method/function 22 | 23 | This is called by sphinx.ext.linkcode 24 | 25 | An example with a long-untouched module that everyone has 26 | >>> _linkcode_resolve('py', {'module': 'tty', 27 | ... 'fullname': 'setraw'}, 28 | ... package='tty', 29 | ... url_fmt='http://hg.python.org/cpython/file/' 30 | ... '{revision}/Lib/{package}/{path}#L{lineno}', 31 | ... revision='xxxx') 32 | 'http://hg.python.org/cpython/file/xxxx/Lib/tty/tty.py#L18' 33 | """ 34 | 35 | if revision is None: 36 | return 37 | if domain not in ('py', 'pyx'): 38 | return 39 | if not info.get('module') or not info.get('fullname'): 40 | return 41 | 42 | class_name = info['fullname'].split('.')[0] 43 | if type(class_name) != str: 44 | # Python 2 only 45 | class_name = class_name.encode('utf-8') 46 | module = __import__(info['module'], fromlist=[class_name]) 47 | try: 48 | obj = attrgetter(info['fullname'])(module) 49 | except AttributeError: 50 | return 51 | 52 | try: 53 | fn = inspect.getsourcefile(obj) 54 | except Exception: 55 | fn = None 56 | if not fn: 57 | try: 58 | fn = inspect.getsourcefile(sys.modules[obj.__module__]) 59 | except Exception: 60 | fn = None 61 | if not fn: 62 | return 63 | 64 | fn = os.path.relpath(fn, 65 | start=os.path.dirname(__import__(package).__file__)) 66 | try: 67 | lineno = inspect.getsourcelines(obj)[1] 68 | except Exception: 69 | lineno = '' 70 | return url_fmt.format(revision=revision, package=package, 71 | path=fn, lineno=lineno) 72 | 73 | 74 | def make_linkcode_resolve(package, url_fmt): 75 | """Returns a linkcode_resolve function for the given URL format 76 | 77 | revision is a git commit reference (hash or name) 78 | 79 | package is the name of the root module of the package 80 | 81 | url_fmt is along the lines of ('https://github.com/USER/PROJECT/' 82 | 'blob/{revision}/{package}/' 83 | '{path}#L{lineno}') 84 | """ 85 | revision = _get_git_revision() 86 | return partial(_linkcode_resolve, revision=revision, package=package, 87 | url_fmt=url_fmt) 88 | -------------------------------------------------------------------------------- /polyglot/detect/base.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """Detecting languages""" 6 | 7 | 8 | import logging 9 | 10 | 11 | from icu import Locale 12 | import pycld2 as cld2 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | class Error(Exception): 18 | """Base exception class for this class.""" 19 | 20 | 21 | class UnknownLanguage(Error): 22 | """Raised if we can not detect the language of a text snippet.""" 23 | 24 | 25 | class Language(object): 26 | def __init__(self, choice): 27 | basic_name, code, confidence, bytesize = choice 28 | self.locale = Locale(code) 29 | self.confidence = float(confidence) 30 | self.read_bytes = int(bytesize) 31 | 32 | @property 33 | def name(self): 34 | return self.locale.getDisplayLanguage() 35 | 36 | @property 37 | def code(self): 38 | return self.locale.getName() 39 | 40 | def __str__(self): 41 | return ("name: {:<12}code: {:<9}confidence: {:>5.1f} " 42 | "read bytes:{:>6}".format(self.name, self.code, 43 | self.confidence, self.read_bytes)) 44 | 45 | @staticmethod 46 | def from_code(code): 47 | return Language(("", code, 100, 0)) 48 | 49 | 50 | class Detector(object): 51 | """ Detect the language used in a snippet of text. 52 | """ 53 | 54 | def __init__(self, text, quiet=False): 55 | """ Detector of the language used in `text`. 56 | 57 | Args: 58 | text (string): unicode string. 59 | """ 60 | self.__text = text 61 | self.reliable = True 62 | """False if the detector used Best Effort strategy in detection.""" 63 | self.quiet = quiet 64 | """If true, exceptions will be silenced.""" 65 | self.detect(text) 66 | 67 | @staticmethod 68 | def supported_languages(): 69 | """Returns a list of the languages that can be detected by pycld2.""" 70 | return [name.capitalize() for name,code in cld2.LANGUAGES if not name.startswith("X_")] 71 | 72 | def detect(self, text): 73 | """Decide which language is used to write the text. 74 | 75 | The method tries first to detect the language with high reliability. If 76 | that is not possible, the method switches to best effort strategy. 77 | 78 | 79 | Args: 80 | text (string): A snippet of text, the longer it is the more reliable we 81 | can detect the language used to write the text. 82 | """ 83 | t = text.encode("utf-8") 84 | reliable, index, top_3_choices = cld2.detect(t, bestEffort=False) 85 | 86 | if not reliable: 87 | self.reliable = False 88 | reliable, index, top_3_choices = cld2.detect(t, bestEffort=True) 89 | 90 | if not reliable and not self.quiet: 91 | raise UnknownLanguage("Try passing a longer snippet of text") 92 | else: 93 | logger.warning("Detector is not able to detect the language reliably.") 94 | 95 | self.languages = [Language(x) for x in top_3_choices] 96 | self.language = self.languages[0] 97 | return self.language 98 | 99 | def __str__(self): 100 | text = "Prediction is reliable: {}\n".format(self.reliable) 101 | text += u"\n".join(["Language {}: {}".format(i+1, str(l)) 102 | for i,l in enumerate(self.languages)]) 103 | return text 104 | -------------------------------------------------------------------------------- /notebooks/Installation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "#Installation" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Installing/Upgrading From the PyPI\n", 15 | "\n", 16 | "\n", 17 | " $ pip install polyglot" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "### Dependencies\n", 25 | "\n", 26 | "polyglot depends on [numpy](http://www.numpy.org/) and [libicu-dev](https://packages.debian.org/sid/libicu-dev), on ubuntu/debian linux distribution you can install such packages by executing the following command:" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": { 33 | "collapsed": false 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "sudo apt-get install python-numpy libicu-dev" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "## From Source\n", 45 | "\n", 46 | "\n", 47 | "polyglot is actively developed on\n", 48 | "[Github](https://github.com/aboSamoor/polyglot).\n", 49 | "\n", 50 | "You can clone the public repo:" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": { 57 | "collapsed": true 58 | }, 59 | "outputs": [], 60 | "source": [ 61 | "git clone https://github.com/aboSamoor/polyglot" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "Or download one of the following:\n", 69 | "\n", 70 | "- [tarball](https://github.com/aboSamoor/polyglot/tarball/master)\n", 71 | "- [zipball](https://github.com/aboSamoor/polyglot/zipball/master)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "Once you have the source, you can install it into your site-packages\n", 79 | "with:" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": { 86 | "collapsed": true 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "python setup.py install" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": {}, 96 | "source": [ 97 | "## Get the bleeding edge version\n", 98 | "\n", 99 | "To get the latest development version of polyglot, run :\n", 100 | "\n", 101 | " $ pip install -U git+https://github.com/aboSamoor/polyglot.git@master" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "### Python\n", 109 | "\n", 110 | "polyglot supports Python \\>=2.7 or \\>=3.4." 111 | ] 112 | } 113 | ], 114 | "metadata": { 115 | "kernelspec": { 116 | "display_name": "Python 2", 117 | "language": "python", 118 | "name": "python2" 119 | }, 120 | "language_info": { 121 | "codemirror_mode": { 122 | "name": "ipython", 123 | "version": 2 124 | }, 125 | "file_extension": ".py", 126 | "mimetype": "text/x-python", 127 | "name": "python", 128 | "nbconvert_exporter": "python", 129 | "pygments_lexer": "ipython2", 130 | "version": "2.7.6" 131 | } 132 | }, 133 | "nbformat": 4, 134 | "nbformat_minor": 0 135 | } 136 | -------------------------------------------------------------------------------- /polyglot/mapping/expansion.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from .base import OrderedVocabulary 5 | from collections import defaultdict 6 | from six import iteritems 7 | import re 8 | import logging 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | class VocabExpander(OrderedVocabulary): 13 | def __init__(self, vocabulary, formatters, strategy): 14 | super(VocabExpander, self).__init__(vocabulary.words) 15 | self.strategy = strategy 16 | self._vocab = vocabulary 17 | self.aux_word_id = defaultdict(lambda: []) 18 | self.formatters = formatters 19 | self.expand(formatters) 20 | self.aux_id_word = {id_:w for w, id_ in iteritems(self.aux_word_id)} 21 | 22 | def __getitem__(self, key): 23 | try: 24 | return self._vocab[key] 25 | except KeyError as e: 26 | try: 27 | return self.aux_word_id[key] 28 | except KeyError as e: 29 | return self.approximate_ids(key) 30 | 31 | def __contains__(self, key): 32 | return ((key in self._vocab) or 33 | (key in self.aux_word_id) or 34 | self.approximate(key)) 35 | 36 | def __len__(self): 37 | return len(self._vocab) + len(self.aux_word_id) 38 | 39 | def __delitem__(self): 40 | raise NotImplementedError("It is quite complex, let us do it in the future") 41 | 42 | def format(self, w): 43 | return [f(w) for f in self.formatters] 44 | 45 | def approximate(self, w): 46 | f = lambda key: (key in self._vocab) or (key in self.aux_word_id) 47 | return {w_:self[w_] for w_ in self.format(w) if f(w_)} 48 | 49 | def approximate_ids(self, key): 50 | ids = [id_ for w, id_ in self.approximate(key).items()] 51 | if not ids: 52 | raise KeyError(u"{} not found".format(key)) 53 | else: 54 | if self.strategy == 'most_frequent': 55 | return min(ids) 56 | else: 57 | return tuple(sorted(ids)) 58 | 59 | def _expand(self, formatter): 60 | for w in self.word_id: 61 | w_ = formatter(w) 62 | if w_ not in self._vocab: 63 | id_ = self.word_id[w] 64 | self.aux_word_id[w_].append(id_) 65 | 66 | def expand(self, formatters): 67 | for formatter in formatters: 68 | self._expand(formatter) 69 | if self.strategy == 'average': 70 | self.aux_word_id = {w: tuple(sorted(ids)) for w, ids in iteritems(self.aux_word_id)} 71 | elif self.strategy == 'most_frequent': 72 | self.aux_word_id = {w: min(ids) for w, ids in iteritems(self.aux_word_id)} 73 | else: 74 | raise ValueError("A strategy is needed") 75 | 76 | words_added = self.aux_word_id.keys() 77 | old_no = len(self._vocab) 78 | new_no = len(self.aux_word_id) 79 | logger.info("We have {} original words.".format(old_no)) 80 | logger.info("Added {} new words.".format(new_no)) 81 | logger.info("The new total number of words is {}".format(len(self))) 82 | logger.debug(u"Words added\n{}\n".format(u" ".join(words_added))) 83 | 84 | 85 | class CaseExpander(VocabExpander): 86 | def __init__(self, vocabulary, strategy='most_frequent'): 87 | formatters = [lambda x: x.lower(), 88 | lambda x: x.title(), 89 | lambda x: x.upper()] 90 | super(CaseExpander, self).__init__(vocabulary=vocabulary, formatters=formatters, strategy=strategy) 91 | 92 | 93 | class DigitExpander(VocabExpander): 94 | def __init__(self, vocabulary, strategy='most_frequent'): 95 | pattern = re.compile("[0-9]", flags=re.UNICODE) 96 | formatters = [lambda x: pattern.sub("#", x)] 97 | super(DigitExpander, self).__init__(vocabulary=vocabulary, formatters=formatters, strategy=strategy) 98 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Contributing 3 | ============ 4 | 5 | Contributions are welcome, and they are greatly appreciated! Every 6 | little bit helps, and credit will always be given. 7 | 8 | You can contribute in many ways: 9 | 10 | Types of Contributions 11 | ---------------------- 12 | 13 | Report Bugs 14 | ~~~~~~~~~~~ 15 | 16 | Report bugs at https://github.com/aboSamoor/polyglot/issues. 17 | 18 | If you are reporting a bug, please include: 19 | 20 | * Your operating system name and version. 21 | * Any details about your local setup that might be helpful in troubleshooting. 22 | * Detailed steps to reproduce the bug. 23 | 24 | Fix Bugs 25 | ~~~~~~~~ 26 | 27 | Look through the GitHub issues for bugs. Anything tagged with "bug" 28 | is open to whoever wants to implement it. 29 | 30 | Implement Features 31 | ~~~~~~~~~~~~~~~~~~ 32 | 33 | Look through the GitHub issues for features. Anything tagged with "feature" 34 | is open to whoever wants to implement it. 35 | 36 | Write Documentation 37 | ~~~~~~~~~~~~~~~~~~~ 38 | 39 | polyglot could always use more documentation, whether as part of the 40 | official polyglot docs, in docstrings, or even on the web in blog posts, 41 | articles, and such. 42 | 43 | Submit Feedback 44 | ~~~~~~~~~~~~~~~ 45 | 46 | The best way to send feedback is to file an issue at https://github.com/aboSamoor/polyglot/issues. 47 | 48 | If you are proposing a feature: 49 | 50 | * Explain in detail how it would work. 51 | * Keep the scope as narrow as possible, to make it easier to implement. 52 | * Remember that this is a volunteer-driven project, and that contributions 53 | are welcome :) 54 | 55 | Get Started! 56 | ------------ 57 | 58 | Ready to contribute? Here's how to set up `polyglot` for local development. 59 | 60 | 1. Fork the `polyglot` repo on GitHub. 61 | 2. Clone your fork locally:: 62 | 63 | $ git clone git@github.com:your_name_here/polyglot.git 64 | 65 | 3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development:: 66 | 67 | $ mkvirtualenv polyglot 68 | $ cd polyglot/ 69 | $ python setup.py develop 70 | 71 | 4. Create a branch for local development:: 72 | 73 | $ git checkout -b name-of-your-bugfix-or-feature 74 | 75 | Now you can make your changes locally. 76 | 77 | 5. When you're done making changes, check that your changes pass flake8 and the tests, including testing other Python versions with tox:: 78 | 79 | $ flake8 polyglot tests 80 | $ python setup.py test 81 | $ tox 82 | 83 | To get flake8 and tox, just pip install them into your virtualenv. 84 | 85 | 6. Commit your changes and push your branch to GitHub:: 86 | 87 | $ git add . 88 | $ git commit -m "Your detailed description of your changes." 89 | $ git push origin name-of-your-bugfix-or-feature 90 | 91 | 7. Submit a pull request through the GitHub website. 92 | 93 | Pull Request Guidelines 94 | ----------------------- 95 | 96 | Before you submit a pull request, check that it meets these guidelines: 97 | 98 | 1. The pull request should include tests. 99 | 2. If the pull request adds functionality, the docs should be updated. Put 100 | your new functionality into a function with a docstring, and add the 101 | feature to the list in README.rst. 102 | 3. The pull request should work for Python 2.6, 2.7, and 3.3, 3.4, and for PyPy. Check 103 | https://travis-ci.org/aboSamoor/polyglot/pull_requests 104 | and make sure that the tests pass for all supported Python versions. 105 | 106 | Tips 107 | ---- 108 | 109 | To run a subset of tests:: 110 | 111 | $ python -m unittest tests.test_polyglot -------------------------------------------------------------------------------- /polyglot/tokenize/tests/test_base.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ Test basic tokenization utilities.""" 5 | 6 | import unittest 7 | from ..base import SentenceTokenizer, WordTokenizer 8 | from ...base import Sequence 9 | 10 | en_text = u"""A Ukrainian separatist leader is calling on Russia to "absorb" the eastern region of Donetsk after Sunday's referendum on self rule. Self-declared Donetsk People's Republic leader Denis Pushilin urged Moscow to listen to the "will of the people". In neighbouring Luhansk, where a vote was also held, rebels declared independence. Ukraine, the EU and US have declared the referendums illegal but Russia says the results should be "implemented". Moscow has so far not commented on the call for Donetsk to become part of Russia but has appealed for dialogue between the militants and Kiev, with the participation of the Organisation for Security and Co-operation in Europe. 11 | """ 12 | 13 | ar_text = u"""عبر أحد قادة المتمردين الموالين لروسيا في أوكرانيا عن مساندته لفكرة الوحدة مع روسيا في أعقاب الإعلان عن نتائج الاستفتاء المثير للجدل في شرق البلاد. وقال رومان لياجين، رئيس لجنة المتمردين للانتخابات في دونيتسك إن الانضمام لروسيا "قد يكون خطوة مناسبة". 14 | """ 15 | 16 | ja_text = u"""やった!""" 17 | 18 | 19 | class BaseTest(unittest.TestCase): 20 | def setUp(self): 21 | self.en_seq = Sequence(en_text) 22 | self.ar_seq = Sequence(ar_text) 23 | self.ja_seq = Sequence(ja_text) 24 | 25 | self.en_sent = SentenceTokenizer(locale='en') 26 | self.ar_sent = SentenceTokenizer(locale='ar') 27 | self.ja_sent = SentenceTokenizer(locale='ja') 28 | 29 | self.en_word = WordTokenizer(locale='en') 30 | self.ar_word = WordTokenizer(locale='ar') 31 | self.ja_word = WordTokenizer(locale='ja') 32 | 33 | self.en_sents = self.en_sent.transform(self.en_seq) 34 | self.ar_sents = self.ar_sent.transform(self.ar_seq) 35 | self.ja_sents = self.ja_sent.transform(self.ja_seq) 36 | 37 | self.en_words = self.en_word.transform(self.en_seq) 38 | self.ar_words = self.ar_word.transform(self.ar_seq) 39 | self.ja_words = self.ja_word.transform(self.ja_seq) 40 | 41 | def tearDown(self): 42 | pass 43 | 44 | def test_sentences_count(self): 45 | """ Sentence segmentation produces correct number of sentences.""" 46 | 47 | self.assertEqual(5, len(self.en_sents)) 48 | self.assertEqual(2, len(self.ar_sents)) 49 | self.assertEqual(1, len(self.ja_sents)) 50 | 51 | def test_redundant_idx(self): 52 | """ Test if there are redundant indices.""" 53 | 54 | self.assertEqual(len(self.en_sents.idx), len(set(self.en_sents.idx))) 55 | self.assertEqual(len(self.ar_sents.idx), len(set(self.ar_sents.idx))) 56 | self.assertEqual(len(self.ja_sents.idx), len(set(self.ja_sents.idx))) 57 | 58 | self.assertEqual(len(self.en_words.idx), len(set(self.en_words.idx))) 59 | self.assertEqual(len(self.ar_words.idx), len(set(self.ar_words.idx))) 60 | self.assertEqual(len(self.ja_words.idx), len(set(self.ja_words.idx))) 61 | 62 | def test_boundaries(self): 63 | """ Sentence boundaries should be also word boundaries.""" 64 | 65 | self.assertTrue(set(self.en_sents.idx).issubset(set(self.en_words.idx))) 66 | self.assertTrue(set(self.ar_sents.idx).issubset(set(self.ar_words.idx))) 67 | self.assertTrue(set(self.ja_sents.idx).issubset(set(self.ja_words.idx))) 68 | 69 | def test_transformations_equal(self): 70 | """ Word toeknization over text is equal to over sentences.""" 71 | 72 | idx1 = self.en_words.idx 73 | idx2 = self.en_word.transform(self.en_sents).idx 74 | self.assertListEqual(idx1, idx2) 75 | 76 | idx1 = self.ar_words.idx 77 | idx2 = self.ar_word.transform(self.ar_sents).idx 78 | self.assertListEqual(idx1, idx2) 79 | 80 | idx1 = self.ja_words.idx 81 | idx2 = self.ja_word.transform(self.ja_sents).idx 82 | self.assertListEqual(idx1, idx2) 83 | 84 | 85 | if __name__ == "__main__": 86 | unittest.main() 87 | -------------------------------------------------------------------------------- /polyglot/transliteration/base.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """Transliteration. 5 | 6 | Transliteration across pair of languages. 7 | 8 | """ 9 | 10 | from math import log 11 | 12 | from ..load import load_transliteration_table 13 | from ..decorators import cached_property 14 | 15 | 16 | class Transliterator(object): 17 | """Transliterator between pair of languages. """ 18 | 19 | def __init__(self, source_lang="en", target_lang="en"): 20 | """ 21 | Args: 22 | source_lang (string): language code of the input langauge. 23 | target_lang (string): language code of the generated output langauge. 24 | """ 25 | self.source_lang = source_lang 26 | self.target_lang = target_lang 27 | 28 | self.decoder = self._decoder() 29 | """Transliterate a string from English to the target language.""" 30 | self.encoder = self._encoder() 31 | """Transliterate a string from the input language to English.""" 32 | 33 | def _decoder(self): 34 | """Transliterate a string from English to the target language.""" 35 | if self.target_lang == 'en': 36 | return Transliterator._dummy_coder 37 | else: 38 | weights = load_transliteration_table(self.target_lang) 39 | decoder_weights = weights["decoder"] 40 | return Transliterator._transliterate_string(decoder_weights) 41 | 42 | def _encoder(self): 43 | """Transliterate a string from the input language to English.""" 44 | if self.source_lang == 'en': 45 | return Transliterator._dummy_coder 46 | else: 47 | weights = load_transliteration_table(self.source_lang) 48 | encoder_weights = weights["encoder"] 49 | return Transliterator._transliterate_string(encoder_weights) 50 | 51 | @staticmethod 52 | def _dummy_coder(word): 53 | """Returns the string as it is, no transliteration is done.""" 54 | return word 55 | 56 | def transliterate(self, word): 57 | """Transliterate the word from its source language to the target one. 58 | 59 | The method works by encoding the word into English then decoding the new 60 | Enlgish word to the target language. 61 | """ 62 | encoded_word = self.encoder(word) 63 | decoded_word = self.decoder(encoded_word) 64 | return decoded_word 65 | 66 | @staticmethod 67 | def _transliterate_string(weight, ngram1=6, ngram2=6): 68 | def translate_string(word): 69 | unlimited5 = 99999 70 | # Convert input to lower case 71 | word = word.lower().strip() 72 | # Initialize bestk results 73 | best_source_string = [] 74 | best_target_string = [] 75 | best_string_cost = [] 76 | for i in range(len(word)+1): 77 | best_source_string.append('') 78 | best_target_string.append('') 79 | best_string_cost.append(unlimited5) 80 | # Only 1 initial state 81 | best_string_cost[0] = 0 82 | # Start DP to generate bestk results 83 | for i in range(1, len(word)+1): 84 | for j in range(1, ngram1+1): 85 | if i >= j: 86 | piece = word[i-j:i] 87 | for item in weight: 88 | if item[0].strip() == piece: 89 | vfinal = -log(weight[item]) 90 | if best_string_cost[i - j] < unlimited5: 91 | tmp_string_cost = best_string_cost[i - j] 92 | # Final cost value. 93 | # Things need to be considered: 94 | # 1) Individual cost of tranliterating from piece to tar 95 | # 2) Length of piece and tar 96 | # 3) Prefix of piece 97 | # 4) Prefix of tar 98 | tmp_string_cost += vfinal 99 | if tmp_string_cost < best_string_cost[i]: 100 | tmp_source_string = best_source_string[i - j] + piece 101 | tmp_target_string = best_target_string[i - j] + item[1].strip() 102 | best_source_string[i] = tmp_source_string 103 | best_target_string[i] = tmp_target_string 104 | best_string_cost[i] = tmp_string_cost 105 | return best_target_string[len(word)] 106 | return translate_string 107 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import sys 6 | 7 | 8 | try: 9 | from setuptools import setup 10 | except ImportError: 11 | from distutils.core import setup 12 | 13 | 14 | with open('README.rst') as readme_file: 15 | readme = readme_file.read() 16 | 17 | with open('HISTORY.rst') as history_file: 18 | history = history_file.read().replace('.. :changelog:', '') 19 | 20 | packages = set(open("requirements.txt", "r").read().splitlines()) 21 | 22 | requirements = filter(lambda x: "http" not in x, packages) 23 | 24 | 25 | test_requirements = [ 26 | # TODO: put package test requirements here 27 | ] 28 | 29 | setup( 30 | name='polyglot', 31 | version='15.10.03', 32 | description='Polyglot is a natural language pipeline that supports massive multilingual applications.', 33 | long_description=readme + '\n\n' + history, 34 | author='Rami Al-Rfou', 35 | author_email='rmyeid@gmail.com', 36 | url='https://github.com/aboSamoor/polyglot', 37 | packages = ['polyglot', 38 | 'polyglot.detect', 39 | 'polyglot.tokenize', 40 | 'polyglot.mapping', 41 | 'polyglot.tag', 42 | 'polyglot.transliteration'], 43 | entry_points={ 44 | 'console_scripts': [ 45 | 'polyglot = polyglot.__main__:main', 46 | ], 47 | }, 48 | include_package_data=True, 49 | install_requires=requirements, 50 | license="GPLv3", 51 | zip_safe=False, 52 | keywords='polyglot', 53 | classifiers=[ 54 | 'Development Status :: 4 - Beta', 55 | 'Environment :: Console', 56 | 'Intended Audience :: Science/Research', 57 | 'Intended Audience :: Education', 58 | 'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)', 59 | 'Natural Language :: Afrikaans', 60 | 'Natural Language :: Arabic', 61 | 'Natural Language :: Bengali', 62 | 'Natural Language :: Bosnian', 63 | 'Natural Language :: Bulgarian', 64 | 'Natural Language :: Catalan', 65 | 'Natural Language :: Chinese (Simplified)', 66 | 'Natural Language :: Chinese (Traditional)', 67 | 'Natural Language :: Croatian', 68 | 'Natural Language :: Czech', 69 | 'Natural Language :: Danish', 70 | 'Natural Language :: Dutch', 71 | 'Natural Language :: English', 72 | 'Natural Language :: Esperanto', 73 | 'Natural Language :: Finnish', 74 | 'Natural Language :: French', 75 | 'Natural Language :: Galician', 76 | 'Natural Language :: German', 77 | 'Natural Language :: Greek', 78 | 'Natural Language :: Hebrew', 79 | 'Natural Language :: Hindi', 80 | 'Natural Language :: Hungarian', 81 | 'Natural Language :: Icelandic', 82 | 'Natural Language :: Indonesian', 83 | 'Natural Language :: Italian', 84 | 'Natural Language :: Japanese', 85 | 'Natural Language :: Javanese', 86 | 'Natural Language :: Korean', 87 | 'Natural Language :: Latin', 88 | 'Natural Language :: Latvian', 89 | 'Natural Language :: Macedonian', 90 | 'Natural Language :: Malay', 91 | 'Natural Language :: Marathi', 92 | 'Natural Language :: Norwegian', 93 | 'Natural Language :: Panjabi', 94 | 'Natural Language :: Persian', 95 | 'Natural Language :: Polish', 96 | 'Natural Language :: Portuguese', 97 | 'Natural Language :: Portuguese (Brazilian)', 98 | 'Natural Language :: Romanian', 99 | 'Natural Language :: Russian', 100 | 'Natural Language :: Serbian', 101 | 'Natural Language :: Slovak', 102 | 'Natural Language :: Slovenian', 103 | 'Natural Language :: Spanish', 104 | 'Natural Language :: Swedish', 105 | 'Natural Language :: Tamil', 106 | 'Natural Language :: Telugu', 107 | 'Natural Language :: Thai', 108 | 'Natural Language :: Turkish', 109 | 'Natural Language :: Ukranian', 110 | 'Natural Language :: Urdu', 111 | 'Natural Language :: Vietnamese', 112 | "Programming Language :: Python :: 2", 113 | 'Programming Language :: Python :: 2.7', 114 | 'Programming Language :: Python :: 3', 115 | 'Programming Language :: Python :: 3.4', 116 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 117 | 'Topic :: Text Processing :: Linguistic', 118 | ], 119 | test_suite='tests', 120 | tests_require=test_requirements, 121 | ) 122 | -------------------------------------------------------------------------------- /polyglot/load.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from os import path 5 | import os 6 | from tempfile import NamedTemporaryFile 7 | 8 | import numpy as np 9 | import morfessor 10 | 11 | from six import PY2 12 | from six.moves import cPickle as pickle 13 | 14 | from . import data_path 15 | from .decorators import memoize 16 | from .downloader import downloader 17 | from .mapping import Embedding, CountedVocabulary, CaseExpander, DigitExpander 18 | 19 | from .utils import _open 20 | 21 | if "~" in data_path: 22 | data_path = path.expanduser(data_path) 23 | 24 | polyglot_path = path.join(path.abspath(data_path), "polyglot_data") 25 | 26 | 27 | resource_dir = { 28 | "cw_embeddings":"embeddings2", 29 | "sgns_embeddings":"sgns2", 30 | "visualization": "tsne2", 31 | "wiki_vocab": "counts2", 32 | "sentiment": "sentiment2", 33 | } 34 | 35 | 36 | def locate_resource(name, lang, filter=None): 37 | """Return filename that contains specific language resource name. 38 | 39 | Args: 40 | name (string): Name of the resource. 41 | lang (string): language code to be loaded. 42 | """ 43 | task_dir = resource_dir.get(name, name) 44 | package_id = u"{}.{}".format(task_dir, lang) 45 | p = path.join(polyglot_path, task_dir, lang) 46 | if not path.isdir(p): 47 | if downloader.status(package_id) != downloader.INSTALLED: 48 | raise ValueError("This resource is available in the index " 49 | "but not downloaded, yet. Try to run\n\n" 50 | "polyglot download {}".format(package_id)) 51 | return path.join(p, os.listdir(p)[0]) 52 | 53 | 54 | @memoize 55 | def load_embeddings(lang="en", task="embeddings", type="cw"): 56 | """Return a word embeddings object for `lang` and of type `type` 57 | 58 | Args: 59 | lang (string): language code. 60 | task (string): parameters that define task. 61 | type (string): skipgram, cw, cbow ... 62 | """ 63 | src_dir = "_".join((type, task)) if type else task 64 | p = locate_resource(src_dir, lang) 65 | e = Embedding.load(p) 66 | if type == "cw": 67 | e.apply_expansion(CaseExpander) 68 | e.apply_expansion(DigitExpander) 69 | if type == "sgns": 70 | e.apply_expansion(CaseExpander) 71 | return e 72 | 73 | 74 | @memoize 75 | def load_vocabulary(lang="en", type="wiki"): 76 | """Return a CountedVocabulary object. 77 | 78 | Args: 79 | lang (string): language code. 80 | type (string): wiki,... 81 | """ 82 | src_dir = "{}_vocab".format(type) 83 | p = locate_resource(src_dir, lang) 84 | return CountedVocabulary.from_vocabfile(p) 85 | 86 | 87 | @memoize 88 | def load_ner_model(lang="en", version="2"): 89 | """Return a named entity extractor parameters for `lang` and of version `version` 90 | 91 | Args: 92 | lang (string): language code. 93 | version (string): version of the parameters to be used. 94 | """ 95 | src_dir = "ner{}".format(version) 96 | p = locate_resource(src_dir, lang) 97 | fh = _open(p) 98 | try: 99 | return pickle.load(fh) 100 | except UnicodeDecodeError: 101 | fh.seek(0) 102 | return pickle.load(fh, encoding='latin1') 103 | 104 | 105 | @memoize 106 | def load_pos_model(lang="en", version="2"): 107 | """Return a part of speech tagger parameters for `lang` and of version `version` 108 | 109 | Args: 110 | lang (string): language code. 111 | version (string): version of the parameters to be used. 112 | """ 113 | src_dir = "pos{}".format(version) 114 | p = locate_resource(src_dir, lang) 115 | fh = _open(p) 116 | return dict(np.load(fh)) 117 | 118 | 119 | @memoize 120 | def load_morfessor_model(lang="en", version="2"): 121 | """Return a morfessor model for `lang` and of version `version` 122 | 123 | Args: 124 | lang (string): language code. 125 | version (string): version of the parameters to be used. 126 | """ 127 | src_dir = "morph{}".format(version) 128 | p = locate_resource(src_dir, lang) 129 | file_handler = _open(p) 130 | tmp_file_ = NamedTemporaryFile(delete=False) 131 | tmp_file_.write(file_handler.read()) 132 | tmp_file_.close() 133 | io = morfessor.MorfessorIO() 134 | model = io.read_any_model(tmp_file_.name) 135 | os.remove(tmp_file_.name) 136 | return model 137 | 138 | 139 | @memoize 140 | def load_transliteration_table(lang="en", version="2"): 141 | """Return a morfessor model for `lang` and of version `version` 142 | 143 | Args: 144 | lang (string): language code. 145 | version (string): version of the parameters to be used. 146 | """ 147 | src_dir = "transliteration{}".format(version) 148 | p = locate_resource(src_dir, lang) 149 | file_handler = _open(p) 150 | return pickle.load(file_handler) 151 | -------------------------------------------------------------------------------- /polyglot/tag/base.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """POS and NER Taggers. 5 | 6 | Part of speech taggers (POS) classifies words into 17 syntactic category. 7 | Named entity Recognition extractors (NER) Detect three types of entities: {Person, Location, Organization.} 8 | 9 | """ 10 | 11 | import numpy as np 12 | from six.moves import range 13 | 14 | from ..decorators import memoize 15 | from ..load import load_embeddings, load_ner_model, load_pos_model 16 | 17 | 18 | NER_ID_TAG = {0: u'O', 1: u'I-PER', 2: u'I-LOC', 3: u'I-ORG'} 19 | 20 | POS_TAG_ID = {u'ADJ': 0, u'ADP': 1, u'ADV': 2, u'AUX': 3, u'CONJ': 4, 21 | u'DET': 5, u'INTJ': 6, u'NOUN': 7, u'NUM': 8, u'PART': 9, 22 | u'PRON': 10, u'PROPN': 11, u'PUNCT': 12, u'SCONJ': 13, 23 | u'SYM': 14, u'VERB': 15, u'X': 16} 24 | 25 | POS_ID_TAG = {v:k for k,v in POS_TAG_ID.items()} 26 | 27 | class TaggerBase(object): 28 | """Tagger base class that defines the interface. """ 29 | PAD = u'' 30 | START = u'' 31 | END = u'' 32 | UNK = u'' 33 | 34 | def __init__(self, lang='en'): 35 | """ 36 | Args: 37 | lang: language code to decide which chunker to use. 38 | """ 39 | self.lang = lang 40 | self.predictor = self._load_network() 41 | self.ID_TAG = {} 42 | self.add_bias = True 43 | self.context = 2 44 | 45 | @staticmethod 46 | def ngrams(sequence, n): 47 | ngrams_ = [] 48 | seq = ((n-1) * [TaggerBase.PAD] + [TaggerBase.START] + 49 | sequence + 50 | [TaggerBase.END] + (n-1) * [TaggerBase.PAD]) 51 | for i in range(n, n+len(sequence)): 52 | yield seq[i-n: i+n+1] 53 | 54 | def _load_network(self): 55 | raise NotImplementedError() 56 | 57 | def annotate(self, sent): 58 | """Annotate a squence of words with entity tags. 59 | 60 | Args: 61 | sent: sequence of strings/words. 62 | """ 63 | preds = [] 64 | words = [] 65 | for word, fv in self.sent2examples(sent): 66 | probs = self.predictor(fv) 67 | tags = probs.argsort() 68 | tag = self.ID_TAG[tags[-1]] 69 | 70 | words.append(word) 71 | preds.append(tag) 72 | 73 | # fix_chunks(preds) 74 | annotations = zip(words, preds) 75 | return annotations 76 | 77 | def sent2examples(self, sent): 78 | """ Convert ngrams into feature vectors.""" 79 | 80 | # TODO(rmyeid): use expanders. 81 | words = [w if w in self.embeddings else TaggerBase.UNK for w in sent] 82 | ngrams = TaggerBase.ngrams(words, self.context) 83 | fvs = [] 84 | for word, ngram in zip(sent, ngrams): 85 | fv = np.array([self.embeddings[w] for w in ngram]).flatten() 86 | if self.add_bias: 87 | fv = np.hstack((fv, np.array(1))) 88 | yield word, fv 89 | 90 | 91 | class NEChunker(TaggerBase): 92 | """Named entity extractor.""" 93 | 94 | def __init__(self, lang='en'): 95 | """ 96 | Args: 97 | lang: language code to decide which chunker to use. 98 | """ 99 | super(NEChunker, self).__init__(lang=lang) 100 | self.ID_TAG = NER_ID_TAG 101 | 102 | def _load_network(self): 103 | """ Building the predictor out of the model.""" 104 | self.embeddings = load_embeddings(self.lang, type='cw') 105 | self.embeddings.normalize_words(inplace=True) 106 | self.model = load_ner_model(lang=self.lang, version=2) 107 | first_layer, second_layer = self.model 108 | def predict_proba(input_): 109 | hidden = np.tanh(np.dot(first_layer, input_)) 110 | hidden = np.hstack((hidden, np.ones((hidden.shape[0], 1)))) 111 | output = (second_layer * hidden).sum(axis=1) 112 | output_ = 1.0/(1.0 + np.exp(-output)) 113 | probs = output_/output_.sum() 114 | return probs 115 | return predict_proba 116 | 117 | 118 | class POSTagger(TaggerBase): 119 | """Universal Part of Speech Tagger.""" 120 | 121 | def __init__(self, lang='en'): 122 | """ 123 | Args: 124 | lang: language code to decide which chunker to use. 125 | """ 126 | super(POSTagger, self).__init__(lang=lang) 127 | self.ID_TAG = POS_ID_TAG 128 | self.add_bias = False 129 | 130 | def _load_network(self): 131 | """ Building the predictor out of the model.""" 132 | self.embeddings = load_embeddings(self.lang, type='cw') 133 | #self.embeddings.normalize_words(inplace=True) 134 | self.model = load_pos_model(lang=self.lang, version=2) 135 | 136 | def predict_proba(input_): 137 | hidden = np.tanh(np.dot(input_, self.model["W1"]) + self.model["b1"]) 138 | output = np.dot(hidden, self.model["W2"]) + self.model["b2"] 139 | scores = np.exp(output) 140 | probs = scores/scores.sum() 141 | return probs 142 | return predict_proba 143 | 144 | @memoize 145 | def get_pos_tagger(lang='en'): 146 | """Return a POS tagger from the models cache.""" 147 | return POSTagger(lang=lang) 148 | 149 | @memoize 150 | def get_ner_tagger(lang='en'): 151 | """Return a NER tagger from the models cache.""" 152 | return NEChunker(lang=lang) 153 | -------------------------------------------------------------------------------- /docs/Tokenization.rst: -------------------------------------------------------------------------------- 1 | 2 | Tokenization 3 | ============ 4 | 5 | Toeknization is the process that identifies the text boundaries of words 6 | and sentences. We can identify the boundaries of sentences first then 7 | tokenize each sentence to identify the words that compose the sentence. 8 | Of course, we can do word tokenization first and then segment the token 9 | sequence into sentneces. Tokenization in polyglot relies on the `Unicode 10 | Text Segmentation `__ algorithm as 11 | implemented by the `ICU Project `__. 12 | 13 | You can use C/C++ ICU library by installing the required package 14 | ``libicu-dev``. For example, on ubuntu/debian systems you should use 15 | ``apt-get`` utility as the following: 16 | 17 | .. code:: python 18 | 19 | sudo apt-get install libicu-dev 20 | 21 | .. code:: python 22 | 23 | from polyglot.text import Text 24 | 25 | Word Tokenization 26 | ----------------- 27 | 28 | To call our word tokenizer, first we need to construct a Text object. 29 | 30 | .. code:: python 31 | 32 | blob = u""" 33 | 两个月前遭受恐怖袭击的法国巴黎的犹太超市在装修之后周日重新开放,法国内政部长以及超市的管理者都表示,这显示了生命力要比野蛮行为更强大。 34 | 该超市1月9日遭受枪手袭击,导致4人死亡,据悉这起事件与法国《查理周刊》杂志社恐怖袭击案有关。 35 | """ 36 | text = Text(blob) 37 | 38 | The property words will call the word tokenizer. 39 | 40 | .. code:: python 41 | 42 | text.words 43 | 44 | 45 | 46 | 47 | .. parsed-literal:: 48 | 49 | WordList(['两', '个', '月', '前', '遭受', '恐怖', '袭击', '的', '法国', '巴黎', '的', '犹太', '超市', '在', '装修', '之后', '周日', '重新', '开放', ',', '法国', '内政', '部长', '以及', '超市', '的', '管理者', '都', '表示', ',', '这', '显示', '了', '生命力', '要', '比', '野蛮', '行为', '更', '强大', '。', '该', '超市', '1', '月', '9', '日', '遭受', '枪手', '袭击', ',', '导致', '4', '人', '死亡', ',', '据悉', '这', '起', '事件', '与', '法国', '《', '查理', '周刊', '》', '杂志', '社', '恐怖', '袭击', '案', '有关', '。']) 50 | 51 | 52 | 53 | Since ICU boundary break algorithms are language aware, polyglot will 54 | detect the language used first before calling the tokenizer 55 | 56 | .. code:: python 57 | 58 | print(text.language) 59 | 60 | 61 | .. parsed-literal:: 62 | 63 | name: code: zh confidence: 99.0 read bytes: 1920 64 | 65 | 66 | Sentence Segementation 67 | ---------------------- 68 | 69 | If we are interested in segmenting the text first into sentences, we can 70 | query the ``sentences`` property 71 | 72 | .. code:: python 73 | 74 | text.sentences 75 | 76 | 77 | 78 | 79 | .. parsed-literal:: 80 | 81 | [Sentence("两个月前遭受恐怖袭击的法国巴黎的犹太超市在装修之后周日重新开放,法国内政部长以及超市的管理者都表示,这显示了生命力要比野蛮行为更强大。"), 82 | Sentence("该超市1月9日遭受枪手袭击,导致4人死亡,据悉这起事件与法国《查理周刊》杂志社恐怖袭击案有关。")] 83 | 84 | 85 | 86 | ``Sentence`` class inherits ``Text``, therefore, we can tokenize each 87 | sentence into words using the same property ``words`` 88 | 89 | .. code:: python 90 | 91 | first_sentence = text.sentences[0] 92 | first_sentence.words 93 | 94 | 95 | 96 | 97 | .. parsed-literal:: 98 | 99 | WordList(['两', '个', '月', '前', '遭受', '恐怖', '袭击', '的', '法国', '巴黎', '的', '犹太', '超市', '在', '装修', '之后', '周日', '重新', '开放', ',', '法国', '内政', '部长', '以及', '超市', '的', '管理者', '都', '表示', ',', '这', '显示', '了', '生命力', '要', '比', '野蛮', '行为', '更', '强大', '。']) 100 | 101 | 102 | 103 | Command Line 104 | ------------ 105 | 106 | The subcommand tokenize does by default sentence segmentation and word 107 | tokenization. 108 | 109 | .. code:: python 110 | 111 | ! polyglot tokenize --help 112 | 113 | 114 | .. parsed-literal:: 115 | 116 | usage: polyglot tokenize [-h] [--only-sent | --only-word] [--input [INPUT [INPUT ...]]] 117 | 118 | optional arguments: 119 | -h, --help show this help message and exit 120 | --only-sent Segment sentences without word tokenization 121 | --only-word Tokenize words without sentence segmentation 122 | --input [INPUT [INPUT ...]] 123 | 124 | 125 | Each line represents a sentence where the words are split by spaces. 126 | 127 | .. code:: python 128 | 129 | !polyglot --lang en tokenize --input testdata/cricket.txt 130 | 131 | 132 | .. parsed-literal:: 133 | 134 | Australia posted a World Cup record total of 417 - 6 as they beat Afghanistan by 275 runs . 135 | David Warner hit 178 off 133 balls , Steve Smith scored 95 while Glenn Maxwell struck 88 in 39 deliveries in the Pool A encounter in Perth . 136 | Afghanistan were then dismissed for 142 , with Mitchell Johnson and Mitchell Starc taking six wickets between them . 137 | Australia's score surpassed the 413 - 5 India made against Bermuda in 2007 . 138 | It continues the pattern of bat dominating ball in this tournament as the third 400 plus score achieved in the pool stages , following South Africa's 408 - 5 and 411 - 4 against West Indies and Ireland respectively . 139 | The winning margin beats the 257 - run amount by which India beat Bermuda in Port of Spain in 2007 , which was equalled five days ago by South Africa in their victory over West Indies in Sydney . 140 | 141 | 142 | References 143 | ~~~~~~~~~~ 144 | 145 | - `Unicode Text Segmentation 146 | Algorithm `__ 147 | - `Unicode Line Breaking 148 | Algorithm `__ 149 | - `Boundary 150 | Analysis `__ 151 | - `ICU Homepage `__ 152 | - `Python Wrapper for libicu `__ 153 | -------------------------------------------------------------------------------- /docs/Transliteration.rst: -------------------------------------------------------------------------------- 1 | 2 | Transliteration 3 | =============== 4 | 5 | Transliteration is the conversion of a text from one script to another. 6 | For instance, a Latin transliteration of the Greek phrase "Ελληνική 7 | Δημοκρατία", usually translated as 'Hellenic Republic', is "Ellēnikḗ 8 | Dēmokratía". 9 | 10 | .. code:: python 11 | 12 | from polyglot.transliteration import Transliterator 13 | 14 | Languages Coverage 15 | ------------------ 16 | 17 | .. code:: python 18 | 19 | from polyglot.downloader import downloader 20 | print(downloader.supported_languages_table("transliteration2")) 21 | 22 | 23 | .. parsed-literal:: 24 | 25 | 1. Haitian; Haitian Creole 2. Tamil 3. Vietnamese 26 | 4. Telugu 5. Croatian 6. Hungarian 27 | 7. Thai 8. Kannada 9. Tagalog 28 | 10. Armenian 11. Hebrew (modern) 12. Turkish 29 | 13. Portuguese 14. Belarusian 15. Norwegian Nynorsk 30 | 16. Norwegian 17. Dutch 18. Japanese 31 | 19. Albanian 20. Bulgarian 21. Serbian 32 | 22. Swahili 23. Swedish 24. French 33 | 25. Latin 26. Czech 27. Yiddish 34 | 28. Hindi 29. Danish 30. Finnish 35 | 31. German 32. Bosnian-Croatian-Serbian 33. Slovak 36 | 34. Persian 35. Lithuanian 36. Slovene 37 | 37. Latvian 38. Bosnian 39. Gujarati 38 | 40. Italian 41. Icelandic 42. Spanish; Castilian 39 | 43. Ukrainian 44. Georgian 45. Urdu 40 | 46. Indonesian 47. Marathi (Marāṭhī) 48. Korean 41 | 49. Galician 50. Khmer 51. Catalan; Valencian 42 | 52. Romanian, Moldavian, ... 53. Basque 54. Macedonian 43 | 55. Russian 56. Azerbaijani 57. Chinese 44 | 58. Estonian 59. Welsh 60. Arabic 45 | 61. Bengali 62. Amharic 63. Irish 46 | 64. Malay 65. Afrikaans 66. Polish 47 | 67. Greek, Modern 68. Esperanto 69. Maltese 48 | 49 | 50 | 51 | Downloading Necessary Models 52 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 53 | 54 | .. code:: python 55 | 56 | %%bash 57 | polyglot download embeddings2.en pos2.en 58 | 59 | 60 | .. parsed-literal:: 61 | 62 | [polyglot_data] Downloading package embeddings2.en to 63 | [polyglot_data] /home/rmyeid/polyglot_data... 64 | [polyglot_data] Package embeddings2.en is already up-to-date! 65 | [polyglot_data] Downloading package pos2.en to 66 | [polyglot_data] /home/rmyeid/polyglot_data... 67 | [polyglot_data] Package pos2.en is already up-to-date! 68 | 69 | 70 | Example 71 | ------- 72 | 73 | We tag each word in the text with one part of speech. 74 | 75 | .. code:: python 76 | 77 | from polyglot.text import Text 78 | 79 | .. code:: python 80 | 81 | blob = """We will meet at eight o'clock on Thursday morning.""" 82 | text = Text(blob) 83 | 84 | We can query all the tagged words 85 | 86 | .. code:: python 87 | 88 | for x in text.transliterate("ar"): 89 | print(x) 90 | 91 | 92 | .. parsed-literal:: 93 | 94 | وي 95 | ويل 96 | ميت 97 | ات 98 | ييايت 99 | أوكلوك 100 | ون 101 | ثورسداي 102 | مورنينغ 103 | 104 | 105 | 106 | Command Line Interface 107 | ~~~~~~~~~~~~~~~~~~~~~~ 108 | 109 | .. code:: python 110 | 111 | !polyglot --lang en tokenize --input testdata/cricket.txt | polyglot --lang en transliteration --target ar | tail -n 30 112 | 113 | 114 | .. parsed-literal:: 115 | 116 | which ويكه 117 | India ينديا 118 | beat بيت 119 | Bermuda بيرمودا 120 | in ين 121 | Port بورت 122 | of وف 123 | Spain سباين 124 | in ين 125 | 2007 126 | , 127 | which ويكه 128 | was واس 129 | equalled يكالليد 130 | five فيفي 131 | days دايس 132 | ago اغو 133 | by بي 134 | South سووث 135 | Africa افريكا 136 | in ين 137 | their ثير 138 | victory فيكتوري 139 | over وفير 140 | West ويست 141 | Indies يندييس 142 | in ين 143 | Sydney سيدني 144 | . 145 | 146 | 147 | -------------------------------------------------------------------------------- /docs/README.rst: -------------------------------------------------------------------------------- 1 | 2 | polyglot 3 | ======== 4 | 5 | |Downloads| |Latest Version| |Build Status| |Documentation Status| 6 | 7 | .. |Downloads| image:: https://img.shields.io/pypi/dm/polyglot.svg 8 | :target: https://pypi.python.org/pypi/polyglot 9 | .. |Latest Version| image:: https://badge.fury.io/py/polyglot.svg 10 | :target: https://pypi.python.org/pypi/polyglot 11 | .. |Build Status| image:: https://travis-ci.org/aboSamoor/polyglot.png?branch=master 12 | :target: https://travis-ci.org/aboSamoor/polyglot 13 | .. |Documentation Status| image:: https://readthedocs.org/projects/polyglot/badge/?version=latest 14 | :target: https://readthedocs.org/builds/polyglot/ 15 | 16 | Polyglot is a natural language pipeline that supports massive 17 | multilingual applications. 18 | 19 | - Free software: GPLv3 license 20 | - Documentation: http://polyglot.readthedocs.org. 21 | 22 | Features 23 | ~~~~~~~~ 24 | 25 | - Tokenization (165 Languages) 26 | - Language detection (196 Languages) 27 | - Named Entity Recognition (40 Languages) 28 | - Part of Speech Tagging (16 Languages) 29 | - Sentiment Analysis (136 Languages) 30 | - Word Embeddings (137 Languages) 31 | - Morphological analysis (135 Languages) 32 | - Transliteration (69 Languages) 33 | 34 | Developer 35 | ~~~~~~~~~ 36 | 37 | - Rami Al-Rfou @ ``rmyeid gmail com`` 38 | 39 | Quick Tutorial 40 | -------------- 41 | 42 | .. code:: python 43 | 44 | import polyglot 45 | from polyglot.text import Text, Word 46 | 47 | Language Detection 48 | ~~~~~~~~~~~~~~~~~~ 49 | 50 | .. code:: python 51 | 52 | text = Text("Bonjour, Mesdames.") 53 | print("Language Detected: Code={}, Name={}\n".format(text.language.code, text.language.name)) 54 | 55 | 56 | .. parsed-literal:: 57 | 58 | Language Detected: Code=fr, Name=French 59 | 60 | 61 | 62 | Tokenization 63 | ~~~~~~~~~~~~ 64 | 65 | .. code:: python 66 | 67 | zen = Text("Beautiful is better than ugly. " 68 | "Explicit is better than implicit. " 69 | "Simple is better than complex.") 70 | print(zen.words) 71 | 72 | 73 | .. parsed-literal:: 74 | 75 | [u'Beautiful', u'is', u'better', u'than', u'ugly', u'.', u'Explicit', u'is', u'better', u'than', u'implicit', u'.', u'Simple', u'is', u'better', u'than', u'complex', u'.'] 76 | 77 | 78 | .. code:: python 79 | 80 | print(zen.sentences) 81 | 82 | 83 | .. parsed-literal:: 84 | 85 | [Sentence("Beautiful is better than ugly."), Sentence("Explicit is better than implicit."), Sentence("Simple is better than complex.")] 86 | 87 | 88 | Part of Speech Tagging 89 | ~~~~~~~~~~~~~~~~~~~~~~ 90 | 91 | .. code:: python 92 | 93 | text = Text(u"O primeiro uso de desobediência civil em massa ocorreu em setembro de 1906.") 94 | 95 | print("{:<16}{}".format("Word", "POS Tag")+"\n"+"-"*30) 96 | for word, tag in text.pos_tags: 97 | print(u"{:<16}{:>2}".format(word, tag)) 98 | 99 | 100 | .. parsed-literal:: 101 | 102 | Word POS Tag 103 | ------------------------------ 104 | O DET 105 | primeiro ADJ 106 | uso NOUN 107 | de ADP 108 | desobediência NOUN 109 | civil ADJ 110 | em ADP 111 | massa NOUN 112 | ocorreu ADJ 113 | em ADP 114 | setembro NOUN 115 | de ADP 116 | 1906 NUM 117 | . PUNCT 118 | 119 | 120 | Named Entity Recognition 121 | ~~~~~~~~~~~~~~~~~~~~~~~~ 122 | 123 | .. code:: python 124 | 125 | text = Text(u"In Großbritannien war Gandhi mit dem westlichen Lebensstil vertraut geworden") 126 | print(text.entities) 127 | 128 | 129 | .. parsed-literal:: 130 | 131 | [I-LOC([u'Gro\xdfbritannien']), I-PER([u'Gandhi'])] 132 | 133 | 134 | Polarity 135 | ~~~~~~~~ 136 | 137 | .. code:: python 138 | 139 | print("{:<16}{}".format("Word", "Polarity")+"\n"+"-"*30) 140 | for w in zen.words[:6]: 141 | print("{:<16}{:>2}".format(w, w.polarity)) 142 | 143 | 144 | .. parsed-literal:: 145 | 146 | Word Polarity 147 | ------------------------------ 148 | Beautiful 0 149 | is 0 150 | better 1 151 | than 0 152 | ugly -1 153 | . 0 154 | 155 | 156 | Embeddings 157 | ~~~~~~~~~~ 158 | 159 | .. code:: python 160 | 161 | word = Word("Obama", language="en") 162 | print("Neighbors (Synonms) of {}".format(word)+"\n"+"-"*30) 163 | for w in word.neighbors: 164 | print("{:<16}".format(w)) 165 | print("\n\nThe first 10 dimensions out the {} dimensions\n".format(word.vector.shape[0])) 166 | print(word.vector[:10]) 167 | 168 | 169 | .. parsed-literal:: 170 | 171 | Neighbors (Synonms) of Obama 172 | ------------------------------ 173 | Bush 174 | Reagan 175 | Clinton 176 | Ahmadinejad 177 | Nixon 178 | Karzai 179 | McCain 180 | Biden 181 | Huckabee 182 | Lula 183 | 184 | 185 | The first 10 dimensions out the 256 dimensions 186 | 187 | [-2.57382345 1.52175975 0.51070285 1.08678675 -0.74386948 -1.18616164 188 | 2.92784619 -0.25694436 -1.40958667 -2.39675403] 189 | 190 | 191 | Morphology 192 | ~~~~~~~~~~ 193 | 194 | .. code:: python 195 | 196 | word = Text("Preprocessing is an essential step.").words[0] 197 | print(word.morphemes) 198 | 199 | 200 | .. parsed-literal:: 201 | 202 | [u'Pre', u'process', u'ing'] 203 | 204 | 205 | Transliteration 206 | ~~~~~~~~~~~~~~~ 207 | 208 | .. code:: python 209 | 210 | from polyglot.transliteration import Transliterator 211 | transliterator = Transliterator(source_lang="en", target_lang="ru") 212 | print(transliterator.transliterate(u"preprocessing")) 213 | 214 | 215 | .. parsed-literal:: 216 | 217 | препрокессинг 218 | 219 | -------------------------------------------------------------------------------- /docs/POS.rst: -------------------------------------------------------------------------------- 1 | 2 | Part of Speech Tagging 3 | ====================== 4 | 5 | Part of speech tagging task aims to assign every word/token in plain 6 | text a category that identifies the syntactic functionality of the word 7 | occurrence. 8 | 9 | Polyglot recognizes 17 parts of speech, this set is called the 10 | ``universal part of speech tag set``: 11 | 12 | - **ADJ**: adjective 13 | - **ADP**: adposition 14 | - **ADV**: adverb 15 | - **AUX**: auxiliary verb 16 | - **CONJ**: coordinating conjunction 17 | - **DET**: determiner 18 | - **INTJ**: interjection 19 | - **NOUN**: noun 20 | - **NUM**: numeral 21 | - **PART**: particle 22 | - **PRON**: pronoun 23 | - **PROPN**: proper noun 24 | - **PUNCT**: punctuation 25 | - **SCONJ**: subordinating conjunction 26 | - **SYM**: symbol 27 | - **VERB**: verb 28 | - **X**: other 29 | 30 | Languages Coverage 31 | ------------------ 32 | 33 | The models were trained on a combination of: 34 | 35 | - Original CONLL datasets after the tags were converted using the 36 | `universal POS 37 | tables `__. 38 | 39 | - Universal Dependencies 1.0 corpora whenever they are available. 40 | 41 | .. code:: python 42 | 43 | from polyglot.downloader import downloader 44 | print(downloader.supported_languages_table("pos2")) 45 | 46 | 47 | .. parsed-literal:: 48 | 49 | 1. German 2. Italian 3. Danish 50 | 4. Czech 5. Slovene 6. French 51 | 7. English 8. Swedish 9. Bulgarian 52 | 10. Spanish; Castilian 11. Indonesian 12. Portuguese 53 | 13. Finnish 14. Irish 15. Hungarian 54 | 16. Dutch 55 | 56 | 57 | Download Necessary Models 58 | ^^^^^^^^^^^^^^^^^^^^^^^^^ 59 | 60 | .. code:: python 61 | 62 | %%bash 63 | polyglot download embeddings2.en pos2.en 64 | 65 | 66 | .. parsed-literal:: 67 | 68 | [polyglot_data] Downloading package embeddings2.en to 69 | [polyglot_data] /home/rmyeid/polyglot_data... 70 | [polyglot_data] Package embeddings2.en is already up-to-date! 71 | [polyglot_data] Downloading package pos2.en to 72 | [polyglot_data] /home/rmyeid/polyglot_data... 73 | [polyglot_data] Package pos2.en is already up-to-date! 74 | 75 | 76 | Example 77 | ------- 78 | 79 | We tag each word in the text with one part of speech. 80 | 81 | .. code:: python 82 | 83 | from polyglot.text import Text 84 | 85 | .. code:: python 86 | 87 | blob = """We will meet at eight o'clock on Thursday morning.""" 88 | text = Text(blob) 89 | 90 | We can query all the tagged words 91 | 92 | .. code:: python 93 | 94 | text.pos_tags 95 | 96 | 97 | 98 | 99 | .. parsed-literal:: 100 | 101 | [(u'We', u'PRON'), 102 | (u'will', u'AUX'), 103 | (u'meet', u'VERB'), 104 | (u'at', u'ADP'), 105 | (u'eight', u'NUM'), 106 | (u"o'clock", u'NOUN'), 107 | (u'on', u'ADP'), 108 | (u'Thursday', u'PROPN'), 109 | (u'morning', u'NOUN'), 110 | (u'.', u'PUNCT')] 111 | 112 | 113 | 114 | After calling the pos\_tags property once, the words objects will carry 115 | the POS tags. 116 | 117 | .. code:: python 118 | 119 | text.words[0].pos_tag 120 | 121 | 122 | 123 | 124 | .. parsed-literal:: 125 | 126 | u'PRON' 127 | 128 | 129 | 130 | Command Line Interface 131 | ~~~~~~~~~~~~~~~~~~~~~~ 132 | 133 | .. code:: python 134 | 135 | !polyglot --lang en tokenize --input testdata/cricket.txt | polyglot --lang en pos | tail -n 30 136 | 137 | 138 | .. parsed-literal:: 139 | 140 | which DET 141 | India PROPN 142 | beat VERB 143 | Bermuda PROPN 144 | in ADP 145 | Port PROPN 146 | of ADP 147 | Spain PROPN 148 | in ADP 149 | 2007 NUM 150 | , PUNCT 151 | which DET 152 | was AUX 153 | equalled VERB 154 | five NUM 155 | days NOUN 156 | ago ADV 157 | by ADP 158 | South PROPN 159 | Africa PROPN 160 | in ADP 161 | their PRON 162 | victory NOUN 163 | over ADP 164 | West PROPN 165 | Indies PROPN 166 | in ADP 167 | Sydney PROPN 168 | . PUNCT 169 | 170 | 171 | 172 | Citation 173 | ~~~~~~~~ 174 | 175 | This work is a direct implementation of the research being described in 176 | the `Polyglot: Distributed Word Representations for Multilingual 177 | NLP `__ paper. The author of 178 | this library strongly encourage you to cite the following paper if you 179 | are using this software. 180 | 181 | :: 182 | 183 | @InProceedings{polyglot:2013:ACL-CoNLL, 184 | author = {Al-Rfou, Rami and Perozzi, Bryan and Skiena, Steven}, 185 | title = {Polyglot: Distributed Word Representations for Multilingual NLP}, 186 | booktitle = {Proceedings of the Seventeenth Conference on Computational Natural Language Learning}, 187 | month = {August}, 188 | year = {2013}, 189 | address = {Sofia, Bulgaria}, 190 | publisher = {Association for Computational Linguistics}, 191 | pages = {183--192}, 192 | url = {http://www.aclweb.org/anthology/W13-3520} 193 | } 194 | 195 | References 196 | ---------- 197 | 198 | - `Universal Part of Speech 199 | Tagging `__ 200 | - `Universal Dependencies 201 | 1.0 `__. 202 | -------------------------------------------------------------------------------- /docs/NamedEntityRecognition.rst: -------------------------------------------------------------------------------- 1 | 2 | Named Entity Extraction 3 | ======================= 4 | 5 | Named entity extraction task aims to extract phrases from plain text 6 | that correpond to entities. Polyglot recognizes 3 categories of 7 | entities: 8 | 9 | - Locations (Tag: ``I-LOC``): cities, countries, regions, continents, 10 | neighborhoods, administrative divisions ... 11 | - Organizations (Tag: ``I-ORG``): sports teams, newspapers, banks, 12 | universities, schools, non-profits, companies, ... 13 | - Persons (Tag: ``I-PER``): politicians, scientists, artists, atheletes 14 | ... 15 | 16 | Languages Coverage 17 | ------------------ 18 | 19 | The models were trained on datasets extracted automatically from 20 | Wikipedia. Polyglot currently supports 40 major languages. 21 | 22 | .. code:: python 23 | 24 | from polyglot.downloader import downloader 25 | print(downloader.supported_languages_table("ner2", 3)) 26 | 27 | 28 | .. parsed-literal:: 29 | 30 | 1. Polish 2. Turkish 3. Russian 31 | 4. Indonesian 5. Czech 6. Arabic 32 | 7. Korean 8. Catalan; Valencian 9. Italian 33 | 10. Thai 11. Romanian, Moldavian, ... 12. Tagalog 34 | 13. Danish 14. Finnish 15. German 35 | 16. Persian 17. Dutch 18. Chinese 36 | 19. French 20. Portuguese 21. Slovak 37 | 22. Hebrew (modern) 23. Malay 24. Slovene 38 | 25. Bulgarian 26. Hindi 27. Japanese 39 | 28. Hungarian 29. Croatian 30. Ukrainian 40 | 31. Serbian 32. Lithuanian 33. Norwegian 41 | 34. Latvian 35. Swedish 36. English 42 | 37. Greek, Modern 38. Spanish; Castilian 39. Vietnamese 43 | 40. Estonian 44 | 45 | 46 | Download Necessary Models 47 | ^^^^^^^^^^^^^^^^^^^^^^^^^ 48 | 49 | .. code:: python 50 | 51 | %%bash 52 | polyglot download embeddings2.en ner2.en 53 | 54 | 55 | .. parsed-literal:: 56 | 57 | [polyglot_data] Downloading package embeddings2.en to 58 | [polyglot_data] /home/rmyeid/polyglot_data... 59 | [polyglot_data] Package embeddings2.en is already up-to-date! 60 | [polyglot_data] Downloading package ner2.en to 61 | [polyglot_data] /home/rmyeid/polyglot_data... 62 | [polyglot_data] Package ner2.en is already up-to-date! 63 | 64 | 65 | Example 66 | ------- 67 | 68 | Entities inside a text object or a sentence are represented as chunks. 69 | Each chunk identifies the start and the end indices of the word 70 | subsequence within the text. 71 | 72 | .. code:: python 73 | 74 | from polyglot.text import Text 75 | 76 | .. code:: python 77 | 78 | blob = """The Israeli Prime Minister Benjamin Netanyahu has warned that Iran poses a "threat to the entire world".""" 79 | text = Text(blob) 80 | 81 | We can query all entities mentioned in a text. 82 | 83 | .. code:: python 84 | 85 | text.entities 86 | 87 | 88 | 89 | 90 | .. parsed-literal:: 91 | 92 | [I-ORG([u'Israeli']), I-PER([u'Benjamin', u'Netanyahu']), I-LOC([u'Iran'])] 93 | 94 | 95 | 96 | Or, we can query entites per sentence 97 | 98 | .. code:: python 99 | 100 | for sent in text.sentences: 101 | print(sent, "\n") 102 | for entity in sent.entities: 103 | print(entity.tag, entity) 104 | 105 | 106 | .. parsed-literal:: 107 | 108 | The Israeli Prime Minister Benjamin Netanyahu has warned that Iran poses a "threat to the entire world". 109 | 110 | I-ORG [u'Israeli'] 111 | I-PER [u'Benjamin', u'Netanyahu'] 112 | I-LOC [u'Iran'] 113 | 114 | 115 | By doing more careful inspection of the second entity 116 | ``Benjamin Netanyahu``, we can locate the position of the entity within 117 | the sentence. 118 | 119 | .. code:: python 120 | 121 | benjamin = sent.entities[1] 122 | sent.words[benjamin.start: benjamin.end] 123 | 124 | 125 | 126 | 127 | .. parsed-literal:: 128 | 129 | WordList([u'Benjamin', u'Netanyahu']) 130 | 131 | 132 | 133 | Command Line Interface 134 | ~~~~~~~~~~~~~~~~~~~~~~ 135 | 136 | .. code:: python 137 | 138 | !polyglot --lang en tokenize --input testdata/cricket.txt | polyglot --lang en ner | tail -n 20 139 | 140 | 141 | .. parsed-literal:: 142 | 143 | , O 144 | which O 145 | was O 146 | equalled O 147 | five O 148 | days O 149 | ago O 150 | by O 151 | South I-LOC 152 | Africa I-LOC 153 | in O 154 | their O 155 | victory O 156 | over O 157 | West I-ORG 158 | Indies I-ORG 159 | in O 160 | Sydney I-LOC 161 | . O 162 | 163 | 164 | 165 | Demo 166 | ---- 167 | 168 | .. raw:: html 169 | 170 | 171 | 172 | 173 | Citation 174 | ~~~~~~~~ 175 | 176 | This work is a direct implementation of the research being described in 177 | the `Polyglot-NER: Multilingual Named Entity 178 | Recognition `__ 179 | paper. The author of this library strongly encourage you to cite the 180 | following paper if you are using this software. 181 | 182 | :: 183 | 184 | @article{polyglotner, 185 | author = {Al-Rfou, Rami and Kulkarni, Vivek and Perozzi, Bryan and Skiena, Steven}, 186 | title = {{Polyglot-NER}: Massive Multilingual Named Entity Recognition}, 187 | journal = {{Proceedings of the 2015 {SIAM} International Conference on Data Mining, Vancouver, British Columbia, Canada, April 30 - May 2, 2015}}, 188 | month = {April}, 189 | year = {2015}, 190 | publisher = {SIAM} 191 | } 192 | 193 | References 194 | ---------- 195 | 196 | - `Polyglot-NER project page. `__ 197 | - `Wikipedia on 198 | NER `__. 199 | -------------------------------------------------------------------------------- /polyglot/base.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """Basic data types.""" 5 | 6 | from io import open, StringIO 7 | from collections import Counter 8 | import os 9 | from concurrent.futures import ProcessPoolExecutor 10 | from itertools import islice 11 | 12 | import six 13 | from six.moves import zip 14 | from six import text_type as unicode 15 | from six import iteritems 16 | from six import string_types 17 | 18 | 19 | class Sequence(object): 20 | """ Text with indices indicates boundaries.""" 21 | 22 | def __init__(self, text): 23 | 24 | if not text: 25 | raise ValueError("This Sequence is Empty") 26 | if not isinstance(text, unicode): 27 | raise ValueError("This is not unicode text instead {}".format(type(text))) 28 | 29 | self.__text = text 30 | self.idx = [0, len(self.text)] 31 | 32 | @property 33 | def text(self): 34 | return self.__text 35 | 36 | def __iter__(self): 37 | for start, end in zip(self.idx[:-1], self.idx[1:]): 38 | yield self.text[start: end] 39 | 40 | def tokens(self): 41 | """ Returns segmented text after stripping whitespace.""" 42 | 43 | return [x.strip() for x in self if x.strip()] 44 | 45 | def __str__(self): 46 | if six.PY3: 47 | return self.__unicode__() 48 | return self.__unicode__().encode("utf-8") 49 | 50 | def __unicode__(self): 51 | return u'\n'.join(self.tokens()) 52 | 53 | def split(self, sequence): 54 | """ Split into subsequences according to `sequence`.""" 55 | 56 | major_idx = sequence.idx 57 | idx2 = 0 58 | for start, end in zip(major_idx[:-1], major_idx[1:]): 59 | idx1 = self.idx.index(start, idx2) 60 | idx2 = self.idx.index(end, idx2) 61 | seq = Sequence(self.text[start:end]) 62 | seq.idx = [x-start for x in self.idx[idx1:idx2]] 63 | yield seq 64 | 65 | def __len__(self): 66 | return len(self.idx) - 1 67 | 68 | def empty(self): 69 | return not self.text.strip() 70 | 71 | 72 | class TokenSequence(list): 73 | """A list of tokens. 74 | 75 | Args: 76 | tokens (list): list of symbols. 77 | """ 78 | 79 | def sliding_window(self, width=2, padding=None): 80 | seq = self 81 | if padding: 82 | pad = [padding for x in range(width/2)] 83 | seq = pad + self + pad 84 | args = [islice(seq, i, None) for i in range(width)] 85 | for x in zip(*args): 86 | yield x 87 | 88 | 89 | class TextFile(object): 90 | """ Wrapper around text files. 91 | 92 | It uses io.open to guarantee reading text files with unicode encoding. 93 | It has an iterator that supports arbitrary delimiter instead of only 94 | new lines. 95 | 96 | Attributes: 97 | delimiter (string): A string that defines the limit of each chunk. 98 | file (string): A path to a file. 99 | buf (StringIO): a buffer to store the results of peeking into the file. 100 | """ 101 | 102 | def __init__(self, file, delimiter=u'\n'): 103 | self.name = file 104 | self.delimiter = delimiter 105 | self.open_file = open(file, 'r') 106 | self.buf = StringIO() 107 | 108 | def iter_delimiter(self, byte_size=8192): 109 | """ Generalization of the default iter file delimited by '\n'. 110 | Note: 111 | The newline string can be arbitrarily long; it need not be restricted to a 112 | single character. You can also set the read size and control whether or not 113 | the newline string is left on the end of the iterated lines. Setting 114 | newline to '\0' is particularly good for use with an input file created with 115 | something like "os.popen('find -print0')". 116 | 117 | Args: 118 | byte_size (integer): Number of bytes to be read at each time. 119 | """ 120 | partial = u'' 121 | while True: 122 | read_chars = self.read(byte_size) 123 | if not read_chars: break 124 | partial += read_chars 125 | lines = partial.split(self.delimiter) 126 | partial = lines.pop() 127 | 128 | for line in lines: 129 | yield line + self.delimiter 130 | 131 | if partial: 132 | yield partial 133 | 134 | def __iter__(self): 135 | for l in self.iter_delimiter(): 136 | yield l 137 | 138 | def iter_chunks(self, chunksize): 139 | chunk = [] 140 | for i, l in enumerate(self): 141 | chunk.append(l) 142 | if i % chunksize == chunksize -1: 143 | yield chunk 144 | chunk = [] 145 | if chunk: 146 | yield chunk 147 | 148 | def _append_to_buf(self, contents): 149 | oldpos = self.buf.tell() 150 | self.buf.seek(0, os.SEEK_END) 151 | self.buf.write(contents) 152 | self.buf.seek(oldpos) 153 | 154 | def peek(self, size): 155 | contents = self.open_file.read(size) 156 | self._append_to_buf(contents) 157 | return contents 158 | 159 | def read(self, size=None): 160 | """ Read `size` of bytes.""" 161 | if size is None: 162 | return self.buf.read() + self.open_file.read() 163 | contents = self.buf.read(size) 164 | if len(contents) < size: 165 | contents += self.open_file.read(size - len(contents)) 166 | return contents 167 | 168 | def readline(self): 169 | line = self.buf.readline() 170 | if not line.endswith('\n'): 171 | line += self.open_file.readline() 172 | return line 173 | 174 | def apply(self, func, workers=1, job_size=10000): 175 | """Apply `func` to lines of text in parallel or sequential. 176 | 177 | Args: 178 | func : a function that takes a list of lines. 179 | """ 180 | if workers == 1: 181 | for lines in self.iter_chunks(job_size): 182 | yield func(lines) 183 | else: 184 | with ProcessPoolExecutor(max_workers=workers) as executor: 185 | for result in executor.map(func, self.iter_chunks(job_size)): 186 | yield result 187 | 188 | 189 | class TextFiles(TextFile): 190 | """Interface for a sequence of files.""" 191 | 192 | def __init__(self, files, delimiter=u'\n'): 193 | if isinstance(files[0], string_types): 194 | self.files = [TextFile(f) for f in files] 195 | self.files = files 196 | self.delimiter = delimiter 197 | self.buf = StringIO() 198 | self.i = 0 199 | self.open_file = self.files[self.i].open_file 200 | 201 | def readline(self): 202 | raise NotImplementedError("Future work") 203 | 204 | def peek(self, size): 205 | self.open_file.seek(0) 206 | contents = self.open_file.read(size) 207 | self.open_file.seek(0) 208 | return contents 209 | 210 | def read(self, size=None): 211 | content = super(TextFiles, self).read(size) 212 | if not content and self.i < len(self.files)-1: 213 | self.i += 1 214 | self.buf = StringIO() 215 | self.open_file = self.files[self.i].open_file 216 | return self.read(size) 217 | return content 218 | 219 | @property 220 | def names(self): 221 | return [f.name for f in self.files] 222 | -------------------------------------------------------------------------------- /docs/Embeddings.rst: -------------------------------------------------------------------------------- 1 | 2 | Word Embeddings 3 | =============== 4 | 5 | Word embedding is a mapping of a word to a d-dimensional vector space. 6 | This real valued vector representation captures semantic and syntactic 7 | features. Polyglot offers a simple interface to load several formats of 8 | word embeddings. 9 | 10 | .. code:: python 11 | 12 | from polyglot.mapping import Embedding 13 | 14 | Formats 15 | ------- 16 | 17 | The Embedding class can read word embeddings from different sources: 18 | 19 | - Gensim word2vec objects: (``from_gensim`` method) 20 | - Word2vec binary/text models: (``from_word2vec`` method) 21 | - polyglot pickle files: (``load`` method) 22 | 23 | .. code:: python 24 | 25 | embeddings = Embedding.load("/home/rmyeid/polyglot_data/embeddings2/en/embeddings_pkl.tar.bz2") 26 | 27 | Nearest Neighbors 28 | ----------------- 29 | 30 | A common way to investigate the space capture by the embeddings is to 31 | query for the nearest neightbors of any word. 32 | 33 | .. code:: python 34 | 35 | neighbors = embeddings.nearest_neighbors("green") 36 | neighbors 37 | 38 | 39 | 40 | 41 | .. parsed-literal:: 42 | 43 | [u'blue', 44 | u'white', 45 | u'red', 46 | u'yellow', 47 | u'black', 48 | u'grey', 49 | u'purple', 50 | u'pink', 51 | u'light', 52 | u'gray'] 53 | 54 | 55 | 56 | to calculate the distance between a word and the nieghbors, we can call 57 | the ``distances`` method 58 | 59 | .. code:: python 60 | 61 | embeddings.distances("green", neighbors) 62 | 63 | 64 | 65 | 66 | .. parsed-literal:: 67 | 68 | array([ 1.34894466, 1.37864077, 1.39504588, 1.39524949, 1.43183875, 69 | 1.68007386, 1.75897062, 1.88401115, 1.89186132, 1.902614 ], dtype=float32) 70 | 71 | 72 | 73 | The word embeddings are not unit vectors, actually the more frequent the 74 | word is the larger the norm of its own vector. 75 | 76 | .. code:: python 77 | 78 | %matplotlib inline 79 | import matplotlib.pyplot as plt 80 | import numpy as np 81 | 82 | .. code:: python 83 | 84 | norms = np.linalg.norm(embeddings.vectors, axis=1) 85 | window = 300 86 | smooth_line = np.convolve(norms, np.ones(window)/float(window), mode='valid') 87 | plt.plot(smooth_line) 88 | plt.xlabel("Word Rank"); _ = plt.ylabel("$L_2$ norm") 89 | 90 | 91 | 92 | .. image:: Embeddings_files/Embeddings_12_0.png 93 | 94 | 95 | This could be problematic for some applications and training algorithms. 96 | We can normalize them by :math:`L_2` norms to get unit vectors to reduce 97 | effects of word frequency, as the following 98 | 99 | .. code:: python 100 | 101 | embeddings = embeddings.normalize_words() 102 | 103 | .. code:: python 104 | 105 | neighbors = embeddings.nearest_neighbors("green") 106 | for w,d in zip(neighbors, embeddings.distances("green", neighbors)): 107 | print("{:<8}{:.4f}".format(w,d)) 108 | 109 | 110 | .. parsed-literal:: 111 | 112 | white 0.4261 113 | blue 0.4451 114 | black 0.4591 115 | red 0.4786 116 | yellow 0.4947 117 | grey 0.6072 118 | purple 0.6392 119 | light 0.6483 120 | pink 0.6574 121 | colour 0.6824 122 | 123 | 124 | Vocabulary Expansion 125 | -------------------- 126 | 127 | .. code:: python 128 | 129 | from polyglot.mapping import CaseExpander, DigitExpander 130 | 131 | Not all the words are available in the dictionary defined by the word 132 | embeddings. Sometimes it would be useful to map new words to similar 133 | ones that we have embeddings for. 134 | 135 | Case Expansion 136 | ~~~~~~~~~~~~~~ 137 | 138 | For example, the word ``GREEN`` is not available in the embeddings, 139 | 140 | .. code:: python 141 | 142 | "GREEN" in embeddings 143 | 144 | 145 | 146 | 147 | .. parsed-literal:: 148 | 149 | False 150 | 151 | 152 | 153 | we would like to return the vector that represents the word ``Green``, 154 | to do that we apply a case expansion: 155 | 156 | .. code:: python 157 | 158 | embeddings.apply_expansion(CaseExpander) 159 | 160 | .. code:: python 161 | 162 | "GREEN" in embeddings 163 | 164 | 165 | 166 | 167 | .. parsed-literal:: 168 | 169 | True 170 | 171 | 172 | 173 | .. code:: python 174 | 175 | embeddings.nearest_neighbors("GREEN") 176 | 177 | 178 | 179 | 180 | .. parsed-literal:: 181 | 182 | [u'White', 183 | u'Black', 184 | u'Brown', 185 | u'Blue', 186 | u'Diamond', 187 | u'Wood', 188 | u'Young', 189 | u'Hudson', 190 | u'Cook', 191 | u'Gold'] 192 | 193 | 194 | 195 | Digit Expansion 196 | ~~~~~~~~~~~~~~~ 197 | 198 | We reduce the size of the vocabulary while training the embeddings by 199 | grouping special classes of words. Once common case of such grouping is 200 | digits. Every digit in the training corpus get replaced by the symbol 201 | ``#``. For example, a number like ``123.54`` becomes ``###.##``. 202 | Therefore, querying the embedding for a new number like ``434`` will 203 | result in a failure 204 | 205 | .. code:: python 206 | 207 | "434" in embeddings 208 | 209 | 210 | 211 | 212 | .. parsed-literal:: 213 | 214 | False 215 | 216 | 217 | 218 | To fix that, we apply another type of vocabulary expansion 219 | ``DigitExpander``. It will map any number to a sequence of ``#``\ s. 220 | 221 | .. code:: python 222 | 223 | embeddings.apply_expansion(DigitExpander) 224 | 225 | .. code:: python 226 | 227 | "434" in embeddings 228 | 229 | 230 | 231 | 232 | .. parsed-literal:: 233 | 234 | True 235 | 236 | 237 | 238 | As expected, the neighbors of the new number ``434`` will be other 239 | numbers: 240 | 241 | .. code:: python 242 | 243 | embeddings.nearest_neighbors("434") 244 | 245 | 246 | 247 | 248 | .. parsed-literal:: 249 | 250 | [u'##', 251 | u'#', 252 | u'3', 253 | u'#####', 254 | u'#,###', 255 | u'##,###', 256 | u'##EN##', 257 | u'####', 258 | u'###EN###', 259 | u'n'] 260 | 261 | 262 | 263 | Demo 264 | ---- 265 | 266 | Demo is available `here `__. 267 | 268 | Citation 269 | ~~~~~~~~ 270 | 271 | This work is a direct implementation of the research being described in 272 | the `Polyglot: Distributed Word Representations for Multilingual 273 | NLP `__ paper. The author of 274 | this library strongly encourage you to cite the following paper if you 275 | are using this software. 276 | 277 | :: 278 | 279 | @InProceedings{polyglot:2013:ACL-CoNLL, 280 | author = {Al-Rfou, Rami and Perozzi, Bryan and Skiena, Steven}, 281 | title = {Polyglot: Distributed Word Representations for Multilingual NLP}, 282 | booktitle = {Proceedings of the Seventeenth Conference on Computational Natural Language Learning}, 283 | month = {August}, 284 | year = {2013}, 285 | address = {Sofia, Bulgaria}, 286 | publisher = {Association for Computational Linguistics}, 287 | pages = {183--192}, 288 | url = {http://www.aclweb.org/anthology/W13-3520} 289 | } 290 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | 49 | clean: 50 | rm -rf $(BUILDDIR)/* 51 | 52 | html: 53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 54 | @echo 55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 56 | 57 | dirhtml: 58 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 59 | @echo 60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 61 | 62 | singlehtml: 63 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 64 | @echo 65 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 66 | 67 | pickle: 68 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 69 | @echo 70 | @echo "Build finished; now you can process the pickle files." 71 | 72 | json: 73 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 74 | @echo 75 | @echo "Build finished; now you can process the JSON files." 76 | 77 | htmlhelp: 78 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 79 | @echo 80 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 81 | ".hhp project file in $(BUILDDIR)/htmlhelp." 82 | 83 | qthelp: 84 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 85 | @echo 86 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 87 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 88 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/complexity.qhcp" 89 | @echo "To view the help file:" 90 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/complexity.qhc" 91 | 92 | devhelp: 93 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 94 | @echo 95 | @echo "Build finished." 96 | @echo "To view the help file:" 97 | @echo "# mkdir -p $$HOME/.local/share/devhelp/complexity" 98 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/complexity" 99 | @echo "# devhelp" 100 | 101 | epub: 102 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 103 | @echo 104 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 105 | 106 | latex: 107 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 108 | @echo 109 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 110 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 111 | "(use \`make latexpdf' here to do that automatically)." 112 | 113 | latexpdf: 114 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 115 | @echo "Running LaTeX files through pdflatex..." 116 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 117 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 118 | 119 | latexpdfja: 120 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 121 | @echo "Running LaTeX files through platex and dvipdfmx..." 122 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 123 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 124 | 125 | text: 126 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 127 | @echo 128 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 129 | 130 | man: 131 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 132 | @echo 133 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 134 | 135 | texinfo: 136 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 137 | @echo 138 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 139 | @echo "Run \`make' in that directory to run these through makeinfo" \ 140 | "(use \`make info' here to do that automatically)." 141 | 142 | info: 143 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 144 | @echo "Running Texinfo files through makeinfo..." 145 | make -C $(BUILDDIR)/texinfo info 146 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 147 | 148 | gettext: 149 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 150 | @echo 151 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 152 | 153 | changes: 154 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 155 | @echo 156 | @echo "The overview file is in $(BUILDDIR)/changes." 157 | 158 | linkcheck: 159 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 160 | @echo 161 | @echo "Link check complete; look for any errors in the above output " \ 162 | "or in $(BUILDDIR)/linkcheck/output.txt." 163 | 164 | doctest: 165 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 166 | @echo "Testing of doctests in the sources finished, look at the " \ 167 | "results in $(BUILDDIR)/doctest/output.txt." 168 | 169 | xml: 170 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 171 | @echo 172 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 173 | 174 | pseudoxml: 175 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 176 | @echo 177 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. xml to make Docutils-native XML files 37 | echo. pseudoxml to make pseudoxml-XML files for display purposes 38 | echo. linkcheck to check all external links for integrity 39 | echo. doctest to run all doctests embedded in the documentation if enabled 40 | goto end 41 | ) 42 | 43 | if "%1" == "clean" ( 44 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 45 | del /q /s %BUILDDIR%\* 46 | goto end 47 | ) 48 | 49 | 50 | %SPHINXBUILD% 2> nul 51 | if errorlevel 9009 ( 52 | echo. 53 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 54 | echo.installed, then set the SPHINXBUILD environment variable to point 55 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 56 | echo.may add the Sphinx directory to PATH. 57 | echo. 58 | echo.If you don't have Sphinx installed, grab it from 59 | echo.http://sphinx-doc.org/ 60 | exit /b 1 61 | ) 62 | 63 | if "%1" == "html" ( 64 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 65 | if errorlevel 1 exit /b 1 66 | echo. 67 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 68 | goto end 69 | ) 70 | 71 | if "%1" == "dirhtml" ( 72 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 73 | if errorlevel 1 exit /b 1 74 | echo. 75 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 76 | goto end 77 | ) 78 | 79 | if "%1" == "singlehtml" ( 80 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 81 | if errorlevel 1 exit /b 1 82 | echo. 83 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 84 | goto end 85 | ) 86 | 87 | if "%1" == "pickle" ( 88 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 89 | if errorlevel 1 exit /b 1 90 | echo. 91 | echo.Build finished; now you can process the pickle files. 92 | goto end 93 | ) 94 | 95 | if "%1" == "json" ( 96 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 97 | if errorlevel 1 exit /b 1 98 | echo. 99 | echo.Build finished; now you can process the JSON files. 100 | goto end 101 | ) 102 | 103 | if "%1" == "htmlhelp" ( 104 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 105 | if errorlevel 1 exit /b 1 106 | echo. 107 | echo.Build finished; now you can run HTML Help Workshop with the ^ 108 | .hhp project file in %BUILDDIR%/htmlhelp. 109 | goto end 110 | ) 111 | 112 | if "%1" == "qthelp" ( 113 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 114 | if errorlevel 1 exit /b 1 115 | echo. 116 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 117 | .qhcp project file in %BUILDDIR%/qthelp, like this: 118 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\complexity.qhcp 119 | echo.To view the help file: 120 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\complexity.ghc 121 | goto end 122 | ) 123 | 124 | if "%1" == "devhelp" ( 125 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished. 129 | goto end 130 | ) 131 | 132 | if "%1" == "epub" ( 133 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 134 | if errorlevel 1 exit /b 1 135 | echo. 136 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 137 | goto end 138 | ) 139 | 140 | if "%1" == "latex" ( 141 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 142 | if errorlevel 1 exit /b 1 143 | echo. 144 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 145 | goto end 146 | ) 147 | 148 | if "%1" == "latexpdf" ( 149 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 150 | cd %BUILDDIR%/latex 151 | make all-pdf 152 | cd %BUILDDIR%/.. 153 | echo. 154 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 155 | goto end 156 | ) 157 | 158 | if "%1" == "latexpdfja" ( 159 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 160 | cd %BUILDDIR%/latex 161 | make all-pdf-ja 162 | cd %BUILDDIR%/.. 163 | echo. 164 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 165 | goto end 166 | ) 167 | 168 | if "%1" == "text" ( 169 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 170 | if errorlevel 1 exit /b 1 171 | echo. 172 | echo.Build finished. The text files are in %BUILDDIR%/text. 173 | goto end 174 | ) 175 | 176 | if "%1" == "man" ( 177 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 178 | if errorlevel 1 exit /b 1 179 | echo. 180 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 181 | goto end 182 | ) 183 | 184 | if "%1" == "texinfo" ( 185 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 186 | if errorlevel 1 exit /b 1 187 | echo. 188 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 189 | goto end 190 | ) 191 | 192 | if "%1" == "gettext" ( 193 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 194 | if errorlevel 1 exit /b 1 195 | echo. 196 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 197 | goto end 198 | ) 199 | 200 | if "%1" == "changes" ( 201 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 202 | if errorlevel 1 exit /b 1 203 | echo. 204 | echo.The overview file is in %BUILDDIR%/changes. 205 | goto end 206 | ) 207 | 208 | if "%1" == "linkcheck" ( 209 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 210 | if errorlevel 1 exit /b 1 211 | echo. 212 | echo.Link check complete; look for any errors in the above output ^ 213 | or in %BUILDDIR%/linkcheck/output.txt. 214 | goto end 215 | ) 216 | 217 | if "%1" == "doctest" ( 218 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 219 | if errorlevel 1 exit /b 1 220 | echo. 221 | echo.Testing of doctests in the sources finished, look at the ^ 222 | results in %BUILDDIR%/doctest/output.txt. 223 | goto end 224 | ) 225 | 226 | if "%1" == "xml" ( 227 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 228 | if errorlevel 1 exit /b 1 229 | echo. 230 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 231 | goto end 232 | ) 233 | 234 | if "%1" == "pseudoxml" ( 235 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 236 | if errorlevel 1 exit /b 1 237 | echo. 238 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 239 | goto end 240 | ) 241 | 242 | :end -------------------------------------------------------------------------------- /polyglot/mixins.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | import sys 4 | 5 | import six 6 | from six import PY2, binary_type 7 | 8 | if PY2: 9 | string_types = (str, unicode) 10 | basestring = basestring 11 | def implements_to_string(cls): 12 | """Class decorator that renames __str__ to __unicode__ and 13 | modifies __str__ that returns utf-8. 14 | """ 15 | cls.__unicode__ = cls.__str__ 16 | cls.__str__ = lambda x: x.__unicode__().encode('utf-8') 17 | return cls 18 | else: # PY3 19 | string_types = (str,) 20 | basestring = (str, bytes) 21 | implements_to_string = lambda x: x 22 | 23 | class ComparableMixin(object): 24 | 25 | '''Implements rich operators for an object.''' 26 | 27 | def _compare(self, other, method): 28 | try: 29 | return method(self._cmpkey(), other._cmpkey()) 30 | except (AttributeError, TypeError): 31 | # _cmpkey not implemented, or return different type, 32 | # so I can't compare with "other". Try the reverse comparison 33 | return NotImplemented 34 | 35 | def __lt__(self, other): 36 | return self._compare(other, lambda s, o: s < o) 37 | 38 | def __le__(self, other): 39 | return self._compare(other, lambda s, o: s <= o) 40 | 41 | def __eq__(self, other): 42 | return self._compare(other, lambda s, o: s == o) 43 | 44 | def __ge__(self, other): 45 | return self._compare(other, lambda s, o: s >= o) 46 | 47 | def __gt__(self, other): 48 | return self._compare(other, lambda s, o: s > o) 49 | 50 | def __ne__(self, other): 51 | return self._compare(other, lambda s, o: s != o) 52 | 53 | 54 | class BlobComparableMixin(ComparableMixin): 55 | 56 | '''Allow blob objects to be comparable with both strings and blobs.''' 57 | 58 | def _compare(self, other, method): 59 | if isinstance(other, basestring): 60 | # Just compare with the other string 61 | return method(self._cmpkey(), other) 62 | return super(BlobComparableMixin, self)._compare(other, method) 63 | 64 | 65 | @implements_to_string 66 | class StringlikeMixin(object): 67 | 68 | '''Make blob objects behave like Python strings. 69 | 70 | Expects that classes that use this mixin to have a _strkey() method that 71 | returns the string to apply string methods to. Using _strkey() instead 72 | of __str__ ensures consistent behavior between Python 2 and 3. 73 | ''' 74 | 75 | def __repr__(self): 76 | '''Returns a string representation for debugging.''' 77 | class_name = self.__class__.__name__ 78 | text = self.__unicode__().encode("utf-8") if PY2 else str(self) 79 | ret = '{cls}("{text}")'.format(cls=class_name, 80 | text=text) 81 | return binary_type(ret) if PY2 else ret 82 | 83 | def __str__(self): 84 | '''Returns a string representation used in print statements 85 | or str(my_blob).''' 86 | return self._strkey() 87 | 88 | def __len__(self): 89 | '''Returns the length of the raw text.''' 90 | return len(self._strkey()) 91 | 92 | def __iter__(self): 93 | '''Makes the object iterable as if it were a string, 94 | iterating through the raw string's characters. 95 | ''' 96 | return iter(self._strkey()) 97 | 98 | def __contains__(self, sub): 99 | '''Implements the `in` keyword like a Python string.''' 100 | return sub in self._strkey() 101 | 102 | def __getitem__(self, index): 103 | '''Returns a substring. If index is an integer, returns a Python 104 | string of a single character. If a range is given, e.g. `blob[3:5]`, 105 | a new instance of the class is returned. 106 | ''' 107 | if isinstance(index, int): 108 | return self._strkey()[index] # Just return a single character 109 | else: 110 | # Return a new blob object 111 | return self.__class__(self._strkey()[index]) 112 | 113 | def find(self, sub, start=0, end=sys.maxsize): 114 | '''Behaves like the built-in str.find() method. Returns an integer, 115 | the index of the first occurrence of the substring argument sub in the 116 | sub-string given by [start:end]. 117 | ''' 118 | return self._strkey().find(sub, start, end) 119 | 120 | def rfind(self, sub, start=0, end=sys.maxsize): 121 | '''Behaves like the built-in str.rfind() method. Returns an integer, 122 | the index of he last (right-most) occurence of the substring argument 123 | sub in the sub-sequence given by [start:end]. 124 | ''' 125 | return self._strkey().rfind(sub, start, end) 126 | 127 | def index(self, sub, start=0, end=sys.maxsize): 128 | '''Like blob.find() but raise ValueError when the substring 129 | is not found. 130 | ''' 131 | return self._strkey().index(sub, start, end) 132 | 133 | def rindex(self, sub, start=0, end=sys.maxsize): 134 | '''Like blob.rfind() but raise ValueError when substring is not 135 | found. 136 | ''' 137 | return self._strkey().rindex(sub, start, end) 138 | 139 | def startswith(self, prefix, start=0, end=sys.maxsize): 140 | """Returns True if the blob starts with the given prefix.""" 141 | return self._strkey().startswith(prefix, start, end) 142 | 143 | def endswith(self, suffix, start=0, end=sys.maxsize): 144 | """Returns True if the blob ends with the given suffix.""" 145 | return self._strkey().endswith(suffix, start, end) 146 | 147 | # PEP8 aliases 148 | starts_with = startswith 149 | ends_with = endswith 150 | 151 | def title(self): 152 | """Returns a blob object with the text in title-case.""" 153 | return self.__class__(self._strkey().title()) 154 | 155 | def format(self, *args, **kwargs): 156 | """Perform a string formatting operation, like the built-in 157 | `str.format(*args, **kwargs)`. Returns a blob object. 158 | """ 159 | return self.__class__(self._strkey().format(*args, **kwargs)) 160 | 161 | def split(self, sep=None, maxsplit=sys.maxsize): 162 | """Behaves like the built-in str.split(). 163 | """ 164 | return self._strkey().split(sep, maxsplit) 165 | 166 | def strip(self, chars=None): 167 | """Behaves like the built-in str.strip([chars]) method. Returns 168 | an object with leading and trailing whitespace removed. 169 | """ 170 | return self.__class__(self._strkey().strip(chars)) 171 | 172 | def upper(self): 173 | """Like str.upper(), returns new object with all upper-cased characters. 174 | """ 175 | return self.__class__(self._strkey().upper()) 176 | 177 | def lower(self): 178 | """Like str.lower(), returns new object with all lower-cased characters. 179 | """ 180 | return self.__class__(self._strkey().lower()) 181 | 182 | def join(self, iterable): 183 | """Behaves like the built-in `str.join(iterable)` method, except 184 | returns a blob object. 185 | 186 | Returns a blob which is the concatenation of the strings or blobs 187 | in the iterable. 188 | """ 189 | return self.__class__(self._strkey().join(iterable)) 190 | 191 | def replace(self, old, new, count=sys.maxsize): 192 | """Return a new blob object with all the occurence of `old` replaced 193 | by `new`. 194 | """ 195 | return self.__class__(self._strkey().replace(old, new, count)) 196 | -------------------------------------------------------------------------------- /docs/Sentiment.rst: -------------------------------------------------------------------------------- 1 | 2 | Sentiment 3 | ========= 4 | 5 | Polyglot has polarity lexicons for 136 languages. The scale of the 6 | words' polarity consisted of three degrees: +1 for positive words, and 7 | -1 for negatives words. Neutral words will have a score of 0. 8 | 9 | Languages Coverage 10 | ~~~~~~~~~~~~~~~~~~ 11 | 12 | .. code:: python 13 | 14 | from polyglot.downloader import downloader 15 | print(downloader.supported_languages_table("sentiment2", 3)) 16 | 17 | 18 | .. parsed-literal:: 19 | 20 | 1. Turkmen 2. Thai 3. Latvian 21 | 4. Zazaki 5. Tagalog 6. Tamil 22 | 7. Tajik 8. Telugu 9. Luxembourgish, Letzeb... 23 | 10. Alemannic 11. Latin 12. Turkish 24 | 13. Limburgish, Limburgan... 14. Egyptian Arabic 15. Tatar 25 | 16. Lithuanian 17. Spanish; Castilian 18. Basque 26 | 19. Estonian 20. Asturian 21. Greek, Modern 27 | 22. Esperanto 23. English 24. Ukrainian 28 | 25. Marathi (Marāṭhī) 26. Maltese 27. Burmese 29 | 28. Kapampangan 29. Uighur, Uyghur 30. Uzbek 30 | 31. Malagasy 32. Yiddish 33. Macedonian 31 | 34. Urdu 35. Malayalam 36. Mongolian 32 | 37. Breton 38. Bosnian 39. Bengali 33 | 40. Tibetan Standard, Tib... 41. Belarusian 42. Bulgarian 34 | 43. Bashkir 44. Vietnamese 45. Volapük 35 | 46. Gan Chinese 47. Manx 48. Gujarati 36 | 49. Yoruba 50. Occitan 51. Scottish Gaelic; Gaelic 37 | 52. Irish 53. Galician 54. Ossetian, Ossetic 38 | 55. Oriya 56. Walloon 57. Swedish 39 | 58. Silesian 59. Lombard language 60. Divehi; Dhivehi; Mald... 40 | 61. Danish 62. German 63. Armenian 41 | 64. Haitian; Haitian Creole 65. Hungarian 66. Croatian 42 | 67. Bishnupriya Manipuri 68. Hindi 69. Hebrew (modern) 43 | 70. Portuguese 71. Afrikaans 72. Pashto, Pushto 44 | 73. Amharic 74. Aragonese 75. Bavarian 45 | 76. Assamese 77. Panjabi, Punjabi 78. Polish 46 | 79. Azerbaijani 80. Italian 81. Arabic 47 | 82. Icelandic 83. Ido 84. Scots 48 | 85. Sicilian 86. Indonesian 87. Chinese Word 49 | 88. Interlingua 89. Waray-Waray 90. Piedmontese language 50 | 91. Quechua 92. French 93. Dutch 51 | 94. Norwegian Nynorsk 95. Norwegian 96. Western Frisian 52 | 97. Upper Sorbian 98. Nepali 99. Persian 53 | 100. Ilokano 101. Finnish 102. Faroese 54 | 103. Romansh 104. Javanese 105. Romanian, Moldavian, ... 55 | 106. Malay 107. Japanese 108. Russian 56 | 109. Catalan; Valencian 110. Fiji Hindi 111. Chinese 57 | 112. Cebuano 113. Czech 114. Chuvash 58 | 115. Welsh 116. West Flemish 117. Kirghiz, Kyrgyz 59 | 118. Kurdish 119. Kazakh 120. Korean 60 | 121. Kannada 122. Khmer 123. Georgian 61 | 124. Sakha 125. Serbian 126. Albanian 62 | 127. Swahili 128. Chechen 129. Sundanese 63 | 130. Sanskrit (Saṁskṛta) 131. Venetian 132. Northern Sami 64 | 133. Slovak 134. Sinhala, Sinhalese 135. Bosnian-Croatian-Serbian 65 | 136. Slovene 66 | 67 | 68 | .. code:: python 69 | 70 | from polyglot.text import Text 71 | 72 | Polarity 73 | -------- 74 | 75 | To inquiry the polarity of a word, we can just call its own attribute 76 | ``polarity`` 77 | 78 | .. code:: python 79 | 80 | text = Text("The movie was really good.") 81 | 82 | .. code:: python 83 | 84 | print("{:<16}{}".format("Word", "Polarity")+"\n"+"-"*30) 85 | for w in text.words: 86 | print("{:<16}{:>2}".format(w, w.polarity)) 87 | 88 | 89 | .. parsed-literal:: 90 | 91 | Word Polarity 92 | ------------------------------ 93 | The 0 94 | movie 0 95 | was 0 96 | really 0 97 | good 1 98 | . 0 99 | 100 | 101 | Entity Sentiment 102 | ---------------- 103 | 104 | We can calculate a more sphosticated sentiment score for an entity that 105 | is mentioned in text as the following: 106 | 107 | .. code:: python 108 | 109 | blob = ("Barack Obama gave a fantastic speech last night. " 110 | "Reports indicate he will move next to New Hampshire.") 111 | text = Text(blob) 112 | 113 | First, we need split the text into sentneces, this will limit the words 114 | tha affect the sentiment of an entity to the words mentioned in the 115 | sentnece. 116 | 117 | .. code:: python 118 | 119 | first_sentence = text.sentences[0] 120 | print(first_sentence) 121 | 122 | 123 | .. parsed-literal:: 124 | 125 | The movie was really good. 126 | 127 | 128 | Second, we extract the entities 129 | 130 | .. code:: python 131 | 132 | first_entity = first_sentence.entities[0] 133 | print(first_entity) 134 | 135 | 136 | .. parsed-literal:: 137 | 138 | [u'Obama'] 139 | 140 | 141 | Finally, for each entity we identified, we can calculate the strength of 142 | the positive or negative sentiment it has on a scale from 0-1 143 | 144 | .. code:: python 145 | 146 | first_entity.positive_sentiment 147 | 148 | 149 | 150 | 151 | .. parsed-literal:: 152 | 153 | 0.9375 154 | 155 | 156 | 157 | .. code:: python 158 | 159 | first_entity.negative_sentiment 160 | 161 | 162 | 163 | 164 | .. parsed-literal:: 165 | 166 | 0 167 | 168 | 169 | 170 | Citation 171 | ~~~~~~~~ 172 | 173 | This work is a direct implementation of the research being described in 174 | the `Building sentiment lexicons for all major 175 | languages `__ paper. The author of 176 | this library strongly encourage you to cite the following paper if you 177 | are using this software. 178 | 179 | :: 180 | 181 | @inproceedings{chen2014building, 182 | title={Building sentiment lexicons for all major languages}, 183 | author={Chen, Yanqing and Skiena, Steven}, 184 | booktitle={Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics (Short Papers)}, 185 | pages={383--389}, 186 | year={2014}} 187 | -------------------------------------------------------------------------------- /docs/CLI.rst: -------------------------------------------------------------------------------- 1 | 2 | Command Line Interface 3 | ====================== 4 | 5 | polyglot package offer a command line interface along with the library 6 | access. For each task in polyglot, there is a subcommand with specific 7 | options for that task. Common options are gathered under the main 8 | command ``polyglot`` 9 | 10 | .. code:: python 11 | 12 | !polyglot --help 13 | 14 | 15 | .. parsed-literal:: 16 | 17 | usage: polyglot [-h] [--lang LANG] [--delimiter DELIMITER] [--workers WORKERS] [-l LOG] [--debug] 18 | {detect,morph,tokenize,download,count,cat,ner,pos,transliteration,sentiment} ... 19 | 20 | optional arguments: 21 | -h, --help show this help message and exit 22 | --lang LANG Language to be processed 23 | --delimiter DELIMITER 24 | Delimiter that seperates documents, records or even sentences. 25 | --workers WORKERS Number of parallel processes. 26 | -l LOG, --log LOG log verbosity level 27 | --debug drop a debugger if an exception is raised. 28 | 29 | tools: 30 | multilingual tools for all languages 31 | 32 | {detect,morph,tokenize,download,count,cat,ner,pos,transliteration,sentiment} 33 | detect Detect the language(s) used in text. 34 | tokenize Tokenize text into sentences and words. 35 | download Download polyglot resources and models. 36 | count Count words frequency in a corpus. 37 | cat Print the contents of the input file to the screen. 38 | ner Named entity recognition chunking. 39 | pos Part of Speech tagger. 40 | transliteration Rewriting the input in the target language script. 41 | sentiment Classify text to positive and negative polarity. 42 | 43 | 44 | Notice that most of the operations are language specific. For example, 45 | tokenization rules and part of speech taggers differ between languages. 46 | Therefore, it is important that the lanaguage of the input is detected 47 | or given. The ``--lang`` option allows you to tell polyglot which 48 | language the input is written in. 49 | 50 | .. code:: python 51 | 52 | !polyglot --lang en tokenize --input testdata/cricket.txt | head -n 3 53 | 54 | 55 | .. parsed-literal:: 56 | 57 | Australia posted a World Cup record total of 417 - 6 as they beat Afghanistan by 275 runs . 58 | David Warner hit 178 off 133 balls , Steve Smith scored 95 while Glenn Maxwell struck 88 in 39 deliveries in the Pool A encounter in Perth . 59 | Afghanistan were then dismissed for 142 , with Mitchell Johnson and Mitchell Starc taking six wickets between them . 60 | 61 | 62 | In case the user did not supply the the language code, polyglot will 63 | peek ahead and read the first 1KB of data to detect the language used in 64 | the input. 65 | 66 | .. code:: python 67 | 68 | !polyglot tokenize --input testdata/cricket.txt | head -n 3 69 | 70 | 71 | .. parsed-literal:: 72 | 73 | 2015-03-15 17:06:45 INFO __main__.py: 276 Language English is detected while reading the first 1128 bytes. 74 | Australia posted a World Cup record total of 417 - 6 as they beat Afghanistan by 275 runs . 75 | David Warner hit 178 off 133 balls , Steve Smith scored 95 while Glenn Maxwell struck 88 in 39 deliveries in the Pool A encounter in Perth . 76 | Afghanistan were then dismissed for 142 , with Mitchell Johnson and Mitchell Starc taking six wickets between them . 77 | 78 | 79 | Input formats 80 | ------------- 81 | 82 | Polyglot will process the input contents line by line assuming that the 83 | lines are separated by "``\n``\ ". If the file is formatted differently, 84 | you can use the polyglot main command option ``delimiter`` to specify 85 | any string other than "``\n``\ ". 86 | 87 | You can pass text to the polyglot subcommands in several ways: 88 | 89 | - **Standard input**: This is, usually, useful for building processing 90 | pipelines. 91 | 92 | - **Text file**: The file contents will be processed line by line. 93 | 94 | - **Collection of text files**: Polyglot will iterate over the files 95 | one by one. If the polyglot main command option ``workers`` is 96 | activated, the execution will be parallelized and each file will be 97 | processed by a different process. 98 | 99 | Word Count Example 100 | ------------------ 101 | 102 | This example will demonstrate how to use the polyglot main command 103 | options and the subcommand count to generate a count of the words 104 | appearing in a collection of text files. 105 | 106 | First, let us examine the subcommand ``count`` options 107 | 108 | .. code:: python 109 | 110 | !polyglot count --help 111 | 112 | 113 | .. parsed-literal:: 114 | 115 | usage: polyglot count [-h] [--min-count MIN_COUNT | --most-freq MOST_FREQ] [--input [INPUT [INPUT ...]]] 116 | 117 | optional arguments: 118 | -h, --help show this help message and exit 119 | --min-count MIN_COUNT 120 | Ignore all words that appear <= min_freq. 121 | --most-freq MOST_FREQ 122 | Consider only the most frequent k words. 123 | --input [INPUT [INPUT ...]] 124 | 125 | 126 | To avoid long output, we will restrict the count to the words that 127 | appeared at least twice 128 | 129 | .. code:: python 130 | 131 | !polyglot count --input testdata/cricket.txt --min-count 2 132 | 133 | 134 | .. parsed-literal:: 135 | 136 | in 10 137 | the 6 138 | by 3 139 | and 3 140 | of 3 141 | Bermuda 2 142 | West 2 143 | Mitchell 2 144 | South 2 145 | Indies 2 146 | against 2 147 | beat 2 148 | as 2 149 | India 2 150 | which 2 151 | score 2 152 | Afghanistan 2 153 | 154 | 155 | Let us consider the scenario where we have hundreds of files that 156 | contains words we want to count. Notice, that we can parallelize the 157 | process by passing a number higher than 1 to the polyglot main command 158 | option ``workers``. 159 | 160 | .. code:: python 161 | 162 | !polyglot --log debug --workers 5 count --input testdata/cricket.txt testdata/cricket.txt --min-count 3 163 | 164 | 165 | .. parsed-literal:: 166 | 167 | in 20 168 | the 12 169 | of 6 170 | by 6 171 | and 6 172 | West 4 173 | Afghanistan 4 174 | India 4 175 | beat 4 176 | which 4 177 | Indies 4 178 | Bermuda 4 179 | as 4 180 | South 4 181 | Mitchell 4 182 | against 4 183 | score 4 184 | 185 | 186 | Building Pipelines 187 | ------------------ 188 | 189 | The previous subcommand ``count`` assumed that the words are separted by 190 | spaces. Given that we never tokenized the text file, that may result in 191 | suboptimal word counting. Let us take a closer look at the tail of the 192 | word counts 193 | 194 | .. code:: python 195 | 196 | !polyglot count --input testdata/cricket.txt | tail -n 10 197 | 198 | 199 | .. parsed-literal:: 200 | 201 | Ireland 1 202 | surpassed 1 203 | amount 1 204 | equalled 1 205 | a 1 206 | The 1 207 | 413-5 1 208 | Africa's 1 209 | tournament 1 210 | Johnson 1 211 | 212 | 213 | Observe that words like "2007." could have been considered two words 214 | "2007" and "." and the same for "Africa's". To fix this issue, we can 215 | use the polyglot subcommand tokenize to deal with these cases. We can 216 | stage the counting to happen after the tokenization using the stdin to 217 | build a simple pipe. 218 | 219 | .. code:: python 220 | 221 | !polyglot --lang en tokenize --input testdata/cricket.txt | polyglot count --min-count 2 222 | 223 | 224 | .. parsed-literal:: 225 | 226 | in 10 227 | the 6 228 | . 6 229 | - 5 230 | , 4 231 | of 3 232 | and 3 233 | by 3 234 | South 2 235 | 5 2 236 | 2007 2 237 | Bermuda 2 238 | which 2 239 | score 2 240 | against 2 241 | Mitchell 2 242 | as 2 243 | West 2 244 | India 2 245 | beat 2 246 | Afghanistan 2 247 | Indies 2 248 | 249 | 250 | Notice, that the word "2007" started appearing in the words counts list. 251 | -------------------------------------------------------------------------------- /polyglot/mapping/base.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ Supports word embeddings.""" 5 | 6 | from io import open, StringIO 7 | from collections import Counter 8 | import os 9 | from concurrent.futures import ProcessPoolExecutor 10 | 11 | import six 12 | from six.moves import zip 13 | from six import iteritems 14 | from six import text_type as unicode 15 | from six import string_types 16 | 17 | from ..base import TextFile 18 | from ..utils import _open 19 | 20 | def count(lines): 21 | """ Counts the word frequences in a list of sentences. 22 | 23 | Note: 24 | This is a helper function for parallel execution of `Vocabulary.from_text` 25 | method. 26 | """ 27 | words = [w for l in lines for w in l.strip().split()] 28 | return Counter(words) 29 | 30 | 31 | class VocabularyBase(object): 32 | """ A set of words/tokens that have consistent IDs. 33 | 34 | Note: 35 | Words will be sorted according to their lexicographic order. 36 | 37 | Attributes: 38 | word_id (dictionary): Mapping from words to IDs. 39 | id_word (dictionary): A reverse map of `word_id`. 40 | """ 41 | 42 | def __init__(self, words=None): 43 | """ Build attributes word_id and id_word from input. 44 | 45 | Args: 46 | words (list/set): list or set of words. 47 | """ 48 | words = self.sanitize_words(words) 49 | self.word_id = {w:i for i, w in enumerate(sorted(words))} 50 | self.id_word = {i:w for w,i in iteritems(self.word_id)} 51 | 52 | def sanitize_words(self, words): 53 | """Guarantees that all textual symbols are unicode. 54 | 55 | Note: 56 | We do not convert numbers, only strings to unicode. 57 | We assume that the strings are encoded in utf-8. 58 | """ 59 | _words = [] 60 | for w in words: 61 | if isinstance(w, string_types) and not isinstance(w, unicode): 62 | _words.append(unicode(w, encoding="utf-8")) 63 | else: 64 | _words.append(w) 65 | return _words 66 | 67 | def __iter__(self): 68 | """Iterate over the words in a vocabulary.""" 69 | for w,i in sorted(iteritems(self.word_id), key=lambda wc: wc[1]): 70 | yield w 71 | 72 | @property 73 | def words(self): 74 | """ Ordered list of words according to their IDs.""" 75 | return list(self) 76 | 77 | def __unicode__(self): 78 | return u"\n".join(self.words) 79 | 80 | def __str__(self): 81 | if six.PY3: 82 | return self.__unicode__() 83 | return self.__unicode__().encode("utf-8") 84 | 85 | def __getitem__(self, key): 86 | if isinstance(key, string_types) and not isinstance(key, unicode): 87 | key = unicode(key, encoding="utf-8") 88 | return self.word_id[key] 89 | 90 | def __contains__(self, key): 91 | return key in self.word_id 92 | 93 | def __delitem__(self, key): 94 | """Delete a word from vocabulary. 95 | 96 | Note: 97 | To maintain consecutive IDs, this operation implemented 98 | with a complexity of \\theta(n). 99 | """ 100 | del self.word_id[key] 101 | self.id_word = dict(enumerate(self.words)) 102 | self.word_id = {w:i for i,w in iteritems(self.id_word)} 103 | 104 | def __len__(self): 105 | return len(self.word_id) 106 | 107 | def get(self, k, default=None): 108 | try: 109 | return self[k] 110 | except KeyError as e: 111 | return default 112 | 113 | def getstate(self): 114 | return list(self.words) 115 | 116 | @classmethod 117 | def from_vocabfile(cls, filename): 118 | """ Construct a CountedVocabulary out of a vocabulary file. 119 | 120 | Note: 121 | File has the following format word1 122 | word2 123 | """ 124 | words = [x.strip() for x in _open(filename, 'r').read().splitlines()] 125 | return cls(words=words) 126 | 127 | 128 | class OrderedVocabulary(VocabularyBase): 129 | """ An ordered list of words/tokens according to their frequency. 130 | 131 | Note: 132 | The words order is assumed to be sorted according to the word frequency. 133 | Most frequent words appear first in the list. 134 | 135 | Attributes: 136 | word_id (dictionary): Mapping from words to IDs. 137 | id_word (dictionary): A reverse map of `word_id`. 138 | """ 139 | 140 | def __init__(self, words=None): 141 | """ Build attributes word_id and id_word from input. 142 | 143 | Args: 144 | words (list): list of sorted words according to frequency. 145 | """ 146 | 147 | words = self.sanitize_words(words) 148 | self.word_id = {w:i for i, w in enumerate(words)} 149 | self.id_word = {i:w for w,i in iteritems(self.word_id)} 150 | 151 | 152 | def most_frequent(self, k): 153 | """ Returns a vocabulary with the most frequent `k` words. 154 | 155 | Args: 156 | k (integer): specifies the top k most frequent words to be returned. 157 | """ 158 | return OrderedVocabulary(words=self.words[:k]) 159 | 160 | 161 | class CountedVocabulary(OrderedVocabulary): 162 | """ List of words and counts sorted according to word count. 163 | """ 164 | 165 | def __init__(self, word_count=None): 166 | """ Build attributes word_id and id_word from input. 167 | 168 | Args: 169 | word_count (dictionary): A dictionary of the type word:count or 170 | list of tuples of the type (word, count). 171 | """ 172 | 173 | if isinstance(word_count, dict): 174 | word_count = iteritems(word_count) 175 | sorted_counts = list(sorted(word_count, key=lambda wc: wc[1], reverse=True)) 176 | words = [w for w,c in sorted_counts] 177 | super(CountedVocabulary, self).__init__(words=words) 178 | self.word_count = dict(sorted_counts) 179 | 180 | @staticmethod 181 | def from_textfiles(files, workers=1, job_size=1000): 182 | c = Counter() 183 | if workers == 1: 184 | for lines in files.iter_chunks(job_size): 185 | c.update(count(lines)) 186 | else: 187 | with ProcessPoolExecutor(max_workers=workers) as executor: 188 | for counter_ in executor.map(CountedVocabulary.from_textfile, files.names): 189 | c.update(Counter(counter_.word_count)) 190 | return CountedVocabulary(word_count=c) 191 | 192 | @classmethod 193 | def from_textfile(cls, textfile, workers=1, job_size=1000): 194 | """ Count the set of words appeared in a text file. 195 | 196 | Args: 197 | textfile (string): The name of the text file or `TextFile` object. 198 | min_count (integer): Minimum number of times a word/token appeared in the document 199 | to be considered part of the vocabulary. 200 | workers (integer): Number of parallel workers to read the file simulatenously. 201 | job_size (integer): Size of the batch send to each worker. 202 | most_frequent (integer): if no min_count is specified, consider the most frequent k words for the vocabulary. 203 | 204 | Returns: 205 | A vocabulary of the most frequent words appeared in the document. 206 | """ 207 | 208 | c = Counter() 209 | if isinstance(textfile, string_types): 210 | textfile = TextFile(textfile) 211 | for result in textfile.apply(count, workers, job_size): 212 | c.update(result) 213 | return CountedVocabulary(word_count=c) 214 | 215 | def most_frequent(self, k): 216 | """ Returns a vocabulary with the most frequent `k` words. 217 | 218 | Args: 219 | k (integer): specifies the top k most frequent words to be returned. 220 | """ 221 | word_count = {w:self.word_count[w] for w in self.words[:k]} 222 | return CountedVocabulary(word_count=word_count) 223 | 224 | def min_count(self, n=1): 225 | """ Returns a vocabulary after eliminating the words that appear < `n`. 226 | 227 | Args: 228 | n (integer): specifies the minimum word frequency allowed. 229 | """ 230 | word_count = {w:c for w,c in iteritems(self.word_count) if c >= n} 231 | return CountedVocabulary(word_count=word_count) 232 | 233 | def __unicode__(self): 234 | return u"\n".join([u"{}\t{}".format(w,self.word_count[w]) for w in self.words]) 235 | 236 | def __delitem__(self, key): 237 | super(CountedVocabulary, self).__delitem__(key) 238 | self.word_count = {w:self.word_count[w] for w in self} 239 | 240 | def getstate(self): 241 | words = list(self.words) 242 | counts = [self.word_count[w] for w in words] 243 | return (words, counts) 244 | 245 | @staticmethod 246 | def from_vocabfile(filename): 247 | """ Construct a CountedVocabulary out of a vocabulary file. 248 | 249 | Note: 250 | File has the following format word1 count1 251 | word2 count2 252 | """ 253 | word_count = [x.strip().split() for x in _open(filename, 'r').read().splitlines()] 254 | word_count = {w:int(c) for w,c in word_count} 255 | return CountedVocabulary(word_count=word_count) 256 | -------------------------------------------------------------------------------- /notebooks/Transliteration.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Transliteration" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Transliteration is the conversion of a text from one script to another.\n", 15 | "For instance, a Latin transliteration of the Greek phrase \"Ελληνική Δημοκρατία\", usually translated as 'Hellenic Republic', is \"Ellēnikḗ Dēmokratía\"." 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "metadata": { 22 | "collapsed": false 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "from polyglot.transliteration import Transliterator" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "## Languages Coverage" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "metadata": { 40 | "collapsed": false 41 | }, 42 | "outputs": [ 43 | { 44 | "name": "stdout", 45 | "output_type": "stream", 46 | "text": [ 47 | " 1. Haitian; Haitian Creole 2. Tamil 3. Vietnamese \n", 48 | " 4. Telugu 5. Croatian 6. Hungarian \n", 49 | " 7. Thai 8. Kannada 9. Tagalog \n", 50 | " 10. Armenian 11. Hebrew (modern) 12. Turkish \n", 51 | " 13. Portuguese 14. Belarusian 15. Norwegian Nynorsk \n", 52 | " 16. Norwegian 17. Dutch 18. Japanese \n", 53 | " 19. Albanian 20. Bulgarian 21. Serbian \n", 54 | " 22. Swahili 23. Swedish 24. French \n", 55 | " 25. Latin 26. Czech 27. Yiddish \n", 56 | " 28. Hindi 29. Danish 30. Finnish \n", 57 | " 31. German 32. Bosnian-Croatian-Serbian 33. Slovak \n", 58 | " 34. Persian 35. Lithuanian 36. Slovene \n", 59 | " 37. Latvian 38. Bosnian 39. Gujarati \n", 60 | " 40. Italian 41. Icelandic 42. Spanish; Castilian \n", 61 | " 43. Ukrainian 44. Georgian 45. Urdu \n", 62 | " 46. Indonesian 47. Marathi (Marāṭhī) 48. Korean \n", 63 | " 49. Galician 50. Khmer 51. Catalan; Valencian \n", 64 | " 52. Romanian, Moldavian, ... 53. Basque 54. Macedonian \n", 65 | " 55. Russian 56. Azerbaijani 57. Chinese \n", 66 | " 58. Estonian 59. Welsh 60. Arabic \n", 67 | " 61. Bengali 62. Amharic 63. Irish \n", 68 | " 64. Malay 65. Afrikaans 66. Polish \n", 69 | " 67. Greek, Modern 68. Esperanto 69. Maltese \n", 70 | "\n" 71 | ] 72 | } 73 | ], 74 | "source": [ 75 | "from polyglot.downloader import downloader\n", 76 | "print(downloader.supported_languages_table(\"transliteration2\"))" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "#### Downloading Necessary Models" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 3, 89 | "metadata": { 90 | "collapsed": false 91 | }, 92 | "outputs": [ 93 | { 94 | "name": "stdout", 95 | "output_type": "stream", 96 | "text": [ 97 | "[polyglot_data] Downloading package embeddings2.en to\n", 98 | "[polyglot_data] /home/rmyeid/polyglot_data...\n", 99 | "[polyglot_data] Package embeddings2.en is already up-to-date!\n", 100 | "[polyglot_data] Downloading package pos2.en to\n", 101 | "[polyglot_data] /home/rmyeid/polyglot_data...\n", 102 | "[polyglot_data] Package pos2.en is already up-to-date!\n" 103 | ] 104 | } 105 | ], 106 | "source": [ 107 | "%%bash\n", 108 | "polyglot download embeddings2.en pos2.en" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "## Example" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "We tag each word in the text with one part of speech." 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 7, 128 | "metadata": { 129 | "collapsed": false 130 | }, 131 | "outputs": [], 132 | "source": [ 133 | "from polyglot.text import Text" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 8, 139 | "metadata": { 140 | "collapsed": true 141 | }, 142 | "outputs": [], 143 | "source": [ 144 | "blob = \"\"\"We will meet at eight o'clock on Thursday morning.\"\"\"\n", 145 | "text = Text(blob)" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "We can query all the tagged words" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 9, 158 | "metadata": { 159 | "collapsed": false 160 | }, 161 | "outputs": [ 162 | { 163 | "name": "stdout", 164 | "output_type": "stream", 165 | "text": [ 166 | "وي\n", 167 | "ويل\n", 168 | "ميت\n", 169 | "ات\n", 170 | "ييايت\n", 171 | "أوكلوك\n", 172 | "ون\n", 173 | "ثورسداي\n", 174 | "مورنينغ\n", 175 | "\n" 176 | ] 177 | } 178 | ], 179 | "source": [ 180 | "for x in text.transliterate(\"ar\"):\n", 181 | " print(x)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "### Command Line Interface" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 20, 194 | "metadata": { 195 | "collapsed": false 196 | }, 197 | "outputs": [ 198 | { 199 | "name": "stdout", 200 | "output_type": "stream", 201 | "text": [ 202 | "which ويكه \r\n", 203 | "India ينديا \r\n", 204 | "beat بيت \r\n", 205 | "Bermuda بيرمودا \r\n", 206 | "in ين \r\n", 207 | "Port بورت \r\n", 208 | "of وف \r\n", 209 | "Spain سباين \r\n", 210 | "in ين \r\n", 211 | "2007 \r\n", 212 | ", \r\n", 213 | "which ويكه \r\n", 214 | "was واس \r\n", 215 | "equalled يكالليد \r\n", 216 | "five فيفي \r\n", 217 | "days دايس \r\n", 218 | "ago اغو \r\n", 219 | "by بي \r\n", 220 | "South سووث \r\n", 221 | "Africa افريكا \r\n", 222 | "in ين \r\n", 223 | "their ثير \r\n", 224 | "victory فيكتوري \r\n", 225 | "over وفير \r\n", 226 | "West ويست \r\n", 227 | "Indies يندييس \r\n", 228 | "in ين \r\n", 229 | "Sydney سيدني \r\n", 230 | ". \r\n", 231 | "\r\n" 232 | ] 233 | } 234 | ], 235 | "source": [ 236 | "!polyglot --lang en tokenize --input testdata/cricket.txt | polyglot --lang en transliteration --target ar | tail -n 30" 237 | ] 238 | } 239 | ], 240 | "metadata": { 241 | "kernelspec": { 242 | "display_name": "Python 2", 243 | "language": "python", 244 | "name": "python2" 245 | }, 246 | "language_info": { 247 | "codemirror_mode": { 248 | "name": "ipython", 249 | "version": 2 250 | }, 251 | "file_extension": ".py", 252 | "mimetype": "text/x-python", 253 | "name": "python", 254 | "nbconvert_exporter": "python", 255 | "pygments_lexer": "ipython2", 256 | "version": "2.7.6" 257 | } 258 | }, 259 | "nbformat": 4, 260 | "nbformat_minor": 0 261 | } 262 | -------------------------------------------------------------------------------- /notebooks/Tokenization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Tokenization" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Toeknization is the process that identifies the text boundaries of words and sentences.\n", 15 | "We can identify the boundaries of sentences first then tokenize each sentence to identify the words that compose the sentence.\n", 16 | "Of course, we can do word tokenization first and then segment the token sequence into sentneces.\n", 17 | "Tokenization in polyglot relies on the [Unicode Text Segmentation](http://www.unicode.org/reports/tr29/) algorithm as implemented by the [ICU Project](http://site.icu-project.org/).\n", 18 | "\n", 19 | "You can use C/C++ ICU library by installing the required package `libicu-dev`. For example, on ubuntu/debian systems you should use `apt-get` utility as the following:" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": { 26 | "collapsed": true 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "sudo apt-get install libicu-dev" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 4, 36 | "metadata": { 37 | "collapsed": false 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "from polyglot.text import Text" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "## Word Tokenization" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "To call our word tokenizer, first we need to construct a Text object." 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 9, 61 | "metadata": { 62 | "collapsed": true 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "blob = u\"\"\"\n", 67 | "两个月前遭受恐怖袭击的法国巴黎的犹太超市在装修之后周日重新开放,法国内政部长以及超市的管理者都表示,这显示了生命力要比野蛮行为更强大。\n", 68 | "该超市1月9日遭受枪手袭击,导致4人死亡,据悉这起事件与法国《查理周刊》杂志社恐怖袭击案有关。\n", 69 | "\"\"\"\n", 70 | "text = Text(blob)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "The property words will call the word tokenizer." 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 10, 83 | "metadata": { 84 | "collapsed": false 85 | }, 86 | "outputs": [ 87 | { 88 | "data": { 89 | "text/plain": [ 90 | "WordList(['两', '个', '月', '前', '遭受', '恐怖', '袭击', '的', '法国', '巴黎', '的', '犹太', '超市', '在', '装修', '之后', '周日', '重新', '开放', ',', '法国', '内政', '部长', '以及', '超市', '的', '管理者', '都', '表示', ',', '这', '显示', '了', '生命力', '要', '比', '野蛮', '行为', '更', '强大', '。', '该', '超市', '1', '月', '9', '日', '遭受', '枪手', '袭击', ',', '导致', '4', '人', '死亡', ',', '据悉', '这', '起', '事件', '与', '法国', '《', '查理', '周刊', '》', '杂志', '社', '恐怖', '袭击', '案', '有关', '。'])" 91 | ] 92 | }, 93 | "execution_count": 10, 94 | "metadata": {}, 95 | "output_type": "execute_result" 96 | } 97 | ], 98 | "source": [ 99 | "text.words" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "Since ICU boundary break algorithms are language aware, polyglot will detect the language used first before calling the tokenizer" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 26, 112 | "metadata": { 113 | "collapsed": false 114 | }, 115 | "outputs": [ 116 | { 117 | "name": "stdout", 118 | "output_type": "stream", 119 | "text": [ 120 | "name: code: zh confidence: 99.0 read bytes: 1920\n" 121 | ] 122 | } 123 | ], 124 | "source": [ 125 | "print(text.language)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": { 131 | "collapsed": true 132 | }, 133 | "source": [ 134 | "## Sentence Segementation" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": {}, 140 | "source": [ 141 | "If we are interested in segmenting the text first into sentences, we can query the `sentences` property" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 20, 147 | "metadata": { 148 | "collapsed": false 149 | }, 150 | "outputs": [ 151 | { 152 | "data": { 153 | "text/plain": [ 154 | "[Sentence(\"两个月前遭受恐怖袭击的法国巴黎的犹太超市在装修之后周日重新开放,法国内政部长以及超市的管理者都表示,这显示了生命力要比野蛮行为更强大。\"),\n", 155 | " Sentence(\"该超市1月9日遭受枪手袭击,导致4人死亡,据悉这起事件与法国《查理周刊》杂志社恐怖袭击案有关。\")]" 156 | ] 157 | }, 158 | "execution_count": 20, 159 | "metadata": {}, 160 | "output_type": "execute_result" 161 | } 162 | ], 163 | "source": [ 164 | "text.sentences" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "`Sentence` class inherits `Text`, therefore, we can tokenize each sentence into words using the same property `words`" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 21, 177 | "metadata": { 178 | "collapsed": false 179 | }, 180 | "outputs": [ 181 | { 182 | "data": { 183 | "text/plain": [ 184 | "WordList(['两', '个', '月', '前', '遭受', '恐怖', '袭击', '的', '法国', '巴黎', '的', '犹太', '超市', '在', '装修', '之后', '周日', '重新', '开放', ',', '法国', '内政', '部长', '以及', '超市', '的', '管理者', '都', '表示', ',', '这', '显示', '了', '生命力', '要', '比', '野蛮', '行为', '更', '强大', '。'])" 185 | ] 186 | }, 187 | "execution_count": 21, 188 | "metadata": {}, 189 | "output_type": "execute_result" 190 | } 191 | ], 192 | "source": [ 193 | "first_sentence = text.sentences[0]\n", 194 | "first_sentence.words" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "## Command Line" 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "metadata": {}, 207 | "source": [ 208 | "The subcommand tokenize does by default sentence segmentation and word tokenization." 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 4, 214 | "metadata": { 215 | "collapsed": false 216 | }, 217 | "outputs": [ 218 | { 219 | "name": "stdout", 220 | "output_type": "stream", 221 | "text": [ 222 | "usage: polyglot tokenize [-h] [--only-sent | --only-word] [--input [INPUT [INPUT ...]]]\r\n", 223 | "\r\n", 224 | "optional arguments:\r\n", 225 | " -h, --help show this help message and exit\r\n", 226 | " --only-sent Segment sentences without word tokenization\r\n", 227 | " --only-word Tokenize words without sentence segmentation\r\n", 228 | " --input [INPUT [INPUT ...]]\r\n" 229 | ] 230 | } 231 | ], 232 | "source": [ 233 | "! polyglot tokenize --help" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "Each line represents a sentence where the words are split by spaces." 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 25, 246 | "metadata": { 247 | "collapsed": false 248 | }, 249 | "outputs": [ 250 | { 251 | "name": "stdout", 252 | "output_type": "stream", 253 | "text": [ 254 | "Australia posted a World Cup record total of 417 - 6 as they beat Afghanistan by 275 runs .\r\n", 255 | "David Warner hit 178 off 133 balls , Steve Smith scored 95 while Glenn Maxwell struck 88 in 39 deliveries in the Pool A encounter in Perth .\r\n", 256 | "Afghanistan were then dismissed for 142 , with Mitchell Johnson and Mitchell Starc taking six wickets between them .\r\n", 257 | "Australia's score surpassed the 413 - 5 India made against Bermuda in 2007 .\r\n", 258 | "It continues the pattern of bat dominating ball in this tournament as the third 400 plus score achieved in the pool stages , following South Africa's 408 - 5 and 411 - 4 against West Indies and Ireland respectively .\r\n", 259 | "The winning margin beats the 257 - run amount by which India beat Bermuda in Port of Spain in 2007 , which was equalled five days ago by South Africa in their victory over West Indies in Sydney .\r\n" 260 | ] 261 | } 262 | ], 263 | "source": [ 264 | "!polyglot --lang en tokenize --input testdata/cricket.txt" 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "metadata": {}, 270 | "source": [ 271 | "### References\n", 272 | "\n", 273 | "- [Unicode Text Segmentation Algorithm](http://www.unicode.org/reports/tr29/)\n", 274 | "- [Unicode Line Breaking Algorithm](http://www.unicode.org/reports/tr14/)\n", 275 | "- [Boundary Analysis](http://userguide.icu-project.org/boundaryanalysis)\n", 276 | "- [ICU Homepage](http://site.icu-project.org/)\n", 277 | "- [Python Wrapper for libicu](https://pypi.python.org/pypi/PyICU)" 278 | ] 279 | } 280 | ], 281 | "metadata": { 282 | "kernelspec": { 283 | "display_name": "Python 3", 284 | "language": "python", 285 | "name": "python3" 286 | }, 287 | "language_info": { 288 | "codemirror_mode": { 289 | "name": "ipython", 290 | "version": 3 291 | }, 292 | "file_extension": ".py", 293 | "mimetype": "text/x-python", 294 | "name": "python", 295 | "nbconvert_exporter": "python", 296 | "pygments_lexer": "ipython3", 297 | "version": "3.4.0" 298 | } 299 | }, 300 | "nbformat": 4, 301 | "nbformat_minor": 0 302 | } 303 | -------------------------------------------------------------------------------- /docs/MorphologicalAnalysis.rst: -------------------------------------------------------------------------------- 1 | 2 | Morphological Analysis 3 | ====================== 4 | 5 | Polyglot offers trained `morfessor 6 | models `__ to generate 7 | morphemes from words. The goal of the Morpho project is to develop 8 | unsupervised data-driven methods that discover the regularities behind 9 | word forming in natural languages. In particular, Morpho project is 10 | focussing on the discovery of morphemes, which are the primitive units 11 | of syntax, the smallest individually meaningful elements in the 12 | utterances of a language. Morphemes are important in automatic 13 | generation and recognition of a language, especially in languages in 14 | which words may have many different inflected forms. 15 | 16 | Languages Coverage 17 | ------------------ 18 | 19 | Using polyglot vocabulary dictionaries, we trained morfessor models on 20 | the most frequent words 50,000 words of each language. 21 | 22 | .. code:: python 23 | 24 | from polyglot.downloader import downloader 25 | print(downloader.supported_languages_table("morph2")) 26 | 27 | 28 | .. parsed-literal:: 29 | 30 | 1. Piedmontese language 2. Lombard language 3. Gan Chinese 31 | 4. Sicilian 5. Scots 6. Kirghiz, Kyrgyz 32 | 7. Pashto, Pushto 8. Kurdish 9. Portuguese 33 | 10. Kannada 11. Korean 12. Khmer 34 | 13. Kazakh 14. Ilokano 15. Polish 35 | 16. Panjabi, Punjabi 17. Georgian 18. Chuvash 36 | 19. Alemannic 20. Czech 21. Welsh 37 | 22. Chechen 23. Catalan; Valencian 24. Northern Sami 38 | 25. Sanskrit (Saṁskṛta) 26. Slovene 27. Javanese 39 | 28. Slovak 29. Bosnian-Croatian-Serbian 30. Bavarian 40 | 31. Swedish 32. Swahili 33. Sundanese 41 | 34. Serbian 35. Albanian 36. Japanese 42 | 37. Western Frisian 38. French 39. Finnish 43 | 40. Upper Sorbian 41. Faroese 42. Persian 44 | 43. Sinhala, Sinhalese 44. Italian 45. Amharic 45 | 46. Aragonese 47. Volapük 48. Icelandic 46 | 49. Sakha 50. Afrikaans 51. Indonesian 47 | 52. Interlingua 53. Azerbaijani 54. Ido 48 | 55. Arabic 56. Assamese 57. Yoruba 49 | 58. Yiddish 59. Waray-Waray 60. Croatian 50 | 61. Hungarian 62. Haitian; Haitian Creole 63. Quechua 51 | 64. Armenian 65. Hebrew (modern) 66. Silesian 52 | 67. Hindi 68. Divehi; Dhivehi; Mald... 69. German 53 | 70. Danish 71. Occitan 72. Tagalog 54 | 73. Turkmen 74. Thai 75. Tajik 55 | 76. Greek, Modern 77. Telugu 78. Tamil 56 | 79. Oriya 80. Ossetian, Ossetic 81. Tatar 57 | 82. Turkish 83. Kapampangan 84. Venetian 58 | 85. Manx 86. Gujarati 87. Galician 59 | 88. Irish 89. Scottish Gaelic; Gaelic 90. Nepali 60 | 91. Cebuano 92. Zazaki 93. Walloon 61 | 94. Dutch 95. Norwegian 96. Norwegian Nynorsk 62 | 97. West Flemish 98. Chinese 99. Bosnian 63 | 100. Breton 101. Belarusian 102. Bulgarian 64 | 103. Bashkir 104. Egyptian Arabic 105. Tibetan Standard, Tib... 65 | 106. Bengali 107. Burmese 108. Romansh 66 | 109. Marathi (Marāṭhī) 110. Malay 111. Maltese 67 | 112. Russian 113. Macedonian 114. Malayalam 68 | 115. Mongolian 116. Malagasy 117. Vietnamese 69 | 118. Spanish; Castilian 119. Estonian 120. Basque 70 | 121. Bishnupriya Manipuri 122. Asturian 123. English 71 | 124. Esperanto 125. Luxembourgish, Letzeb... 126. Latin 72 | 127. Uighur, Uyghur 128. Ukrainian 129. Limburgish, Limburgan... 73 | 130. Latvian 131. Urdu 132. Lithuanian 74 | 133. Fiji Hindi 134. Uzbek 135. Romanian, Moldavian, ... 75 | 76 | 77 | 78 | Download Necessary Models 79 | ^^^^^^^^^^^^^^^^^^^^^^^^^ 80 | 81 | .. code:: python 82 | 83 | %%bash 84 | polyglot download morph2.en morph2.ar 85 | 86 | 87 | .. parsed-literal:: 88 | 89 | [polyglot_data] Downloading package morph2.en to 90 | [polyglot_data] /home/rmyeid/polyglot_data... 91 | [polyglot_data] Package morph2.en is already up-to-date! 92 | [polyglot_data] Downloading package morph2.ar to 93 | [polyglot_data] /home/rmyeid/polyglot_data... 94 | [polyglot_data] Package morph2.ar is already up-to-date! 95 | 96 | 97 | Example 98 | ------- 99 | 100 | Word Segmentation 101 | ~~~~~~~~~~~~~~~~~ 102 | 103 | .. code:: python 104 | 105 | from polyglot.text import Text, Word 106 | 107 | .. code:: python 108 | 109 | words = ["preprocessing", "processor", "invaluable", "thankful", "crossed"] 110 | for w in words: 111 | w = Word(w, language="en") 112 | print("{:<20}{}".format(w, w.morphemes)) 113 | 114 | 115 | .. parsed-literal:: 116 | 117 | preprocessing ['pre', 'process', 'ing'] 118 | processor ['process', 'or'] 119 | invaluable ['in', 'valuable'] 120 | thankful ['thank', 'ful'] 121 | crossed ['cross', 'ed'] 122 | 123 | 124 | Sentence Segmentation 125 | ~~~~~~~~~~~~~~~~~~~~~ 126 | 127 | If the text is not tokenized properly, morphological analysis could 128 | offer a smart of way of splitting the text into its original units. 129 | Here, is an example: 130 | 131 | .. code:: python 132 | 133 | blob = "Wewillmeettoday." 134 | text = Text(blob) 135 | text.language = "en" 136 | 137 | .. code:: python 138 | 139 | text.morphemes 140 | 141 | 142 | 143 | 144 | .. parsed-literal:: 145 | 146 | WordList([u'We', u'will', u'meet', u'to', u'day', u'.']) 147 | 148 | 149 | 150 | Command Line Interface 151 | ~~~~~~~~~~~~~~~~~~~~~~ 152 | 153 | .. code:: python 154 | 155 | !polyglot --lang en tokenize --input testdata/cricket.txt | polyglot --lang en morph | tail -n 30 156 | 157 | 158 | .. parsed-literal:: 159 | 160 | which which 161 | India In_dia 162 | beat beat 163 | Bermuda Ber_mud_a 164 | in in 165 | Port Port 166 | of of 167 | Spain Spa_in 168 | in in 169 | 2007 2007 170 | , , 171 | which which 172 | was wa_s 173 | equalled equal_led 174 | five five 175 | days day_s 176 | ago ago 177 | by by 178 | South South 179 | Africa Africa 180 | in in 181 | their t_heir 182 | victory victor_y 183 | over over 184 | West West 185 | Indies In_dies 186 | in in 187 | Sydney Syd_ney 188 | . . 189 | 190 | 191 | 192 | Demo 193 | ---- 194 | 195 | This demo does not reflect the models supplied by polyglot, however, we 196 | think it is indicative of what you should expect from morfessor 197 | 198 | `Demo `__ 199 | 200 | Citation 201 | ~~~~~~~~ 202 | 203 | This is an interface to the implementation being described in the 204 | `Morfessor2.0: Python Implementation and Extensions for Morfessor 205 | Baseline `__ 206 | technical report. 207 | 208 | :: 209 | 210 | @InProceedings{morfessor2, 211 | title:{Morfessor 2.0: Python Implementation and Extensions for Morfessor Baseline}, 212 | author: {Virpioja, Sami ; Smit, Peter ; Grönroos, Stig-Arne ; Kurimo, Mikko}, 213 | year: {2013}, 214 | publisher: {Department of Signal Processing and Acoustics, Aalto University}, 215 | booktitle:{Aalto University publication series} 216 | } 217 | 218 | References 219 | ---------- 220 | 221 | - `Morpho project `__ 222 | - `Background information on morpheme 223 | discovery `__. 224 | -------------------------------------------------------------------------------- /polyglot/mapping/embeddings.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """Defines classes related to mapping vocabulary to n-dimensional points.""" 5 | 6 | from io import open 7 | import logging 8 | from os import path 9 | import tarfile 10 | 11 | import numpy as np 12 | from numpy import float32 13 | 14 | from six import PY2 15 | from six import text_type as unicode 16 | from six import iteritems 17 | from six.moves import map 18 | from six import string_types 19 | from six.moves import cPickle as pickle 20 | 21 | from .base import CountedVocabulary, OrderedVocabulary 22 | from ..utils import _open 23 | 24 | 25 | logger = logging.getLogger(__name__) 26 | 27 | 28 | class Embedding(object): 29 | """ Mapping a vocabulary to a d-dimensional points.""" 30 | 31 | def __init__(self, vocabulary, vectors): 32 | self.vocabulary = vocabulary 33 | self.vectors = np.asarray(vectors) 34 | 35 | if len(self.vocabulary) != self.vectors.shape[0]: 36 | raise ValueError("Vocabulary has {} items but we have {} " 37 | "vectors".format(len(vocabulary), self.vectors.shape[0])) 38 | 39 | def __getitem__(self, k): 40 | return self.vectors[self.vocabulary[k]] 41 | 42 | def __contains__(self, k): 43 | return k in self.vocabulary 44 | 45 | def __delitem__(self, k): 46 | """Remove the word and its vector from the embedding. 47 | 48 | Note: 49 | This operation costs \\theta(n). Be careful putting it in a loop. 50 | """ 51 | index = self.vocabulary[k] 52 | del self.vocabulary[k] 53 | self.vectors = np.delete(self.vectors, index, 0) 54 | 55 | def __len__(self): 56 | return len(self.vocabulary) 57 | 58 | def __iter__(self): 59 | for w in self.vocabulary: 60 | yield w, self[w] 61 | 62 | @property 63 | def words(self): 64 | return self.vocabulary.words 65 | 66 | @property 67 | def shape(self): 68 | return self.vectors.shape 69 | 70 | def apply_expansion(self, expansion): 71 | """Apply a vocabulary expansion to the current emebddings.""" 72 | self.vocabulary = expansion(self.vocabulary) 73 | 74 | def get(self, k, default=None): 75 | try: 76 | return self[k] 77 | except KeyError as e: 78 | return default 79 | 80 | def most_frequent(self, k, inplace=False): 81 | """Only most frequent k words to be included in the embeddings.""" 82 | vocabulary = self.vocabulary.most_frequent(k) 83 | vectors = np.asarray([self[w] for w in vocabulary]) 84 | if inplace: 85 | self.vocabulary = vocabulary 86 | self.vectors = vectors 87 | return self 88 | return Embedding(vectors=vectors, vocabulary=vocabulary) 89 | 90 | def normalize_words(self, ord=2, inplace=False): 91 | """Normalize embeddings matrix row-wise. 92 | 93 | Args: 94 | ord: normalization order. Possible values {1, 2, 'inf', '-inf'} 95 | """ 96 | if ord == 2: 97 | ord = None # numpy uses this flag to indicate l2. 98 | vectors = self.vectors.T / np.linalg.norm(self.vectors, ord, axis=1) 99 | if inplace: 100 | self.vectors = vectors.T 101 | return self 102 | return Embedding(vectors=vectors.T, vocabulary=self.vocabulary) 103 | 104 | def nearest_neighbors(self, word, top_k=10): 105 | """Return the nearest k words to the given `word`. 106 | 107 | Args: 108 | word (string): single word. 109 | top_k (integer): decides how many neighbors to report. 110 | 111 | Returns: 112 | A list of words sorted by the distances. The closest is the first. 113 | 114 | Note: 115 | L2 metric is used to calculate distances. 116 | """ 117 | #TODO(rmyeid): Use scikit ball tree, if scikit is available 118 | point = self[word] 119 | diff = self.vectors - point 120 | distances = np.linalg.norm(diff, axis=1) 121 | top_ids = distances.argsort()[1:top_k+1] 122 | return [self.vocabulary.id_word[i] for i in top_ids] 123 | 124 | def distances(self, word, words): 125 | """Calculate eucledean pairwise distances between `word` and `words`. 126 | 127 | Args: 128 | word (string): single word. 129 | words (list): list of strings. 130 | 131 | Returns: 132 | numpy array of the distances. 133 | 134 | Note: 135 | L2 metric is used to calculate distances. 136 | """ 137 | 138 | point = self[word] 139 | vectors = np.asarray([self[w] for w in words]) 140 | diff = vectors - point 141 | distances = np.linalg.norm(diff, axis=1) 142 | return distances 143 | 144 | @staticmethod 145 | def from_gensim(model): 146 | word_count = {} 147 | vectors = [] 148 | for word, vocab in sorted(iteritems(model.vocab), key=lambda item: -item[1].count): 149 | vectors.append(model.syn0[vocab.index]) 150 | word_count[word] = vocab.count 151 | vocab = CountedVocabulary(word_count=word_count) 152 | vectors = np.asarray(vectors) 153 | return Embedding(vocabulary=vocab, vectors=vectors) 154 | 155 | @staticmethod 156 | def from_word2vec_vocab(fvocab): 157 | counts = {} 158 | with _open(fvocab) as fin: 159 | for line in fin: 160 | word, count = unicode(line).strip().split() 161 | counts[word] = int(count) 162 | return CountedVocabulary(word_count=counts) 163 | 164 | @staticmethod 165 | def _from_word2vec_binary(fname): 166 | with _open(fname, 'rb') as fin: 167 | words = [] 168 | header = unicode(fin.readline()) 169 | vocab_size, layer1_size = list(map(int, header.split())) # throws for invalid file format 170 | vectors = np.zeros((vocab_size, layer1_size), dtype=float32) 171 | binary_len = np.dtype(float32).itemsize * layer1_size 172 | for line_no in xrange(vocab_size): 173 | # mixed text and binary: read text first, then binary 174 | word = [] 175 | while True: 176 | ch = fin.read(1) 177 | if ch == b' ': 178 | break 179 | if ch != b'\n': # ignore newlines in front of words (some binary files have newline, some don't) 180 | word.append(ch) 181 | word = b''.join(word) 182 | index = line_no 183 | words.append(word) 184 | vectors[index, :] = np.fromstring(fin.read(binary_len), dtype=float32) 185 | return words, vectors 186 | 187 | @staticmethod 188 | def _from_word2vec_text(fname): 189 | with _open(fname, 'rb') as fin: 190 | words = [] 191 | header = unicode(fin.readline()) 192 | vocab_size, layer1_size = list(map(int, header.split())) # throws for invalid file format 193 | vectors = [] 194 | for line_no, line in enumerate(fin): 195 | try: 196 | parts = unicode(line, encoding="utf-8").strip().split() 197 | except TypeError as e: 198 | parts = line.strip().split() 199 | except Exception as e: 200 | logger.warning("We ignored line number {} because of erros in parsing" 201 | "\n{}".format(line_no, e)) 202 | continue 203 | # We differ from Gensim implementation. 204 | # Our assumption that a difference of one happens because of having a 205 | # space in the word. 206 | if len(parts) == layer1_size + 1: 207 | word, weights = parts[0], list(map(float32, parts[1:])) 208 | elif len(parts) == layer1_size + 2: 209 | word, weights = parts[:2], list(map(float32, parts[2:])) 210 | word = u" ".join(word) 211 | else: 212 | logger.warning("We ignored line number {} because of unrecognized " 213 | "number of columns {}".format(line_no, parts[:-layer1_size])) 214 | continue 215 | index = line_no 216 | words.append(word) 217 | vectors.append(weights) 218 | vectors = np.asarray(vectors, dtype=np.float32) 219 | return words, vectors 220 | 221 | @staticmethod 222 | def from_word2vec(fname, fvocab=None, binary=False): 223 | """ 224 | Load the input-hidden weight matrix from the original C word2vec-tool format. 225 | 226 | Note that the information stored in the file is incomplete (the binary tree is missing), 227 | so while you can query for word similarity etc., you cannot continue training 228 | with a model loaded this way. 229 | 230 | `binary` is a boolean indicating whether the data is in binary word2vec format. 231 | Word counts are read from `fvocab` filename, if set (this is the file generated 232 | by `-save-vocab` flag of the original C tool). 233 | """ 234 | vocabulary = None 235 | if fvocab is not None: 236 | logger.info("loading word counts from %s" % (fvocab)) 237 | vocabulary = Embedding.from_word2vec_vocab(fvocab) 238 | 239 | logger.info("loading projection weights from %s" % (fname)) 240 | if binary: 241 | words, vectors = Embedding._from_word2vec_binary(fname) 242 | else: 243 | words, vectors = Embedding._from_word2vec_text(fname) 244 | 245 | if not vocabulary: 246 | vocabulary = OrderedVocabulary(words=words) 247 | 248 | return Embedding(vocabulary=vocabulary, vectors=vectors) 249 | 250 | @staticmethod 251 | def load(fname): 252 | """Load an embedding dump generated by `save`""" 253 | 254 | content = _open(fname).read() 255 | if PY2: 256 | state = pickle.loads(content) 257 | else: 258 | state = pickle.loads(content, encoding='latin1') 259 | voc, vec = state 260 | if len(voc) == 2: 261 | words, counts = voc 262 | word_count = dict(zip(words, counts)) 263 | vocab = CountedVocabulary(word_count=word_count) 264 | else: 265 | vocab = OrderedVocabulary(voc) 266 | return Embedding(vocabulary=vocab, vectors=vec) 267 | 268 | def save(self, fname): 269 | """Save a pickled version of the embedding into `fname`.""" 270 | 271 | vec = self.vectors 272 | voc = self.vocabulary.getstate() 273 | state = (voc, vec) 274 | with open(fname, 'wb') as f: 275 | pickle.dump(state, f, protocol=pickle.HIGHEST_PROTOCOL) 276 | -------------------------------------------------------------------------------- /notebooks/POS.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Part of Speech Tagging" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Part of speech tagging task aims to assign every word/token in plain text a category that identifies the syntactic functionality of the word occurrence.\n", 15 | "\n", 16 | "Polyglot recognizes 17 parts of speech, this set is called the `universal part of speech tag set`:\n", 17 | "\n", 18 | "- **ADJ**: adjective\n", 19 | "- **ADP**: adposition\n", 20 | "- **ADV**: adverb\n", 21 | "- **AUX**: auxiliary verb\n", 22 | "- **CONJ**: coordinating conjunction\n", 23 | "- **DET**: determiner\n", 24 | "- **INTJ**: interjection\n", 25 | "- **NOUN**: noun\n", 26 | "- **NUM**: numeral\n", 27 | "- **PART**: particle\n", 28 | "- **PRON**: pronoun\n", 29 | "- **PROPN**: proper noun\n", 30 | "- **PUNCT**: punctuation\n", 31 | "- **SCONJ**: subordinating conjunction\n", 32 | "- **SYM**: symbol\n", 33 | "- **VERB**: verb\n", 34 | "- **X**: other" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "## Languages Coverage" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "The models were trained on a combination of:\n", 49 | "\n", 50 | "- Original CONLL datasets after the tags were converted using the [universal POS tables](http://universaldependencies.github.io/docs/tagset-conversion/index.html).\n", 51 | "\n", 52 | "- Universal Dependencies 1.0 corpora whenever they are available." 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 1, 58 | "metadata": { 59 | "collapsed": false 60 | }, 61 | "outputs": [ 62 | { 63 | "name": "stdout", 64 | "output_type": "stream", 65 | "text": [ 66 | " 1. German 2. Italian 3. Danish \n", 67 | " 4. Czech 5. Slovene 6. French \n", 68 | " 7. English 8. Swedish 9. Bulgarian \n", 69 | " 10. Spanish; Castilian 11. Indonesian 12. Portuguese \n", 70 | " 13. Finnish 14. Irish 15. Hungarian \n", 71 | " 16. Dutch \n" 72 | ] 73 | } 74 | ], 75 | "source": [ 76 | "from polyglot.downloader import downloader\n", 77 | "print(downloader.supported_languages_table(\"pos2\"))" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "#### Download Necessary Models" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 2, 90 | "metadata": { 91 | "collapsed": false 92 | }, 93 | "outputs": [ 94 | { 95 | "name": "stdout", 96 | "output_type": "stream", 97 | "text": [ 98 | "[polyglot_data] Downloading package embeddings2.en to\n", 99 | "[polyglot_data] /home/rmyeid/polyglot_data...\n", 100 | "[polyglot_data] Package embeddings2.en is already up-to-date!\n", 101 | "[polyglot_data] Downloading package pos2.en to\n", 102 | "[polyglot_data] /home/rmyeid/polyglot_data...\n", 103 | "[polyglot_data] Package pos2.en is already up-to-date!\n" 104 | ] 105 | } 106 | ], 107 | "source": [ 108 | "%%bash\n", 109 | "polyglot download embeddings2.en pos2.en" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "## Example" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "We tag each word in the text with one part of speech." 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 3, 129 | "metadata": { 130 | "collapsed": false 131 | }, 132 | "outputs": [], 133 | "source": [ 134 | "from polyglot.text import Text" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 4, 140 | "metadata": { 141 | "collapsed": true 142 | }, 143 | "outputs": [], 144 | "source": [ 145 | "blob = \"\"\"We will meet at eight o'clock on Thursday morning.\"\"\"\n", 146 | "text = Text(blob)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "We can query all the tagged words" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 5, 159 | "metadata": { 160 | "collapsed": false 161 | }, 162 | "outputs": [ 163 | { 164 | "data": { 165 | "text/plain": [ 166 | "[(u'We', u'PRON'),\n", 167 | " (u'will', u'AUX'),\n", 168 | " (u'meet', u'VERB'),\n", 169 | " (u'at', u'ADP'),\n", 170 | " (u'eight', u'NUM'),\n", 171 | " (u\"o'clock\", u'NOUN'),\n", 172 | " (u'on', u'ADP'),\n", 173 | " (u'Thursday', u'PROPN'),\n", 174 | " (u'morning', u'NOUN'),\n", 175 | " (u'.', u'PUNCT')]" 176 | ] 177 | }, 178 | "execution_count": 5, 179 | "metadata": {}, 180 | "output_type": "execute_result" 181 | } 182 | ], 183 | "source": [ 184 | "text.pos_tags" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "After calling the pos_tags property once, the words objects will carry the POS tags." 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 6, 197 | "metadata": { 198 | "collapsed": false, 199 | "scrolled": true 200 | }, 201 | "outputs": [ 202 | { 203 | "data": { 204 | "text/plain": [ 205 | "u'PRON'" 206 | ] 207 | }, 208 | "execution_count": 6, 209 | "metadata": {}, 210 | "output_type": "execute_result" 211 | } 212 | ], 213 | "source": [ 214 | "text.words[0].pos_tag" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": {}, 220 | "source": [ 221 | "### Command Line Interface" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 7, 227 | "metadata": { 228 | "collapsed": false 229 | }, 230 | "outputs": [ 231 | { 232 | "name": "stdout", 233 | "output_type": "stream", 234 | "text": [ 235 | "which DET \r\n", 236 | "India PROPN\r\n", 237 | "beat VERB \r\n", 238 | "Bermuda PROPN\r\n", 239 | "in ADP \r\n", 240 | "Port PROPN\r\n", 241 | "of ADP \r\n", 242 | "Spain PROPN\r\n", 243 | "in ADP \r\n", 244 | "2007 NUM \r\n", 245 | ", PUNCT\r\n", 246 | "which DET \r\n", 247 | "was AUX \r\n", 248 | "equalled VERB \r\n", 249 | "five NUM \r\n", 250 | "days NOUN \r\n", 251 | "ago ADV \r\n", 252 | "by ADP \r\n", 253 | "South PROPN\r\n", 254 | "Africa PROPN\r\n", 255 | "in ADP \r\n", 256 | "their PRON \r\n", 257 | "victory NOUN \r\n", 258 | "over ADP \r\n", 259 | "West PROPN\r\n", 260 | "Indies PROPN\r\n", 261 | "in ADP \r\n", 262 | "Sydney PROPN\r\n", 263 | ". PUNCT\r\n", 264 | "\r\n" 265 | ] 266 | } 267 | ], 268 | "source": [ 269 | "!polyglot --lang en tokenize --input testdata/cricket.txt | polyglot --lang en pos | tail -n 30" 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": {}, 275 | "source": [ 276 | "\n", 277 | "### Citation\n", 278 | "\n", 279 | "This work is a direct implementation of the research being described in the [Polyglot: Distributed Word Representations for Multilingual NLP](http://www.aclweb.org/anthology/W13-3520) paper.\n", 280 | "The author of this library strongly encourage you to cite the following paper if you are using this software." 281 | ] 282 | }, 283 | { 284 | "cell_type": "markdown", 285 | "metadata": {}, 286 | "source": [ 287 | "\n", 288 | "```\n", 289 | " @InProceedings{polyglot:2013:ACL-CoNLL,\n", 290 | " author = {Al-Rfou, Rami and Perozzi, Bryan and Skiena, Steven},\n", 291 | " title = {Polyglot: Distributed Word Representations for Multilingual NLP},\n", 292 | " booktitle = {Proceedings of the Seventeenth Conference on Computational Natural Language Learning},\n", 293 | " month = {August},\n", 294 | " year = {2013},\n", 295 | " address = {Sofia, Bulgaria},\n", 296 | " publisher = {Association for Computational Linguistics},\n", 297 | " pages = {183--192}, \n", 298 | " url = {http://www.aclweb.org/anthology/W13-3520}\n", 299 | " }\n", 300 | "```" 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": {}, 306 | "source": [ 307 | "## References\n", 308 | "\n", 309 | "- [Universal Part of Speech Tagging](http://universaldependencies.github.io/docs/u/pos/index.html)\n", 310 | "- [Universal Dependencies 1.0](https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-1464)." 311 | ] 312 | } 313 | ], 314 | "metadata": { 315 | "kernelspec": { 316 | "display_name": "Python 2", 317 | "language": "python", 318 | "name": "python2" 319 | }, 320 | "language_info": { 321 | "codemirror_mode": { 322 | "name": "ipython", 323 | "version": 2 324 | }, 325 | "file_extension": ".py", 326 | "mimetype": "text/x-python", 327 | "name": "python", 328 | "nbconvert_exporter": "python", 329 | "pygments_lexer": "ipython2", 330 | "version": "2.7.6" 331 | } 332 | }, 333 | "nbformat": 4, 334 | "nbformat_minor": 0 335 | } 336 | -------------------------------------------------------------------------------- /notebooks/README.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "polyglot\n", 8 | "===============================" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "[![Downloads](https://img.shields.io/pypi/dm/polyglot.svg \"Downloads\")](https://pypi.python.org/pypi/polyglot)\n", 16 | "[![Latest Version](https://badge.fury.io/py/polyglot.svg \"Latest Version\")](https://pypi.python.org/pypi/polyglot)\n", 17 | "[![Build Status](https://travis-ci.org/aboSamoor/polyglot.png?branch=master \"Build Status\")](https://travis-ci.org/aboSamoor/polyglot)\n", 18 | "[![Documentation Status](https://readthedocs.org/projects/polyglot/badge/?version=latest \"Documentation Status\")](https://readthedocs.org/builds/polyglot/)" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "Polyglot is a natural language pipeline that supports massive multilingual applications.\n", 26 | "\n", 27 | "* Free software: GPLv3 license\n", 28 | "* Documentation: http://polyglot.readthedocs.org.\n", 29 | "\n", 30 | "###Features\n", 31 | "\n", 32 | "\n", 33 | "* Tokenization (165 Languages)\n", 34 | "* Language detection (196 Languages)\n", 35 | "* Named Entity Recognition (40 Languages)\n", 36 | "* Part of Speech Tagging (16 Languages)\n", 37 | "* Sentiment Analysis (136 Languages)\n", 38 | "* Word Embeddings (137 Languages)\n", 39 | "* Morphological analysis (135 Languages)\n", 40 | "* Transliteration (69 Languages)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "### Developer\n", 48 | "\n", 49 | "* Rami Al-Rfou @ `rmyeid gmail com`" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "\n", 57 | "## Quick Tutorial" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 9, 63 | "metadata": { 64 | "collapsed": false 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "import polyglot\n", 69 | "from polyglot.text import Text, Word" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "### Language Detection" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 10, 82 | "metadata": { 83 | "collapsed": false 84 | }, 85 | "outputs": [ 86 | { 87 | "name": "stdout", 88 | "output_type": "stream", 89 | "text": [ 90 | "Language Detected: Code=fr, Name=French\n", 91 | "\n" 92 | ] 93 | } 94 | ], 95 | "source": [ 96 | "text = Text(\"Bonjour, Mesdames.\")\n", 97 | "print(\"Language Detected: Code={}, Name={}\\n\".format(text.language.code, text.language.name))" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "### Tokenization" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 11, 110 | "metadata": { 111 | "collapsed": false 112 | }, 113 | "outputs": [ 114 | { 115 | "name": "stdout", 116 | "output_type": "stream", 117 | "text": [ 118 | "[u'Beautiful', u'is', u'better', u'than', u'ugly', u'.', u'Explicit', u'is', u'better', u'than', u'implicit', u'.', u'Simple', u'is', u'better', u'than', u'complex', u'.']\n" 119 | ] 120 | } 121 | ], 122 | "source": [ 123 | "zen = Text(\"Beautiful is better than ugly. \"\n", 124 | " \"Explicit is better than implicit. \"\n", 125 | " \"Simple is better than complex.\")\n", 126 | "print(zen.words)" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 12, 132 | "metadata": { 133 | "collapsed": false, 134 | "scrolled": true 135 | }, 136 | "outputs": [ 137 | { 138 | "name": "stdout", 139 | "output_type": "stream", 140 | "text": [ 141 | "[Sentence(\"Beautiful is better than ugly.\"), Sentence(\"Explicit is better than implicit.\"), Sentence(\"Simple is better than complex.\")]\n" 142 | ] 143 | } 144 | ], 145 | "source": [ 146 | "print(zen.sentences)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "### Part of Speech Tagging" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 13, 159 | "metadata": { 160 | "collapsed": false 161 | }, 162 | "outputs": [ 163 | { 164 | "name": "stdout", 165 | "output_type": "stream", 166 | "text": [ 167 | "Word POS Tag\n", 168 | "------------------------------\n", 169 | "O DET\n", 170 | "primeiro ADJ\n", 171 | "uso NOUN\n", 172 | "de ADP\n", 173 | "desobediência NOUN\n", 174 | "civil ADJ\n", 175 | "em ADP\n", 176 | "massa NOUN\n", 177 | "ocorreu ADJ\n", 178 | "em ADP\n", 179 | "setembro NOUN\n", 180 | "de ADP\n", 181 | "1906 NUM\n", 182 | ". PUNCT\n" 183 | ] 184 | } 185 | ], 186 | "source": [ 187 | "text = Text(u\"O primeiro uso de desobediência civil em massa ocorreu em setembro de 1906.\")\n", 188 | "\n", 189 | "print(\"{:<16}{}\".format(\"Word\", \"POS Tag\")+\"\\n\"+\"-\"*30)\n", 190 | "for word, tag in text.pos_tags:\n", 191 | " print(u\"{:<16}{:>2}\".format(word, tag))" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": {}, 197 | "source": [ 198 | "### Named Entity Recognition" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 14, 204 | "metadata": { 205 | "collapsed": false 206 | }, 207 | "outputs": [ 208 | { 209 | "name": "stdout", 210 | "output_type": "stream", 211 | "text": [ 212 | "[I-LOC([u'Gro\\xdfbritannien']), I-PER([u'Gandhi'])]\n" 213 | ] 214 | } 215 | ], 216 | "source": [ 217 | "text = Text(u\"In Großbritannien war Gandhi mit dem westlichen Lebensstil vertraut geworden\")\n", 218 | "print(text.entities)" 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": {}, 224 | "source": [ 225 | "### Polarity" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": 15, 231 | "metadata": { 232 | "collapsed": false 233 | }, 234 | "outputs": [ 235 | { 236 | "name": "stdout", 237 | "output_type": "stream", 238 | "text": [ 239 | "Word Polarity\n", 240 | "------------------------------\n", 241 | "Beautiful 0\n", 242 | "is 0\n", 243 | "better 1\n", 244 | "than 0\n", 245 | "ugly -1\n", 246 | ". 0\n" 247 | ] 248 | } 249 | ], 250 | "source": [ 251 | "print(\"{:<16}{}\".format(\"Word\", \"Polarity\")+\"\\n\"+\"-\"*30)\n", 252 | "for w in zen.words[:6]:\n", 253 | " print(\"{:<16}{:>2}\".format(w, w.polarity))" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": {}, 259 | "source": [ 260 | "### Embeddings" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 19, 266 | "metadata": { 267 | "collapsed": false 268 | }, 269 | "outputs": [ 270 | { 271 | "name": "stdout", 272 | "output_type": "stream", 273 | "text": [ 274 | "Neighbors (Synonms) of Obama\n", 275 | "------------------------------\n", 276 | "Bush \n", 277 | "Reagan \n", 278 | "Clinton \n", 279 | "Ahmadinejad \n", 280 | "Nixon \n", 281 | "Karzai \n", 282 | "McCain \n", 283 | "Biden \n", 284 | "Huckabee \n", 285 | "Lula \n", 286 | "\n", 287 | "\n", 288 | "The first 10 dimensions out the 256 dimensions\n", 289 | "\n", 290 | "[-2.57382345 1.52175975 0.51070285 1.08678675 -0.74386948 -1.18616164\n", 291 | " 2.92784619 -0.25694436 -1.40958667 -2.39675403]\n" 292 | ] 293 | } 294 | ], 295 | "source": [ 296 | "word = Word(\"Obama\", language=\"en\")\n", 297 | "print(\"Neighbors (Synonms) of {}\".format(word)+\"\\n\"+\"-\"*30)\n", 298 | "for w in word.neighbors:\n", 299 | " print(\"{:<16}\".format(w))\n", 300 | "print(\"\\n\\nThe first 10 dimensions out the {} dimensions\\n\".format(word.vector.shape[0]))\n", 301 | "print(word.vector[:10])" 302 | ] 303 | }, 304 | { 305 | "cell_type": "markdown", 306 | "metadata": {}, 307 | "source": [ 308 | "### Morphology" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": 17, 314 | "metadata": { 315 | "collapsed": false 316 | }, 317 | "outputs": [ 318 | { 319 | "name": "stdout", 320 | "output_type": "stream", 321 | "text": [ 322 | "[u'Pre', u'process', u'ing']\n" 323 | ] 324 | } 325 | ], 326 | "source": [ 327 | "word = Text(\"Preprocessing is an essential step.\").words[0]\n", 328 | "print(word.morphemes)" 329 | ] 330 | }, 331 | { 332 | "cell_type": "markdown", 333 | "metadata": {}, 334 | "source": [ 335 | "### Transliteration" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 18, 341 | "metadata": { 342 | "collapsed": false 343 | }, 344 | "outputs": [ 345 | { 346 | "name": "stdout", 347 | "output_type": "stream", 348 | "text": [ 349 | "препрокессинг\n" 350 | ] 351 | } 352 | ], 353 | "source": [ 354 | "from polyglot.transliteration import Transliterator\n", 355 | "transliterator = Transliterator(source_lang=\"en\", target_lang=\"ru\")\n", 356 | "print(transliterator.transliterate(u\"preprocessing\"))" 357 | ] 358 | } 359 | ], 360 | "metadata": { 361 | "kernelspec": { 362 | "display_name": "Python 2", 363 | "language": "python", 364 | "name": "python2" 365 | }, 366 | "language_info": { 367 | "codemirror_mode": { 368 | "name": "ipython", 369 | "version": 2 370 | }, 371 | "file_extension": ".py", 372 | "mimetype": "text/x-python", 373 | "name": "python", 374 | "nbconvert_exporter": "python", 375 | "pygments_lexer": "ipython2", 376 | "version": "2.7.6" 377 | } 378 | }, 379 | "nbformat": 4, 380 | "nbformat_minor": 0 381 | } 382 | --------------------------------------------------------------------------------