├── .gitignore ├── MANIFEST.in ├── setup.cfg ├── tox.ini ├── _dump.pl ├── .travis.yml ├── src └── text_unidecode │ └── __init__.py ├── test_unidecode.py ├── README.rst ├── setup.py └── LICENSE /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .tox/ 3 | MANIFEST 4 | build/ 5 | dist/ 6 | *.egg-info/ 7 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include _dump.pl 3 | include test_unidecode.py 4 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal = 1 3 | 4 | [metadata] 5 | license_file = LICENSE 6 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py27,py34,py35,py36,py37,pypy,pypy3 3 | 4 | [testenv] 5 | deps = 6 | pytest 7 | commands= 8 | py.test [] 9 | -------------------------------------------------------------------------------- /_dump.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use Text::Unidecode; 3 | use Encode; 4 | 5 | for ($c=1; $c<65535; $c++){ # limit ourselves to narrow python builds 6 | $trans = unidecode(chr($c)); 7 | print encode("utf8", "$trans\x00"); 8 | } 9 | 10 | # usage: perl _dump.pl > src/text_unidecode/data.bin 11 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | cache: pip 3 | sudo: false 4 | matrix: 5 | include: 6 | - python: 2.7 7 | env: TOXENV=py27 8 | - python: 3.4 9 | env: TOXENV=py34 10 | - python: 3.5 11 | env: TOXENV=py35 12 | - python: 3.6 13 | env: TOXENV=py36 14 | - python: 3.7 15 | env: TOXENV=py37 16 | - python: pypy 17 | env: TOXENV=pypy 18 | - python: pypy3 19 | env: TOXENV=pypy3 20 | install: 21 | - pip install -U tox 22 | script: tox 23 | -------------------------------------------------------------------------------- /src/text_unidecode/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import, unicode_literals 3 | import os 4 | import pkgutil 5 | 6 | _replaces = pkgutil.get_data(__name__, 'data.bin').decode('utf8').split('\x00') 7 | 8 | def unidecode(txt): 9 | chars = [] 10 | for ch in txt: 11 | codepoint = ord(ch) 12 | 13 | if not codepoint: 14 | chars.append('\x00') 15 | continue 16 | 17 | try: 18 | chars.append(_replaces[codepoint-1]) 19 | except IndexError: 20 | pass 21 | return "".join(chars) 22 | -------------------------------------------------------------------------------- /test_unidecode.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import, unicode_literals 3 | from text_unidecode import unidecode 4 | import pytest 5 | 6 | 7 | @pytest.mark.parametrize(("text", "result"), [ 8 | ("Programmes de publicité - Solutions d'entreprise", "Programmes de publicite - Solutions d'entreprise"), 9 | ("Транслитерирует и русский", "Transliteriruet i russkii"), 10 | ("kožušček", "kozuscek"), 11 | ("北亰", "Bei Jing "), 12 | ]) 13 | def test_transliterate(text, result): 14 | assert unidecode(text) == result 15 | 16 | 17 | @pytest.mark.parametrize("code", range(128)) 18 | def test_7bit_purity(code): 19 | ch = chr(code) 20 | assert unidecode(ch) == ch 21 | 22 | 23 | def test_7bit_text_purity(): 24 | txt = "".join([chr(x) for x in range(128)]) 25 | assert unidecode(txt) == txt 26 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Text-Unidecode 2 | ============== 3 | 4 | .. image:: https://travis-ci.org/kmike/text-unidecode.svg?branch=master 5 | :target: https://travis-ci.org/kmike/text-unidecode 6 | :alt: Build Status 7 | 8 | text-unidecode is the most basic port of the 9 | `Text::Unidecode `_ 10 | Perl library. 11 | 12 | There are other Python ports of Text::Unidecode (unidecode_ 13 | and isounidecode_). unidecode_ is GPL; isounidecode_ uses too much memory, 14 | and it didn't support Python 3 when this package was created. 15 | 16 | You can redistribute it and/or modify this port under the terms of either: 17 | 18 | * `Artistic License`_, or 19 | * GPL or GPLv2+ 20 | 21 | If you're OK with GPL-only, use unidecode_ (it has better memory usage and 22 | better transliteration quality). 23 | 24 | ``text-unidecode`` supports Python 2.7 and 3.4+. 25 | 26 | .. _unidecode: https://pypi.python.org/pypi/Unidecode/ 27 | .. _isounidecode: https://pypi.python.org/pypi/isounidecode/ 28 | .. _Artistic License: https://opensource.org/licenses/Artistic-Perl-1.0 29 | 30 | Installation 31 | ------------ 32 | 33 | :: 34 | 35 | pip install text-unidecode 36 | 37 | Usage 38 | ----- 39 | 40 | :: 41 | 42 | >>> from text_unidecode import unidecode 43 | >>> unidecode(u'какой-то текст') 44 | 'kakoi-to tekst' 45 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | import codecs 3 | import sys 4 | from setuptools import setup 5 | 6 | __version__ = '1.3' 7 | 8 | 9 | if sys.version_info >= (3, ): 10 | with codecs.open('README.rst', encoding='utf-8') as f: 11 | long_description = f.read() 12 | else: 13 | with open('README.rst') as f: 14 | long_description = f.read() 15 | 16 | setup( 17 | name="text-unidecode", 18 | version=__version__, 19 | description="The most basic Text::Unidecode port", 20 | long_description=long_description, 21 | license='Artistic License', 22 | author='Mikhail Korobov', 23 | author_email='kmike84@gmail.com', 24 | 25 | url='https://github.com/kmike/text-unidecode/', 26 | 27 | package_dir={'': 'src'}, 28 | packages=['text_unidecode'], 29 | package_data={'text_unidecode': ['data.bin']}, 30 | 31 | classifiers=[ 32 | 'Development Status :: 5 - Production/Stable', 33 | 'Intended Audience :: Developers', 34 | 'License :: OSI Approved :: Artistic License', 35 | 'License :: OSI Approved :: GNU General Public License (GPL)', 36 | 'License :: OSI Approved :: GNU General Public License v2 or later (GPLv2+)', 37 | 'Programming Language :: Python', 38 | 'Programming Language :: Python :: 2', 39 | 'Programming Language :: Python :: 2.7', 40 | 'Programming Language :: Python :: 3', 41 | 'Programming Language :: Python :: 3.4', 42 | 'Programming Language :: Python :: 3.5', 43 | 'Programming Language :: Python :: 3.6', 44 | 'Programming Language :: Python :: 3.7', 45 | 'Programming Language :: Python :: Implementation :: CPython', 46 | 'Programming Language :: Python :: Implementation :: PyPy', 47 | 'Topic :: Software Development :: Libraries :: Python Modules', 48 | 'Topic :: Text Processing :: Linguistic', 49 | ], 50 | ) 51 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | text-unidecode is a free software; you can redistribute 2 | it and/or modify it under the terms of either: 3 | 4 | * GPL or GPLv2+ (see https://www.gnu.org/licenses/license-list.html#GNUGPL), or 5 | * Artistic License - see below: 6 | 7 | 8 | The "Artistic License" 9 | 10 | Preamble 11 | 12 | The intent of this document is to state the conditions under which a 13 | Package may be copied, such that the Copyright Holder maintains some 14 | semblance of artistic control over the development of the package, 15 | while giving the users of the package the right to use and distribute 16 | the Package in a more-or-less customary fashion, plus the right to make 17 | reasonable modifications. 18 | 19 | Definitions: 20 | 21 | "Package" refers to the collection of files distributed by the 22 | Copyright Holder, and derivatives of that collection of files 23 | created through textual modification. 24 | 25 | "Standard Version" refers to such a Package if it has not been 26 | modified, or has been modified in accordance with the wishes 27 | of the Copyright Holder as specified below. 28 | 29 | "Copyright Holder" is whoever is named in the copyright or 30 | copyrights for the package. 31 | 32 | "You" is you, if you're thinking about copying or distributing 33 | this Package. 34 | 35 | "Reasonable copying fee" is whatever you can justify on the 36 | basis of media cost, duplication charges, time of people involved, 37 | and so on. (You will not be required to justify it to the 38 | Copyright Holder, but only to the computing community at large 39 | as a market that must bear the fee.) 40 | 41 | "Freely Available" means that no fee is charged for the item 42 | itself, though there may be fees involved in handling the item. 43 | It also means that recipients of the item may redistribute it 44 | under the same conditions they received it. 45 | 46 | 1. You may make and give away verbatim copies of the source form of the 47 | Standard Version of this Package without restriction, provided that you 48 | duplicate all of the original copyright notices and associated disclaimers. 49 | 50 | 2. You may apply bug fixes, portability fixes and other modifications 51 | derived from the Public Domain or from the Copyright Holder. A Package 52 | modified in such a way shall still be considered the Standard Version. 53 | 54 | 3. You may otherwise modify your copy of this Package in any way, provided 55 | that you insert a prominent notice in each changed file stating how and 56 | when you changed that file, and provided that you do at least ONE of the 57 | following: 58 | 59 | a) place your modifications in the Public Domain or otherwise make them 60 | Freely Available, such as by posting said modifications to Usenet or 61 | an equivalent medium, or placing the modifications on a major archive 62 | site such as uunet.uu.net, or by allowing the Copyright Holder to include 63 | your modifications in the Standard Version of the Package. 64 | 65 | b) use the modified Package only within your corporation or organization. 66 | 67 | c) rename any non-standard executables so the names do not conflict 68 | with standard executables, which must also be provided, and provide 69 | a separate manual page for each non-standard executable that clearly 70 | documents how it differs from the Standard Version. 71 | 72 | d) make other distribution arrangements with the Copyright Holder. 73 | 74 | 4. You may distribute the programs of this Package in object code or 75 | executable form, provided that you do at least ONE of the following: 76 | 77 | a) distribute a Standard Version of the executables and library files, 78 | together with instructions (in the manual page or equivalent) on where 79 | to get the Standard Version. 80 | 81 | b) accompany the distribution with the machine-readable source of 82 | the Package with your modifications. 83 | 84 | c) give non-standard executables non-standard names, and clearly 85 | document the differences in manual pages (or equivalent), together 86 | with instructions on where to get the Standard Version. 87 | 88 | d) make other distribution arrangements with the Copyright Holder. 89 | 90 | 5. You may charge a reasonable copying fee for any distribution of this 91 | Package. You may charge any fee you choose for support of this 92 | Package. You may not charge a fee for this Package itself. However, 93 | you may distribute this Package in aggregate with other (possibly 94 | commercial) programs as part of a larger (possibly commercial) software 95 | distribution provided that you do not advertise this Package as a 96 | product of your own. You may embed this Package's interpreter within 97 | an executable of yours (by linking); this shall be construed as a mere 98 | form of aggregation, provided that the complete Standard Version of the 99 | interpreter is so embedded. 100 | 101 | 6. The scripts and library files supplied as input to or produced as 102 | output from the programs of this Package do not automatically fall 103 | under the copyright of this Package, but belong to whoever generated 104 | them, and may be sold commercially, and may be aggregated with this 105 | Package. If such scripts or library files are aggregated with this 106 | Package via the so-called "undump" or "unexec" methods of producing a 107 | binary executable image, then distribution of such an image shall 108 | neither be construed as a distribution of this Package nor shall it 109 | fall under the restrictions of Paragraphs 3 and 4, provided that you do 110 | not represent such an executable image as a Standard Version of this 111 | Package. 112 | 113 | 7. C subroutines (or comparably compiled subroutines in other 114 | languages) supplied by you and linked into this Package in order to 115 | emulate subroutines and variables of the language defined by this 116 | Package shall not be considered part of this Package, but are the 117 | equivalent of input as in Paragraph 6, provided these subroutines do 118 | not change the language in any way that would cause it to fail the 119 | regression tests for the language. 120 | 121 | 8. Aggregation of this Package with a commercial distribution is always 122 | permitted provided that the use of this Package is embedded; that is, 123 | when no overt attempt is made to make this Package's interfaces visible 124 | to the end user of the commercial distribution. Such use shall not be 125 | construed as a distribution of this Package. 126 | 127 | 9. The name of the Copyright Holder may not be used to endorse or promote 128 | products derived from this software without specific prior written permission. 129 | 130 | 10. THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR 131 | IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED 132 | WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. 133 | 134 | The End 135 | --------------------------------------------------------------------------------