├── requirements.txt ├── MANIFEST.in ├── requirements-dev.txt ├── test ├── __init__.py ├── base_test_case.py ├── test_console_scripts.py ├── test_rotunicode.py └── test_utils.py ├── rotunicode ├── __init__.py ├── console_scripts.py ├── rotunicode.py └── utils.py ├── .coveragerc ├── setup.cfg ├── .gitattributes ├── .travis ├── run.sh └── install.sh ├── .travis.yml ├── .gitignore ├── tox.ini ├── setup.py ├── CONTRIBUTING.rst ├── README.rst ├── .pylintrc └── LICENSE /requirements.txt: -------------------------------------------------------------------------------- 1 | -e . 2 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst LICENSE 2 | 3 | recursive-include test * 4 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | -rrequirements.txt 2 | coveralls 3 | genty>=1.0.0 4 | pep8 5 | pylint 6 | tox 7 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import, unicode_literals 4 | -------------------------------------------------------------------------------- /rotunicode/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import, unicode_literals 4 | from .rotunicode import RotUnicode 5 | from .utils import ruencode, rudecode 6 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | # Whether to measure branch coverage in addition to statement coverage. 3 | branch = True 4 | # List of packages or directories, the source to measure during execution. 5 | source = box 6 | -------------------------------------------------------------------------------- /test/base_test_case.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import, unicode_literals 4 | 5 | from unittest import skipIf, TestCase # pylint:disable=unused-import 6 | 7 | import six 8 | 9 | if six.PY3: 10 | # pylint:disable=no-member,maybe-no-member 11 | TestCase.assertItemsEqual = TestCase.assertCountEqual 12 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | # This flag says that the code is written to work on both Python 2 and Python 3 | # 3. If at all possible, it is good practice to do this. If you cannot, you 4 | # will need to generate wheels for each Python version that you support. 5 | universal=1 6 | 7 | [metadata] 8 | license_file=LICENSE 9 | 10 | [isort] 11 | known_first_party=test 12 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # See GITATTRIBUTES(5) (``man gitattributes``). 2 | # - ``diff=`` declares that diff hunk headers and git-diff word diffs 3 | # (for ``git diff --word-diff`` and ``git diff --color-words``) should be 4 | # specially computed for the language grammar. This makes git-diff more 5 | # powerful and useful. 6 | *.py text eol=lf diff=python 7 | *.txt text eol=lf 8 | *.rst text eol=lf 9 | *.in text eol=lf 10 | *.ini text eol=lf 11 | -------------------------------------------------------------------------------- /.travis/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Originally from 4 | # . 5 | 6 | set -e 7 | set -x 8 | set -o pipefail 9 | 10 | export PYENV_ROOT="$PWD/.pyenv" 11 | export PATH="$PYENV_ROOT/bin:$PATH" 12 | 13 | if [[ "$(uname -s)" == "Darwin" ]]; then 14 | eval "$(pyenv init -)" 15 | else 16 | if [[ "${TOX_ENV}" == "pypy" ]]; then 17 | eval "$(pyenv init -)" 18 | pyenv global "pypy${PYPY_VERSION}" 19 | fi 20 | fi 21 | source $PWD/.venv/bin/activate 22 | tox -e $TOX_ENV -- $TOX_FLAGS 23 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | language: python 3 | 4 | cache: 5 | directories: 6 | - $HOME/.cache/pip 7 | 8 | before_cache: 9 | - rm -r -f $HOME/.cache/pip/log 10 | 11 | matrix: 12 | include: 13 | - python: 2.7 14 | env: TOX_ENV=py27 15 | - python: pypy 16 | env: TOX_ENV=pypy PYPY_VERSION='2.7-5.10.0' 17 | - python: 3.4 18 | env: TOX_ENV=py34 19 | - python: 3.5 20 | env: TOX_ENV=py35 21 | - python: 3.6 22 | env: TOX_ENV=py36 23 | - python: 3.7 24 | env: TOX_ENV=py37 25 | # Python3.7 isn't available on the standard Trusty build that Travis uses 26 | dist: xenial 27 | sudo: true 28 | - python: 2.7 29 | env: TOX_ENV=pep8 30 | - python: 3.6 31 | env: TOX_ENV=pep8 32 | - python: 2.7 33 | env: TOX_ENV=pylint 34 | - python: 3.6 35 | env: TOX_ENV=pylint 36 | - python: 2.7 37 | env: TOX_ENV=coverage 38 | - python: 3.6 39 | env: TOX_ENV=coverage 40 | 41 | # commands to install dependencies 42 | install: 43 | - ./.travis/install.sh 44 | # commands to run 45 | script: 46 | - ./.travis/run.sh 47 | after_success: 48 | - if [ "-x$TOX_ENV" = "xcoverage" ]; then coveralls; fi 49 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Backup files 2 | *.~ 3 | .*.sw* 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Packages 13 | *.egg 14 | *.egg-info 15 | dist 16 | build 17 | eggs 18 | parts 19 | bin 20 | var 21 | sdist 22 | develop-eggs 23 | .Python 24 | env/ 25 | downloads/ 26 | .installed.cfg 27 | lib 28 | lib64 29 | MANIFEST 30 | .eggs/ 31 | .pyenv/ 32 | .venv/ 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .coverage 47 | .tox 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | .coverage.* 52 | *,cover 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | 61 | # Sphinx documentation 62 | docs/_build/ 63 | 64 | # PyBuilder 65 | target/ 66 | 67 | # Mr Developer 68 | .mr.developer.cfg 69 | .project 70 | .pydevproject 71 | 72 | # IntelliJ 73 | .idea/ 74 | *.iml 75 | 76 | # Mac noise 77 | .DS_Store 78 | 79 | # Misc 80 | tokens.pk 81 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = 3 | py27, 4 | pep8, 5 | pypy, 6 | py34, 7 | py35, 8 | py36, 9 | py37, 10 | pylint, 11 | coverage 12 | 13 | [testenv] 14 | deps = -rrequirements-dev.txt 15 | commands = {envpython} setup.py test 16 | 17 | [testenv:pep8] 18 | commands = 19 | pep8 rotunicode setup.py 20 | pep8 --ignore=E501 test 21 | 22 | [testenv:pylint] 23 | deps = -rrequirements-dev.txt 24 | commands = 25 | pylint --rcfile=.pylintrc rotunicode setup.py 26 | pylint --rcfile=.pylintrc --disable=C0301,C0411 test 27 | 28 | [testenv:coverage] 29 | commands = coverage run --rcfile=.coveragerc setup.py test 30 | 31 | [pep8] 32 | show-pep8 = True 33 | show-source = True 34 | 35 | [testenv:py36-build] 36 | description = Build the source and binary wheel packages for distribution. 37 | pypi_dist_dir = {toxinidir}/pypi-dist 38 | commands = 39 | rm -rf "{[testenv:py36-build]pypi_dist_dir}" 40 | {envpython} setup.py -vv \ 41 | sdist --formats=gztar --keep-temp --dist-dir="{[testenv:py36-build]pypi_dist_dir}" \ 42 | bdist_wheel --keep-temp --dist-dir="{[testenv:py36-build]pypi_dist_dir}" 43 | skip_install = True 44 | sitepackages = False 45 | recreate = True 46 | deps = 47 | wheel 48 | setuptools 49 | whitelist_externals = rm 50 | 51 | [testenv:py36-upload] 52 | description = Upload packages to PyPI. 53 | commands = 54 | twine upload --config-file="{toxinidir}/.pypirc" {posargs} {[testenv:py36-build]pypi_dist_dir}/* 55 | skip_install = True 56 | sitepackages = False 57 | recreate = True 58 | deps = 59 | twine 60 | -------------------------------------------------------------------------------- /.travis/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Originally from 4 | # . 5 | 6 | set -e 7 | set -x 8 | set -o pipefail 9 | 10 | git clean -f -d -X 11 | rm -r -f $PWD/.pyenv # Apparently `git-clean` won't remove other repositories. 12 | 13 | export PYENV_ROOT="$PWD/.pyenv" 14 | export PATH="$PYENV_ROOT/bin:$PATH" 15 | 16 | if [[ "$(uname -s)" == 'Darwin' ]]; then 17 | brew update || brew update 18 | brew install pyenv 19 | brew outdated pyenv || brew upgrade pyenv 20 | 21 | if which -s pyenv; then 22 | eval "$(pyenv init -)" 23 | fi 24 | 25 | case "${TOX_ENV}" in 26 | py27) 27 | curl -O https://bootstrap.pypa.io/get-pip.py 28 | python get-pip.py --user 29 | ;; 30 | py34) 31 | pyenv install 3.4.2 32 | pyenv global 3.4.2 33 | ;; 34 | py35) 35 | pyenv install 3.5.0 36 | pyenv global 3.5.0 37 | ;; 38 | py36) 39 | pyenv install 3.6.0 40 | pyenv global 3.6.0 41 | ;; 42 | pypy) 43 | pyenv install "pypy${PYPY_VERSION}" 44 | pyenv global "pypy${PYPY_VERSION}" 45 | ;; 46 | esac 47 | pyenv rehash 48 | python -m pip install -U --user virtualenv 49 | else 50 | # pyenv installation to get specified pypy version (Travis may only have older version(s)) 51 | if [[ "${TOX_ENV}" == "pypy" ]]; then 52 | git clone https://github.com/yyuu/pyenv.git $PWD/.pyenv 53 | eval "$(pyenv init -)" 54 | pyenv install "pypy${PYPY_VERSION}" 55 | pyenv global "pypy${PYPY_VERSION}" 56 | fi 57 | pip install -U virtualenv 58 | fi 59 | 60 | python -m virtualenv $PWD/.venv 61 | source $PWD/.venv/bin/activate 62 | pip install -U tox 63 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import, unicode_literals 4 | 5 | from codecs import open # pylint:disable=redefined-builtin 6 | from os.path import dirname, join 7 | 8 | from setuptools import setup, find_packages 9 | 10 | 11 | CLASSIFIERS = [ 12 | 'Development Status :: 4 - Beta', 13 | 'Intended Audience :: Developers', 14 | 'License :: OSI Approved :: Apache Software License', 15 | 'Topic :: Software Development :: Testing', 16 | 'Programming Language :: Python', 17 | 'Programming Language :: Python :: 2.7', 18 | 'Programming Language :: Python :: 3.4', 19 | 'Programming Language :: Python :: 3.5', 20 | 'Programming Language :: Python :: 3.6', 21 | 'Programming Language :: Python :: 3.7', 22 | 'Programming Language :: Python :: Implementation :: CPython', 23 | 'Programming Language :: Python :: Implementation :: PyPy', 24 | 'Operating System :: OS Independent', 25 | 'Operating System :: POSIX', 26 | 'Operating System :: Microsoft :: Windows', 27 | 'Operating System :: MacOS :: MacOS X', 28 | ] 29 | 30 | 31 | def main(): 32 | base_dir = dirname(__file__) 33 | test_requirements = ['genty>=1.0.0'] 34 | test_suite = 'test' 35 | with open(join(base_dir, 'README.rst'), encoding='utf-8') as readme_file: 36 | long_description = readme_file.read() 37 | setup( 38 | name='rotunicode', 39 | version='2.3.0', 40 | description='Python library for converting between a string of ASCII ' 41 | 'and non-ASCII chars maintaining readability', 42 | long_description=long_description, 43 | author='Box', 44 | author_email='oss@box.com', 45 | url='https://github.com/box/rotunicode', 46 | license=( 47 | 'Apache Software License, Version 2.0, ' 48 | 'http://www.apache.org/licenses/LICENSE-2.0' 49 | ), 50 | packages=find_packages(exclude=['test']), 51 | install_requires=['six>=1.9.0'], 52 | tests_require=test_requirements, 53 | test_suite=test_suite, 54 | zip_safe=False, 55 | entry_points={ 56 | 'console_scripts': [ 57 | 'rotunicode = rotunicode.console_scripts:main', 58 | ], 59 | }, 60 | classifiers=CLASSIFIERS, 61 | ) 62 | 63 | 64 | if __name__ == '__main__': 65 | main() 66 | -------------------------------------------------------------------------------- /test/test_console_scripts.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import, unicode_literals 4 | 5 | import codecs 6 | from collections import Iterator 7 | from io import StringIO 8 | import sys 9 | 10 | from genty import genty, genty_args, genty_dataset 11 | from six.moves import map # pylint:disable=redefined-builtin 12 | 13 | from rotunicode import RotUnicode, console_scripts 14 | from rotunicode.utils import get_rotunicode_function_for_decode_argument 15 | from test.base_test_case import TestCase 16 | 17 | 18 | @genty 19 | class TestConsoleScripts(TestCase): 20 | """Tests for :mod:`rotunicode.console_scripts`.""" 21 | 22 | @classmethod 23 | def setUpClass(cls): 24 | super(TestConsoleScripts, cls).setUpClass() 25 | codecs.register(RotUnicode.search_function) 26 | 27 | @genty_dataset(False, True) 28 | def test_rotunicode(self, decode): 29 | original_lines = ['foo', 'bar', 'baz', '\u2345', 'foo\u2345baz'] 30 | original_lines_with_newlines = [line + '\n' for line in original_lines] 31 | anti_action = get_rotunicode_function_for_decode_argument(not decode) 32 | lines = list(map(anti_action, original_lines_with_newlines)) 33 | content = ''.join(lines) 34 | io_object = StringIO(content) 35 | result = console_scripts.rotunicode(io_object, decode=decode) 36 | self.assertIsInstance(result, Iterator) 37 | self.assertEqual(list(map(anti_action, result)), lines) 38 | 39 | @genty_dataset( 40 | ('ƒőő\n', 'foo'), 41 | genty_args('foo\n', 'ƒőő', decode=True), 42 | genty_args('ƒőő', 'foo', use_stdin=True), 43 | genty_args('foo', 'ƒőő', decode=True, use_stdin=True), 44 | genty_args('ƒőő\nƒőő', 'foo\nfoo', use_stdin=True), 45 | ('ƒőő\\ńƒőő\n', 'foo\\nfoo'), 46 | ('ƒőő\\ȕᎾ➁➄➅ƒőő\n', 'foo\\u0256foo'), 47 | genty_args('ƒőő\nƒőő\n', 'foo\\nfoo', should_parse_escape_sequences=True), 48 | genty_args('ƒőőɖƒőő\n', 'foo\\u0256foo', should_parse_escape_sequences=True), 49 | genty_args('foo\nfoo\n', 'ƒőő\\nƒőő', decode=True, should_parse_escape_sequences=True), 50 | genty_args('fooɖfoo\n', 'ƒőő\\u0256ƒőő', decode=True, should_parse_escape_sequences=True), 51 | ) 52 | def test_main(self, expected_string, string, decode=False, use_stdin=False, should_parse_escape_sequences=False): 53 | # pylint:disable=too-many-arguments 54 | args = [] 55 | if decode: 56 | args.append('-d') 57 | if should_parse_escape_sequences: 58 | args.append('-e') 59 | stdin = sys.stdin 60 | stdout = sys.stdout 61 | try: 62 | if use_stdin: 63 | io_object = StringIO(string) 64 | sys.stdin = io_object 65 | else: 66 | args.append(string) 67 | sys.stdout = StringIO() 68 | console_scripts.main(args) 69 | sys.stdout.seek(0) 70 | self.assertEqual(sys.stdout.read(), expected_string) 71 | finally: 72 | sys.stdin = stdin 73 | sys.stdout = stdout 74 | -------------------------------------------------------------------------------- /rotunicode/console_scripts.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import, print_function, unicode_literals 4 | 5 | import argparse 6 | from io import StringIO 7 | import sys 8 | 9 | from six.moves import map # pylint:disable=redefined-builtin 10 | 11 | from .utils import ( 12 | get_rotunicode_function_for_decode_argument, 13 | parse_escape_sequences, 14 | safe_unicode, 15 | stream_file_lines, 16 | ) 17 | 18 | 19 | def main(args=None): 20 | parser = argparse.ArgumentParser( 21 | description='Rotate to Unicode. Convert ASCII characters in a string' 22 | 'to non-ASCII characters while maintaining readability.', 23 | ) 24 | parser.add_argument( 25 | '-d', 26 | '--decode', 27 | action='store_true', 28 | help='Decode a string previously encoded with rotunicode.', 29 | ) 30 | parser.add_argument( 31 | '-e', 32 | dest='should_parse_escape_sequences', 33 | action='store_true', 34 | help=( 35 | 'Like `echo -e`, use the backslash character for escape sequences,' 36 | ' including \\uxxxx. Only valid with an explicit string argument.' 37 | ), 38 | ) 39 | input_group = parser.add_argument_group( 40 | title='Input', 41 | description='What to rotate to unicode. The following options are' 42 | 'mutually exclusive.', 43 | ) 44 | input_group.add_argument( 45 | '-f', 46 | '--file', 47 | action='store', 48 | type=argparse.FileType('r'), 49 | nargs='?', 50 | help='The stream to be rotated from ASCII to non-ASCII (or decoded, if' 51 | '-d is specified.', 52 | ) 53 | input_group.add_argument( 54 | 'string', 55 | type=safe_unicode, 56 | action='store', 57 | nargs='?', 58 | help='The string to be rotated from ASCII to non-ASCII (or decoded, if' 59 | '-d is specified.', 60 | ) 61 | options = parser.parse_args(args) 62 | if options.string: 63 | input_string = options.string + '\n' 64 | if options.should_parse_escape_sequences: 65 | input_string = parse_escape_sequences(input_string) 66 | file_to_read = StringIO(input_string) 67 | else: 68 | file_to_read = options.file or sys.stdin 69 | for line in rotunicode(file_to_read, decode=options.decode): 70 | print(line, end='') 71 | return 0 72 | 73 | 74 | def rotunicode(io_object, decode=False): 75 | """Rotate ASCII <-> non-ASCII characters in a file. 76 | 77 | :param io_object: 78 | The file object to convert. 79 | :type io_object: 80 | :class:`io.TextIOWrapper` 81 | :param decode: 82 | If True, perform a rotunicode-decode (rotate from non-ASCII to ASCII). 83 | Defaults to False (rotate from ASCII to non-ASCII). 84 | :type decode: 85 | `bool` 86 | :return: 87 | Yield the converted lines of the file. 88 | :rtype: 89 | `generator` of `unicode` 90 | """ 91 | rotu_fn = get_rotunicode_function_for_decode_argument(decode=decode) 92 | return map(rotu_fn, map(safe_unicode, stream_file_lines(io_object))) 93 | -------------------------------------------------------------------------------- /test/test_rotunicode.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import, unicode_literals 4 | 5 | import codecs 6 | import platform 7 | 8 | # pylint:disable=import-error,no-name-in-module 9 | from genty import genty, genty_dataset 10 | # pylint:enable=import-error,no-name-in-module 11 | import six 12 | 13 | from rotunicode import RotUnicode 14 | # pylint:disable=wrong-import-order 15 | from test.base_test_case import skipIf, TestCase 16 | # pylint:enable=wrong-import-order 17 | 18 | 19 | @genty 20 | class RotUnicodeTest(TestCase): 21 | """Tests for :mod:`box.util.rotunicode.rotunicode`.""" 22 | 23 | @classmethod 24 | def setUpClass(cls): 25 | super(RotUnicodeTest, cls).setUpClass() 26 | codecs.register(RotUnicode.search_function) 27 | 28 | def test_encoder_is_searchable_by_name(self): 29 | encoder = codecs.getencoder('rotunicode') 30 | self.assertIsNotNone(encoder) 31 | 32 | def test_decoder_is_searchable_by_name(self): 33 | decoder = codecs.getdecoder('rotunicode') 34 | self.assertIsNotNone(decoder) 35 | 36 | def test_search_function_returns_none_for_non_rotunicode_encoding(self): 37 | self.assertIsNone(RotUnicode.search_function('random')) 38 | 39 | @genty_dataset('ignore', 'replace', 'xmlcharrefreplace') 40 | def test_encoding_using_unsupported_error_types_raise_exception( 41 | self, 42 | error_type, 43 | ): 44 | with self.assertRaises(UnicodeError): 45 | 'Hello World!'.encode('rotunicode', error_type) 46 | 47 | @skipIf(six.PY3, 'Python 3 strings cannot be decoded.') 48 | @genty_dataset('ignore', 'replace', 'xmlcharrefreplace') 49 | def test_decoding_using_unsupported_error_types_raise_exception( 50 | self, 51 | error_type 52 | ): 53 | with self.assertRaises(UnicodeError): 54 | 'Hello World!'.decode('rotunicode', error_type) 55 | 56 | @skipIf( 57 | not six.PY2 or platform.python_implementation() == 'PyPy', 58 | 'Encoders must return bytes except in Python 2.', 59 | ) 60 | @genty_dataset( 61 | zero_length_byte_string=(b'', ''), 62 | zero_length_unicode_string=('', ''), 63 | byte_string=(b'Hello World!', 'Ĥȅľľő Ŵőŕľď!'), 64 | unicode_string=('Hello World!', 'Ĥȅľľő Ŵőŕľď!'), 65 | byte_string_with_unsupported_chars=( 66 | 'हेलो World!'.encode('utf-8'), 67 | 'हेलो Ŵőŕľď!', 68 | ), 69 | unidcode_string_with_unsupported_chars=('हेलो World!', 'हेलो Ŵőŕľď!'), 70 | ) 71 | def test_encode_returns_correct_string(self, source, target): 72 | self.assertEqual( 73 | target, 74 | source.encode('rotunicode'), 75 | ) 76 | 77 | @skipIf(six.PY3, 'Python 3 strings cannot be decoded.') 78 | @genty_dataset( 79 | zero_length_byte_string=(b'', ''), 80 | zero_length_unicode_string=('', ''), 81 | byte_string=('Ĥȅľľő Ŵőŕľď!'.encode('utf-8'), 'Hello World!'), 82 | unicode_string=('Ĥȅľľő Ŵőŕľď!', 'Hello World!'), 83 | byte_string_with_unsupported_chars=( 84 | 'हेलो Ŵőŕľď!'.encode('utf-8'), 85 | 'हेलो World!', 86 | ), 87 | unicode_string_with_unsupported_chars=('हेलो Ŵőŕľď!', 'हेलो World!'), 88 | ) 89 | def test_decode_returns_correct_string(self, source, target): 90 | self.assertEqual( 91 | target, 92 | source.decode('rotunicode'), 93 | ) 94 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | Contributing 2 | ============ 3 | 4 | All contributions are welcome to this project. 5 | 6 | Contributor License Agreement 7 | ----------------------------- 8 | 9 | Before a contribution can be merged into this project, please fill out 10 | the Contributor License Agreement (CLA) located at: 11 | 12 | http://box.github.io/cla 13 | 14 | To learn more about CLAs and why they are important to open source 15 | projects, please see the `Wikipedia 16 | entry `_. 17 | 18 | How to contribute 19 | ----------------- 20 | 21 | - **File an issue** - if you found a bug, want to request an 22 | enhancement, or want to implement something (bug fix or feature). 23 | - **Send a pull request** - if you want to contribute code. Please be 24 | sure to file an issue first. 25 | 26 | Pull request best practices 27 | --------------------------- 28 | 29 | We want to accept your pull requests. Please follow these steps: 30 | 31 | Step 1: File an issue 32 | ~~~~~~~~~~~~~~~~~~~~~ 33 | 34 | Before writing any code, please file an issue stating the problem you 35 | want to solve or the feature you want to implement. This allows us to 36 | give you feedback before you spend any time writing code. There may be a 37 | known limitation that can't be addressed, or a bug that has already been 38 | fixed in a different way. The issue allows us to communicate and figure 39 | out if it's worth your time to write a bunch of code for the project. 40 | 41 | Step 2: Fork this repository in GitHub 42 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 43 | 44 | This will create your own copy of our repository. 45 | 46 | Step 3: Add the upstream source 47 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 48 | 49 | The upstream source is the project under the Box organization on GitHub. 50 | To add an upstream source for this project, type: 51 | 52 | .. code-block:: console 53 | 54 | git remote add upstream git@github.com:box/rotunicode.git 55 | 56 | This will come in useful later. 57 | 58 | Step 4: Create a feature branch 59 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 60 | 61 | Create a branch with a descriptive name, such as ``add-search``. 62 | 63 | Step 5: Push your feature branch to your fork 64 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 65 | 66 | As you develop code, continue to push code to your remote feature 67 | branch. Please make sure to include the issue number you're addressing 68 | in your commit message, such as: 69 | 70 | .. code-block:: console 71 | 72 | git commit -am "Adding search (fixes #123)" 73 | 74 | This helps us out by allowing us to track which issue your commit 75 | relates to. 76 | 77 | Keep a separate feature branch for each issue you want to address. 78 | 79 | Step 6: Rebase 80 | ~~~~~~~~~~~~~~ 81 | 82 | Before sending a pull request, rebase against upstream, such as: 83 | 84 | .. code-block:: console 85 | 86 | git fetch upstream 87 | git rebase upstream/master 88 | 89 | This will add your changes on top of what's already in upstream, 90 | minimizing merge issues. 91 | 92 | Step 7: Run the tests 93 | ~~~~~~~~~~~~~~~~~~~~~ 94 | 95 | Make sure that all tests are passing before submitting a pull request. 96 | 97 | Step 8: Send the pull request 98 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 99 | 100 | Send the pull request from your feature branch to us. Be sure to include 101 | a description that lets us know what work you did. 102 | 103 | Keep in mind that we like to see one issue addressed per pull request, 104 | as this helps keep our git history clean and we can more easily track 105 | down issues. 106 | -------------------------------------------------------------------------------- /rotunicode/rotunicode.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import unicode_literals 4 | import codecs 5 | import six 6 | 7 | 8 | class RotUnicode(codecs.Codec): 9 | """ 10 | Codec for converting between a string of ASCII and non-ASCII chars 11 | maintaining readability. 12 | 13 | >>> codes.register(RotUnicode.search_function) 14 | >>> 'Hello Frodo!'.encode('rotunicode') 15 | Ĥȅľľő Ƒŕőďő! 16 | >>> 'Ĥȅľľő Ƒŕőďő!'.decode('rotunicode') 17 | Hello Frodo! 18 | 19 | RotUnicode stands for rotate-to-unicode. Or rotten-unicode for those who 20 | have nightmares about Unicode. It was inspired by Rot13. 21 | """ 22 | # pylint: disable=no-init 23 | # The base class does not define it. 24 | 25 | _codec_name = 'rotunicode' 26 | 27 | _lowercase = 'abcdefghijklmnopqrstuvwxyz' 28 | _uppercase = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 29 | _ascii_alphabet = _lowercase + _uppercase + '0123456789' 30 | _rot_unicode_alphabet = ('ȁƄćďȅƒġĥȉĵƙľḿńőҏqŕŝƭȕѵŵхŷż' + 31 | 'ȀβĆĎȄƑĜĤȈĴƘĽḾŃŐΡɊŔŜƬȔѴŴΧŶŻ' + 32 | 'Ꮎ➀➁➂➃➄➅➆➇➈') 33 | 34 | _encoding_table = dict( 35 | zip( 36 | (ord(c) for c in _ascii_alphabet), 37 | _rot_unicode_alphabet, 38 | ), 39 | ) 40 | 41 | _decoding_table = dict( 42 | zip( 43 | (ord(c) for c in _rot_unicode_alphabet), 44 | (ord(c) for c in _ascii_alphabet), 45 | ), 46 | ) 47 | 48 | # pylint:disable=arguments-differ 49 | @classmethod 50 | def encode(cls, string, errors='strict'): 51 | """Return the encoded version of a string. 52 | 53 | :param string: 54 | The input string to encode. 55 | :type string: 56 | `basestring` 57 | 58 | :param errors: 59 | The error handling scheme. Only 'strict' is supported. 60 | :type errors: 61 | `basestring` 62 | 63 | :return: 64 | Tuple of encoded string and number of input bytes consumed. 65 | :rtype: 66 | `tuple` (`unicode`, `int`) 67 | """ 68 | if errors != 'strict': 69 | raise UnicodeError('Unsupported error handling {0}'.format(errors)) 70 | 71 | unicode_string = cls._ensure_unicode_string(string) 72 | encoded = unicode_string.translate(cls._encoding_table) 73 | return encoded, len(string) 74 | 75 | @classmethod 76 | def decode(cls, string, errors='strict'): 77 | """Return the decoded version of a string. 78 | 79 | :param string: 80 | The input string to decode. 81 | :type string: 82 | `basestring` 83 | 84 | :param errors: 85 | The error handling scheme. Only 'strict' is supported. 86 | :type errors: 87 | `basestring` 88 | 89 | :return: 90 | Tuple of decoded string and number of input bytes consumed. 91 | :rtype: 92 | `tuple` (`unicode`, `int`) 93 | """ 94 | if errors != 'strict': 95 | raise UnicodeError('Unsupported error handling {0}'.format(errors)) 96 | 97 | unicode_string = cls._ensure_unicode_string(string) 98 | decoded = unicode_string.translate(cls._decoding_table) 99 | return decoded, len(string) 100 | # pylint:enable=arguments-differ 101 | 102 | @classmethod 103 | def search_function(cls, encoding): 104 | """Search function to find 'rotunicode' codec.""" 105 | if encoding == cls._codec_name: 106 | return codecs.CodecInfo( 107 | name=cls._codec_name, 108 | encode=cls.encode, 109 | decode=cls.decode, 110 | ) 111 | return None 112 | 113 | @staticmethod 114 | def _ensure_unicode_string(string): 115 | """Returns a unicode string for string. 116 | 117 | :param string: 118 | The input string. 119 | :type string: 120 | `basestring` 121 | 122 | :returns: 123 | A unicode string. 124 | :rtype: 125 | `unicode` 126 | """ 127 | if not isinstance(string, six.text_type): 128 | string = string.decode('utf-8') 129 | return string 130 | -------------------------------------------------------------------------------- /test/test_utils.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import, unicode_literals 4 | 5 | import codecs 6 | from collections import Iterator 7 | from io import StringIO 8 | 9 | from genty import genty, genty_dataset, genty_args 10 | from six import with_metaclass 11 | 12 | from rotunicode import RotUnicode, ruencode, rudecode 13 | from rotunicode.utils import get_rotunicode_function_for_decode_argument, parse_escape_sequences, safe_unicode, stream_file_lines 14 | # pylint:disable=wrong-import-order 15 | from test.base_test_case import TestCase 16 | # pylint:enable=wrong-import-order 17 | 18 | 19 | class TypeWithoutStringMethods(type): 20 | def __new__(mcs, name, bases, dictionary): 21 | def string(self): # pylint:disable=unused-argument 22 | raise ValueError 23 | dictionary.update(dict.fromkeys(['__str__', '__unicode__', '__bytes__', '__repr__'], string)) 24 | return super(TypeWithoutStringMethods, mcs).__new__(mcs, name, bases, dictionary) 25 | 26 | 27 | class ClassWithoutStringMethods(with_metaclass(TypeWithoutStringMethods, object)): # pylint:disable=too-few-public-methods 28 | pass 29 | 30 | 31 | @genty 32 | class RotUnicodeUtilsTest(TestCase): 33 | """Tests for :mod:`box.util.rotunicode.utils`.""" 34 | 35 | @classmethod 36 | def setUpClass(cls): 37 | super(RotUnicodeUtilsTest, cls).setUpClass() 38 | codecs.register(RotUnicode.search_function) 39 | 40 | @genty_dataset( 41 | genty_args('plain', 'ҏľȁȉń'), 42 | genty_args('plain', 'ҏľȁȉń', extension=False), 43 | genty_args('.extension', '.ȅхƭȅńŝȉőń'), 44 | genty_args('.extension', '.ȅхƭȅńŝȉőń', extension=False), 45 | genty_args('plain.txt', 'ҏľȁȉń.txt'), 46 | genty_args('plain.txt', 'ҏľȁȉń.txt', extension=False,), 47 | genty_args('plain.txt', 'ҏľȁȉń.ƭхƭ', extension=True), 48 | genty_args('two.ext.sions', 'ƭŵő.ȅхƭ.sions'), 49 | genty_args('two.ext.sions', 'ƭŵő.ȅхƭ.sions', extension=False), 50 | genty_args('two.ext.sions', 'ƭŵő.ȅхƭ.ŝȉőńŝ', extension=True), 51 | ) 52 | def test_ruencode_encodes_string_using_rotunicode( 53 | self, 54 | source, 55 | target, 56 | extension=None, 57 | ): 58 | encoded_source = ruencode(source) if extension is None else ruencode(source, extension=extension) 59 | self.assertEqual( 60 | target, 61 | encoded_source, 62 | ) 63 | 64 | def test_rudecode_decodes_string_using_rotunicode(self): 65 | self.assertEqual( 66 | 'Hello World!', 67 | rudecode('Ĥȅľľő Ŵőŕľď!'), 68 | ) 69 | 70 | @genty_dataset( 71 | ascii_byte_string=(b'plain', u'plain'), 72 | ascii_unicode_string=(u'plain', u'plain'), 73 | non_ascii_byte_string=(u'ƒøø'.encode('utf-8'), u'ƒøø'), 74 | non_ascii_unicode_string=(u'ƒøø', u'ƒøø'), 75 | non_string_object_with_unicode_method=(17, u'17'), 76 | undecodable_byte_string=(u'ƒøø'.encode('utf-16'), None, UnicodeDecodeError), 77 | non_string_object_without_unicode_method=(ClassWithoutStringMethods(), None, ValueError), 78 | ) 79 | def test_safe_unicode(self, string, expected_result, expected_exception_classes=()): 80 | 81 | def run(): 82 | return safe_unicode(string) 83 | 84 | if expected_exception_classes: 85 | with self.assertRaises(expected_exception_classes): 86 | run() 87 | else: 88 | self.assertEqual(run(), expected_result) 89 | 90 | @genty_dataset(False, True) 91 | def test_get_rotunicode_function_for_decode_argument(self, decode): 92 | expected_result = rudecode if decode else ruencode 93 | self.assertIs(get_rotunicode_function_for_decode_argument(decode=decode), expected_result) 94 | 95 | @genty_dataset( 96 | empty_string=('', ''), 97 | plain=('ƒøøbar', 'ƒøøbar'), 98 | trailing_backslash_character=('\\', None, ValueError), 99 | invalid_escape_sequence=('\\z', None, ValueError), 100 | two_backslash_characters=('\\\\', '\\'), 101 | mixed=('foo\\nƒøø\\t', 'foo\nƒøø\t'), 102 | uxxxx=('foo\\u0256', 'fooɖ'), 103 | ) 104 | def test_parse_escape_sequences(self, string, expected_result, expected_exception_classes=None): 105 | 106 | def run(): 107 | return parse_escape_sequences(string) 108 | 109 | if expected_exception_classes: 110 | with self.assertRaises(expected_exception_classes): 111 | run() 112 | else: 113 | self.assertEqual(run(), expected_result) 114 | 115 | def test_stream_file_lines(self): 116 | lines = ['foo', 'bar', 'baz', 'ƒøø', '', 'ƒoobarbaz'] 117 | lines_with_newlines = [line + '\n' for line in lines] 118 | content = ''.join(lines_with_newlines) 119 | io_object = StringIO(content) 120 | result = stream_file_lines(io_object) 121 | self.assertIsInstance(result, Iterator) 122 | self.assertEqual(list(result), lines_with_newlines) 123 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | rotunicode 2 | ========== 3 | 4 | .. image:: http://opensource.box.com/badges/active.svg 5 | :target: http://opensource.box.com/badges 6 | 7 | .. image:: https://travis-ci.org/box/rotunicode.png?branch=master 8 | :target: https://travis-ci.org/box/rotunicode 9 | 10 | .. image:: https://coveralls.io/repos/box/rotunicode/badge.png 11 | :target: https://coveralls.io/r/box/rotunicode 12 | 13 | .. image:: https://img.shields.io/pypi/v/rotunicode.svg 14 | :target: https://pypi.python.org/pypi/rotunicode 15 | 16 | .. image:: https://img.shields.io/pypi/dm/rotunicode.svg 17 | :target: https://pypi.python.org/pypi/rotunicode 18 | 19 | 20 | RotUnicode is a Python library that can convert a string containing ASCII 21 | characters to a string with non-ASCII characters without losing readability. 22 | 23 | .. code-block:: pycon 24 | 25 | >>> 'Hello World!'.encode('rotunicode') 26 | Ĥȅľľő Ŵőŕľď! 27 | >>> 'Ĥȅľľő Ŵőŕľď!'.decode('rotunicode') 28 | Hello World! 29 | 30 | In the above example, the 'Hello World' string has all ASCII characters. 31 | Encoding it with RotUnicode gives you 'Ĥȅľľő Ŵőŕľď' which reads like 32 | 'Hello World' but has all non-ASCII characters. 33 | 34 | 35 | Why is this named RotUnicode? 36 | ----------------------------- 37 | 38 | RotUnicode stands for rotate-to-unicode. Or rotten-unicode for those who have 39 | nightmares about Unicode. It was inspired by Rot13. 40 | 41 | 42 | Supported Characters 43 | -------------------- 44 | 45 | RotUnicode converts lower case and upper case characters of the English 46 | alphabet and digits 0 to 9 to non-ASCII characters. All characters that are 47 | outside this range are left as is. 48 | 49 | .. code-block:: pycon 50 | 51 | >>> 'हेलो World!'.encode('rotunicode') 52 | हेलो Ŵőŕľď! 53 | >>> 'हेलो Ŵőŕľď!'.decode('rotunicode') 54 | हेलो World! 55 | 56 | 57 | Installation 58 | ------------ 59 | 60 | To install, simply: 61 | 62 | .. code-block:: console 63 | 64 | pip install rotunicode 65 | 66 | 67 | Use 68 | --- 69 | 70 | .. code-block:: pycon 71 | 72 | >>> from rotunicode import ruencode 73 | >>> ruencode('Hello World!') 74 | Ĥȅľľő Ŵőŕľď! 75 | >>> rudecode('Ĥȅľľő Ŵőŕľď!') 76 | Hello World! 77 | 78 | 79 | As a Codec 80 | ---------- 81 | 82 | In Python 2, RotUnicode can also be used as a codec, but it must first 83 | be registered with the codecs library. This allows python to know what 84 | functions to call to encode or decode a string using RotUnicode. 85 | 86 | .. code-block:: pycon 87 | 88 | >>> import codecs 89 | >>> from rotunicode import RotUnicode 90 | >>> codecs.register(RotUnicode.search_function) 91 | >>> 'Hello World!'.encode('rotunicode') 92 | Ĥȅľľő Ŵőŕľď! 93 | 94 | 95 | Command Line 96 | ------------ 97 | 98 | Installing RotUnicode also includes a command line tool. 99 | 100 | .. code-block:: console 101 | 102 | $ rotunicode "Hello World" 103 | Ĥȅľľő Ŵőŕľď! 104 | $ rotunicode -d "Ĥȅľľő Ŵőŕľď!" 105 | Hello World! 106 | $ echo "Hello World!" > hello.txt 107 | $ rotunicode -f hello.txt 108 | Ĥȅľľő Ŵőŕľď! 109 | $ cat hello.txt | rotunicode -f 110 | Ĥȅľľő Ŵőŕľď! 111 | 112 | 113 | Why should I use RotUnicode? 114 | ---------------------------- 115 | 116 | RotUnicode it extremely helpful in testing because it reduces the friction for 117 | developers to test with non-ASCII strings. Imagine for example that you have a 118 | class to represent a contact for your address book application: 119 | 120 | .. code-block:: python 121 | 122 | class Contact(object): 123 | 124 | def __init__(self, first_name, last_name): 125 | super(Contact, self).__init__() 126 | self.first_name = first_name 127 | self.last_name = last_name 128 | 129 | def display_name(self): 130 | return '{} {}'.format(self.first_name, self.last_name) 131 | 132 | Most developers would test this as follows: 133 | 134 | .. code-block:: python 135 | 136 | from unittest import TestCase 137 | from contact import Contact 138 | 139 | class ContactTests(TestCase): 140 | 141 | def test_display_name(self): 142 | contact = Contact('John', 'Doe’) 143 | self.assertEqual('John Doe', contact.display_name())) 144 | 145 | This test is good. But it is going to miss catching problems in the code with 146 | non-ASCII characters. Requiring developers to remember how to type non-ASCII 147 | characters is not practical. With RotUnicode, this is super easy: 148 | 149 | .. code-block:: python 150 | 151 | from unittest import TestCase 152 | from contact import Contact 153 | 154 | class ContactTests(TestCase): 155 | 156 | def test_display_name_with_ascii_name(self): 157 | contact = Contact(u'John', u'Doe') 158 | self.assertEqual(u'John Doe', contact.display_name()) 159 | 160 | def test_display_name_with_non_ascii_name(self): 161 | contact = Contact(ruencode(u'John'), ruencode(u'Doe')) 162 | self.assertEqual(ruencode(u'John Doe'), contact.display_name()) 163 | 164 | 165 | This is an example of a bug in Python 166 | (`issue18695 `_) with non-ASCII characters - 167 | 168 | .. code-block:: pycon 169 | 170 | >>> import os, errno 171 | >>> name = 'foo'.encode('rotunicode') 172 | >>> os.mkdir(name) 173 | >>> print(name) 174 | ƒőő 175 | >>> os.path.exists(name) 176 | True 177 | >>> os.statvfs(name) 178 | Traceback (most recent call last): 179 | File "", line 1, in 180 | UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-2: 181 | ordinal not in range(128) 182 | 183 | 184 | Contribute 185 | ---------- 186 | 187 | See `CONTRIBUTING `_. 188 | 189 | 190 | Setup 191 | ~~~~~ 192 | 193 | Create a virtual environment and install packages: 194 | 195 | .. code-block:: console 196 | 197 | mkvirtualenv rotunicode 198 | pip install -r requirements-dev.txt 199 | 200 | 201 | Testing 202 | ~~~~~~~ 203 | 204 | Run all tests using: 205 | 206 | .. code-block:: console 207 | 208 | tox 209 | 210 | The tox tests include code style checks via pep8 and pylint. 211 | 212 | The tox tests are configured to run on Python 2.7, 3.4, 3.5, 3.6, 3.7, 213 | and PyPy2.7 (version 5.10). 214 | 215 | 216 | Copyright and License 217 | --------------------- 218 | 219 | :: 220 | 221 | Copyright 2019 Box, Inc. All rights reserved. 222 | 223 | Licensed under the Apache License, Version 2.0 (the "License"); 224 | you may not use this file except in compliance with the License. 225 | You may obtain a copy of the License at 226 | 227 | http://www.apache.org/licenses/LICENSE-2.0 228 | 229 | Unless required by applicable law or agreed to in writing, software 230 | distributed under the License is distributed on an "AS IS" BASIS, 231 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 232 | See the License for the specific language governing permissions and 233 | limitations under the License. 234 | 235 | -------------------------------------------------------------------------------- /rotunicode/utils.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import, unicode_literals 4 | 5 | from itertools import repeat, takewhile 6 | from json.decoder import scanstring 7 | from operator import methodcaller 8 | from os.path import splitext 9 | 10 | from six import binary_type, raise_from, text_type 11 | from six.moves import map # pylint:disable=redefined-builtin 12 | 13 | from .rotunicode import RotUnicode 14 | 15 | 16 | _ROT_UNICODE = RotUnicode() 17 | 18 | 19 | def safe_unicode(data): 20 | """ Helper to safely convert that contain unicode to unicode. 21 | Otherwise argparse barfs. """ 22 | if isinstance(data, binary_type): 23 | return data.decode('utf-8') 24 | return text_type(data) 25 | 26 | 27 | def ruencode(string, extension=False): 28 | """Encode a string using 'rotunicode' codec. 29 | 30 | :param string: 31 | The input string to encode. 32 | :type string: 33 | `basestring` 34 | 35 | :param extension: 36 | True if the entire input string should be encoded. 37 | False to split the input string using :func:`os.path.splitext` and 38 | encode only the file name portion keeping the extension as is. 39 | :type extension: 40 | `bool` 41 | 42 | :return: 43 | Encoded string. 44 | :rtype: 45 | `unicode` 46 | """ 47 | if extension: 48 | file_name = string 49 | file_ext = '' 50 | else: 51 | file_name, file_ext = splitext(string) 52 | 53 | encoded_value, _ = _ROT_UNICODE.encode(file_name) 54 | return encoded_value + file_ext 55 | 56 | 57 | def rudecode(string): 58 | """Decode a string using 'rotunicode' codec. 59 | 60 | :param string: 61 | The input string to decode. 62 | :type string: 63 | `basestring` 64 | 65 | :return: 66 | Decoded string. 67 | :rtype: 68 | `unicode` 69 | """ 70 | decoded_value, _ = _ROT_UNICODE.decode(string) 71 | return decoded_value 72 | 73 | 74 | _ROTUNICODE_FUNCTION_FOR_DECODE_ARGUMENT = { 75 | False: ruencode, 76 | True: rudecode, 77 | } 78 | 79 | 80 | def get_rotunicode_function_for_decode_argument(decode=False): 81 | """Return either `ruencode` or `rudecode`, depending on :param:`decode`. 82 | 83 | :param decode: 84 | (optional) If True, return `rudecode`. 85 | Defaults to False (return `ruencode`). 86 | :type decode: 87 | `bool` 88 | :return: 89 | Either `ruencode` or `rudecode`. 90 | :rtype: 91 | `callable` of (`basestring`) -> `unicode` 92 | """ 93 | return _ROTUNICODE_FUNCTION_FOR_DECODE_ARGUMENT[decode] 94 | 95 | 96 | def parse_escape_sequences(string): 97 | """Parse a string for possible escape sequences. 98 | 99 | Sample usage: 100 | >>> parse_escape_sequences('foo\\nbar') 101 | 'foo\nbar' 102 | >>> parse_escape_sequences('foo\\\\u0256') 103 | 'foo\\u0256' 104 | 105 | :param string: 106 | Any string. 107 | :type string: 108 | `basestring` 109 | :raises: 110 | :class:`ValueError` if a backslash character is found, but it doesn't 111 | form a proper escape sequence with the character(s) that follow. 112 | :return: 113 | The parsed string. Will parse the standard escape sequences, and also 114 | basic \\uxxxx escape sequences. 115 | \\uxxxxxxxxxx escape sequences are not currently supported. 116 | :rtype: 117 | `unicode` 118 | """ 119 | string = safe_unicode(string) 120 | characters = [] 121 | i = 0 122 | string_len = len(string) 123 | while i < string_len: 124 | character = string[i] 125 | if character == '\\': 126 | # Figure out the size of the escape sequence. Most escape sequences 127 | # are two characters (e.g. '\\' and 'n'), with the sole exception 128 | # being \uxxxx escape sequences, which are six characters. 129 | if string[(i + 1):(i + 2)] == 'u': 130 | offset = 6 131 | else: 132 | offset = 2 133 | 134 | try: 135 | # `json.decoder.scanstring()` mostly does what we want, but it 136 | # also does some stuff that we don't want, like parsing quote 137 | # characters. This will mess us up. The iteration and scanning 138 | # within this loop is meant to isolate the escape sequences, so 139 | # that we'll always be calling it with something like 140 | # >>> scanstring('"\n"', 1) 141 | # or 142 | # >>> scanstring('"\u0256"', 1) 143 | # The 1 refers to the location of the first character after the 144 | # open quote character. 145 | json_string = '"' + string[i:(i + offset)] + '"' 146 | character = scanstring(json_string, 1)[0] 147 | characters.append(character) 148 | i += offset 149 | except ValueError: 150 | # If an exception was raised, raise a new `ValueError`. The 151 | # reason we don't re-raise the original exception is because, 152 | # in Python 3, it is a custom JSON `ValueError` subclass. We 153 | # don't want to raise a JSON error from a function that has 154 | # nothing to do with JSON, so we create a new `ValueError`. The 155 | # error message is also nonsensical to the caller, in all 156 | # cases. 157 | raise_from(ValueError(string), None) 158 | else: 159 | characters.append(character) 160 | i += 1 161 | return ''.join(characters) 162 | 163 | 164 | def stream_file_lines(io_object): 165 | """Stream the lines of a file. 166 | 167 | This is a more powerful version of `io.TextIOWrapper.readlines()`. For file 168 | streams / pipes such as `sys.stdin`, that method will hang until EOF has 169 | been read, preventing the caller from acting on any lines until the full 170 | content is available. 171 | 172 | Callers can get data from `io.TextIOWrapper.readline()` (which only hangs 173 | between reads of newline characters). This generator does that, and 174 | provides the user with an iterable stream of lines that can be iterated 175 | through at any time. 176 | 177 | The generator will automatically terminate when EOF has been reached, i.e. 178 | when `io.TextIOWrapper.readline()` returns the empty string. 179 | 180 | :param io_object: 181 | The file object to convert. 182 | :type io_object: 183 | :class:`io.TextIOWrapper` 184 | :param decode: 185 | If True, perform a rotunicode-decode (rotate from non-ASCII to ASCII). 186 | Defaults to False (rotate from ASCII to non-ASCII). 187 | :type decode: 188 | `bool` 189 | :return: 190 | Yield the converted lines of the file. The generator will terminate 191 | before it would have yielded an empty string. Each line will contain 192 | its terminating newline. 193 | :rtype: 194 | `generator` of `unicode` 195 | """ 196 | return takewhile( 197 | bool, 198 | map(methodcaller('readline'), repeat(io_object)), 199 | ) 200 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | 3 | # Specify a configuration file. 4 | #rcfile= 5 | 6 | # Python code to execute, usually for sys.path manipulation such as 7 | # pygtk.require(). 8 | #init-hook= 9 | 10 | # Add files or directories to the blacklist. They should be base names, not 11 | # paths. 12 | #ignore=CVS 13 | 14 | # Pickle collected data for later comparisons. 15 | #persistent=yes 16 | 17 | # List of plugins (as comma separated values of python modules names) to load, 18 | # usually to register additional checkers. 19 | #load-plugins= 20 | 21 | 22 | [MESSAGES CONTROL] 23 | 24 | # Enable the message, report, category or checker with the given id(s). You can 25 | # either give multiple identifier separated by comma (,) or put this option 26 | # multiple time. See also the "--disable" option for examples. 27 | #enable= 28 | 29 | # Disable the message, report, category or checker with the given id(s). You 30 | # can either give multiple identifiers separated by comma (,) or put this 31 | # option multiple times (only on the command line, not in the configuration 32 | # file where it should appear only once).You can also use "--disable=all" to 33 | # disable everything first and then reenable specific checks. For example, if 34 | # you want to run only the similarities checker, you can use "--disable=all 35 | # --enable=similarities". If you want to run only the classes checker, but have 36 | # no Warning level messages displayed, use"--disable=all --enable=classes 37 | # --disable=W" 38 | 39 | # C0111 => Missing docstring 40 | disable=I, C0111 41 | 42 | 43 | [REPORTS] 44 | 45 | # Set the output format. Available formats are text, parseable, colorized, msvs 46 | # (visual studio) and html. You can also give a reporter class, eg 47 | # mypackage.mymodule.MyReporterClass. 48 | output-format=text 49 | 50 | # Put messages in a separate file for each module / package specified on the 51 | # command line instead of printing them on stdout. Reports (if any) will be 52 | # written in a file name "pylint_global.[txt|html]". 53 | files-output=no 54 | 55 | # Tells whether to display a full report or only the messages 56 | reports=no 57 | 58 | # Python expression which should return a note less than 10 (10 is the highest 59 | # note). You have access to the variables errors warning, statement which 60 | # respectively contain the number of errors / warnings messages and the total 61 | # number of statements analyzed. This is used by the global evaluation report 62 | # (RP0004). 63 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) 64 | 65 | # Template used to display messages. This is a python new-style format string 66 | # used to format the massage information. See doc for all details 67 | msg-template={module}:{line}:{column}: [{msg_id}({symbol}), {obj}] {msg} 68 | 69 | 70 | [BASIC] 71 | 72 | # List of builtins function names that should not be used, separated by a comma 73 | bad-functions=map,filter,apply,input 74 | 75 | # Regular expression which should only match correct module names 76 | module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ 77 | 78 | # Regular expression which should only match correct module level names 79 | const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$ 80 | 81 | # Regular expression which should only match correct class names 82 | class-rgx=[A-Z_][a-zA-Z0-9]+$ 83 | 84 | # Regular expression which should only match correct function names 85 | function-rgx=[a-z_][a-z0-9_]{2,150}$ 86 | 87 | # Regular expression which should only match correct method names 88 | method-rgx=[a-z_][a-z0-9_]{2,150}$ 89 | 90 | # Regular expression which should only match correct instance attribute names 91 | attr-rgx=[a-z_][a-z0-9_]{2,30}$ 92 | 93 | # Regular expression which should only match correct argument names 94 | argument-rgx=[a-z_][a-z0-9_]{2,30}$ 95 | 96 | # Regular expression which should only match correct variable names 97 | variable-rgx=[a-z_][a-z0-9_]{2,30}$ 98 | 99 | # Regular expression which should only match correct attribute names in class 100 | # bodies 101 | class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ 102 | 103 | # Regular expression which should only match correct list comprehension / 104 | # generator expression variable names 105 | inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$ 106 | 107 | # Good variable names which should always be accepted, separated by a comma 108 | good-names=i,j,k,ex,Run,_ 109 | 110 | # Bad variable names which should always be refused, separated by a comma 111 | bad-names=foo,bar,baz,toto,tutu,tata 112 | 113 | # Regular expression which should only match function or class names that do 114 | # not require a docstring. 115 | no-docstring-rgx=__.*__ 116 | 117 | # Minimum line length for functions/classes that require docstrings, shorter 118 | # ones are exempt. 119 | docstring-min-length=-1 120 | 121 | 122 | [FORMAT] 123 | 124 | # Maximum number of characters on a single line. 125 | max-line-length=80 126 | 127 | # Regexp for a line that is allowed to be longer than the limit. 128 | ignore-long-lines=^\s*(# )??$ 129 | 130 | # Maximum number of lines in a module 131 | max-module-lines=1000 132 | 133 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 134 | # tab). 135 | indent-string=' ' 136 | 137 | 138 | [MISCELLANEOUS] 139 | 140 | # List of note tags to take in consideration, separated by a comma. 141 | notes=TODO 142 | 143 | 144 | [SIMILARITIES] 145 | 146 | # Minimum lines number of a similarity. 147 | min-similarity-lines=4 148 | 149 | # Ignore comments when computing similarities. 150 | ignore-comments=yes 151 | 152 | # Ignore docstrings when computing similarities. 153 | ignore-docstrings=yes 154 | 155 | # Ignore imports when computing similarities. 156 | ignore-imports=no 157 | 158 | 159 | [TYPECHECK] 160 | 161 | # Tells whether missing members accessed in mixin class should be ignored. A 162 | # mixin class is detected if its name ends with "mixin" (case insensitive). 163 | ignore-mixin-members=yes 164 | 165 | # List of classes names for which member attributes should not be checked 166 | # (useful for classes with attributes dynamically set). 167 | #ignored-classes=SQLObject 168 | 169 | # List of members which are set dynamically and missed by pylint inference 170 | # system, and so shouldn't trigger E0201 when accessed. Python regular 171 | # expressions are accepted. 172 | generated-members=REQUEST,acl_users,aq_parent 173 | 174 | 175 | [VARIABLES] 176 | 177 | # Tells whether we should check for unused import in __init__ files. 178 | init-import=no 179 | 180 | # A regular expression matching the beginning of the name of dummy variables 181 | # (i.e. not used). 182 | dummy-variables-rgx=_$|dummy 183 | 184 | # List of additional names supposed to be defined in builtins. Remember that 185 | # you should avoid to define new builtins when possible. 186 | additional-builtins= 187 | 188 | 189 | [CLASSES] 190 | 191 | # List of method names used to declare (i.e. assign) instance attributes. 192 | defining-attr-methods=__init__,__new__,setUp 193 | 194 | # List of valid names for the first argument in a class method. 195 | valid-classmethod-first-arg=cls 196 | 197 | # List of valid names for the first argument in a metaclass class method. 198 | valid-metaclass-classmethod-first-arg=mcs 199 | 200 | 201 | [DESIGN] 202 | 203 | # Maximum number of arguments for function / method 204 | max-args=5 205 | 206 | # Argument names that match this expression will be ignored. Default to name 207 | # with leading underscore 208 | ignored-argument-names=_.* 209 | 210 | # Maximum number of locals for function / method body 211 | max-locals=15 212 | 213 | # Maximum number of return / yield for function / method body 214 | max-returns=6 215 | 216 | # Maximum number of branch for function / method body 217 | max-branches=12 218 | 219 | # Maximum number of statements in function / method body 220 | max-statements=50 221 | 222 | # Maximum number of parents for a class (see R0901). 223 | max-parents=7 224 | 225 | # Maximum number of attributes for a class (see R0902). 226 | max-attributes=7 227 | 228 | # Minimum number of public methods for a class (see R0903). 229 | min-public-methods=2 230 | 231 | # Maximum number of public methods for a class (see R0904). 232 | max-public-methods=100 233 | 234 | 235 | [IMPORTS] 236 | 237 | # Deprecated modules which should not be used, separated by a comma 238 | deprecated-modules= 239 | 240 | # Create a graph of every (i.e. internal and external) dependencies in the 241 | # given file (report RP0402 must not be disabled) 242 | import-graph= 243 | 244 | # Create a graph of external dependencies in the given file (report RP0402 must 245 | # not be disabled) 246 | ext-import-graph= 247 | 248 | # Create a graph of internal dependencies in the given file (report RP0402 must 249 | # not be disabled) 250 | int-import-graph= 251 | 252 | 253 | [EXCEPTIONS] 254 | 255 | # Exceptions that will emit a warning when being caught. Defaults to 256 | # "Exception" 257 | overgeneral-exceptions=Exception 258 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | "License" shall mean the terms and conditions for use, reproduction, 9 | and distribution as defined by Sections 1 through 9 of this document. 10 | "Licensor" shall mean the copyright owner or entity authorized by 11 | the copyright owner that is granting the License. 12 | "Legal Entity" shall mean the union of the acting entity and all 13 | other entities that control, are controlled by, or are under common 14 | control with that entity. For the purposes of this definition, 15 | "control" means (i) the power, direct or indirect, to cause the 16 | direction or management of such entity, whether by contract or 17 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 18 | outstanding shares, or (iii) beneficial ownership of such entity. 19 | "You" (or "Your") shall mean an individual or Legal Entity 20 | exercising permissions granted by this License. 21 | "Source" form shall mean the preferred form for making modifications, 22 | including but not limited to software source code, documentation 23 | source, and configuration files. 24 | "Object" form shall mean any form resulting from mechanical 25 | transformation or translation of a Source form, including but 26 | not limited to compiled object code, generated documentation, 27 | and conversions to other media types. 28 | "Work" shall mean the work of authorship, whether in Source or 29 | Object form, made available under the License, as indicated by a 30 | copyright notice that is included in or attached to the work 31 | (an example is provided in the Appendix below). 32 | "Derivative Works" shall mean any work, whether in Source or Object 33 | form, that is based on (or derived from) the Work and for which the 34 | editorial revisions, annotations, elaborations, or other modifications 35 | represent, as a whole, an original work of authorship. For the purposes 36 | of this License, Derivative Works shall not include works that remain 37 | separable from, or merely link (or bind by name) to the interfaces of, 38 | the Work and Derivative Works thereof. 39 | "Contribution" shall mean any work of authorship, including 40 | the original version of the Work and any modifications or additions 41 | to that Work or Derivative Works thereof, that is intentionally 42 | submitted to Licensor for inclusion in the Work by the copyright owner 43 | or by an individual or Legal Entity authorized to submit on behalf of 44 | the copyright owner. For the purposes of this definition, "submitted" 45 | means any form of electronic, verbal, or written communication sent 46 | to the Licensor or its representatives, including but not limited to 47 | communication on electronic mailing lists, source code control systems, 48 | and issue tracking systems that are managed by, or on behalf of, the 49 | Licensor for the purpose of discussing and improving the Work, but 50 | excluding communication that is conspicuously marked or otherwise 51 | designated in writing by the copyright owner as "Not a Contribution." 52 | "Contributor" shall mean Licensor and any individual or Legal Entity 53 | on behalf of whom a Contribution has been received by Licensor and 54 | subsequently incorporated within the Work. 55 | 56 | 2. Grant of Copyright License. Subject to the terms and conditions of 57 | this License, each Contributor hereby grants to You a perpetual, 58 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 59 | copyright license to reproduce, prepare Derivative Works of, 60 | publicly display, publicly perform, sublicense, and distribute the 61 | Work and such Derivative Works in Source or Object form. 62 | 63 | 3. Grant of Patent License. Subject to the terms and conditions of 64 | this License, each Contributor hereby grants to You a perpetual, 65 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 66 | (except as stated in this section) patent license to make, have made, 67 | use, offer to sell, sell, import, and otherwise transfer the Work, 68 | where such license applies only to those patent claims licensable 69 | by such Contributor that are necessarily infringed by their 70 | Contribution(s) alone or by combination of their Contribution(s) 71 | with the Work to which such Contribution(s) was submitted. If You 72 | institute patent litigation against any entity (including a 73 | cross-claim or counterclaim in a lawsuit) alleging that the Work 74 | or a Contribution incorporated within the Work constitutes direct 75 | or contributory patent infringement, then any patent licenses 76 | granted to You under this License for that Work shall terminate 77 | as of the date such litigation is filed. 78 | 79 | 4. Redistribution. You may reproduce and distribute copies of the 80 | Work or Derivative Works thereof in any medium, with or without 81 | modifications, and in Source or Object form, provided that You 82 | meet the following conditions: 83 | 84 | (a) You must give any other recipients of the Work or 85 | Derivative Works a copy of this License; and 86 | 87 | (b) You must cause any modified files to carry prominent notices 88 | stating that You changed the files; and 89 | 90 | (c) You must retain, in the Source form of any Derivative Works 91 | that You distribute, all copyright, patent, trademark, and 92 | attribution notices from the Source form of the Work, 93 | excluding those notices that do not pertain to any part of 94 | the Derivative Works; and 95 | 96 | (d) If the Work includes a "NOTICE" text file as part of its 97 | distribution, then any Derivative Works that You distribute must 98 | include a readable copy of the attribution notices contained 99 | within such NOTICE file, excluding those notices that do not 100 | pertain to any part of the Derivative Works, in at least one 101 | of the following places: within a NOTICE text file distributed 102 | as part of the Derivative Works; within the Source form or 103 | documentation, if provided along with the Derivative Works; or, 104 | within a display generated by the Derivative Works, if and 105 | wherever such third-party notices normally appear. The contents 106 | of the NOTICE file are for informational purposes only and 107 | do not modify the License. You may add Your own attribution 108 | notices within Derivative Works that You distribute, alongside 109 | or as an addendum to the NOTICE text from the Work, provided 110 | that such additional attribution notices cannot be construed 111 | as modifying the License. 112 | 113 | You may add Your own copyright statement to Your modifications and 114 | may provide additional or different license terms and conditions 115 | for use, reproduction, or distribution of Your modifications, or 116 | for any such Derivative Works as a whole, provided Your use, 117 | reproduction, and distribution of the Work otherwise complies with 118 | the conditions stated in this License. 119 | 120 | 5. Submission of Contributions. Unless You explicitly state otherwise, 121 | any Contribution intentionally submitted for inclusion in the Work 122 | by You to the Licensor shall be under the terms and conditions of 123 | this License, without any additional terms or conditions. 124 | Notwithstanding the above, nothing herein shall supersede or modify 125 | the terms of any separate license agreement you may have executed 126 | with Licensor regarding such Contributions. 127 | 128 | 6. Trademarks. This License does not grant permission to use the trade 129 | names, trademarks, service marks, or product names of the Licensor, 130 | except as required for reasonable and customary use in describing the 131 | origin of the Work and reproducing the content of the NOTICE file. 132 | 133 | 7. Disclaimer of Warranty. Unless required by applicable law or 134 | agreed to in writing, Licensor provides the Work (and each 135 | Contributor provides its Contributions) on an "AS IS" BASIS, 136 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 137 | implied, including, without limitation, any warranties or conditions 138 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 139 | PARTICULAR PURPOSE. You are solely responsible for determining the 140 | appropriateness of using or redistributing the Work and assume any 141 | risks associated with Your exercise of permissions under this License. 142 | 143 | 8. Limitation of Liability. In no event and under no legal theory, 144 | whether in tort (including negligence), contract, or otherwise, 145 | unless required by applicable law (such as deliberate and grossly 146 | negligent acts) or agreed to in writing, shall any Contributor be 147 | liable to You for damages, including any direct, indirect, special, 148 | incidental, or consequential damages of any character arising as a 149 | result of this License or out of the use or inability to use the 150 | Work (including but not limited to damages for loss of goodwill, 151 | work stoppage, computer failure or malfunction, or any and all 152 | other commercial damages or losses), even if such Contributor 153 | has been advised of the possibility of such damages. 154 | 155 | 9. Accepting Warranty or Additional Liability. While redistributing 156 | the Work or Derivative Works thereof, You may choose to offer, 157 | and charge a fee for, acceptance of support, warranty, indemnity, 158 | or other liability obligations and/or rights consistent with this 159 | License. However, in accepting such obligations, You may act only 160 | on Your own behalf and on Your sole responsibility, not on behalf 161 | of any other Contributor, and only if You agree to indemnify, 162 | defend, and hold each Contributor harmless for any liability 163 | incurred by, or claims asserted against, such Contributor by reason 164 | of your accepting any such warranty or additional liability. 165 | 166 | END OF TERMS AND CONDITIONS 167 | --------------------------------------------------------------------------------