├── .coveragerc
├── .gitignore
├── .travis.yml
├── LICENSE
├── MANIFEST.in
├── README.rst
├── docs
├── conf.py
└── index.rst
├── setup.cfg
├── setup.py
├── tox.ini
└── webencodings
├── __init__.py
├── labels.py
├── mklabels.py
├── tests.py
└── x_user_defined.py
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | branch = True
3 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | .tox
3 | *.egg-info
4 | .coverage
5 | docs/_build
6 | /dist
7 | htmlcov
8 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | sudo: false
3 | cache: pip
4 |
5 | matrix:
6 | include:
7 | - python: 2.6
8 | env: TOXENV=py26
9 | - python: 2.7
10 | env: TOXENV=py27
11 | - python: 3.3
12 | env: TOXENV=py33
13 | - python: 3.4
14 | env: TOXENV=py34
15 | - python: 3.5
16 | env: TOXENV=py35
17 | - python: 3.6
18 | env: TOXENV=py36
19 | - python: pypy
20 | env: TOXENV=pypy
21 |
22 | install:
23 | - pip install -U tox
24 |
25 | script:
26 | - tox
27 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2012 by Simon Sapin.
2 |
3 | Some rights reserved.
4 |
5 | Redistribution and use in source and binary forms, with or without
6 | modification, are permitted provided that the following conditions are
7 | met:
8 |
9 | * Redistributions of source code must retain the above copyright
10 | notice, this list of conditions and the following disclaimer.
11 |
12 | * Redistributions in binary form must reproduce the above
13 | copyright notice, this list of conditions and the following
14 | disclaimer in the documentation and/or other materials provided
15 | with the distribution.
16 |
17 | * The names of the contributors may not be used to endorse or
18 | promote products derived from this software without specific
19 | prior written permission.
20 |
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | python-webencodings
2 | ===================
3 |
4 | This is a Python implementation of the `WHATWG Encoding standard
5 | `_.
6 |
7 | * Latest documentation: http://packages.python.org/webencodings/
8 | * Source code and issue tracker:
9 | https://github.com/gsnedders/python-webencodings
10 | * PyPI releases: http://pypi.python.org/pypi/webencodings
11 | * License: BSD
12 | * Python 2.6+ and 3.3+
13 |
14 | In order to be compatible with legacy web content
15 | when interpreting something like ``Content-Type: text/html; charset=latin1``,
16 | tools need to use a particular set of aliases for encoding labels
17 | as well as some overriding rules.
18 | For example, ``US-ASCII`` and ``iso-8859-1`` on the web are actually
19 | aliases for ``windows-1252``, and an UTF-8 or UTF-16 BOM takes precedence
20 | over any other encoding declaration.
21 | The Encoding standard defines all such details so that implementations do
22 | not have to reverse-engineer each other.
23 |
24 | This module has encoding labels and BOM detection,
25 | but the actual implementation for encoders and decoders is Python’s.
26 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | #
4 | # python-webencodings documentation build configuration file, created by
5 | # sphinx-quickstart on Sat Dec 22 21:53:21 2012.
6 | #
7 | # This file is execfile()d with the current directory set to its containing dir.
8 | #
9 | # Note that not all possible configuration values are present in this
10 | # autogenerated file.
11 | #
12 | # All configuration values have a default; values that are commented out
13 | # serve to show the default.
14 |
15 | import sys, os, re
16 |
17 | # If extensions (or modules to document with autodoc) are in another directory,
18 | # add these directories to sys.path here. If the directory is relative to the
19 | # documentation root, use os.path.abspath to make it absolute, like shown here.
20 | #sys.path.insert(0, os.path.abspath('.'))
21 |
22 | # -- General configuration -----------------------------------------------------
23 |
24 | # If your documentation needs a minimal Sphinx version, state it here.
25 | #needs_sphinx = '1.0'
26 |
27 | # Add any Sphinx extension module names here, as strings. They can be extensions
28 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
29 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx', 'sphinx.ext.viewcode']
30 |
31 | # Add any paths that contain templates here, relative to this directory.
32 | templates_path = ['_templates']
33 |
34 | # The suffix of source filenames.
35 | source_suffix = '.rst'
36 |
37 | # The encoding of source files.
38 | #source_encoding = 'utf-8-sig'
39 |
40 | # The master toctree document.
41 | master_doc = 'index'
42 |
43 | # General information about the project.
44 | project = 'python-webencodings'
45 | copyright = '2012, Simon Sapin'
46 |
47 | # The version info for the project you're documenting, acts as replacement for
48 | # |version| and |release|, also used in various other places throughout the
49 | # built documents.
50 | #
51 | # The full version, including alpha/beta/rc tags.
52 | release = re.search("VERSION = '([^']+)'",
53 | open(os.path.join(os.path.dirname(__file__), os.pardir,
54 | 'webencodings', '__init__.py')).read().strip()
55 | ).group(1)
56 |
57 | # The short X.Y version.
58 | version = '.'.join(release.split('.')[:2])
59 |
60 | # The language for content autogenerated by Sphinx. Refer to documentation
61 | # for a list of supported languages.
62 | #language = None
63 |
64 | # There are two options for replacing |today|: either, you set today to some
65 | # non-false value, then it is used:
66 | #today = ''
67 | # Else, today_fmt is used as the format for a strftime call.
68 | #today_fmt = '%B %d, %Y'
69 |
70 | # List of patterns, relative to source directory, that match files and
71 | # directories to ignore when looking for source files.
72 | exclude_patterns = ['_build']
73 |
74 | # The reST default role (used for this markup: `text`) to use for all documents.
75 | #default_role = None
76 |
77 | # If true, '()' will be appended to :func: etc. cross-reference text.
78 | #add_function_parentheses = True
79 |
80 | # If true, the current module name will be prepended to all description
81 | # unit titles (such as .. function::).
82 | #add_module_names = True
83 |
84 | # If true, sectionauthor and moduleauthor directives will be shown in the
85 | # output. They are ignored by default.
86 | #show_authors = False
87 |
88 | # The name of the Pygments (syntax highlighting) style to use.
89 | pygments_style = 'sphinx'
90 |
91 | # A list of ignored prefixes for module index sorting.
92 | #modindex_common_prefix = []
93 |
94 |
95 | # -- Options for HTML output ---------------------------------------------------
96 |
97 | # The theme to use for HTML and HTML Help pages. See the documentation for
98 | # a list of builtin themes.
99 | html_theme = 'default'
100 |
101 | # Theme options are theme-specific and customize the look and feel of a theme
102 | # further. For a list of options available for each theme, see the
103 | # documentation.
104 | #html_theme_options = {}
105 |
106 | # Add any paths that contain custom themes here, relative to this directory.
107 | #html_theme_path = []
108 |
109 | # The name for this set of Sphinx documents. If None, it defaults to
110 | # " v documentation".
111 | #html_title = None
112 |
113 | # A shorter title for the navigation bar. Default is the same as html_title.
114 | #html_short_title = None
115 |
116 | # The name of an image file (relative to this directory) to place at the top
117 | # of the sidebar.
118 | #html_logo = None
119 |
120 | # The name of an image file (within the static path) to use as favicon of the
121 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
122 | # pixels large.
123 | #html_favicon = None
124 |
125 | # Add any paths that contain custom static files (such as style sheets) here,
126 | # relative to this directory. They are copied after the builtin static files,
127 | # so a file named "default.css" will overwrite the builtin "default.css".
128 | html_static_path = ['_static']
129 |
130 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
131 | # using the given strftime format.
132 | #html_last_updated_fmt = '%b %d, %Y'
133 |
134 | # If true, SmartyPants will be used to convert quotes and dashes to
135 | # typographically correct entities.
136 | #html_use_smartypants = True
137 |
138 | # Custom sidebar templates, maps document names to template names.
139 | #html_sidebars = {}
140 |
141 | # Additional templates that should be rendered to pages, maps page names to
142 | # template names.
143 | #html_additional_pages = {}
144 |
145 | # If false, no module index is generated.
146 | #html_domain_indices = True
147 |
148 | # If false, no index is generated.
149 | #html_use_index = True
150 |
151 | # If true, the index is split into individual pages for each letter.
152 | #html_split_index = False
153 |
154 | # If true, links to the reST sources are added to the pages.
155 | #html_show_sourcelink = True
156 |
157 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
158 | #html_show_sphinx = True
159 |
160 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
161 | #html_show_copyright = True
162 |
163 | # If true, an OpenSearch description file will be output, and all pages will
164 | # contain a tag referring to it. The value of this option must be the
165 | # base URL from which the finished HTML is served.
166 | #html_use_opensearch = ''
167 |
168 | # This is the file name suffix for HTML files (e.g. ".xhtml").
169 | #html_file_suffix = None
170 |
171 | # Output file base name for HTML help builder.
172 | htmlhelp_basename = 'python-webencodingsdoc'
173 |
174 |
175 | # -- Options for LaTeX output --------------------------------------------------
176 |
177 | latex_elements = {
178 | # The paper size ('letterpaper' or 'a4paper').
179 | #'papersize': 'letterpaper',
180 |
181 | # The font size ('10pt', '11pt' or '12pt').
182 | #'pointsize': '10pt',
183 |
184 | # Additional stuff for the LaTeX preamble.
185 | #'preamble': '',
186 | }
187 |
188 | # Grouping the document tree into LaTeX files. List of tuples
189 | # (source start file, target name, title, author, documentclass [howto/manual]).
190 | latex_documents = [
191 | ('index', 'python-webencodings.tex', 'python-webencodings Documentation',
192 | 'Simon Sapin', 'manual'),
193 | ]
194 |
195 | # The name of an image file (relative to this directory) to place at the top of
196 | # the title page.
197 | #latex_logo = None
198 |
199 | # For "manual" documents, if this is true, then toplevel headings are parts,
200 | # not chapters.
201 | #latex_use_parts = False
202 |
203 | # If true, show page references after internal links.
204 | #latex_show_pagerefs = False
205 |
206 | # If true, show URL addresses after external links.
207 | #latex_show_urls = False
208 |
209 | # Documents to append as an appendix to all manuals.
210 | #latex_appendices = []
211 |
212 | # If false, no module index is generated.
213 | #latex_domain_indices = True
214 |
215 |
216 | # -- Options for manual page output --------------------------------------------
217 |
218 | # One entry per manual page. List of tuples
219 | # (source start file, name, description, authors, manual section).
220 | man_pages = [
221 | ('index', 'python-webencodings', 'python-webencodings Documentation',
222 | ['Simon Sapin'], 1)
223 | ]
224 |
225 | # If true, show URL addresses after external links.
226 | #man_show_urls = False
227 |
228 |
229 | # -- Options for Texinfo output ------------------------------------------------
230 |
231 | # Grouping the document tree into Texinfo files. List of tuples
232 | # (source start file, target name, title, author,
233 | # dir menu entry, description, category)
234 | texinfo_documents = [
235 | ('index', 'python-webencodings', 'python-webencodings Documentation',
236 | 'Simon Sapin', 'python-webencodings', 'One line description of project.',
237 | 'Miscellaneous'),
238 | ]
239 |
240 | # Documents to append as an appendix to all manuals.
241 | #texinfo_appendices = []
242 |
243 | # If false, no module index is generated.
244 | #texinfo_domain_indices = True
245 |
246 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
247 | #texinfo_show_urls = 'footnote'
248 |
249 |
250 | # Example configuration for intersphinx: refer to the Python standard library.
251 | intersphinx_mapping = {
252 | 'py': ('http://docs.python.org/3', None)
253 | }
254 |
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../README.rst
2 |
3 | .. toctree::
4 | :maxdepth: 2
5 |
6 |
7 | Byte order marks
8 | ----------------
9 |
10 | When decoding, for compatibility with deployed content,
11 | a `byte order mark `_
12 | (also known as BOM)
13 | is considered more authoritative than anything else.
14 | The corresponding U+FFFE code point is not part of the decoded output.
15 |
16 | Encoding nevers prepends a BOM,
17 | but the output can start with a BOM
18 | if the input starts with a U+FFFE code point.
19 | In that case encoding then decoding will not round-trip.
20 |
21 |
22 | Error handling
23 | --------------
24 |
25 | As in the stdlib, error handling for encoding defaults to ``strict``:
26 | raise an exception if there is an error.
27 |
28 | For decoding however the default is ``replace``, unlike the stdlib.
29 | Invalid bytes are decoded as ``�`` (U+FFFD, the replacement character).
30 | The reason is that when showing legacy content to the user,
31 | it might be better to succeed decoding only part of it rather than blow up.
32 | This is of course not the case is all situations:
33 | sometimes you want stuff to blow up so you can detect errors early.
34 |
35 |
36 | API
37 | ---
38 |
39 | .. module:: webencodings
40 |
41 | .. autofunction:: lookup
42 |
43 | .. autoclass:: Encoding()
44 |
45 | .. autodata:: UTF8
46 |
47 | .. autofunction:: decode
48 | .. autofunction:: encode
49 | .. autofunction:: iter_decode
50 | .. autofunction:: iter_encode
51 | .. autoclass:: IncrementalDecoder
52 | :members:
53 | .. autoclass:: IncrementalEncoder
54 | .. autofunction:: ascii_lower
55 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal = 1
3 |
4 | [metadata]
5 | license_file = LICENSE
6 |
7 | [build_sphinx]
8 | source-dir = docs
9 | build-dir = docs/_build
10 | #all_files = 1
11 |
12 | [upload_sphinx] # Sphinx-PyPI-upload
13 | upload-dir = docs/_build/html
14 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 | import io
3 | from os import path
4 | import re
5 |
6 |
7 | VERSION = re.search("VERSION = '([^']+)'", io.open(
8 | path.join(path.dirname(__file__), 'webencodings', '__init__.py'),
9 | encoding='utf-8'
10 | ).read().strip()).group(1)
11 |
12 | LONG_DESCRIPTION = io.open(
13 | path.join(path.dirname(__file__), 'README.rst'),
14 | encoding='utf-8'
15 | ).read()
16 |
17 |
18 | setup(
19 | name='webencodings',
20 | version=VERSION,
21 | url='https://github.com/SimonSapin/python-webencodings',
22 | license='BSD',
23 | author='Simon Sapin',
24 | author_email='simon.sapin@exyr.org',
25 | maintainer='Geoffrey Sneddon',
26 | maintainer_email='me@gsnedders.com',
27 | description='Character encoding aliases for legacy web content',
28 | long_description=LONG_DESCRIPTION,
29 | classifiers=[
30 | 'Development Status :: 4 - Beta',
31 | 'Intended Audience :: Developers',
32 | 'License :: OSI Approved :: BSD License',
33 | 'Programming Language :: Python',
34 | 'Programming Language :: Python :: 2',
35 | 'Programming Language :: Python :: 2.6',
36 | 'Programming Language :: Python :: 2.7',
37 | 'Programming Language :: Python :: 3',
38 | 'Programming Language :: Python :: 3.3',
39 | 'Programming Language :: Python :: 3.4',
40 | 'Programming Language :: Python :: 3.5',
41 | 'Programming Language :: Python :: 3.6',
42 | 'Programming Language :: Python :: Implementation :: CPython',
43 | 'Programming Language :: Python :: Implementation :: PyPy',
44 | 'Topic :: Internet :: WWW/HTTP',
45 | ],
46 | packages=find_packages(),
47 | )
48 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | python_files=test*.py
3 |
4 | [tox]
5 | envlist = py26, py27, py33, py34, py35, py36, pypy
6 |
7 | [testenv]
8 | deps=pytest
9 | commands=py.test []
10 |
--------------------------------------------------------------------------------
/webencodings/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | """
3 |
4 | webencodings
5 | ~~~~~~~~~~~~
6 |
7 | This is a Python implementation of the `WHATWG Encoding standard
8 | `. See README for details.
9 |
10 | :copyright: Copyright 2012 by Simon Sapin
11 | :license: BSD, see LICENSE for details.
12 |
13 | """
14 |
15 | from __future__ import unicode_literals
16 |
17 | import codecs
18 |
19 | from .labels import LABELS
20 |
21 |
22 | VERSION = '0.6-dev'
23 |
24 |
25 | # Some names in Encoding are not valid Python aliases. Remap these.
26 | PYTHON_NAMES = {
27 | 'iso-8859-8-i': 'iso-8859-8',
28 | 'x-mac-cyrillic': 'mac-cyrillic',
29 | 'macintosh': 'mac-roman',
30 | 'windows-874': 'cp874'}
31 |
32 | CACHE = {}
33 |
34 |
35 | def ascii_lower(string):
36 | r"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z.
37 |
38 | :param string: An Unicode string.
39 | :returns: A new Unicode string.
40 |
41 | This is used for `ASCII case-insensitive
42 | `_
43 | matching of encoding labels.
44 | The same matching is also used, among other things,
45 | for `CSS keywords `_.
46 |
47 | This is different from the :meth:`~py:str.lower` method of Unicode strings
48 | which also affect non-ASCII characters,
49 | sometimes mapping them into the ASCII range:
50 |
51 | >>> keyword = u'Bac\N{KELVIN SIGN}ground'
52 | >>> assert keyword.lower() == u'background'
53 | >>> assert ascii_lower(keyword) != keyword.lower()
54 | >>> assert ascii_lower(keyword) == u'bac\N{KELVIN SIGN}ground'
55 |
56 | """
57 | # This turns out to be faster than unicode.translate()
58 | return string.encode('utf8').lower().decode('utf8')
59 |
60 |
61 | def lookup(label):
62 | """
63 | Look for an encoding by its label.
64 | This is the spec’s `get an encoding
65 | `_ algorithm.
66 | Supported labels are listed there.
67 |
68 | :param label: A string.
69 | :returns:
70 | An :class:`Encoding` object, or :obj:`None` for an unknown label.
71 |
72 | """
73 | # Only strip ASCII whitespace: U+0009, U+000A, U+000C, U+000D, and U+0020.
74 | label = ascii_lower(label.strip('\t\n\f\r '))
75 | name = LABELS.get(label)
76 | if name is None:
77 | return None
78 | encoding = CACHE.get(name)
79 | if encoding is None:
80 | if name == 'x-user-defined':
81 | from .x_user_defined import codec_info
82 | else:
83 | python_name = PYTHON_NAMES.get(name, name)
84 | # Any python_name value that gets to here should be valid.
85 | codec_info = codecs.lookup(python_name)
86 | encoding = Encoding(name, codec_info)
87 | CACHE[name] = encoding
88 | return encoding
89 |
90 |
91 | def _get_encoding(encoding_or_label):
92 | """
93 | Accept either an encoding object or label.
94 |
95 | :param encoding: An :class:`Encoding` object or a label string.
96 | :returns: An :class:`Encoding` object.
97 | :raises: :exc:`~exceptions.LookupError` for an unknown label.
98 |
99 | """
100 | if hasattr(encoding_or_label, 'codec_info'):
101 | return encoding_or_label
102 |
103 | encoding = lookup(encoding_or_label)
104 | if encoding is None:
105 | raise LookupError('Unknown encoding label: %r' % encoding_or_label)
106 | return encoding
107 |
108 |
109 | class Encoding(object):
110 | """Reresents a character encoding such as UTF-8,
111 | that can be used for decoding or encoding.
112 |
113 | .. attribute:: name
114 |
115 | Canonical name of the encoding
116 |
117 | .. attribute:: codec_info
118 |
119 | The actual implementation of the encoding,
120 | a stdlib :class:`~codecs.CodecInfo` object.
121 | See :func:`codecs.register`.
122 |
123 | """
124 | def __init__(self, name, codec_info):
125 | self.name = name
126 | self.codec_info = codec_info
127 |
128 | def __repr__(self):
129 | return '' % self.name
130 |
131 |
132 | #: The UTF-8 encoding. Should be used for new content and formats.
133 | UTF8 = lookup('utf-8')
134 |
135 | _UTF16LE = lookup('utf-16le')
136 | _UTF16BE = lookup('utf-16be')
137 |
138 |
139 | def decode(input, fallback_encoding, errors='replace'):
140 | """
141 | Decode a single string.
142 |
143 | :param input: A byte string
144 | :param fallback_encoding:
145 | An :class:`Encoding` object or a label string.
146 | The encoding to use if :obj:`input` does note have a BOM.
147 | :param errors: Type of error handling. See :func:`codecs.register`.
148 | :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
149 | :return:
150 | A ``(output, encoding)`` tuple of an Unicode string
151 | and an :obj:`Encoding`.
152 |
153 | """
154 | # Fail early if `encoding` is an invalid label.
155 | fallback_encoding = _get_encoding(fallback_encoding)
156 | bom_encoding, input = _detect_bom(input)
157 | encoding = bom_encoding or fallback_encoding
158 | return encoding.codec_info.decode(input, errors)[0], encoding
159 |
160 |
161 | def _detect_bom(input):
162 | """Return (bom_encoding, input), with any BOM removed from the input."""
163 | if input.startswith(b'\xFF\xFE'):
164 | return _UTF16LE, input[2:]
165 | if input.startswith(b'\xFE\xFF'):
166 | return _UTF16BE, input[2:]
167 | if input.startswith(b'\xEF\xBB\xBF'):
168 | return UTF8, input[3:]
169 | return None, input
170 |
171 |
172 | def encode(input, encoding=UTF8, errors='strict'):
173 | """
174 | Encode a single string.
175 |
176 | :param input: An Unicode string.
177 | :param encoding: An :class:`Encoding` object or a label string.
178 | :param errors: Type of error handling. See :func:`codecs.register`.
179 | :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
180 | :return: A byte string.
181 |
182 | """
183 | return _get_encoding(encoding).codec_info.encode(input, errors)[0]
184 |
185 |
186 | def iter_decode(input, fallback_encoding, errors='replace'):
187 | """
188 | "Pull"-based decoder.
189 |
190 | :param input:
191 | An iterable of byte strings.
192 |
193 | The input is first consumed just enough to determine the encoding
194 | based on the precense of a BOM,
195 | then consumed on demand when the return value is.
196 | :param fallback_encoding:
197 | An :class:`Encoding` object or a label string.
198 | The encoding to use if :obj:`input` does note have a BOM.
199 | :param errors: Type of error handling. See :func:`codecs.register`.
200 | :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
201 | :returns:
202 | An ``(output, encoding)`` tuple.
203 | :obj:`output` is an iterable of Unicode strings,
204 | :obj:`encoding` is the :obj:`Encoding` that is being used.
205 |
206 | """
207 |
208 | decoder = IncrementalDecoder(fallback_encoding, errors)
209 | generator = _iter_decode_generator(input, decoder)
210 | encoding = next(generator)
211 | return generator, encoding
212 |
213 |
214 | def _iter_decode_generator(input, decoder):
215 | """Return a generator that first yields the :obj:`Encoding`,
216 | then yields output chukns as Unicode strings.
217 |
218 | """
219 | decode = decoder.decode
220 | input = iter(input)
221 | for chunck in input:
222 | output = decode(chunck)
223 | if output:
224 | assert decoder.encoding is not None
225 | yield decoder.encoding
226 | yield output
227 | break
228 | else:
229 | # Input exhausted without determining the encoding
230 | output = decode(b'', final=True)
231 | assert decoder.encoding is not None
232 | yield decoder.encoding
233 | if output:
234 | yield output
235 | return
236 |
237 | for chunck in input:
238 | output = decode(chunck)
239 | if output:
240 | yield output
241 | output = decode(b'', final=True)
242 | if output:
243 | yield output
244 |
245 |
246 | def iter_encode(input, encoding=UTF8, errors='strict'):
247 | """
248 | “Pull”-based encoder.
249 |
250 | :param input: An iterable of Unicode strings.
251 | :param encoding: An :class:`Encoding` object or a label string.
252 | :param errors: Type of error handling. See :func:`codecs.register`.
253 | :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
254 | :returns: An iterable of byte strings.
255 |
256 | """
257 | # Fail early if `encoding` is an invalid label.
258 | encode = IncrementalEncoder(encoding, errors).encode
259 | return _iter_encode_generator(input, encode)
260 |
261 |
262 | def _iter_encode_generator(input, encode):
263 | for chunck in input:
264 | output = encode(chunck)
265 | if output:
266 | yield output
267 | output = encode('', final=True)
268 | if output:
269 | yield output
270 |
271 |
272 | class IncrementalDecoder(object):
273 | """
274 | “Push”-based decoder.
275 |
276 | :param fallback_encoding:
277 | An :class:`Encoding` object or a label string.
278 | The encoding to use if :obj:`input` does note have a BOM.
279 | :param errors: Type of error handling. See :func:`codecs.register`.
280 | :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
281 |
282 | """
283 | def __init__(self, fallback_encoding, errors='replace'):
284 | # Fail early if `encoding` is an invalid label.
285 | self._fallback_encoding = _get_encoding(fallback_encoding)
286 | self._errors = errors
287 | self._buffer = b''
288 | self._decoder = None
289 | #: The actual :class:`Encoding` that is being used,
290 | #: or :obj:`None` if that is not determined yet.
291 | #: (Ie. if there is not enough input yet to determine
292 | #: if there is a BOM.)
293 | self.encoding = None # Not known yet.
294 |
295 | def decode(self, input, final=False):
296 | """Decode one chunk of the input.
297 |
298 | :param input: A byte string.
299 | :param final:
300 | Indicate that no more input is available.
301 | Must be :obj:`True` if this is the last call.
302 | :returns: An Unicode string.
303 |
304 | """
305 | decoder = self._decoder
306 | if decoder is not None:
307 | return decoder(input, final)
308 |
309 | input = self._buffer + input
310 | encoding, input = _detect_bom(input)
311 | if encoding is None:
312 | if len(input) < 3 and not final: # Not enough data yet.
313 | self._buffer = input
314 | return ''
315 | else: # No BOM
316 | encoding = self._fallback_encoding
317 | decoder = encoding.codec_info.incrementaldecoder(self._errors).decode
318 | self._decoder = decoder
319 | self.encoding = encoding
320 | return decoder(input, final)
321 |
322 |
323 | class IncrementalEncoder(object):
324 | """
325 | “Push”-based encoder.
326 |
327 | :param encoding: An :class:`Encoding` object or a label string.
328 | :param errors: Type of error handling. See :func:`codecs.register`.
329 | :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
330 |
331 | .. method:: encode(input, final=False)
332 |
333 | :param input: An Unicode string.
334 | :param final:
335 | Indicate that no more input is available.
336 | Must be :obj:`True` if this is the last call.
337 | :returns: A byte string.
338 |
339 | """
340 | def __init__(self, encoding=UTF8, errors='strict'):
341 | encoding = _get_encoding(encoding)
342 | self.encode = encoding.codec_info.incrementalencoder(errors).encode
343 |
--------------------------------------------------------------------------------
/webencodings/labels.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | webencodings.labels
4 | ~~~~~~~~~~~~~~~~~~~
5 |
6 | Map encoding labels to their name.
7 |
8 | :copyright: Copyright 2012 by Simon Sapin
9 | :license: BSD, see LICENSE for details.
10 |
11 | """
12 |
13 | # XXX Do not edit!
14 | # This file is automatically generated by mklabels.py
15 |
16 | LABELS = {
17 | 'unicode-1-1-utf-8': 'utf-8',
18 | 'utf-8': 'utf-8',
19 | 'utf8': 'utf-8',
20 | '866': 'ibm866',
21 | 'cp866': 'ibm866',
22 | 'csibm866': 'ibm866',
23 | 'ibm866': 'ibm866',
24 | 'csisolatin2': 'iso-8859-2',
25 | 'iso-8859-2': 'iso-8859-2',
26 | 'iso-ir-101': 'iso-8859-2',
27 | 'iso8859-2': 'iso-8859-2',
28 | 'iso88592': 'iso-8859-2',
29 | 'iso_8859-2': 'iso-8859-2',
30 | 'iso_8859-2:1987': 'iso-8859-2',
31 | 'l2': 'iso-8859-2',
32 | 'latin2': 'iso-8859-2',
33 | 'csisolatin3': 'iso-8859-3',
34 | 'iso-8859-3': 'iso-8859-3',
35 | 'iso-ir-109': 'iso-8859-3',
36 | 'iso8859-3': 'iso-8859-3',
37 | 'iso88593': 'iso-8859-3',
38 | 'iso_8859-3': 'iso-8859-3',
39 | 'iso_8859-3:1988': 'iso-8859-3',
40 | 'l3': 'iso-8859-3',
41 | 'latin3': 'iso-8859-3',
42 | 'csisolatin4': 'iso-8859-4',
43 | 'iso-8859-4': 'iso-8859-4',
44 | 'iso-ir-110': 'iso-8859-4',
45 | 'iso8859-4': 'iso-8859-4',
46 | 'iso88594': 'iso-8859-4',
47 | 'iso_8859-4': 'iso-8859-4',
48 | 'iso_8859-4:1988': 'iso-8859-4',
49 | 'l4': 'iso-8859-4',
50 | 'latin4': 'iso-8859-4',
51 | 'csisolatincyrillic': 'iso-8859-5',
52 | 'cyrillic': 'iso-8859-5',
53 | 'iso-8859-5': 'iso-8859-5',
54 | 'iso-ir-144': 'iso-8859-5',
55 | 'iso8859-5': 'iso-8859-5',
56 | 'iso88595': 'iso-8859-5',
57 | 'iso_8859-5': 'iso-8859-5',
58 | 'iso_8859-5:1988': 'iso-8859-5',
59 | 'arabic': 'iso-8859-6',
60 | 'asmo-708': 'iso-8859-6',
61 | 'csiso88596e': 'iso-8859-6',
62 | 'csiso88596i': 'iso-8859-6',
63 | 'csisolatinarabic': 'iso-8859-6',
64 | 'ecma-114': 'iso-8859-6',
65 | 'iso-8859-6': 'iso-8859-6',
66 | 'iso-8859-6-e': 'iso-8859-6',
67 | 'iso-8859-6-i': 'iso-8859-6',
68 | 'iso-ir-127': 'iso-8859-6',
69 | 'iso8859-6': 'iso-8859-6',
70 | 'iso88596': 'iso-8859-6',
71 | 'iso_8859-6': 'iso-8859-6',
72 | 'iso_8859-6:1987': 'iso-8859-6',
73 | 'csisolatingreek': 'iso-8859-7',
74 | 'ecma-118': 'iso-8859-7',
75 | 'elot_928': 'iso-8859-7',
76 | 'greek': 'iso-8859-7',
77 | 'greek8': 'iso-8859-7',
78 | 'iso-8859-7': 'iso-8859-7',
79 | 'iso-ir-126': 'iso-8859-7',
80 | 'iso8859-7': 'iso-8859-7',
81 | 'iso88597': 'iso-8859-7',
82 | 'iso_8859-7': 'iso-8859-7',
83 | 'iso_8859-7:1987': 'iso-8859-7',
84 | 'sun_eu_greek': 'iso-8859-7',
85 | 'csiso88598e': 'iso-8859-8',
86 | 'csisolatinhebrew': 'iso-8859-8',
87 | 'hebrew': 'iso-8859-8',
88 | 'iso-8859-8': 'iso-8859-8',
89 | 'iso-8859-8-e': 'iso-8859-8',
90 | 'iso-ir-138': 'iso-8859-8',
91 | 'iso8859-8': 'iso-8859-8',
92 | 'iso88598': 'iso-8859-8',
93 | 'iso_8859-8': 'iso-8859-8',
94 | 'iso_8859-8:1988': 'iso-8859-8',
95 | 'visual': 'iso-8859-8',
96 | 'csiso88598i': 'iso-8859-8-i',
97 | 'iso-8859-8-i': 'iso-8859-8-i',
98 | 'logical': 'iso-8859-8-i',
99 | 'csisolatin6': 'iso-8859-10',
100 | 'iso-8859-10': 'iso-8859-10',
101 | 'iso-ir-157': 'iso-8859-10',
102 | 'iso8859-10': 'iso-8859-10',
103 | 'iso885910': 'iso-8859-10',
104 | 'l6': 'iso-8859-10',
105 | 'latin6': 'iso-8859-10',
106 | 'iso-8859-13': 'iso-8859-13',
107 | 'iso8859-13': 'iso-8859-13',
108 | 'iso885913': 'iso-8859-13',
109 | 'iso-8859-14': 'iso-8859-14',
110 | 'iso8859-14': 'iso-8859-14',
111 | 'iso885914': 'iso-8859-14',
112 | 'csisolatin9': 'iso-8859-15',
113 | 'iso-8859-15': 'iso-8859-15',
114 | 'iso8859-15': 'iso-8859-15',
115 | 'iso885915': 'iso-8859-15',
116 | 'iso_8859-15': 'iso-8859-15',
117 | 'l9': 'iso-8859-15',
118 | 'iso-8859-16': 'iso-8859-16',
119 | 'cskoi8r': 'koi8-r',
120 | 'koi': 'koi8-r',
121 | 'koi8': 'koi8-r',
122 | 'koi8-r': 'koi8-r',
123 | 'koi8_r': 'koi8-r',
124 | 'koi8-u': 'koi8-u',
125 | 'csmacintosh': 'macintosh',
126 | 'mac': 'macintosh',
127 | 'macintosh': 'macintosh',
128 | 'x-mac-roman': 'macintosh',
129 | 'dos-874': 'windows-874',
130 | 'iso-8859-11': 'windows-874',
131 | 'iso8859-11': 'windows-874',
132 | 'iso885911': 'windows-874',
133 | 'tis-620': 'windows-874',
134 | 'windows-874': 'windows-874',
135 | 'cp1250': 'windows-1250',
136 | 'windows-1250': 'windows-1250',
137 | 'x-cp1250': 'windows-1250',
138 | 'cp1251': 'windows-1251',
139 | 'windows-1251': 'windows-1251',
140 | 'x-cp1251': 'windows-1251',
141 | 'ansi_x3.4-1968': 'windows-1252',
142 | 'ascii': 'windows-1252',
143 | 'cp1252': 'windows-1252',
144 | 'cp819': 'windows-1252',
145 | 'csisolatin1': 'windows-1252',
146 | 'ibm819': 'windows-1252',
147 | 'iso-8859-1': 'windows-1252',
148 | 'iso-ir-100': 'windows-1252',
149 | 'iso8859-1': 'windows-1252',
150 | 'iso88591': 'windows-1252',
151 | 'iso_8859-1': 'windows-1252',
152 | 'iso_8859-1:1987': 'windows-1252',
153 | 'l1': 'windows-1252',
154 | 'latin1': 'windows-1252',
155 | 'us-ascii': 'windows-1252',
156 | 'windows-1252': 'windows-1252',
157 | 'x-cp1252': 'windows-1252',
158 | 'cp1253': 'windows-1253',
159 | 'windows-1253': 'windows-1253',
160 | 'x-cp1253': 'windows-1253',
161 | 'cp1254': 'windows-1254',
162 | 'csisolatin5': 'windows-1254',
163 | 'iso-8859-9': 'windows-1254',
164 | 'iso-ir-148': 'windows-1254',
165 | 'iso8859-9': 'windows-1254',
166 | 'iso88599': 'windows-1254',
167 | 'iso_8859-9': 'windows-1254',
168 | 'iso_8859-9:1989': 'windows-1254',
169 | 'l5': 'windows-1254',
170 | 'latin5': 'windows-1254',
171 | 'windows-1254': 'windows-1254',
172 | 'x-cp1254': 'windows-1254',
173 | 'cp1255': 'windows-1255',
174 | 'windows-1255': 'windows-1255',
175 | 'x-cp1255': 'windows-1255',
176 | 'cp1256': 'windows-1256',
177 | 'windows-1256': 'windows-1256',
178 | 'x-cp1256': 'windows-1256',
179 | 'cp1257': 'windows-1257',
180 | 'windows-1257': 'windows-1257',
181 | 'x-cp1257': 'windows-1257',
182 | 'cp1258': 'windows-1258',
183 | 'windows-1258': 'windows-1258',
184 | 'x-cp1258': 'windows-1258',
185 | 'x-mac-cyrillic': 'x-mac-cyrillic',
186 | 'x-mac-ukrainian': 'x-mac-cyrillic',
187 | 'chinese': 'gbk',
188 | 'csgb2312': 'gbk',
189 | 'csiso58gb231280': 'gbk',
190 | 'gb2312': 'gbk',
191 | 'gb_2312': 'gbk',
192 | 'gb_2312-80': 'gbk',
193 | 'gbk': 'gbk',
194 | 'iso-ir-58': 'gbk',
195 | 'x-gbk': 'gbk',
196 | 'gb18030': 'gb18030',
197 | 'hz-gb-2312': 'hz-gb-2312',
198 | 'big5': 'big5',
199 | 'big5-hkscs': 'big5',
200 | 'cn-big5': 'big5',
201 | 'csbig5': 'big5',
202 | 'x-x-big5': 'big5',
203 | 'cseucpkdfmtjapanese': 'euc-jp',
204 | 'euc-jp': 'euc-jp',
205 | 'x-euc-jp': 'euc-jp',
206 | 'csiso2022jp': 'iso-2022-jp',
207 | 'iso-2022-jp': 'iso-2022-jp',
208 | 'csshiftjis': 'shift_jis',
209 | 'ms_kanji': 'shift_jis',
210 | 'shift-jis': 'shift_jis',
211 | 'shift_jis': 'shift_jis',
212 | 'sjis': 'shift_jis',
213 | 'windows-31j': 'shift_jis',
214 | 'x-sjis': 'shift_jis',
215 | 'cseuckr': 'euc-kr',
216 | 'csksc56011987': 'euc-kr',
217 | 'euc-kr': 'euc-kr',
218 | 'iso-ir-149': 'euc-kr',
219 | 'korean': 'euc-kr',
220 | 'ks_c_5601-1987': 'euc-kr',
221 | 'ks_c_5601-1989': 'euc-kr',
222 | 'ksc5601': 'euc-kr',
223 | 'ksc_5601': 'euc-kr',
224 | 'windows-949': 'euc-kr',
225 | 'csiso2022kr': 'iso-2022-kr',
226 | 'iso-2022-kr': 'iso-2022-kr',
227 | 'utf-16be': 'utf-16be',
228 | 'utf-16': 'utf-16le',
229 | 'utf-16le': 'utf-16le',
230 | 'x-user-defined': 'x-user-defined',
231 | }
232 |
--------------------------------------------------------------------------------
/webencodings/mklabels.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | webencodings.mklabels
4 | ~~~~~~~~~~~~~~~~~~~~~
5 |
6 | Regenarate the webencodings.labels module.
7 |
8 | :copyright: Copyright 2012 by Simon Sapin
9 | :license: BSD, see LICENSE for details.
10 |
11 | """
12 |
13 | import json
14 | try:
15 | from urllib import urlopen
16 | except ImportError:
17 | from urllib.request import urlopen
18 |
19 |
20 | def assert_lower(string):
21 | assert string == string.lower()
22 | return string
23 |
24 |
25 | def generate(url):
26 | parts = ['''\
27 | """
28 |
29 | webencodings.labels
30 | ~~~~~~~~~~~~~~~~~~~
31 |
32 | Map encoding labels to their name.
33 |
34 | :copyright: Copyright 2012 by Simon Sapin
35 | :license: BSD, see LICENSE for details.
36 |
37 | """
38 |
39 | # XXX Do not edit!
40 | # This file is automatically generated by mklabels.py
41 |
42 | LABELS = {
43 | ''']
44 | labels = [
45 | (repr(assert_lower(label)).lstrip('u'),
46 | repr(encoding['name']).lstrip('u'))
47 | for category in json.loads(urlopen(url).read().decode('ascii'))
48 | for encoding in category['encodings']
49 | for label in encoding['labels']]
50 | max_len = max(len(label) for label, name in labels)
51 | parts.extend(
52 | ' %s:%s %s,\n' % (label, ' ' * (max_len - len(label)), name)
53 | for label, name in labels)
54 | parts.append('}')
55 | return ''.join(parts)
56 |
57 |
58 | if __name__ == '__main__':
59 | print(generate('http://encoding.spec.whatwg.org/encodings.json'))
60 |
--------------------------------------------------------------------------------
/webencodings/tests.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | """
3 |
4 | webencodings.tests
5 | ~~~~~~~~~~~~~~~~~~
6 |
7 | A basic test suite for Encoding.
8 |
9 | :copyright: Copyright 2012 by Simon Sapin
10 | :license: BSD, see LICENSE for details.
11 |
12 | """
13 |
14 | from __future__ import unicode_literals
15 |
16 | from . import (lookup, LABELS, decode, encode, iter_decode, iter_encode,
17 | IncrementalDecoder, IncrementalEncoder, UTF8)
18 |
19 |
20 | def assert_raises(exception, function, *args, **kwargs):
21 | try:
22 | function(*args, **kwargs)
23 | except exception:
24 | return
25 | else: # pragma: no cover
26 | raise AssertionError('Did not raise %s.' % exception)
27 |
28 |
29 | def test_labels():
30 | assert lookup('utf-8').name == 'utf-8'
31 | assert lookup('Utf-8').name == 'utf-8'
32 | assert lookup('UTF-8').name == 'utf-8'
33 | assert lookup('utf8').name == 'utf-8'
34 | assert lookup('utf8').name == 'utf-8'
35 | assert lookup('utf8 ').name == 'utf-8'
36 | assert lookup(' \r\nutf8\t').name == 'utf-8'
37 | assert lookup('u8') is None # Python label.
38 | assert lookup('utf-8 ') is None # Non-ASCII white space.
39 |
40 | assert lookup('US-ASCII').name == 'windows-1252'
41 | assert lookup('iso-8859-1').name == 'windows-1252'
42 | assert lookup('latin1').name == 'windows-1252'
43 | assert lookup('LATIN1').name == 'windows-1252'
44 | assert lookup('latin-1') is None
45 | assert lookup('LATİN1') is None # ASCII-only case insensitivity.
46 |
47 |
48 | def test_all_labels():
49 | for label in LABELS:
50 | assert decode(b'', label) == ('', lookup(label))
51 | assert encode('', label) == b''
52 | for repeat in [0, 1, 12]:
53 | output, _ = iter_decode([b''] * repeat, label)
54 | assert list(output) == []
55 | assert list(iter_encode([''] * repeat, label)) == []
56 | decoder = IncrementalDecoder(label)
57 | assert decoder.decode(b'') == ''
58 | assert decoder.decode(b'', final=True) == ''
59 | encoder = IncrementalEncoder(label)
60 | assert encoder.encode('') == b''
61 | assert encoder.encode('', final=True) == b''
62 | # All encoding names are valid labels too:
63 | for name in set(LABELS.values()):
64 | assert lookup(name).name == name
65 |
66 |
67 | def test_invalid_label():
68 | assert_raises(LookupError, decode, b'\xEF\xBB\xBF\xc3\xa9', 'invalid')
69 | assert_raises(LookupError, encode, 'é', 'invalid')
70 | assert_raises(LookupError, iter_decode, [], 'invalid')
71 | assert_raises(LookupError, iter_encode, [], 'invalid')
72 | assert_raises(LookupError, IncrementalDecoder, 'invalid')
73 | assert_raises(LookupError, IncrementalEncoder, 'invalid')
74 |
75 |
76 | def test_decode():
77 | assert decode(b'\x80', 'latin1') == ('€', lookup('latin1'))
78 | assert decode(b'\x80', lookup('latin1')) == ('€', lookup('latin1'))
79 | assert decode(b'\xc3\xa9', 'utf8') == ('é', lookup('utf8'))
80 | assert decode(b'\xc3\xa9', UTF8) == ('é', lookup('utf8'))
81 | assert decode(b'\xc3\xa9', 'ascii') == ('é', lookup('ascii'))
82 | assert decode(b'\xEF\xBB\xBF\xc3\xa9', 'ascii') == ('é', lookup('utf8')) # UTF-8 with BOM
83 |
84 | assert decode(b'\xFE\xFF\x00\xe9', 'ascii') == ('é', lookup('utf-16be')) # UTF-16-BE with BOM
85 | assert decode(b'\xFF\xFE\xe9\x00', 'ascii') == ('é', lookup('utf-16le')) # UTF-16-LE with BOM
86 | assert decode(b'\xFE\xFF\xe9\x00', 'ascii') == ('\ue900', lookup('utf-16be'))
87 | assert decode(b'\xFF\xFE\x00\xe9', 'ascii') == ('\ue900', lookup('utf-16le'))
88 |
89 | assert decode(b'\x00\xe9', 'UTF-16BE') == ('é', lookup('utf-16be'))
90 | assert decode(b'\xe9\x00', 'UTF-16LE') == ('é', lookup('utf-16le'))
91 | assert decode(b'\xe9\x00', 'UTF-16') == ('é', lookup('utf-16le'))
92 |
93 | assert decode(b'\xe9\x00', 'UTF-16BE') == ('\ue900', lookup('utf-16be'))
94 | assert decode(b'\x00\xe9', 'UTF-16LE') == ('\ue900', lookup('utf-16le'))
95 | assert decode(b'\x00\xe9', 'UTF-16') == ('\ue900', lookup('utf-16le'))
96 |
97 |
98 | def test_encode():
99 | assert encode('é', 'latin1') == b'\xe9'
100 | assert encode('é', 'utf8') == b'\xc3\xa9'
101 | assert encode('é', 'utf8') == b'\xc3\xa9'
102 | assert encode('é', 'utf-16') == b'\xe9\x00'
103 | assert encode('é', 'utf-16le') == b'\xe9\x00'
104 | assert encode('é', 'utf-16be') == b'\x00\xe9'
105 |
106 |
107 | def test_iter_decode():
108 | def iter_decode_to_string(input, fallback_encoding):
109 | output, _encoding = iter_decode(input, fallback_encoding)
110 | return ''.join(output)
111 | assert iter_decode_to_string([], 'latin1') == ''
112 | assert iter_decode_to_string([b''], 'latin1') == ''
113 | assert iter_decode_to_string([b'\xe9'], 'latin1') == 'é'
114 | assert iter_decode_to_string([b'hello'], 'latin1') == 'hello'
115 | assert iter_decode_to_string([b'he', b'llo'], 'latin1') == 'hello'
116 | assert iter_decode_to_string([b'hell', b'o'], 'latin1') == 'hello'
117 | assert iter_decode_to_string([b'\xc3\xa9'], 'latin1') == 'é'
118 | assert iter_decode_to_string([b'\xEF\xBB\xBF\xc3\xa9'], 'latin1') == 'é'
119 | assert iter_decode_to_string([
120 | b'\xEF\xBB\xBF', b'\xc3', b'\xa9'], 'latin1') == 'é'
121 | assert iter_decode_to_string([
122 | b'\xEF\xBB\xBF', b'a', b'\xc3'], 'latin1') == 'a\uFFFD'
123 | assert iter_decode_to_string([
124 | b'', b'\xEF', b'', b'', b'\xBB\xBF\xc3', b'\xa9'], 'latin1') == 'é'
125 | assert iter_decode_to_string([b'\xEF\xBB\xBF'], 'latin1') == ''
126 | assert iter_decode_to_string([b'\xEF\xBB'], 'latin1') == 'ï»'
127 | assert iter_decode_to_string([b'\xFE\xFF\x00\xe9'], 'latin1') == 'é'
128 | assert iter_decode_to_string([b'\xFF\xFE\xe9\x00'], 'latin1') == 'é'
129 | assert iter_decode_to_string([
130 | b'', b'\xFF', b'', b'', b'\xFE\xe9', b'\x00'], 'latin1') == 'é'
131 | assert iter_decode_to_string([
132 | b'', b'h\xe9', b'llo'], 'x-user-defined') == 'h\uF7E9llo'
133 |
134 |
135 | def test_iter_encode():
136 | assert b''.join(iter_encode([], 'latin1')) == b''
137 | assert b''.join(iter_encode([''], 'latin1')) == b''
138 | assert b''.join(iter_encode(['é'], 'latin1')) == b'\xe9'
139 | assert b''.join(iter_encode(['', 'é', '', ''], 'latin1')) == b'\xe9'
140 | assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16')) == b'\xe9\x00'
141 | assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16le')) == b'\xe9\x00'
142 | assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16be')) == b'\x00\xe9'
143 | assert b''.join(iter_encode([
144 | '', 'h\uF7E9', '', 'llo'], 'x-user-defined')) == b'h\xe9llo'
145 |
146 |
147 | def test_x_user_defined():
148 | encoded = b'2,\x0c\x0b\x1aO\xd9#\xcb\x0f\xc9\xbbt\xcf\xa8\xca'
149 | decoded = '2,\x0c\x0b\x1aO\uf7d9#\uf7cb\x0f\uf7c9\uf7bbt\uf7cf\uf7a8\uf7ca'
150 | encoded = b'aa'
151 | decoded = 'aa'
152 | assert decode(encoded, 'x-user-defined') == (decoded, lookup('x-user-defined'))
153 | assert encode(decoded, 'x-user-defined') == encoded
154 |
--------------------------------------------------------------------------------
/webencodings/x_user_defined.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | """
3 |
4 | webencodings.x_user_defined
5 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
6 |
7 | An implementation of the x-user-defined encoding.
8 |
9 | :copyright: Copyright 2012 by Simon Sapin
10 | :license: BSD, see LICENSE for details.
11 |
12 | """
13 |
14 | from __future__ import unicode_literals
15 |
16 | import codecs
17 |
18 |
19 | ### Codec APIs
20 |
21 | class Codec(codecs.Codec):
22 |
23 | def encode(self, input, errors='strict'):
24 | return codecs.charmap_encode(input, errors, encoding_table)
25 |
26 | def decode(self, input, errors='strict'):
27 | return codecs.charmap_decode(input, errors, decoding_table)
28 |
29 |
30 | class IncrementalEncoder(codecs.IncrementalEncoder):
31 | def encode(self, input, final=False):
32 | return codecs.charmap_encode(input, self.errors, encoding_table)[0]
33 |
34 |
35 | class IncrementalDecoder(codecs.IncrementalDecoder):
36 | def decode(self, input, final=False):
37 | return codecs.charmap_decode(input, self.errors, decoding_table)[0]
38 |
39 |
40 | class StreamWriter(Codec, codecs.StreamWriter):
41 | pass
42 |
43 |
44 | class StreamReader(Codec, codecs.StreamReader):
45 | pass
46 |
47 |
48 | ### encodings module API
49 |
50 | codec_info = codecs.CodecInfo(
51 | name='x-user-defined',
52 | encode=Codec().encode,
53 | decode=Codec().decode,
54 | incrementalencoder=IncrementalEncoder,
55 | incrementaldecoder=IncrementalDecoder,
56 | streamreader=StreamReader,
57 | streamwriter=StreamWriter,
58 | )
59 |
60 |
61 | ### Decoding Table
62 |
63 | # Python 3:
64 | # for c in range(256): print(' %r' % chr(c if c < 128 else c + 0xF700))
65 | decoding_table = (
66 | '\x00'
67 | '\x01'
68 | '\x02'
69 | '\x03'
70 | '\x04'
71 | '\x05'
72 | '\x06'
73 | '\x07'
74 | '\x08'
75 | '\t'
76 | '\n'
77 | '\x0b'
78 | '\x0c'
79 | '\r'
80 | '\x0e'
81 | '\x0f'
82 | '\x10'
83 | '\x11'
84 | '\x12'
85 | '\x13'
86 | '\x14'
87 | '\x15'
88 | '\x16'
89 | '\x17'
90 | '\x18'
91 | '\x19'
92 | '\x1a'
93 | '\x1b'
94 | '\x1c'
95 | '\x1d'
96 | '\x1e'
97 | '\x1f'
98 | ' '
99 | '!'
100 | '"'
101 | '#'
102 | '$'
103 | '%'
104 | '&'
105 | "'"
106 | '('
107 | ')'
108 | '*'
109 | '+'
110 | ','
111 | '-'
112 | '.'
113 | '/'
114 | '0'
115 | '1'
116 | '2'
117 | '3'
118 | '4'
119 | '5'
120 | '6'
121 | '7'
122 | '8'
123 | '9'
124 | ':'
125 | ';'
126 | '<'
127 | '='
128 | '>'
129 | '?'
130 | '@'
131 | 'A'
132 | 'B'
133 | 'C'
134 | 'D'
135 | 'E'
136 | 'F'
137 | 'G'
138 | 'H'
139 | 'I'
140 | 'J'
141 | 'K'
142 | 'L'
143 | 'M'
144 | 'N'
145 | 'O'
146 | 'P'
147 | 'Q'
148 | 'R'
149 | 'S'
150 | 'T'
151 | 'U'
152 | 'V'
153 | 'W'
154 | 'X'
155 | 'Y'
156 | 'Z'
157 | '['
158 | '\\'
159 | ']'
160 | '^'
161 | '_'
162 | '`'
163 | 'a'
164 | 'b'
165 | 'c'
166 | 'd'
167 | 'e'
168 | 'f'
169 | 'g'
170 | 'h'
171 | 'i'
172 | 'j'
173 | 'k'
174 | 'l'
175 | 'm'
176 | 'n'
177 | 'o'
178 | 'p'
179 | 'q'
180 | 'r'
181 | 's'
182 | 't'
183 | 'u'
184 | 'v'
185 | 'w'
186 | 'x'
187 | 'y'
188 | 'z'
189 | '{'
190 | '|'
191 | '}'
192 | '~'
193 | '\x7f'
194 | '\uf780'
195 | '\uf781'
196 | '\uf782'
197 | '\uf783'
198 | '\uf784'
199 | '\uf785'
200 | '\uf786'
201 | '\uf787'
202 | '\uf788'
203 | '\uf789'
204 | '\uf78a'
205 | '\uf78b'
206 | '\uf78c'
207 | '\uf78d'
208 | '\uf78e'
209 | '\uf78f'
210 | '\uf790'
211 | '\uf791'
212 | '\uf792'
213 | '\uf793'
214 | '\uf794'
215 | '\uf795'
216 | '\uf796'
217 | '\uf797'
218 | '\uf798'
219 | '\uf799'
220 | '\uf79a'
221 | '\uf79b'
222 | '\uf79c'
223 | '\uf79d'
224 | '\uf79e'
225 | '\uf79f'
226 | '\uf7a0'
227 | '\uf7a1'
228 | '\uf7a2'
229 | '\uf7a3'
230 | '\uf7a4'
231 | '\uf7a5'
232 | '\uf7a6'
233 | '\uf7a7'
234 | '\uf7a8'
235 | '\uf7a9'
236 | '\uf7aa'
237 | '\uf7ab'
238 | '\uf7ac'
239 | '\uf7ad'
240 | '\uf7ae'
241 | '\uf7af'
242 | '\uf7b0'
243 | '\uf7b1'
244 | '\uf7b2'
245 | '\uf7b3'
246 | '\uf7b4'
247 | '\uf7b5'
248 | '\uf7b6'
249 | '\uf7b7'
250 | '\uf7b8'
251 | '\uf7b9'
252 | '\uf7ba'
253 | '\uf7bb'
254 | '\uf7bc'
255 | '\uf7bd'
256 | '\uf7be'
257 | '\uf7bf'
258 | '\uf7c0'
259 | '\uf7c1'
260 | '\uf7c2'
261 | '\uf7c3'
262 | '\uf7c4'
263 | '\uf7c5'
264 | '\uf7c6'
265 | '\uf7c7'
266 | '\uf7c8'
267 | '\uf7c9'
268 | '\uf7ca'
269 | '\uf7cb'
270 | '\uf7cc'
271 | '\uf7cd'
272 | '\uf7ce'
273 | '\uf7cf'
274 | '\uf7d0'
275 | '\uf7d1'
276 | '\uf7d2'
277 | '\uf7d3'
278 | '\uf7d4'
279 | '\uf7d5'
280 | '\uf7d6'
281 | '\uf7d7'
282 | '\uf7d8'
283 | '\uf7d9'
284 | '\uf7da'
285 | '\uf7db'
286 | '\uf7dc'
287 | '\uf7dd'
288 | '\uf7de'
289 | '\uf7df'
290 | '\uf7e0'
291 | '\uf7e1'
292 | '\uf7e2'
293 | '\uf7e3'
294 | '\uf7e4'
295 | '\uf7e5'
296 | '\uf7e6'
297 | '\uf7e7'
298 | '\uf7e8'
299 | '\uf7e9'
300 | '\uf7ea'
301 | '\uf7eb'
302 | '\uf7ec'
303 | '\uf7ed'
304 | '\uf7ee'
305 | '\uf7ef'
306 | '\uf7f0'
307 | '\uf7f1'
308 | '\uf7f2'
309 | '\uf7f3'
310 | '\uf7f4'
311 | '\uf7f5'
312 | '\uf7f6'
313 | '\uf7f7'
314 | '\uf7f8'
315 | '\uf7f9'
316 | '\uf7fa'
317 | '\uf7fb'
318 | '\uf7fc'
319 | '\uf7fd'
320 | '\uf7fe'
321 | '\uf7ff'
322 | )
323 |
324 | ### Encoding table
325 | encoding_table = codecs.charmap_build(decoding_table)
326 |
--------------------------------------------------------------------------------