├── .coveragerc
├── .gitignore
├── .travis.yml
├── LICENSE
├── MANIFEST.in
├── README.rst
├── docs
    ├── conf.py
    └── index.rst
├── setup.cfg
├── setup.py
├── tox.ini
└── webencodings
    ├── __init__.py
    ├── labels.py
    ├── mklabels.py
    ├── tests.py
    └── x_user_defined.py


/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | branch = True
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | .tox
3 | *.egg-info
4 | .coverage
5 | docs/_build
6 | /dist
7 | htmlcov
8 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | sudo: false
 3 | cache: pip
 4 | 
 5 | matrix:
 6 |   include:
 7 |     - python: 2.6
 8 |       env: TOXENV=py26
 9 |     - python: 2.7
10 |       env: TOXENV=py27
11 |     - python: 3.3
12 |       env: TOXENV=py33
13 |     - python: 3.4
14 |       env: TOXENV=py34
15 |     - python: 3.5
16 |       env: TOXENV=py35
17 |     - python: 3.6
18 |       env: TOXENV=py36
19 |     - python: pypy
20 |       env: TOXENV=pypy
21 | 
22 | install:
23 |   - pip install -U tox
24 | 
25 | script:
26 |   - tox
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2012 by Simon Sapin.
 2 | 
 3 | Some rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are
 7 | met:
 8 | 
 9 |     * Redistributions of source code must retain the above copyright
10 |       notice, this list of conditions and the following disclaimer.
11 | 
12 |     * Redistributions in binary form must reproduce the above
13 |       copyright notice, this list of conditions and the following
14 |       disclaimer in the documentation and/or other materials provided
15 |       with the distribution.
16 | 
17 |     * The names of the contributors may not be used to endorse or
18 |       promote products derived from this software without specific
19 |       prior written permission.
20 | 
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | python-webencodings
 2 | ===================
 3 | 
 4 | This is a Python implementation of the `WHATWG Encoding standard
 5 | <http://encoding.spec.whatwg.org/>`_.
 6 | 
 7 | * Latest documentation: http://packages.python.org/webencodings/
 8 | * Source code and issue tracker:
 9 |   https://github.com/gsnedders/python-webencodings
10 | * PyPI releases: http://pypi.python.org/pypi/webencodings
11 | * License: BSD
12 | * Python 2.6+ and 3.3+
13 | 
14 | In order to be compatible with legacy web content
15 | when interpreting something like ``Content-Type: text/html; charset=latin1``,
16 | tools need to use a particular set of aliases for encoding labels
17 | as well as some overriding rules.
18 | For example, ``US-ASCII`` and ``iso-8859-1`` on the web are actually
19 | aliases for ``windows-1252``, and an UTF-8 or UTF-16 BOM takes precedence
20 | over any other encoding declaration.
21 | The Encoding standard defines all such details so that implementations do
22 | not have to reverse-engineer each other.
23 | 
24 | This module has encoding labels and BOM detection,
25 | but the actual implementation for encoders and decoders is Python’s.
26 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # python-webencodings documentation build configuration file, created by
  5 | # sphinx-quickstart on Sat Dec 22 21:53:21 2012.
  6 | #
  7 | # This file is execfile()d with the current directory set to its containing dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | import sys, os, re
 16 | 
 17 | # If extensions (or modules to document with autodoc) are in another directory,
 18 | # add these directories to sys.path here. If the directory is relative to the
 19 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 20 | #sys.path.insert(0, os.path.abspath('.'))
 21 | 
 22 | # -- General configuration -----------------------------------------------------
 23 | 
 24 | # If your documentation needs a minimal Sphinx version, state it here.
 25 | #needs_sphinx = '1.0'
 26 | 
 27 | # Add any Sphinx extension module names here, as strings. They can be extensions
 28 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 29 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx', 'sphinx.ext.viewcode']
 30 | 
 31 | # Add any paths that contain templates here, relative to this directory.
 32 | templates_path = ['_templates']
 33 | 
 34 | # The suffix of source filenames.
 35 | source_suffix = '.rst'
 36 | 
 37 | # The encoding of source files.
 38 | #source_encoding = 'utf-8-sig'
 39 | 
 40 | # The master toctree document.
 41 | master_doc = 'index'
 42 | 
 43 | # General information about the project.
 44 | project = 'python-webencodings'
 45 | copyright = '2012, Simon Sapin'
 46 | 
 47 | # The version info for the project you're documenting, acts as replacement for
 48 | # |version| and |release|, also used in various other places throughout the
 49 | # built documents.
 50 | #
 51 | # The full version, including alpha/beta/rc tags.
 52 | release = re.search("VERSION = '([^']+)'",
 53 |     open(os.path.join(os.path.dirname(__file__), os.pardir,
 54 |          'webencodings', '__init__.py')).read().strip()
 55 | ).group(1)
 56 | 
 57 | # The short X.Y version.
 58 | version = '.'.join(release.split('.')[:2])
 59 | 
 60 | # The language for content autogenerated by Sphinx. Refer to documentation
 61 | # for a list of supported languages.
 62 | #language = None
 63 | 
 64 | # There are two options for replacing |today|: either, you set today to some
 65 | # non-false value, then it is used:
 66 | #today = ''
 67 | # Else, today_fmt is used as the format for a strftime call.
 68 | #today_fmt = '%B %d, %Y'
 69 | 
 70 | # List of patterns, relative to source directory, that match files and
 71 | # directories to ignore when looking for source files.
 72 | exclude_patterns = ['_build']
 73 | 
 74 | # The reST default role (used for this markup: `text`) to use for all documents.
 75 | #default_role = None
 76 | 
 77 | # If true, '()' will be appended to :func: etc. cross-reference text.
 78 | #add_function_parentheses = True
 79 | 
 80 | # If true, the current module name will be prepended to all description
 81 | # unit titles (such as .. function::).
 82 | #add_module_names = True
 83 | 
 84 | # If true, sectionauthor and moduleauthor directives will be shown in the
 85 | # output. They are ignored by default.
 86 | #show_authors = False
 87 | 
 88 | # The name of the Pygments (syntax highlighting) style to use.
 89 | pygments_style = 'sphinx'
 90 | 
 91 | # A list of ignored prefixes for module index sorting.
 92 | #modindex_common_prefix = []
 93 | 
 94 | 
 95 | # -- Options for HTML output ---------------------------------------------------
 96 | 
 97 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 98 | # a list of builtin themes.
 99 | html_theme = 'default'
100 | 
101 | # Theme options are theme-specific and customize the look and feel of a theme
102 | # further.  For a list of options available for each theme, see the
103 | # documentation.
104 | #html_theme_options = {}
105 | 
106 | # Add any paths that contain custom themes here, relative to this directory.
107 | #html_theme_path = []
108 | 
109 | # The name for this set of Sphinx documents.  If None, it defaults to
110 | # "<project> v<release> documentation".
111 | #html_title = None
112 | 
113 | # A shorter title for the navigation bar.  Default is the same as html_title.
114 | #html_short_title = None
115 | 
116 | # The name of an image file (relative to this directory) to place at the top
117 | # of the sidebar.
118 | #html_logo = None
119 | 
120 | # The name of an image file (within the static path) to use as favicon of the
121 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
122 | # pixels large.
123 | #html_favicon = None
124 | 
125 | # Add any paths that contain custom static files (such as style sheets) here,
126 | # relative to this directory. They are copied after the builtin static files,
127 | # so a file named "default.css" will overwrite the builtin "default.css".
128 | html_static_path = ['_static']
129 | 
130 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
131 | # using the given strftime format.
132 | #html_last_updated_fmt = '%b %d, %Y'
133 | 
134 | # If true, SmartyPants will be used to convert quotes and dashes to
135 | # typographically correct entities.
136 | #html_use_smartypants = True
137 | 
138 | # Custom sidebar templates, maps document names to template names.
139 | #html_sidebars = {}
140 | 
141 | # Additional templates that should be rendered to pages, maps page names to
142 | # template names.
143 | #html_additional_pages = {}
144 | 
145 | # If false, no module index is generated.
146 | #html_domain_indices = True
147 | 
148 | # If false, no index is generated.
149 | #html_use_index = True
150 | 
151 | # If true, the index is split into individual pages for each letter.
152 | #html_split_index = False
153 | 
154 | # If true, links to the reST sources are added to the pages.
155 | #html_show_sourcelink = True
156 | 
157 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
158 | #html_show_sphinx = True
159 | 
160 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
161 | #html_show_copyright = True
162 | 
163 | # If true, an OpenSearch description file will be output, and all pages will
164 | # contain a <link> tag referring to it.  The value of this option must be the
165 | # base URL from which the finished HTML is served.
166 | #html_use_opensearch = ''
167 | 
168 | # This is the file name suffix for HTML files (e.g. ".xhtml").
169 | #html_file_suffix = None
170 | 
171 | # Output file base name for HTML help builder.
172 | htmlhelp_basename = 'python-webencodingsdoc'
173 | 
174 | 
175 | # -- Options for LaTeX output --------------------------------------------------
176 | 
177 | latex_elements = {
178 | # The paper size ('letterpaper' or 'a4paper').
179 | #'papersize': 'letterpaper',
180 | 
181 | # The font size ('10pt', '11pt' or '12pt').
182 | #'pointsize': '10pt',
183 | 
184 | # Additional stuff for the LaTeX preamble.
185 | #'preamble': '',
186 | }
187 | 
188 | # Grouping the document tree into LaTeX files. List of tuples
189 | # (source start file, target name, title, author, documentclass [howto/manual]).
190 | latex_documents = [
191 |   ('index', 'python-webencodings.tex', 'python-webencodings Documentation',
192 |    'Simon Sapin', 'manual'),
193 | ]
194 | 
195 | # The name of an image file (relative to this directory) to place at the top of
196 | # the title page.
197 | #latex_logo = None
198 | 
199 | # For "manual" documents, if this is true, then toplevel headings are parts,
200 | # not chapters.
201 | #latex_use_parts = False
202 | 
203 | # If true, show page references after internal links.
204 | #latex_show_pagerefs = False
205 | 
206 | # If true, show URL addresses after external links.
207 | #latex_show_urls = False
208 | 
209 | # Documents to append as an appendix to all manuals.
210 | #latex_appendices = []
211 | 
212 | # If false, no module index is generated.
213 | #latex_domain_indices = True
214 | 
215 | 
216 | # -- Options for manual page output --------------------------------------------
217 | 
218 | # One entry per manual page. List of tuples
219 | # (source start file, name, description, authors, manual section).
220 | man_pages = [
221 |     ('index', 'python-webencodings', 'python-webencodings Documentation',
222 |      ['Simon Sapin'], 1)
223 | ]
224 | 
225 | # If true, show URL addresses after external links.
226 | #man_show_urls = False
227 | 
228 | 
229 | # -- Options for Texinfo output ------------------------------------------------
230 | 
231 | # Grouping the document tree into Texinfo files. List of tuples
232 | # (source start file, target name, title, author,
233 | #  dir menu entry, description, category)
234 | texinfo_documents = [
235 |   ('index', 'python-webencodings', 'python-webencodings Documentation',
236 |    'Simon Sapin', 'python-webencodings', 'One line description of project.',
237 |    'Miscellaneous'),
238 | ]
239 | 
240 | # Documents to append as an appendix to all manuals.
241 | #texinfo_appendices = []
242 | 
243 | # If false, no module index is generated.
244 | #texinfo_domain_indices = True
245 | 
246 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
247 | #texinfo_show_urls = 'footnote'
248 | 
249 | 
250 | # Example configuration for intersphinx: refer to the Python standard library.
251 | intersphinx_mapping = {
252 |     'py': ('http://docs.python.org/3', None)
253 | }
254 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. include:: ../README.rst
 2 | 
 3 | .. toctree::
 4 |    :maxdepth: 2
 5 | 
 6 | 
 7 | Byte order marks
 8 | ----------------
 9 | 
10 | When decoding, for compatibility with deployed content,
11 | a `byte order mark <https://en.wikipedia.org/wiki/Byte_order_mark>`_
12 | (also known as BOM)
13 | is considered more authoritative than anything else.
14 | The corresponding U+FFFE code point is not part of the decoded output.
15 | 
16 | Encoding nevers prepends a BOM,
17 | but the output can start with a BOM
18 | if the input starts with a U+FFFE code point.
19 | In that case encoding then decoding will not round-trip.
20 | 
21 | 
22 | Error handling
23 | --------------
24 | 
25 | As in the stdlib, error handling for encoding defaults to ``strict``:
26 | raise an exception if there is an error.
27 | 
28 | For decoding however the default is ``replace``, unlike the stdlib.
29 | Invalid bytes are decoded as ``�`` (U+FFFD, the replacement character).
30 | The reason is that when showing legacy content to the user,
31 | it might be better to succeed decoding only part of it rather than blow up.
32 | This is of course not the case is all situations:
33 | sometimes you want stuff to blow up so you can detect errors early.
34 | 
35 | 
36 | API
37 | ---
38 | 
39 | .. module:: webencodings
40 | 
41 | .. autofunction:: lookup
42 | 
43 | .. autoclass:: Encoding()
44 | 
45 | .. autodata:: UTF8
46 | 
47 | .. autofunction:: decode
48 | .. autofunction:: encode
49 | .. autofunction:: iter_decode
50 | .. autofunction:: iter_encode
51 | .. autoclass:: IncrementalDecoder
52 |     :members:
53 | .. autoclass:: IncrementalEncoder
54 | .. autofunction:: ascii_lower
55 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [bdist_wheel]
 2 | universal = 1
 3 | 
 4 | [metadata]
 5 | license_file = LICENSE
 6 | 
 7 | [build_sphinx]
 8 | source-dir = docs
 9 | build-dir  = docs/_build
10 | #all_files  = 1
11 | 
12 | [upload_sphinx] # Sphinx-PyPI-upload
13 | upload-dir = docs/_build/html
14 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | import io
 3 | from os import path
 4 | import re
 5 | 
 6 | 
 7 | VERSION = re.search("VERSION = '([^']+)'", io.open(
 8 |     path.join(path.dirname(__file__), 'webencodings', '__init__.py'),
 9 |     encoding='utf-8'
10 | ).read().strip()).group(1)
11 | 
12 | LONG_DESCRIPTION = io.open(
13 |     path.join(path.dirname(__file__), 'README.rst'),
14 |     encoding='utf-8'
15 | ).read()
16 | 
17 | 
18 | setup(
19 |     name='webencodings',
20 |     version=VERSION,
21 |     url='https://github.com/SimonSapin/python-webencodings',
22 |     license='BSD',
23 |     author='Simon Sapin',
24 |     author_email='simon.sapin@exyr.org',
25 |     maintainer='Geoffrey Sneddon',
26 |     maintainer_email='me@gsnedders.com',
27 |     description='Character encoding aliases for legacy web content',
28 |     long_description=LONG_DESCRIPTION,
29 |     classifiers=[
30 |         'Development Status :: 4 - Beta',
31 |         'Intended Audience :: Developers',
32 |         'License :: OSI Approved :: BSD License',
33 |         'Programming Language :: Python',
34 |         'Programming Language :: Python :: 2',
35 |         'Programming Language :: Python :: 2.6',
36 |         'Programming Language :: Python :: 2.7',
37 |         'Programming Language :: Python :: 3',
38 |         'Programming Language :: Python :: 3.3',
39 |         'Programming Language :: Python :: 3.4',
40 |         'Programming Language :: Python :: 3.5',
41 |         'Programming Language :: Python :: 3.6',
42 |         'Programming Language :: Python :: Implementation :: CPython',
43 |         'Programming Language :: Python :: Implementation :: PyPy',
44 |         'Topic :: Internet :: WWW/HTTP',
45 |     ],
46 |     packages=find_packages(),
47 | )
48 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [pytest]
 2 | python_files=test*.py
 3 | 
 4 | [tox]
 5 | envlist = py26, py27, py33, py34, py35, py36, pypy
 6 | 
 7 | [testenv]
 8 | deps=pytest
 9 | commands=py.test []
10 | 


--------------------------------------------------------------------------------
/webencodings/__init__.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | """
  3 | 
  4 |     webencodings
  5 |     ~~~~~~~~~~~~
  6 | 
  7 |     This is a Python implementation of the `WHATWG Encoding standard
  8 |     <http://encoding.spec.whatwg.org/>`. See README for details.
  9 | 
 10 |     :copyright: Copyright 2012 by Simon Sapin
 11 |     :license: BSD, see LICENSE for details.
 12 | 
 13 | """
 14 | 
 15 | from __future__ import unicode_literals
 16 | 
 17 | import codecs
 18 | 
 19 | from .labels import LABELS
 20 | 
 21 | 
 22 | VERSION = '0.6-dev'
 23 | 
 24 | 
 25 | # Some names in Encoding are not valid Python aliases. Remap these.
 26 | PYTHON_NAMES = {
 27 |     'iso-8859-8-i': 'iso-8859-8',
 28 |     'x-mac-cyrillic': 'mac-cyrillic',
 29 |     'macintosh': 'mac-roman',
 30 |     'windows-874': 'cp874'}
 31 | 
 32 | CACHE = {}
 33 | 
 34 | 
 35 | def ascii_lower(string):
 36 |     r"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z.
 37 | 
 38 |     :param string: An Unicode string.
 39 |     :returns: A new Unicode string.
 40 | 
 41 |     This is used for `ASCII case-insensitive
 42 |     <http://encoding.spec.whatwg.org/#ascii-case-insensitive>`_
 43 |     matching of encoding labels.
 44 |     The same matching is also used, among other things,
 45 |     for `CSS keywords <http://dev.w3.org/csswg/css-values/#keywords>`_.
 46 | 
 47 |     This is different from the :meth:`~py:str.lower` method of Unicode strings
 48 |     which also affect non-ASCII characters,
 49 |     sometimes mapping them into the ASCII range:
 50 | 
 51 |         >>> keyword = u'Bac\N{KELVIN SIGN}ground'
 52 |         >>> assert keyword.lower() == u'background'
 53 |         >>> assert ascii_lower(keyword) != keyword.lower()
 54 |         >>> assert ascii_lower(keyword) == u'bac\N{KELVIN SIGN}ground'
 55 | 
 56 |     """
 57 |     # This turns out to be faster than unicode.translate()
 58 |     return string.encode('utf8').lower().decode('utf8')
 59 | 
 60 | 
 61 | def lookup(label):
 62 |     """
 63 |     Look for an encoding by its label.
 64 |     This is the spec’s `get an encoding
 65 |     <http://encoding.spec.whatwg.org/#concept-encoding-get>`_ algorithm.
 66 |     Supported labels are listed there.
 67 | 
 68 |     :param label: A string.
 69 |     :returns:
 70 |         An :class:`Encoding` object, or :obj:`None` for an unknown label.
 71 | 
 72 |     """
 73 |     # Only strip ASCII whitespace: U+0009, U+000A, U+000C, U+000D, and U+0020.
 74 |     label = ascii_lower(label.strip('\t\n\f\r '))
 75 |     name = LABELS.get(label)
 76 |     if name is None:
 77 |         return None
 78 |     encoding = CACHE.get(name)
 79 |     if encoding is None:
 80 |         if name == 'x-user-defined':
 81 |             from .x_user_defined import codec_info
 82 |         else:
 83 |             python_name = PYTHON_NAMES.get(name, name)
 84 |             # Any python_name value that gets to here should be valid.
 85 |             codec_info = codecs.lookup(python_name)
 86 |         encoding = Encoding(name, codec_info)
 87 |         CACHE[name] = encoding
 88 |     return encoding
 89 | 
 90 | 
 91 | def _get_encoding(encoding_or_label):
 92 |     """
 93 |     Accept either an encoding object or label.
 94 | 
 95 |     :param encoding: An :class:`Encoding` object or a label string.
 96 |     :returns: An :class:`Encoding` object.
 97 |     :raises: :exc:`~exceptions.LookupError` for an unknown label.
 98 | 
 99 |     """
100 |     if hasattr(encoding_or_label, 'codec_info'):
101 |         return encoding_or_label
102 | 
103 |     encoding = lookup(encoding_or_label)
104 |     if encoding is None:
105 |         raise LookupError('Unknown encoding label: %r' % encoding_or_label)
106 |     return encoding
107 | 
108 | 
109 | class Encoding(object):
110 |     """Reresents a character encoding such as UTF-8,
111 |     that can be used for decoding or encoding.
112 | 
113 |     .. attribute:: name
114 | 
115 |         Canonical name of the encoding
116 | 
117 |     .. attribute:: codec_info
118 | 
119 |         The actual implementation of the encoding,
120 |         a stdlib :class:`~codecs.CodecInfo` object.
121 |         See :func:`codecs.register`.
122 | 
123 |     """
124 |     def __init__(self, name, codec_info):
125 |         self.name = name
126 |         self.codec_info = codec_info
127 | 
128 |     def __repr__(self):
129 |         return '<Encoding %s>' % self.name
130 | 
131 | 
132 | #: The UTF-8 encoding. Should be used for new content and formats.
133 | UTF8 = lookup('utf-8')
134 | 
135 | _UTF16LE = lookup('utf-16le')
136 | _UTF16BE = lookup('utf-16be')
137 | 
138 | 
139 | def decode(input, fallback_encoding, errors='replace'):
140 |     """
141 |     Decode a single string.
142 | 
143 |     :param input: A byte string
144 |     :param fallback_encoding:
145 |         An :class:`Encoding` object or a label string.
146 |         The encoding to use if :obj:`input` does note have a BOM.
147 |     :param errors: Type of error handling. See :func:`codecs.register`.
148 |     :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
149 |     :return:
150 |         A ``(output, encoding)`` tuple of an Unicode string
151 |         and an :obj:`Encoding`.
152 | 
153 |     """
154 |     # Fail early if `encoding` is an invalid label.
155 |     fallback_encoding = _get_encoding(fallback_encoding)
156 |     bom_encoding, input = _detect_bom(input)
157 |     encoding = bom_encoding or fallback_encoding
158 |     return encoding.codec_info.decode(input, errors)[0], encoding
159 | 
160 | 
161 | def _detect_bom(input):
162 |     """Return (bom_encoding, input), with any BOM removed from the input."""
163 |     if input.startswith(b'\xFF\xFE'):
164 |         return _UTF16LE, input[2:]
165 |     if input.startswith(b'\xFE\xFF'):
166 |         return _UTF16BE, input[2:]
167 |     if input.startswith(b'\xEF\xBB\xBF'):
168 |         return UTF8, input[3:]
169 |     return None, input
170 | 
171 | 
172 | def encode(input, encoding=UTF8, errors='strict'):
173 |     """
174 |     Encode a single string.
175 | 
176 |     :param input: An Unicode string.
177 |     :param encoding: An :class:`Encoding` object or a label string.
178 |     :param errors: Type of error handling. See :func:`codecs.register`.
179 |     :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
180 |     :return: A byte string.
181 | 
182 |     """
183 |     return _get_encoding(encoding).codec_info.encode(input, errors)[0]
184 | 
185 | 
186 | def iter_decode(input, fallback_encoding, errors='replace'):
187 |     """
188 |     "Pull"-based decoder.
189 | 
190 |     :param input:
191 |         An iterable of byte strings.
192 | 
193 |         The input is first consumed just enough to determine the encoding
194 |         based on the precense of a BOM,
195 |         then consumed on demand when the return value is.
196 |     :param fallback_encoding:
197 |         An :class:`Encoding` object or a label string.
198 |         The encoding to use if :obj:`input` does note have a BOM.
199 |     :param errors: Type of error handling. See :func:`codecs.register`.
200 |     :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
201 |     :returns:
202 |         An ``(output, encoding)`` tuple.
203 |         :obj:`output` is an iterable of Unicode strings,
204 |         :obj:`encoding` is the :obj:`Encoding` that is being used.
205 | 
206 |     """
207 | 
208 |     decoder = IncrementalDecoder(fallback_encoding, errors)
209 |     generator = _iter_decode_generator(input, decoder)
210 |     encoding = next(generator)
211 |     return generator, encoding
212 | 
213 | 
214 | def _iter_decode_generator(input, decoder):
215 |     """Return a generator that first yields the :obj:`Encoding`,
216 |     then yields output chukns as Unicode strings.
217 | 
218 |     """
219 |     decode = decoder.decode
220 |     input = iter(input)
221 |     for chunck in input:
222 |         output = decode(chunck)
223 |         if output:
224 |             assert decoder.encoding is not None
225 |             yield decoder.encoding
226 |             yield output
227 |             break
228 |     else:
229 |         # Input exhausted without determining the encoding
230 |         output = decode(b'', final=True)
231 |         assert decoder.encoding is not None
232 |         yield decoder.encoding
233 |         if output:
234 |             yield output
235 |         return
236 | 
237 |     for chunck in input:
238 |         output = decode(chunck)
239 |         if output:
240 |             yield output
241 |     output = decode(b'', final=True)
242 |     if output:
243 |         yield output
244 | 
245 | 
246 | def iter_encode(input, encoding=UTF8, errors='strict'):
247 |     """
248 |     “Pull”-based encoder.
249 | 
250 |     :param input: An iterable of Unicode strings.
251 |     :param encoding: An :class:`Encoding` object or a label string.
252 |     :param errors: Type of error handling. See :func:`codecs.register`.
253 |     :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
254 |     :returns: An iterable of byte strings.
255 | 
256 |     """
257 |     # Fail early if `encoding` is an invalid label.
258 |     encode = IncrementalEncoder(encoding, errors).encode
259 |     return _iter_encode_generator(input, encode)
260 | 
261 | 
262 | def _iter_encode_generator(input, encode):
263 |     for chunck in input:
264 |         output = encode(chunck)
265 |         if output:
266 |             yield output
267 |     output = encode('', final=True)
268 |     if output:
269 |         yield output
270 | 
271 | 
272 | class IncrementalDecoder(object):
273 |     """
274 |     “Push”-based decoder.
275 | 
276 |     :param fallback_encoding:
277 |         An :class:`Encoding` object or a label string.
278 |         The encoding to use if :obj:`input` does note have a BOM.
279 |     :param errors: Type of error handling. See :func:`codecs.register`.
280 |     :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
281 | 
282 |     """
283 |     def __init__(self, fallback_encoding, errors='replace'):
284 |         # Fail early if `encoding` is an invalid label.
285 |         self._fallback_encoding = _get_encoding(fallback_encoding)
286 |         self._errors = errors
287 |         self._buffer = b''
288 |         self._decoder = None
289 |         #: The actual :class:`Encoding` that is being used,
290 |         #: or :obj:`None` if that is not determined yet.
291 |         #: (Ie. if there is not enough input yet to determine
292 |         #: if there is a BOM.)
293 |         self.encoding = None  # Not known yet.
294 | 
295 |     def decode(self, input, final=False):
296 |         """Decode one chunk of the input.
297 | 
298 |         :param input: A byte string.
299 |         :param final:
300 |             Indicate that no more input is available.
301 |             Must be :obj:`True` if this is the last call.
302 |         :returns: An Unicode string.
303 | 
304 |         """
305 |         decoder = self._decoder
306 |         if decoder is not None:
307 |             return decoder(input, final)
308 | 
309 |         input = self._buffer + input
310 |         encoding, input = _detect_bom(input)
311 |         if encoding is None:
312 |             if len(input) < 3 and not final:  # Not enough data yet.
313 |                 self._buffer = input
314 |                 return ''
315 |             else:  # No BOM
316 |                 encoding = self._fallback_encoding
317 |         decoder = encoding.codec_info.incrementaldecoder(self._errors).decode
318 |         self._decoder = decoder
319 |         self.encoding = encoding
320 |         return decoder(input, final)
321 | 
322 | 
323 | class IncrementalEncoder(object):
324 |     """
325 |     “Push”-based encoder.
326 | 
327 |     :param encoding: An :class:`Encoding` object or a label string.
328 |     :param errors: Type of error handling. See :func:`codecs.register`.
329 |     :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
330 | 
331 |     .. method:: encode(input, final=False)
332 | 
333 |         :param input: An Unicode string.
334 |         :param final:
335 |             Indicate that no more input is available.
336 |             Must be :obj:`True` if this is the last call.
337 |         :returns: A byte string.
338 | 
339 |     """
340 |     def __init__(self, encoding=UTF8, errors='strict'):
341 |         encoding = _get_encoding(encoding)
342 |         self.encode = encoding.codec_info.incrementalencoder(errors).encode
343 | 


--------------------------------------------------------------------------------
/webencodings/labels.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 |     webencodings.labels
  4 |     ~~~~~~~~~~~~~~~~~~~
  5 | 
  6 |     Map encoding labels to their name.
  7 | 
  8 |     :copyright: Copyright 2012 by Simon Sapin
  9 |     :license: BSD, see LICENSE for details.
 10 | 
 11 | """
 12 | 
 13 | # XXX Do not edit!
 14 | # This file is automatically generated by mklabels.py
 15 | 
 16 | LABELS = {
 17 |     'unicode-1-1-utf-8':   'utf-8',
 18 |     'utf-8':               'utf-8',
 19 |     'utf8':                'utf-8',
 20 |     '866':                 'ibm866',
 21 |     'cp866':               'ibm866',
 22 |     'csibm866':            'ibm866',
 23 |     'ibm866':              'ibm866',
 24 |     'csisolatin2':         'iso-8859-2',
 25 |     'iso-8859-2':          'iso-8859-2',
 26 |     'iso-ir-101':          'iso-8859-2',
 27 |     'iso8859-2':           'iso-8859-2',
 28 |     'iso88592':            'iso-8859-2',
 29 |     'iso_8859-2':          'iso-8859-2',
 30 |     'iso_8859-2:1987':     'iso-8859-2',
 31 |     'l2':                  'iso-8859-2',
 32 |     'latin2':              'iso-8859-2',
 33 |     'csisolatin3':         'iso-8859-3',
 34 |     'iso-8859-3':          'iso-8859-3',
 35 |     'iso-ir-109':          'iso-8859-3',
 36 |     'iso8859-3':           'iso-8859-3',
 37 |     'iso88593':            'iso-8859-3',
 38 |     'iso_8859-3':          'iso-8859-3',
 39 |     'iso_8859-3:1988':     'iso-8859-3',
 40 |     'l3':                  'iso-8859-3',
 41 |     'latin3':              'iso-8859-3',
 42 |     'csisolatin4':         'iso-8859-4',
 43 |     'iso-8859-4':          'iso-8859-4',
 44 |     'iso-ir-110':          'iso-8859-4',
 45 |     'iso8859-4':           'iso-8859-4',
 46 |     'iso88594':            'iso-8859-4',
 47 |     'iso_8859-4':          'iso-8859-4',
 48 |     'iso_8859-4:1988':     'iso-8859-4',
 49 |     'l4':                  'iso-8859-4',
 50 |     'latin4':              'iso-8859-4',
 51 |     'csisolatincyrillic':  'iso-8859-5',
 52 |     'cyrillic':            'iso-8859-5',
 53 |     'iso-8859-5':          'iso-8859-5',
 54 |     'iso-ir-144':          'iso-8859-5',
 55 |     'iso8859-5':           'iso-8859-5',
 56 |     'iso88595':            'iso-8859-5',
 57 |     'iso_8859-5':          'iso-8859-5',
 58 |     'iso_8859-5:1988':     'iso-8859-5',
 59 |     'arabic':              'iso-8859-6',
 60 |     'asmo-708':            'iso-8859-6',
 61 |     'csiso88596e':         'iso-8859-6',
 62 |     'csiso88596i':         'iso-8859-6',
 63 |     'csisolatinarabic':    'iso-8859-6',
 64 |     'ecma-114':            'iso-8859-6',
 65 |     'iso-8859-6':          'iso-8859-6',
 66 |     'iso-8859-6-e':        'iso-8859-6',
 67 |     'iso-8859-6-i':        'iso-8859-6',
 68 |     'iso-ir-127':          'iso-8859-6',
 69 |     'iso8859-6':           'iso-8859-6',
 70 |     'iso88596':            'iso-8859-6',
 71 |     'iso_8859-6':          'iso-8859-6',
 72 |     'iso_8859-6:1987':     'iso-8859-6',
 73 |     'csisolatingreek':     'iso-8859-7',
 74 |     'ecma-118':            'iso-8859-7',
 75 |     'elot_928':            'iso-8859-7',
 76 |     'greek':               'iso-8859-7',
 77 |     'greek8':              'iso-8859-7',
 78 |     'iso-8859-7':          'iso-8859-7',
 79 |     'iso-ir-126':          'iso-8859-7',
 80 |     'iso8859-7':           'iso-8859-7',
 81 |     'iso88597':            'iso-8859-7',
 82 |     'iso_8859-7':          'iso-8859-7',
 83 |     'iso_8859-7:1987':     'iso-8859-7',
 84 |     'sun_eu_greek':        'iso-8859-7',
 85 |     'csiso88598e':         'iso-8859-8',
 86 |     'csisolatinhebrew':    'iso-8859-8',
 87 |     'hebrew':              'iso-8859-8',
 88 |     'iso-8859-8':          'iso-8859-8',
 89 |     'iso-8859-8-e':        'iso-8859-8',
 90 |     'iso-ir-138':          'iso-8859-8',
 91 |     'iso8859-8':           'iso-8859-8',
 92 |     'iso88598':            'iso-8859-8',
 93 |     'iso_8859-8':          'iso-8859-8',
 94 |     'iso_8859-8:1988':     'iso-8859-8',
 95 |     'visual':              'iso-8859-8',
 96 |     'csiso88598i':         'iso-8859-8-i',
 97 |     'iso-8859-8-i':        'iso-8859-8-i',
 98 |     'logical':             'iso-8859-8-i',
 99 |     'csisolatin6':         'iso-8859-10',
100 |     'iso-8859-10':         'iso-8859-10',
101 |     'iso-ir-157':          'iso-8859-10',
102 |     'iso8859-10':          'iso-8859-10',
103 |     'iso885910':           'iso-8859-10',
104 |     'l6':                  'iso-8859-10',
105 |     'latin6':              'iso-8859-10',
106 |     'iso-8859-13':         'iso-8859-13',
107 |     'iso8859-13':          'iso-8859-13',
108 |     'iso885913':           'iso-8859-13',
109 |     'iso-8859-14':         'iso-8859-14',
110 |     'iso8859-14':          'iso-8859-14',
111 |     'iso885914':           'iso-8859-14',
112 |     'csisolatin9':         'iso-8859-15',
113 |     'iso-8859-15':         'iso-8859-15',
114 |     'iso8859-15':          'iso-8859-15',
115 |     'iso885915':           'iso-8859-15',
116 |     'iso_8859-15':         'iso-8859-15',
117 |     'l9':                  'iso-8859-15',
118 |     'iso-8859-16':         'iso-8859-16',
119 |     'cskoi8r':             'koi8-r',
120 |     'koi':                 'koi8-r',
121 |     'koi8':                'koi8-r',
122 |     'koi8-r':              'koi8-r',
123 |     'koi8_r':              'koi8-r',
124 |     'koi8-u':              'koi8-u',
125 |     'csmacintosh':         'macintosh',
126 |     'mac':                 'macintosh',
127 |     'macintosh':           'macintosh',
128 |     'x-mac-roman':         'macintosh',
129 |     'dos-874':             'windows-874',
130 |     'iso-8859-11':         'windows-874',
131 |     'iso8859-11':          'windows-874',
132 |     'iso885911':           'windows-874',
133 |     'tis-620':             'windows-874',
134 |     'windows-874':         'windows-874',
135 |     'cp1250':              'windows-1250',
136 |     'windows-1250':        'windows-1250',
137 |     'x-cp1250':            'windows-1250',
138 |     'cp1251':              'windows-1251',
139 |     'windows-1251':        'windows-1251',
140 |     'x-cp1251':            'windows-1251',
141 |     'ansi_x3.4-1968':      'windows-1252',
142 |     'ascii':               'windows-1252',
143 |     'cp1252':              'windows-1252',
144 |     'cp819':               'windows-1252',
145 |     'csisolatin1':         'windows-1252',
146 |     'ibm819':              'windows-1252',
147 |     'iso-8859-1':          'windows-1252',
148 |     'iso-ir-100':          'windows-1252',
149 |     'iso8859-1':           'windows-1252',
150 |     'iso88591':            'windows-1252',
151 |     'iso_8859-1':          'windows-1252',
152 |     'iso_8859-1:1987':     'windows-1252',
153 |     'l1':                  'windows-1252',
154 |     'latin1':              'windows-1252',
155 |     'us-ascii':            'windows-1252',
156 |     'windows-1252':        'windows-1252',
157 |     'x-cp1252':            'windows-1252',
158 |     'cp1253':              'windows-1253',
159 |     'windows-1253':        'windows-1253',
160 |     'x-cp1253':            'windows-1253',
161 |     'cp1254':              'windows-1254',
162 |     'csisolatin5':         'windows-1254',
163 |     'iso-8859-9':          'windows-1254',
164 |     'iso-ir-148':          'windows-1254',
165 |     'iso8859-9':           'windows-1254',
166 |     'iso88599':            'windows-1254',
167 |     'iso_8859-9':          'windows-1254',
168 |     'iso_8859-9:1989':     'windows-1254',
169 |     'l5':                  'windows-1254',
170 |     'latin5':              'windows-1254',
171 |     'windows-1254':        'windows-1254',
172 |     'x-cp1254':            'windows-1254',
173 |     'cp1255':              'windows-1255',
174 |     'windows-1255':        'windows-1255',
175 |     'x-cp1255':            'windows-1255',
176 |     'cp1256':              'windows-1256',
177 |     'windows-1256':        'windows-1256',
178 |     'x-cp1256':            'windows-1256',
179 |     'cp1257':              'windows-1257',
180 |     'windows-1257':        'windows-1257',
181 |     'x-cp1257':            'windows-1257',
182 |     'cp1258':              'windows-1258',
183 |     'windows-1258':        'windows-1258',
184 |     'x-cp1258':            'windows-1258',
185 |     'x-mac-cyrillic':      'x-mac-cyrillic',
186 |     'x-mac-ukrainian':     'x-mac-cyrillic',
187 |     'chinese':             'gbk',
188 |     'csgb2312':            'gbk',
189 |     'csiso58gb231280':     'gbk',
190 |     'gb2312':              'gbk',
191 |     'gb_2312':             'gbk',
192 |     'gb_2312-80':          'gbk',
193 |     'gbk':                 'gbk',
194 |     'iso-ir-58':           'gbk',
195 |     'x-gbk':               'gbk',
196 |     'gb18030':             'gb18030',
197 |     'hz-gb-2312':          'hz-gb-2312',
198 |     'big5':                'big5',
199 |     'big5-hkscs':          'big5',
200 |     'cn-big5':             'big5',
201 |     'csbig5':              'big5',
202 |     'x-x-big5':            'big5',
203 |     'cseucpkdfmtjapanese': 'euc-jp',
204 |     'euc-jp':              'euc-jp',
205 |     'x-euc-jp':            'euc-jp',
206 |     'csiso2022jp':         'iso-2022-jp',
207 |     'iso-2022-jp':         'iso-2022-jp',
208 |     'csshiftjis':          'shift_jis',
209 |     'ms_kanji':            'shift_jis',
210 |     'shift-jis':           'shift_jis',
211 |     'shift_jis':           'shift_jis',
212 |     'sjis':                'shift_jis',
213 |     'windows-31j':         'shift_jis',
214 |     'x-sjis':              'shift_jis',
215 |     'cseuckr':             'euc-kr',
216 |     'csksc56011987':       'euc-kr',
217 |     'euc-kr':              'euc-kr',
218 |     'iso-ir-149':          'euc-kr',
219 |     'korean':              'euc-kr',
220 |     'ks_c_5601-1987':      'euc-kr',
221 |     'ks_c_5601-1989':      'euc-kr',
222 |     'ksc5601':             'euc-kr',
223 |     'ksc_5601':            'euc-kr',
224 |     'windows-949':         'euc-kr',
225 |     'csiso2022kr':         'iso-2022-kr',
226 |     'iso-2022-kr':         'iso-2022-kr',
227 |     'utf-16be':            'utf-16be',
228 |     'utf-16':              'utf-16le',
229 |     'utf-16le':            'utf-16le',
230 |     'x-user-defined':      'x-user-defined',
231 | }
232 | 


--------------------------------------------------------------------------------
/webencodings/mklabels.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 |     webencodings.mklabels
 4 |     ~~~~~~~~~~~~~~~~~~~~~
 5 | 
 6 |     Regenarate the webencodings.labels module.
 7 | 
 8 |     :copyright: Copyright 2012 by Simon Sapin
 9 |     :license: BSD, see LICENSE for details.
10 | 
11 | """
12 | 
13 | import json
14 | try:
15 |     from urllib import urlopen
16 | except ImportError:
17 |     from urllib.request import urlopen
18 | 
19 | 
20 | def assert_lower(string):
21 |     assert string == string.lower()
22 |     return string
23 | 
24 | 
25 | def generate(url):
26 |     parts = ['''\
27 | """
28 | 
29 |     webencodings.labels
30 |     ~~~~~~~~~~~~~~~~~~~
31 | 
32 |     Map encoding labels to their name.
33 | 
34 |     :copyright: Copyright 2012 by Simon Sapin
35 |     :license: BSD, see LICENSE for details.
36 | 
37 | """
38 | 
39 | # XXX Do not edit!
40 | # This file is automatically generated by mklabels.py
41 | 
42 | LABELS = {
43 | ''']
44 |     labels = [
45 |         (repr(assert_lower(label)).lstrip('u'),
46 |          repr(encoding['name']).lstrip('u'))
47 |         for category in json.loads(urlopen(url).read().decode('ascii'))
48 |         for encoding in category['encodings']
49 |         for label in encoding['labels']]
50 |     max_len = max(len(label) for label, name in labels)
51 |     parts.extend(
52 |         '    %s:%s %s,\n' % (label, ' ' * (max_len - len(label)), name)
53 |         for label, name in labels)
54 |     parts.append('}')
55 |     return ''.join(parts)
56 | 
57 | 
58 | if __name__ == '__main__':
59 |     print(generate('http://encoding.spec.whatwg.org/encodings.json'))
60 | 


--------------------------------------------------------------------------------
/webencodings/tests.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | """
  3 | 
  4 |     webencodings.tests
  5 |     ~~~~~~~~~~~~~~~~~~
  6 | 
  7 |     A basic test suite for Encoding.
  8 | 
  9 |     :copyright: Copyright 2012 by Simon Sapin
 10 |     :license: BSD, see LICENSE for details.
 11 | 
 12 | """
 13 | 
 14 | from __future__ import unicode_literals
 15 | 
 16 | from . import (lookup, LABELS, decode, encode, iter_decode, iter_encode,
 17 |                IncrementalDecoder, IncrementalEncoder, UTF8)
 18 | 
 19 | 
 20 | def assert_raises(exception, function, *args, **kwargs):
 21 |     try:
 22 |         function(*args, **kwargs)
 23 |     except exception:
 24 |         return
 25 |     else:  # pragma: no cover
 26 |         raise AssertionError('Did not raise %s.' % exception)
 27 | 
 28 | 
 29 | def test_labels():
 30 |     assert lookup('utf-8').name == 'utf-8'
 31 |     assert lookup('Utf-8').name == 'utf-8'
 32 |     assert lookup('UTF-8').name == 'utf-8'
 33 |     assert lookup('utf8').name == 'utf-8'
 34 |     assert lookup('utf8').name == 'utf-8'
 35 |     assert lookup('utf8 ').name == 'utf-8'
 36 |     assert lookup(' \r\nutf8\t').name == 'utf-8'
 37 |     assert lookup('u8') is None  # Python label.
 38 |     assert lookup('utf-8 ') is None  # Non-ASCII white space.
 39 | 
 40 |     assert lookup('US-ASCII').name == 'windows-1252'
 41 |     assert lookup('iso-8859-1').name == 'windows-1252'
 42 |     assert lookup('latin1').name == 'windows-1252'
 43 |     assert lookup('LATIN1').name == 'windows-1252'
 44 |     assert lookup('latin-1') is None
 45 |     assert lookup('LATİN1') is None  # ASCII-only case insensitivity.
 46 | 
 47 | 
 48 | def test_all_labels():
 49 |     for label in LABELS:
 50 |         assert decode(b'', label) == ('', lookup(label))
 51 |         assert encode('', label) == b''
 52 |         for repeat in [0, 1, 12]:
 53 |             output, _ = iter_decode([b''] * repeat, label)
 54 |             assert list(output) == []
 55 |             assert list(iter_encode([''] * repeat, label)) == []
 56 |         decoder = IncrementalDecoder(label)
 57 |         assert decoder.decode(b'') == ''
 58 |         assert decoder.decode(b'', final=True) == ''
 59 |         encoder = IncrementalEncoder(label)
 60 |         assert encoder.encode('') == b''
 61 |         assert encoder.encode('', final=True) == b''
 62 |     # All encoding names are valid labels too:
 63 |     for name in set(LABELS.values()):
 64 |         assert lookup(name).name == name
 65 | 
 66 | 
 67 | def test_invalid_label():
 68 |     assert_raises(LookupError, decode, b'\xEF\xBB\xBF\xc3\xa9', 'invalid')
 69 |     assert_raises(LookupError, encode, 'é', 'invalid')
 70 |     assert_raises(LookupError, iter_decode, [], 'invalid')
 71 |     assert_raises(LookupError, iter_encode, [], 'invalid')
 72 |     assert_raises(LookupError, IncrementalDecoder, 'invalid')
 73 |     assert_raises(LookupError, IncrementalEncoder, 'invalid')
 74 | 
 75 | 
 76 | def test_decode():
 77 |     assert decode(b'\x80', 'latin1') == ('€', lookup('latin1'))
 78 |     assert decode(b'\x80', lookup('latin1')) == ('€', lookup('latin1'))
 79 |     assert decode(b'\xc3\xa9', 'utf8') == ('é', lookup('utf8'))
 80 |     assert decode(b'\xc3\xa9', UTF8) == ('é', lookup('utf8'))
 81 |     assert decode(b'\xc3\xa9', 'ascii') == ('Ã©', lookup('ascii'))
 82 |     assert decode(b'\xEF\xBB\xBF\xc3\xa9', 'ascii') == ('é', lookup('utf8'))  # UTF-8 with BOM
 83 | 
 84 |     assert decode(b'\xFE\xFF\x00\xe9', 'ascii') == ('é', lookup('utf-16be'))  # UTF-16-BE with BOM
 85 |     assert decode(b'\xFF\xFE\xe9\x00', 'ascii') == ('é', lookup('utf-16le'))  # UTF-16-LE with BOM
 86 |     assert decode(b'\xFE\xFF\xe9\x00', 'ascii') == ('\ue900', lookup('utf-16be'))
 87 |     assert decode(b'\xFF\xFE\x00\xe9', 'ascii') == ('\ue900', lookup('utf-16le'))
 88 | 
 89 |     assert decode(b'\x00\xe9', 'UTF-16BE') == ('é', lookup('utf-16be'))
 90 |     assert decode(b'\xe9\x00', 'UTF-16LE') == ('é', lookup('utf-16le'))
 91 |     assert decode(b'\xe9\x00', 'UTF-16') == ('é', lookup('utf-16le'))
 92 | 
 93 |     assert decode(b'\xe9\x00', 'UTF-16BE') == ('\ue900', lookup('utf-16be'))
 94 |     assert decode(b'\x00\xe9', 'UTF-16LE') == ('\ue900', lookup('utf-16le'))
 95 |     assert decode(b'\x00\xe9', 'UTF-16') == ('\ue900', lookup('utf-16le'))
 96 | 
 97 | 
 98 | def test_encode():
 99 |     assert encode('é', 'latin1') == b'\xe9'
100 |     assert encode('é', 'utf8') == b'\xc3\xa9'
101 |     assert encode('é', 'utf8') == b'\xc3\xa9'
102 |     assert encode('é', 'utf-16') == b'\xe9\x00'
103 |     assert encode('é', 'utf-16le') == b'\xe9\x00'
104 |     assert encode('é', 'utf-16be') == b'\x00\xe9'
105 | 
106 | 
107 | def test_iter_decode():
108 |     def iter_decode_to_string(input, fallback_encoding):
109 |         output, _encoding = iter_decode(input, fallback_encoding)
110 |         return ''.join(output)
111 |     assert iter_decode_to_string([], 'latin1') == ''
112 |     assert iter_decode_to_string([b''], 'latin1') == ''
113 |     assert iter_decode_to_string([b'\xe9'], 'latin1') == 'é'
114 |     assert iter_decode_to_string([b'hello'], 'latin1') == 'hello'
115 |     assert iter_decode_to_string([b'he', b'llo'], 'latin1') == 'hello'
116 |     assert iter_decode_to_string([b'hell', b'o'], 'latin1') == 'hello'
117 |     assert iter_decode_to_string([b'\xc3\xa9'], 'latin1') == 'Ã©'
118 |     assert iter_decode_to_string([b'\xEF\xBB\xBF\xc3\xa9'], 'latin1') == 'é'
119 |     assert iter_decode_to_string([
120 |         b'\xEF\xBB\xBF', b'\xc3', b'\xa9'], 'latin1') == 'é'
121 |     assert iter_decode_to_string([
122 |         b'\xEF\xBB\xBF', b'a', b'\xc3'], 'latin1') == 'a\uFFFD'
123 |     assert iter_decode_to_string([
124 |         b'', b'\xEF', b'', b'', b'\xBB\xBF\xc3', b'\xa9'], 'latin1') == 'é'
125 |     assert iter_decode_to_string([b'\xEF\xBB\xBF'], 'latin1') == ''
126 |     assert iter_decode_to_string([b'\xEF\xBB'], 'latin1') == 'ï»'
127 |     assert iter_decode_to_string([b'\xFE\xFF\x00\xe9'], 'latin1') == 'é'
128 |     assert iter_decode_to_string([b'\xFF\xFE\xe9\x00'], 'latin1') == 'é'
129 |     assert iter_decode_to_string([
130 |         b'', b'\xFF', b'', b'', b'\xFE\xe9', b'\x00'], 'latin1') == 'é'
131 |     assert iter_decode_to_string([
132 |         b'', b'h\xe9', b'llo'], 'x-user-defined') == 'h\uF7E9llo'
133 | 
134 | 
135 | def test_iter_encode():
136 |     assert b''.join(iter_encode([], 'latin1')) == b''
137 |     assert b''.join(iter_encode([''], 'latin1')) == b''
138 |     assert b''.join(iter_encode(['é'], 'latin1')) == b'\xe9'
139 |     assert b''.join(iter_encode(['', 'é', '', ''], 'latin1')) == b'\xe9'
140 |     assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16')) == b'\xe9\x00'
141 |     assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16le')) == b'\xe9\x00'
142 |     assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16be')) == b'\x00\xe9'
143 |     assert b''.join(iter_encode([
144 |         '', 'h\uF7E9', '', 'llo'], 'x-user-defined')) == b'h\xe9llo'
145 | 
146 | 
147 | def test_x_user_defined():
148 |     encoded = b'2,\x0c\x0b\x1aO\xd9#\xcb\x0f\xc9\xbbt\xcf\xa8\xca'
149 |     decoded = '2,\x0c\x0b\x1aO\uf7d9#\uf7cb\x0f\uf7c9\uf7bbt\uf7cf\uf7a8\uf7ca'
150 |     encoded = b'aa'
151 |     decoded = 'aa'
152 |     assert decode(encoded, 'x-user-defined') == (decoded, lookup('x-user-defined'))
153 |     assert encode(decoded, 'x-user-defined') == encoded
154 | 


--------------------------------------------------------------------------------
/webencodings/x_user_defined.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | """
  3 | 
  4 |     webencodings.x_user_defined
  5 |     ~~~~~~~~~~~~~~~~~~~~~~~~~~~
  6 | 
  7 |     An implementation of the x-user-defined encoding.
  8 | 
  9 |     :copyright: Copyright 2012 by Simon Sapin
 10 |     :license: BSD, see LICENSE for details.
 11 | 
 12 | """
 13 | 
 14 | from __future__ import unicode_literals
 15 | 
 16 | import codecs
 17 | 
 18 | 
 19 | ### Codec APIs
 20 | 
 21 | class Codec(codecs.Codec):
 22 | 
 23 |     def encode(self, input, errors='strict'):
 24 |         return codecs.charmap_encode(input, errors, encoding_table)
 25 | 
 26 |     def decode(self, input, errors='strict'):
 27 |         return codecs.charmap_decode(input, errors, decoding_table)
 28 | 
 29 | 
 30 | class IncrementalEncoder(codecs.IncrementalEncoder):
 31 |     def encode(self, input, final=False):
 32 |         return codecs.charmap_encode(input, self.errors, encoding_table)[0]
 33 | 
 34 | 
 35 | class IncrementalDecoder(codecs.IncrementalDecoder):
 36 |     def decode(self, input, final=False):
 37 |         return codecs.charmap_decode(input, self.errors, decoding_table)[0]
 38 | 
 39 | 
 40 | class StreamWriter(Codec, codecs.StreamWriter):
 41 |     pass
 42 | 
 43 | 
 44 | class StreamReader(Codec, codecs.StreamReader):
 45 |     pass
 46 | 
 47 | 
 48 | ### encodings module API
 49 | 
 50 | codec_info = codecs.CodecInfo(
 51 |     name='x-user-defined',
 52 |     encode=Codec().encode,
 53 |     decode=Codec().decode,
 54 |     incrementalencoder=IncrementalEncoder,
 55 |     incrementaldecoder=IncrementalDecoder,
 56 |     streamreader=StreamReader,
 57 |     streamwriter=StreamWriter,
 58 | )
 59 | 
 60 | 
 61 | ### Decoding Table
 62 | 
 63 | # Python 3:
 64 | # for c in range(256): print('    %r' % chr(c if c < 128 else c + 0xF700))
 65 | decoding_table = (
 66 |     '\x00'
 67 |     '\x01'
 68 |     '\x02'
 69 |     '\x03'
 70 |     '\x04'
 71 |     '\x05'
 72 |     '\x06'
 73 |     '\x07'
 74 |     '\x08'
 75 |     '\t'
 76 |     '\n'
 77 |     '\x0b'
 78 |     '\x0c'
 79 |     '\r'
 80 |     '\x0e'
 81 |     '\x0f'
 82 |     '\x10'
 83 |     '\x11'
 84 |     '\x12'
 85 |     '\x13'
 86 |     '\x14'
 87 |     '\x15'
 88 |     '\x16'
 89 |     '\x17'
 90 |     '\x18'
 91 |     '\x19'
 92 |     '\x1a'
 93 |     '\x1b'
 94 |     '\x1c'
 95 |     '\x1d'
 96 |     '\x1e'
 97 |     '\x1f'
 98 |     ' '
 99 |     '!'
100 |     '"'
101 |     '#'
102 |     '$'
103 |     '%'
104 |     '&'
105 |     "'"
106 |     '('
107 |     ')'
108 |     '*'
109 |     '+'
110 |     ','
111 |     '-'
112 |     '.'
113 |     '/'
114 |     '0'
115 |     '1'
116 |     '2'
117 |     '3'
118 |     '4'
119 |     '5'
120 |     '6'
121 |     '7'
122 |     '8'
123 |     '9'
124 |     ':'
125 |     ';'
126 |     '<'
127 |     '='
128 |     '>'
129 |     '?'
130 |     '@'
131 |     'A'
132 |     'B'
133 |     'C'
134 |     'D'
135 |     'E'
136 |     'F'
137 |     'G'
138 |     'H'
139 |     'I'
140 |     'J'
141 |     'K'
142 |     'L'
143 |     'M'
144 |     'N'
145 |     'O'
146 |     'P'
147 |     'Q'
148 |     'R'
149 |     'S'
150 |     'T'
151 |     'U'
152 |     'V'
153 |     'W'
154 |     'X'
155 |     'Y'
156 |     'Z'
157 |     '['
158 |     '\\'
159 |     ']'
160 |     '^'
161 |     '_'
162 |     '`'
163 |     'a'
164 |     'b'
165 |     'c'
166 |     'd'
167 |     'e'
168 |     'f'
169 |     'g'
170 |     'h'
171 |     'i'
172 |     'j'
173 |     'k'
174 |     'l'
175 |     'm'
176 |     'n'
177 |     'o'
178 |     'p'
179 |     'q'
180 |     'r'
181 |     's'
182 |     't'
183 |     'u'
184 |     'v'
185 |     'w'
186 |     'x'
187 |     'y'
188 |     'z'
189 |     '{'
190 |     '|'
191 |     '}'
192 |     '~'
193 |     '\x7f'
194 |     '\uf780'
195 |     '\uf781'
196 |     '\uf782'
197 |     '\uf783'
198 |     '\uf784'
199 |     '\uf785'
200 |     '\uf786'
201 |     '\uf787'
202 |     '\uf788'
203 |     '\uf789'
204 |     '\uf78a'
205 |     '\uf78b'
206 |     '\uf78c'
207 |     '\uf78d'
208 |     '\uf78e'
209 |     '\uf78f'
210 |     '\uf790'
211 |     '\uf791'
212 |     '\uf792'
213 |     '\uf793'
214 |     '\uf794'
215 |     '\uf795'
216 |     '\uf796'
217 |     '\uf797'
218 |     '\uf798'
219 |     '\uf799'
220 |     '\uf79a'
221 |     '\uf79b'
222 |     '\uf79c'
223 |     '\uf79d'
224 |     '\uf79e'
225 |     '\uf79f'
226 |     '\uf7a0'
227 |     '\uf7a1'
228 |     '\uf7a2'
229 |     '\uf7a3'
230 |     '\uf7a4'
231 |     '\uf7a5'
232 |     '\uf7a6'
233 |     '\uf7a7'
234 |     '\uf7a8'
235 |     '\uf7a9'
236 |     '\uf7aa'
237 |     '\uf7ab'
238 |     '\uf7ac'
239 |     '\uf7ad'
240 |     '\uf7ae'
241 |     '\uf7af'
242 |     '\uf7b0'
243 |     '\uf7b1'
244 |     '\uf7b2'
245 |     '\uf7b3'
246 |     '\uf7b4'
247 |     '\uf7b5'
248 |     '\uf7b6'
249 |     '\uf7b7'
250 |     '\uf7b8'
251 |     '\uf7b9'
252 |     '\uf7ba'
253 |     '\uf7bb'
254 |     '\uf7bc'
255 |     '\uf7bd'
256 |     '\uf7be'
257 |     '\uf7bf'
258 |     '\uf7c0'
259 |     '\uf7c1'
260 |     '\uf7c2'
261 |     '\uf7c3'
262 |     '\uf7c4'
263 |     '\uf7c5'
264 |     '\uf7c6'
265 |     '\uf7c7'
266 |     '\uf7c8'
267 |     '\uf7c9'
268 |     '\uf7ca'
269 |     '\uf7cb'
270 |     '\uf7cc'
271 |     '\uf7cd'
272 |     '\uf7ce'
273 |     '\uf7cf'
274 |     '\uf7d0'
275 |     '\uf7d1'
276 |     '\uf7d2'
277 |     '\uf7d3'
278 |     '\uf7d4'
279 |     '\uf7d5'
280 |     '\uf7d6'
281 |     '\uf7d7'
282 |     '\uf7d8'
283 |     '\uf7d9'
284 |     '\uf7da'
285 |     '\uf7db'
286 |     '\uf7dc'
287 |     '\uf7dd'
288 |     '\uf7de'
289 |     '\uf7df'
290 |     '\uf7e0'
291 |     '\uf7e1'
292 |     '\uf7e2'
293 |     '\uf7e3'
294 |     '\uf7e4'
295 |     '\uf7e5'
296 |     '\uf7e6'
297 |     '\uf7e7'
298 |     '\uf7e8'
299 |     '\uf7e9'
300 |     '\uf7ea'
301 |     '\uf7eb'
302 |     '\uf7ec'
303 |     '\uf7ed'
304 |     '\uf7ee'
305 |     '\uf7ef'
306 |     '\uf7f0'
307 |     '\uf7f1'
308 |     '\uf7f2'
309 |     '\uf7f3'
310 |     '\uf7f4'
311 |     '\uf7f5'
312 |     '\uf7f6'
313 |     '\uf7f7'
314 |     '\uf7f8'
315 |     '\uf7f9'
316 |     '\uf7fa'
317 |     '\uf7fb'
318 |     '\uf7fc'
319 |     '\uf7fd'
320 |     '\uf7fe'
321 |     '\uf7ff'
322 | )
323 | 
324 | ### Encoding table
325 | encoding_table = codecs.charmap_build(decoding_table)
326 | 


--------------------------------------------------------------------------------