├── .coveragerc ├── .gitignore ├── .travis.yml ├── LICENSE ├── MANIFEST.in ├── README.rst ├── docs ├── conf.py └── index.rst ├── setup.cfg ├── setup.py ├── tox.ini └── webencodings ├── __init__.py ├── labels.py ├── mklabels.py ├── tests.py └── x_user_defined.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = True 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .tox 3 | *.egg-info 4 | .coverage 5 | docs/_build 6 | /dist 7 | htmlcov 8 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | sudo: false 3 | cache: pip 4 | 5 | matrix: 6 | include: 7 | - python: 2.6 8 | env: TOXENV=py26 9 | - python: 2.7 10 | env: TOXENV=py27 11 | - python: 3.3 12 | env: TOXENV=py33 13 | - python: 3.4 14 | env: TOXENV=py34 15 | - python: 3.5 16 | env: TOXENV=py35 17 | - python: 3.6 18 | env: TOXENV=py36 19 | - python: pypy 20 | env: TOXENV=pypy 21 | 22 | install: 23 | - pip install -U tox 24 | 25 | script: 26 | - tox 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012 by Simon Sapin. 2 | 3 | Some rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are 7 | met: 8 | 9 | * Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above 13 | copyright notice, this list of conditions and the following 14 | disclaimer in the documentation and/or other materials provided 15 | with the distribution. 16 | 17 | * The names of the contributors may not be used to endorse or 18 | promote products derived from this software without specific 19 | prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | python-webencodings 2 | =================== 3 | 4 | This is a Python implementation of the `WHATWG Encoding standard 5 | `_. 6 | 7 | * Latest documentation: http://packages.python.org/webencodings/ 8 | * Source code and issue tracker: 9 | https://github.com/gsnedders/python-webencodings 10 | * PyPI releases: http://pypi.python.org/pypi/webencodings 11 | * License: BSD 12 | * Python 2.6+ and 3.3+ 13 | 14 | In order to be compatible with legacy web content 15 | when interpreting something like ``Content-Type: text/html; charset=latin1``, 16 | tools need to use a particular set of aliases for encoding labels 17 | as well as some overriding rules. 18 | For example, ``US-ASCII`` and ``iso-8859-1`` on the web are actually 19 | aliases for ``windows-1252``, and an UTF-8 or UTF-16 BOM takes precedence 20 | over any other encoding declaration. 21 | The Encoding standard defines all such details so that implementations do 22 | not have to reverse-engineer each other. 23 | 24 | This module has encoding labels and BOM detection, 25 | but the actual implementation for encoders and decoders is Python’s. 26 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # python-webencodings documentation build configuration file, created by 5 | # sphinx-quickstart on Sat Dec 22 21:53:21 2012. 6 | # 7 | # This file is execfile()d with the current directory set to its containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import sys, os, re 16 | 17 | # If extensions (or modules to document with autodoc) are in another directory, 18 | # add these directories to sys.path here. If the directory is relative to the 19 | # documentation root, use os.path.abspath to make it absolute, like shown here. 20 | #sys.path.insert(0, os.path.abspath('.')) 21 | 22 | # -- General configuration ----------------------------------------------------- 23 | 24 | # If your documentation needs a minimal Sphinx version, state it here. 25 | #needs_sphinx = '1.0' 26 | 27 | # Add any Sphinx extension module names here, as strings. They can be extensions 28 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 29 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx', 'sphinx.ext.viewcode'] 30 | 31 | # Add any paths that contain templates here, relative to this directory. 32 | templates_path = ['_templates'] 33 | 34 | # The suffix of source filenames. 35 | source_suffix = '.rst' 36 | 37 | # The encoding of source files. 38 | #source_encoding = 'utf-8-sig' 39 | 40 | # The master toctree document. 41 | master_doc = 'index' 42 | 43 | # General information about the project. 44 | project = 'python-webencodings' 45 | copyright = '2012, Simon Sapin' 46 | 47 | # The version info for the project you're documenting, acts as replacement for 48 | # |version| and |release|, also used in various other places throughout the 49 | # built documents. 50 | # 51 | # The full version, including alpha/beta/rc tags. 52 | release = re.search("VERSION = '([^']+)'", 53 | open(os.path.join(os.path.dirname(__file__), os.pardir, 54 | 'webencodings', '__init__.py')).read().strip() 55 | ).group(1) 56 | 57 | # The short X.Y version. 58 | version = '.'.join(release.split('.')[:2]) 59 | 60 | # The language for content autogenerated by Sphinx. Refer to documentation 61 | # for a list of supported languages. 62 | #language = None 63 | 64 | # There are two options for replacing |today|: either, you set today to some 65 | # non-false value, then it is used: 66 | #today = '' 67 | # Else, today_fmt is used as the format for a strftime call. 68 | #today_fmt = '%B %d, %Y' 69 | 70 | # List of patterns, relative to source directory, that match files and 71 | # directories to ignore when looking for source files. 72 | exclude_patterns = ['_build'] 73 | 74 | # The reST default role (used for this markup: `text`) to use for all documents. 75 | #default_role = None 76 | 77 | # If true, '()' will be appended to :func: etc. cross-reference text. 78 | #add_function_parentheses = True 79 | 80 | # If true, the current module name will be prepended to all description 81 | # unit titles (such as .. function::). 82 | #add_module_names = True 83 | 84 | # If true, sectionauthor and moduleauthor directives will be shown in the 85 | # output. They are ignored by default. 86 | #show_authors = False 87 | 88 | # The name of the Pygments (syntax highlighting) style to use. 89 | pygments_style = 'sphinx' 90 | 91 | # A list of ignored prefixes for module index sorting. 92 | #modindex_common_prefix = [] 93 | 94 | 95 | # -- Options for HTML output --------------------------------------------------- 96 | 97 | # The theme to use for HTML and HTML Help pages. See the documentation for 98 | # a list of builtin themes. 99 | html_theme = 'default' 100 | 101 | # Theme options are theme-specific and customize the look and feel of a theme 102 | # further. For a list of options available for each theme, see the 103 | # documentation. 104 | #html_theme_options = {} 105 | 106 | # Add any paths that contain custom themes here, relative to this directory. 107 | #html_theme_path = [] 108 | 109 | # The name for this set of Sphinx documents. If None, it defaults to 110 | # " v documentation". 111 | #html_title = None 112 | 113 | # A shorter title for the navigation bar. Default is the same as html_title. 114 | #html_short_title = None 115 | 116 | # The name of an image file (relative to this directory) to place at the top 117 | # of the sidebar. 118 | #html_logo = None 119 | 120 | # The name of an image file (within the static path) to use as favicon of the 121 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 122 | # pixels large. 123 | #html_favicon = None 124 | 125 | # Add any paths that contain custom static files (such as style sheets) here, 126 | # relative to this directory. They are copied after the builtin static files, 127 | # so a file named "default.css" will overwrite the builtin "default.css". 128 | html_static_path = ['_static'] 129 | 130 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 131 | # using the given strftime format. 132 | #html_last_updated_fmt = '%b %d, %Y' 133 | 134 | # If true, SmartyPants will be used to convert quotes and dashes to 135 | # typographically correct entities. 136 | #html_use_smartypants = True 137 | 138 | # Custom sidebar templates, maps document names to template names. 139 | #html_sidebars = {} 140 | 141 | # Additional templates that should be rendered to pages, maps page names to 142 | # template names. 143 | #html_additional_pages = {} 144 | 145 | # If false, no module index is generated. 146 | #html_domain_indices = True 147 | 148 | # If false, no index is generated. 149 | #html_use_index = True 150 | 151 | # If true, the index is split into individual pages for each letter. 152 | #html_split_index = False 153 | 154 | # If true, links to the reST sources are added to the pages. 155 | #html_show_sourcelink = True 156 | 157 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 158 | #html_show_sphinx = True 159 | 160 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 161 | #html_show_copyright = True 162 | 163 | # If true, an OpenSearch description file will be output, and all pages will 164 | # contain a tag referring to it. The value of this option must be the 165 | # base URL from which the finished HTML is served. 166 | #html_use_opensearch = '' 167 | 168 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 169 | #html_file_suffix = None 170 | 171 | # Output file base name for HTML help builder. 172 | htmlhelp_basename = 'python-webencodingsdoc' 173 | 174 | 175 | # -- Options for LaTeX output -------------------------------------------------- 176 | 177 | latex_elements = { 178 | # The paper size ('letterpaper' or 'a4paper'). 179 | #'papersize': 'letterpaper', 180 | 181 | # The font size ('10pt', '11pt' or '12pt'). 182 | #'pointsize': '10pt', 183 | 184 | # Additional stuff for the LaTeX preamble. 185 | #'preamble': '', 186 | } 187 | 188 | # Grouping the document tree into LaTeX files. List of tuples 189 | # (source start file, target name, title, author, documentclass [howto/manual]). 190 | latex_documents = [ 191 | ('index', 'python-webencodings.tex', 'python-webencodings Documentation', 192 | 'Simon Sapin', 'manual'), 193 | ] 194 | 195 | # The name of an image file (relative to this directory) to place at the top of 196 | # the title page. 197 | #latex_logo = None 198 | 199 | # For "manual" documents, if this is true, then toplevel headings are parts, 200 | # not chapters. 201 | #latex_use_parts = False 202 | 203 | # If true, show page references after internal links. 204 | #latex_show_pagerefs = False 205 | 206 | # If true, show URL addresses after external links. 207 | #latex_show_urls = False 208 | 209 | # Documents to append as an appendix to all manuals. 210 | #latex_appendices = [] 211 | 212 | # If false, no module index is generated. 213 | #latex_domain_indices = True 214 | 215 | 216 | # -- Options for manual page output -------------------------------------------- 217 | 218 | # One entry per manual page. List of tuples 219 | # (source start file, name, description, authors, manual section). 220 | man_pages = [ 221 | ('index', 'python-webencodings', 'python-webencodings Documentation', 222 | ['Simon Sapin'], 1) 223 | ] 224 | 225 | # If true, show URL addresses after external links. 226 | #man_show_urls = False 227 | 228 | 229 | # -- Options for Texinfo output ------------------------------------------------ 230 | 231 | # Grouping the document tree into Texinfo files. List of tuples 232 | # (source start file, target name, title, author, 233 | # dir menu entry, description, category) 234 | texinfo_documents = [ 235 | ('index', 'python-webencodings', 'python-webencodings Documentation', 236 | 'Simon Sapin', 'python-webencodings', 'One line description of project.', 237 | 'Miscellaneous'), 238 | ] 239 | 240 | # Documents to append as an appendix to all manuals. 241 | #texinfo_appendices = [] 242 | 243 | # If false, no module index is generated. 244 | #texinfo_domain_indices = True 245 | 246 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 247 | #texinfo_show_urls = 'footnote' 248 | 249 | 250 | # Example configuration for intersphinx: refer to the Python standard library. 251 | intersphinx_mapping = { 252 | 'py': ('http://docs.python.org/3', None) 253 | } 254 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../README.rst 2 | 3 | .. toctree:: 4 | :maxdepth: 2 5 | 6 | 7 | Byte order marks 8 | ---------------- 9 | 10 | When decoding, for compatibility with deployed content, 11 | a `byte order mark `_ 12 | (also known as BOM) 13 | is considered more authoritative than anything else. 14 | The corresponding U+FFFE code point is not part of the decoded output. 15 | 16 | Encoding nevers prepends a BOM, 17 | but the output can start with a BOM 18 | if the input starts with a U+FFFE code point. 19 | In that case encoding then decoding will not round-trip. 20 | 21 | 22 | Error handling 23 | -------------- 24 | 25 | As in the stdlib, error handling for encoding defaults to ``strict``: 26 | raise an exception if there is an error. 27 | 28 | For decoding however the default is ``replace``, unlike the stdlib. 29 | Invalid bytes are decoded as ``�`` (U+FFFD, the replacement character). 30 | The reason is that when showing legacy content to the user, 31 | it might be better to succeed decoding only part of it rather than blow up. 32 | This is of course not the case is all situations: 33 | sometimes you want stuff to blow up so you can detect errors early. 34 | 35 | 36 | API 37 | --- 38 | 39 | .. module:: webencodings 40 | 41 | .. autofunction:: lookup 42 | 43 | .. autoclass:: Encoding() 44 | 45 | .. autodata:: UTF8 46 | 47 | .. autofunction:: decode 48 | .. autofunction:: encode 49 | .. autofunction:: iter_decode 50 | .. autofunction:: iter_encode 51 | .. autoclass:: IncrementalDecoder 52 | :members: 53 | .. autoclass:: IncrementalEncoder 54 | .. autofunction:: ascii_lower 55 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal = 1 3 | 4 | [metadata] 5 | license_file = LICENSE 6 | 7 | [build_sphinx] 8 | source-dir = docs 9 | build-dir = docs/_build 10 | #all_files = 1 11 | 12 | [upload_sphinx] # Sphinx-PyPI-upload 13 | upload-dir = docs/_build/html 14 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | import io 3 | from os import path 4 | import re 5 | 6 | 7 | VERSION = re.search("VERSION = '([^']+)'", io.open( 8 | path.join(path.dirname(__file__), 'webencodings', '__init__.py'), 9 | encoding='utf-8' 10 | ).read().strip()).group(1) 11 | 12 | LONG_DESCRIPTION = io.open( 13 | path.join(path.dirname(__file__), 'README.rst'), 14 | encoding='utf-8' 15 | ).read() 16 | 17 | 18 | setup( 19 | name='webencodings', 20 | version=VERSION, 21 | url='https://github.com/SimonSapin/python-webencodings', 22 | license='BSD', 23 | author='Simon Sapin', 24 | author_email='simon.sapin@exyr.org', 25 | maintainer='Geoffrey Sneddon', 26 | maintainer_email='me@gsnedders.com', 27 | description='Character encoding aliases for legacy web content', 28 | long_description=LONG_DESCRIPTION, 29 | classifiers=[ 30 | 'Development Status :: 4 - Beta', 31 | 'Intended Audience :: Developers', 32 | 'License :: OSI Approved :: BSD License', 33 | 'Programming Language :: Python', 34 | 'Programming Language :: Python :: 2', 35 | 'Programming Language :: Python :: 2.6', 36 | 'Programming Language :: Python :: 2.7', 37 | 'Programming Language :: Python :: 3', 38 | 'Programming Language :: Python :: 3.3', 39 | 'Programming Language :: Python :: 3.4', 40 | 'Programming Language :: Python :: 3.5', 41 | 'Programming Language :: Python :: 3.6', 42 | 'Programming Language :: Python :: Implementation :: CPython', 43 | 'Programming Language :: Python :: Implementation :: PyPy', 44 | 'Topic :: Internet :: WWW/HTTP', 45 | ], 46 | packages=find_packages(), 47 | ) 48 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | python_files=test*.py 3 | 4 | [tox] 5 | envlist = py26, py27, py33, py34, py35, py36, pypy 6 | 7 | [testenv] 8 | deps=pytest 9 | commands=py.test [] 10 | -------------------------------------------------------------------------------- /webencodings/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | 4 | webencodings 5 | ~~~~~~~~~~~~ 6 | 7 | This is a Python implementation of the `WHATWG Encoding standard 8 | `. See README for details. 9 | 10 | :copyright: Copyright 2012 by Simon Sapin 11 | :license: BSD, see LICENSE for details. 12 | 13 | """ 14 | 15 | from __future__ import unicode_literals 16 | 17 | import codecs 18 | 19 | from .labels import LABELS 20 | 21 | 22 | VERSION = '0.6-dev' 23 | 24 | 25 | # Some names in Encoding are not valid Python aliases. Remap these. 26 | PYTHON_NAMES = { 27 | 'iso-8859-8-i': 'iso-8859-8', 28 | 'x-mac-cyrillic': 'mac-cyrillic', 29 | 'macintosh': 'mac-roman', 30 | 'windows-874': 'cp874'} 31 | 32 | CACHE = {} 33 | 34 | 35 | def ascii_lower(string): 36 | r"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z. 37 | 38 | :param string: An Unicode string. 39 | :returns: A new Unicode string. 40 | 41 | This is used for `ASCII case-insensitive 42 | `_ 43 | matching of encoding labels. 44 | The same matching is also used, among other things, 45 | for `CSS keywords `_. 46 | 47 | This is different from the :meth:`~py:str.lower` method of Unicode strings 48 | which also affect non-ASCII characters, 49 | sometimes mapping them into the ASCII range: 50 | 51 | >>> keyword = u'Bac\N{KELVIN SIGN}ground' 52 | >>> assert keyword.lower() == u'background' 53 | >>> assert ascii_lower(keyword) != keyword.lower() 54 | >>> assert ascii_lower(keyword) == u'bac\N{KELVIN SIGN}ground' 55 | 56 | """ 57 | # This turns out to be faster than unicode.translate() 58 | return string.encode('utf8').lower().decode('utf8') 59 | 60 | 61 | def lookup(label): 62 | """ 63 | Look for an encoding by its label. 64 | This is the spec’s `get an encoding 65 | `_ algorithm. 66 | Supported labels are listed there. 67 | 68 | :param label: A string. 69 | :returns: 70 | An :class:`Encoding` object, or :obj:`None` for an unknown label. 71 | 72 | """ 73 | # Only strip ASCII whitespace: U+0009, U+000A, U+000C, U+000D, and U+0020. 74 | label = ascii_lower(label.strip('\t\n\f\r ')) 75 | name = LABELS.get(label) 76 | if name is None: 77 | return None 78 | encoding = CACHE.get(name) 79 | if encoding is None: 80 | if name == 'x-user-defined': 81 | from .x_user_defined import codec_info 82 | else: 83 | python_name = PYTHON_NAMES.get(name, name) 84 | # Any python_name value that gets to here should be valid. 85 | codec_info = codecs.lookup(python_name) 86 | encoding = Encoding(name, codec_info) 87 | CACHE[name] = encoding 88 | return encoding 89 | 90 | 91 | def _get_encoding(encoding_or_label): 92 | """ 93 | Accept either an encoding object or label. 94 | 95 | :param encoding: An :class:`Encoding` object or a label string. 96 | :returns: An :class:`Encoding` object. 97 | :raises: :exc:`~exceptions.LookupError` for an unknown label. 98 | 99 | """ 100 | if hasattr(encoding_or_label, 'codec_info'): 101 | return encoding_or_label 102 | 103 | encoding = lookup(encoding_or_label) 104 | if encoding is None: 105 | raise LookupError('Unknown encoding label: %r' % encoding_or_label) 106 | return encoding 107 | 108 | 109 | class Encoding(object): 110 | """Reresents a character encoding such as UTF-8, 111 | that can be used for decoding or encoding. 112 | 113 | .. attribute:: name 114 | 115 | Canonical name of the encoding 116 | 117 | .. attribute:: codec_info 118 | 119 | The actual implementation of the encoding, 120 | a stdlib :class:`~codecs.CodecInfo` object. 121 | See :func:`codecs.register`. 122 | 123 | """ 124 | def __init__(self, name, codec_info): 125 | self.name = name 126 | self.codec_info = codec_info 127 | 128 | def __repr__(self): 129 | return '' % self.name 130 | 131 | 132 | #: The UTF-8 encoding. Should be used for new content and formats. 133 | UTF8 = lookup('utf-8') 134 | 135 | _UTF16LE = lookup('utf-16le') 136 | _UTF16BE = lookup('utf-16be') 137 | 138 | 139 | def decode(input, fallback_encoding, errors='replace'): 140 | """ 141 | Decode a single string. 142 | 143 | :param input: A byte string 144 | :param fallback_encoding: 145 | An :class:`Encoding` object or a label string. 146 | The encoding to use if :obj:`input` does note have a BOM. 147 | :param errors: Type of error handling. See :func:`codecs.register`. 148 | :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. 149 | :return: 150 | A ``(output, encoding)`` tuple of an Unicode string 151 | and an :obj:`Encoding`. 152 | 153 | """ 154 | # Fail early if `encoding` is an invalid label. 155 | fallback_encoding = _get_encoding(fallback_encoding) 156 | bom_encoding, input = _detect_bom(input) 157 | encoding = bom_encoding or fallback_encoding 158 | return encoding.codec_info.decode(input, errors)[0], encoding 159 | 160 | 161 | def _detect_bom(input): 162 | """Return (bom_encoding, input), with any BOM removed from the input.""" 163 | if input.startswith(b'\xFF\xFE'): 164 | return _UTF16LE, input[2:] 165 | if input.startswith(b'\xFE\xFF'): 166 | return _UTF16BE, input[2:] 167 | if input.startswith(b'\xEF\xBB\xBF'): 168 | return UTF8, input[3:] 169 | return None, input 170 | 171 | 172 | def encode(input, encoding=UTF8, errors='strict'): 173 | """ 174 | Encode a single string. 175 | 176 | :param input: An Unicode string. 177 | :param encoding: An :class:`Encoding` object or a label string. 178 | :param errors: Type of error handling. See :func:`codecs.register`. 179 | :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. 180 | :return: A byte string. 181 | 182 | """ 183 | return _get_encoding(encoding).codec_info.encode(input, errors)[0] 184 | 185 | 186 | def iter_decode(input, fallback_encoding, errors='replace'): 187 | """ 188 | "Pull"-based decoder. 189 | 190 | :param input: 191 | An iterable of byte strings. 192 | 193 | The input is first consumed just enough to determine the encoding 194 | based on the precense of a BOM, 195 | then consumed on demand when the return value is. 196 | :param fallback_encoding: 197 | An :class:`Encoding` object or a label string. 198 | The encoding to use if :obj:`input` does note have a BOM. 199 | :param errors: Type of error handling. See :func:`codecs.register`. 200 | :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. 201 | :returns: 202 | An ``(output, encoding)`` tuple. 203 | :obj:`output` is an iterable of Unicode strings, 204 | :obj:`encoding` is the :obj:`Encoding` that is being used. 205 | 206 | """ 207 | 208 | decoder = IncrementalDecoder(fallback_encoding, errors) 209 | generator = _iter_decode_generator(input, decoder) 210 | encoding = next(generator) 211 | return generator, encoding 212 | 213 | 214 | def _iter_decode_generator(input, decoder): 215 | """Return a generator that first yields the :obj:`Encoding`, 216 | then yields output chukns as Unicode strings. 217 | 218 | """ 219 | decode = decoder.decode 220 | input = iter(input) 221 | for chunck in input: 222 | output = decode(chunck) 223 | if output: 224 | assert decoder.encoding is not None 225 | yield decoder.encoding 226 | yield output 227 | break 228 | else: 229 | # Input exhausted without determining the encoding 230 | output = decode(b'', final=True) 231 | assert decoder.encoding is not None 232 | yield decoder.encoding 233 | if output: 234 | yield output 235 | return 236 | 237 | for chunck in input: 238 | output = decode(chunck) 239 | if output: 240 | yield output 241 | output = decode(b'', final=True) 242 | if output: 243 | yield output 244 | 245 | 246 | def iter_encode(input, encoding=UTF8, errors='strict'): 247 | """ 248 | “Pull”-based encoder. 249 | 250 | :param input: An iterable of Unicode strings. 251 | :param encoding: An :class:`Encoding` object or a label string. 252 | :param errors: Type of error handling. See :func:`codecs.register`. 253 | :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. 254 | :returns: An iterable of byte strings. 255 | 256 | """ 257 | # Fail early if `encoding` is an invalid label. 258 | encode = IncrementalEncoder(encoding, errors).encode 259 | return _iter_encode_generator(input, encode) 260 | 261 | 262 | def _iter_encode_generator(input, encode): 263 | for chunck in input: 264 | output = encode(chunck) 265 | if output: 266 | yield output 267 | output = encode('', final=True) 268 | if output: 269 | yield output 270 | 271 | 272 | class IncrementalDecoder(object): 273 | """ 274 | “Push”-based decoder. 275 | 276 | :param fallback_encoding: 277 | An :class:`Encoding` object or a label string. 278 | The encoding to use if :obj:`input` does note have a BOM. 279 | :param errors: Type of error handling. See :func:`codecs.register`. 280 | :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. 281 | 282 | """ 283 | def __init__(self, fallback_encoding, errors='replace'): 284 | # Fail early if `encoding` is an invalid label. 285 | self._fallback_encoding = _get_encoding(fallback_encoding) 286 | self._errors = errors 287 | self._buffer = b'' 288 | self._decoder = None 289 | #: The actual :class:`Encoding` that is being used, 290 | #: or :obj:`None` if that is not determined yet. 291 | #: (Ie. if there is not enough input yet to determine 292 | #: if there is a BOM.) 293 | self.encoding = None # Not known yet. 294 | 295 | def decode(self, input, final=False): 296 | """Decode one chunk of the input. 297 | 298 | :param input: A byte string. 299 | :param final: 300 | Indicate that no more input is available. 301 | Must be :obj:`True` if this is the last call. 302 | :returns: An Unicode string. 303 | 304 | """ 305 | decoder = self._decoder 306 | if decoder is not None: 307 | return decoder(input, final) 308 | 309 | input = self._buffer + input 310 | encoding, input = _detect_bom(input) 311 | if encoding is None: 312 | if len(input) < 3 and not final: # Not enough data yet. 313 | self._buffer = input 314 | return '' 315 | else: # No BOM 316 | encoding = self._fallback_encoding 317 | decoder = encoding.codec_info.incrementaldecoder(self._errors).decode 318 | self._decoder = decoder 319 | self.encoding = encoding 320 | return decoder(input, final) 321 | 322 | 323 | class IncrementalEncoder(object): 324 | """ 325 | “Push”-based encoder. 326 | 327 | :param encoding: An :class:`Encoding` object or a label string. 328 | :param errors: Type of error handling. See :func:`codecs.register`. 329 | :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. 330 | 331 | .. method:: encode(input, final=False) 332 | 333 | :param input: An Unicode string. 334 | :param final: 335 | Indicate that no more input is available. 336 | Must be :obj:`True` if this is the last call. 337 | :returns: A byte string. 338 | 339 | """ 340 | def __init__(self, encoding=UTF8, errors='strict'): 341 | encoding = _get_encoding(encoding) 342 | self.encode = encoding.codec_info.incrementalencoder(errors).encode 343 | -------------------------------------------------------------------------------- /webencodings/labels.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | webencodings.labels 4 | ~~~~~~~~~~~~~~~~~~~ 5 | 6 | Map encoding labels to their name. 7 | 8 | :copyright: Copyright 2012 by Simon Sapin 9 | :license: BSD, see LICENSE for details. 10 | 11 | """ 12 | 13 | # XXX Do not edit! 14 | # This file is automatically generated by mklabels.py 15 | 16 | LABELS = { 17 | 'unicode-1-1-utf-8': 'utf-8', 18 | 'utf-8': 'utf-8', 19 | 'utf8': 'utf-8', 20 | '866': 'ibm866', 21 | 'cp866': 'ibm866', 22 | 'csibm866': 'ibm866', 23 | 'ibm866': 'ibm866', 24 | 'csisolatin2': 'iso-8859-2', 25 | 'iso-8859-2': 'iso-8859-2', 26 | 'iso-ir-101': 'iso-8859-2', 27 | 'iso8859-2': 'iso-8859-2', 28 | 'iso88592': 'iso-8859-2', 29 | 'iso_8859-2': 'iso-8859-2', 30 | 'iso_8859-2:1987': 'iso-8859-2', 31 | 'l2': 'iso-8859-2', 32 | 'latin2': 'iso-8859-2', 33 | 'csisolatin3': 'iso-8859-3', 34 | 'iso-8859-3': 'iso-8859-3', 35 | 'iso-ir-109': 'iso-8859-3', 36 | 'iso8859-3': 'iso-8859-3', 37 | 'iso88593': 'iso-8859-3', 38 | 'iso_8859-3': 'iso-8859-3', 39 | 'iso_8859-3:1988': 'iso-8859-3', 40 | 'l3': 'iso-8859-3', 41 | 'latin3': 'iso-8859-3', 42 | 'csisolatin4': 'iso-8859-4', 43 | 'iso-8859-4': 'iso-8859-4', 44 | 'iso-ir-110': 'iso-8859-4', 45 | 'iso8859-4': 'iso-8859-4', 46 | 'iso88594': 'iso-8859-4', 47 | 'iso_8859-4': 'iso-8859-4', 48 | 'iso_8859-4:1988': 'iso-8859-4', 49 | 'l4': 'iso-8859-4', 50 | 'latin4': 'iso-8859-4', 51 | 'csisolatincyrillic': 'iso-8859-5', 52 | 'cyrillic': 'iso-8859-5', 53 | 'iso-8859-5': 'iso-8859-5', 54 | 'iso-ir-144': 'iso-8859-5', 55 | 'iso8859-5': 'iso-8859-5', 56 | 'iso88595': 'iso-8859-5', 57 | 'iso_8859-5': 'iso-8859-5', 58 | 'iso_8859-5:1988': 'iso-8859-5', 59 | 'arabic': 'iso-8859-6', 60 | 'asmo-708': 'iso-8859-6', 61 | 'csiso88596e': 'iso-8859-6', 62 | 'csiso88596i': 'iso-8859-6', 63 | 'csisolatinarabic': 'iso-8859-6', 64 | 'ecma-114': 'iso-8859-6', 65 | 'iso-8859-6': 'iso-8859-6', 66 | 'iso-8859-6-e': 'iso-8859-6', 67 | 'iso-8859-6-i': 'iso-8859-6', 68 | 'iso-ir-127': 'iso-8859-6', 69 | 'iso8859-6': 'iso-8859-6', 70 | 'iso88596': 'iso-8859-6', 71 | 'iso_8859-6': 'iso-8859-6', 72 | 'iso_8859-6:1987': 'iso-8859-6', 73 | 'csisolatingreek': 'iso-8859-7', 74 | 'ecma-118': 'iso-8859-7', 75 | 'elot_928': 'iso-8859-7', 76 | 'greek': 'iso-8859-7', 77 | 'greek8': 'iso-8859-7', 78 | 'iso-8859-7': 'iso-8859-7', 79 | 'iso-ir-126': 'iso-8859-7', 80 | 'iso8859-7': 'iso-8859-7', 81 | 'iso88597': 'iso-8859-7', 82 | 'iso_8859-7': 'iso-8859-7', 83 | 'iso_8859-7:1987': 'iso-8859-7', 84 | 'sun_eu_greek': 'iso-8859-7', 85 | 'csiso88598e': 'iso-8859-8', 86 | 'csisolatinhebrew': 'iso-8859-8', 87 | 'hebrew': 'iso-8859-8', 88 | 'iso-8859-8': 'iso-8859-8', 89 | 'iso-8859-8-e': 'iso-8859-8', 90 | 'iso-ir-138': 'iso-8859-8', 91 | 'iso8859-8': 'iso-8859-8', 92 | 'iso88598': 'iso-8859-8', 93 | 'iso_8859-8': 'iso-8859-8', 94 | 'iso_8859-8:1988': 'iso-8859-8', 95 | 'visual': 'iso-8859-8', 96 | 'csiso88598i': 'iso-8859-8-i', 97 | 'iso-8859-8-i': 'iso-8859-8-i', 98 | 'logical': 'iso-8859-8-i', 99 | 'csisolatin6': 'iso-8859-10', 100 | 'iso-8859-10': 'iso-8859-10', 101 | 'iso-ir-157': 'iso-8859-10', 102 | 'iso8859-10': 'iso-8859-10', 103 | 'iso885910': 'iso-8859-10', 104 | 'l6': 'iso-8859-10', 105 | 'latin6': 'iso-8859-10', 106 | 'iso-8859-13': 'iso-8859-13', 107 | 'iso8859-13': 'iso-8859-13', 108 | 'iso885913': 'iso-8859-13', 109 | 'iso-8859-14': 'iso-8859-14', 110 | 'iso8859-14': 'iso-8859-14', 111 | 'iso885914': 'iso-8859-14', 112 | 'csisolatin9': 'iso-8859-15', 113 | 'iso-8859-15': 'iso-8859-15', 114 | 'iso8859-15': 'iso-8859-15', 115 | 'iso885915': 'iso-8859-15', 116 | 'iso_8859-15': 'iso-8859-15', 117 | 'l9': 'iso-8859-15', 118 | 'iso-8859-16': 'iso-8859-16', 119 | 'cskoi8r': 'koi8-r', 120 | 'koi': 'koi8-r', 121 | 'koi8': 'koi8-r', 122 | 'koi8-r': 'koi8-r', 123 | 'koi8_r': 'koi8-r', 124 | 'koi8-u': 'koi8-u', 125 | 'csmacintosh': 'macintosh', 126 | 'mac': 'macintosh', 127 | 'macintosh': 'macintosh', 128 | 'x-mac-roman': 'macintosh', 129 | 'dos-874': 'windows-874', 130 | 'iso-8859-11': 'windows-874', 131 | 'iso8859-11': 'windows-874', 132 | 'iso885911': 'windows-874', 133 | 'tis-620': 'windows-874', 134 | 'windows-874': 'windows-874', 135 | 'cp1250': 'windows-1250', 136 | 'windows-1250': 'windows-1250', 137 | 'x-cp1250': 'windows-1250', 138 | 'cp1251': 'windows-1251', 139 | 'windows-1251': 'windows-1251', 140 | 'x-cp1251': 'windows-1251', 141 | 'ansi_x3.4-1968': 'windows-1252', 142 | 'ascii': 'windows-1252', 143 | 'cp1252': 'windows-1252', 144 | 'cp819': 'windows-1252', 145 | 'csisolatin1': 'windows-1252', 146 | 'ibm819': 'windows-1252', 147 | 'iso-8859-1': 'windows-1252', 148 | 'iso-ir-100': 'windows-1252', 149 | 'iso8859-1': 'windows-1252', 150 | 'iso88591': 'windows-1252', 151 | 'iso_8859-1': 'windows-1252', 152 | 'iso_8859-1:1987': 'windows-1252', 153 | 'l1': 'windows-1252', 154 | 'latin1': 'windows-1252', 155 | 'us-ascii': 'windows-1252', 156 | 'windows-1252': 'windows-1252', 157 | 'x-cp1252': 'windows-1252', 158 | 'cp1253': 'windows-1253', 159 | 'windows-1253': 'windows-1253', 160 | 'x-cp1253': 'windows-1253', 161 | 'cp1254': 'windows-1254', 162 | 'csisolatin5': 'windows-1254', 163 | 'iso-8859-9': 'windows-1254', 164 | 'iso-ir-148': 'windows-1254', 165 | 'iso8859-9': 'windows-1254', 166 | 'iso88599': 'windows-1254', 167 | 'iso_8859-9': 'windows-1254', 168 | 'iso_8859-9:1989': 'windows-1254', 169 | 'l5': 'windows-1254', 170 | 'latin5': 'windows-1254', 171 | 'windows-1254': 'windows-1254', 172 | 'x-cp1254': 'windows-1254', 173 | 'cp1255': 'windows-1255', 174 | 'windows-1255': 'windows-1255', 175 | 'x-cp1255': 'windows-1255', 176 | 'cp1256': 'windows-1256', 177 | 'windows-1256': 'windows-1256', 178 | 'x-cp1256': 'windows-1256', 179 | 'cp1257': 'windows-1257', 180 | 'windows-1257': 'windows-1257', 181 | 'x-cp1257': 'windows-1257', 182 | 'cp1258': 'windows-1258', 183 | 'windows-1258': 'windows-1258', 184 | 'x-cp1258': 'windows-1258', 185 | 'x-mac-cyrillic': 'x-mac-cyrillic', 186 | 'x-mac-ukrainian': 'x-mac-cyrillic', 187 | 'chinese': 'gbk', 188 | 'csgb2312': 'gbk', 189 | 'csiso58gb231280': 'gbk', 190 | 'gb2312': 'gbk', 191 | 'gb_2312': 'gbk', 192 | 'gb_2312-80': 'gbk', 193 | 'gbk': 'gbk', 194 | 'iso-ir-58': 'gbk', 195 | 'x-gbk': 'gbk', 196 | 'gb18030': 'gb18030', 197 | 'hz-gb-2312': 'hz-gb-2312', 198 | 'big5': 'big5', 199 | 'big5-hkscs': 'big5', 200 | 'cn-big5': 'big5', 201 | 'csbig5': 'big5', 202 | 'x-x-big5': 'big5', 203 | 'cseucpkdfmtjapanese': 'euc-jp', 204 | 'euc-jp': 'euc-jp', 205 | 'x-euc-jp': 'euc-jp', 206 | 'csiso2022jp': 'iso-2022-jp', 207 | 'iso-2022-jp': 'iso-2022-jp', 208 | 'csshiftjis': 'shift_jis', 209 | 'ms_kanji': 'shift_jis', 210 | 'shift-jis': 'shift_jis', 211 | 'shift_jis': 'shift_jis', 212 | 'sjis': 'shift_jis', 213 | 'windows-31j': 'shift_jis', 214 | 'x-sjis': 'shift_jis', 215 | 'cseuckr': 'euc-kr', 216 | 'csksc56011987': 'euc-kr', 217 | 'euc-kr': 'euc-kr', 218 | 'iso-ir-149': 'euc-kr', 219 | 'korean': 'euc-kr', 220 | 'ks_c_5601-1987': 'euc-kr', 221 | 'ks_c_5601-1989': 'euc-kr', 222 | 'ksc5601': 'euc-kr', 223 | 'ksc_5601': 'euc-kr', 224 | 'windows-949': 'euc-kr', 225 | 'csiso2022kr': 'iso-2022-kr', 226 | 'iso-2022-kr': 'iso-2022-kr', 227 | 'utf-16be': 'utf-16be', 228 | 'utf-16': 'utf-16le', 229 | 'utf-16le': 'utf-16le', 230 | 'x-user-defined': 'x-user-defined', 231 | } 232 | -------------------------------------------------------------------------------- /webencodings/mklabels.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | webencodings.mklabels 4 | ~~~~~~~~~~~~~~~~~~~~~ 5 | 6 | Regenarate the webencodings.labels module. 7 | 8 | :copyright: Copyright 2012 by Simon Sapin 9 | :license: BSD, see LICENSE for details. 10 | 11 | """ 12 | 13 | import json 14 | try: 15 | from urllib import urlopen 16 | except ImportError: 17 | from urllib.request import urlopen 18 | 19 | 20 | def assert_lower(string): 21 | assert string == string.lower() 22 | return string 23 | 24 | 25 | def generate(url): 26 | parts = ['''\ 27 | """ 28 | 29 | webencodings.labels 30 | ~~~~~~~~~~~~~~~~~~~ 31 | 32 | Map encoding labels to their name. 33 | 34 | :copyright: Copyright 2012 by Simon Sapin 35 | :license: BSD, see LICENSE for details. 36 | 37 | """ 38 | 39 | # XXX Do not edit! 40 | # This file is automatically generated by mklabels.py 41 | 42 | LABELS = { 43 | '''] 44 | labels = [ 45 | (repr(assert_lower(label)).lstrip('u'), 46 | repr(encoding['name']).lstrip('u')) 47 | for category in json.loads(urlopen(url).read().decode('ascii')) 48 | for encoding in category['encodings'] 49 | for label in encoding['labels']] 50 | max_len = max(len(label) for label, name in labels) 51 | parts.extend( 52 | ' %s:%s %s,\n' % (label, ' ' * (max_len - len(label)), name) 53 | for label, name in labels) 54 | parts.append('}') 55 | return ''.join(parts) 56 | 57 | 58 | if __name__ == '__main__': 59 | print(generate('http://encoding.spec.whatwg.org/encodings.json')) 60 | -------------------------------------------------------------------------------- /webencodings/tests.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | 4 | webencodings.tests 5 | ~~~~~~~~~~~~~~~~~~ 6 | 7 | A basic test suite for Encoding. 8 | 9 | :copyright: Copyright 2012 by Simon Sapin 10 | :license: BSD, see LICENSE for details. 11 | 12 | """ 13 | 14 | from __future__ import unicode_literals 15 | 16 | from . import (lookup, LABELS, decode, encode, iter_decode, iter_encode, 17 | IncrementalDecoder, IncrementalEncoder, UTF8) 18 | 19 | 20 | def assert_raises(exception, function, *args, **kwargs): 21 | try: 22 | function(*args, **kwargs) 23 | except exception: 24 | return 25 | else: # pragma: no cover 26 | raise AssertionError('Did not raise %s.' % exception) 27 | 28 | 29 | def test_labels(): 30 | assert lookup('utf-8').name == 'utf-8' 31 | assert lookup('Utf-8').name == 'utf-8' 32 | assert lookup('UTF-8').name == 'utf-8' 33 | assert lookup('utf8').name == 'utf-8' 34 | assert lookup('utf8').name == 'utf-8' 35 | assert lookup('utf8 ').name == 'utf-8' 36 | assert lookup(' \r\nutf8\t').name == 'utf-8' 37 | assert lookup('u8') is None # Python label. 38 | assert lookup('utf-8 ') is None # Non-ASCII white space. 39 | 40 | assert lookup('US-ASCII').name == 'windows-1252' 41 | assert lookup('iso-8859-1').name == 'windows-1252' 42 | assert lookup('latin1').name == 'windows-1252' 43 | assert lookup('LATIN1').name == 'windows-1252' 44 | assert lookup('latin-1') is None 45 | assert lookup('LATİN1') is None # ASCII-only case insensitivity. 46 | 47 | 48 | def test_all_labels(): 49 | for label in LABELS: 50 | assert decode(b'', label) == ('', lookup(label)) 51 | assert encode('', label) == b'' 52 | for repeat in [0, 1, 12]: 53 | output, _ = iter_decode([b''] * repeat, label) 54 | assert list(output) == [] 55 | assert list(iter_encode([''] * repeat, label)) == [] 56 | decoder = IncrementalDecoder(label) 57 | assert decoder.decode(b'') == '' 58 | assert decoder.decode(b'', final=True) == '' 59 | encoder = IncrementalEncoder(label) 60 | assert encoder.encode('') == b'' 61 | assert encoder.encode('', final=True) == b'' 62 | # All encoding names are valid labels too: 63 | for name in set(LABELS.values()): 64 | assert lookup(name).name == name 65 | 66 | 67 | def test_invalid_label(): 68 | assert_raises(LookupError, decode, b'\xEF\xBB\xBF\xc3\xa9', 'invalid') 69 | assert_raises(LookupError, encode, 'é', 'invalid') 70 | assert_raises(LookupError, iter_decode, [], 'invalid') 71 | assert_raises(LookupError, iter_encode, [], 'invalid') 72 | assert_raises(LookupError, IncrementalDecoder, 'invalid') 73 | assert_raises(LookupError, IncrementalEncoder, 'invalid') 74 | 75 | 76 | def test_decode(): 77 | assert decode(b'\x80', 'latin1') == ('€', lookup('latin1')) 78 | assert decode(b'\x80', lookup('latin1')) == ('€', lookup('latin1')) 79 | assert decode(b'\xc3\xa9', 'utf8') == ('é', lookup('utf8')) 80 | assert decode(b'\xc3\xa9', UTF8) == ('é', lookup('utf8')) 81 | assert decode(b'\xc3\xa9', 'ascii') == ('é', lookup('ascii')) 82 | assert decode(b'\xEF\xBB\xBF\xc3\xa9', 'ascii') == ('é', lookup('utf8')) # UTF-8 with BOM 83 | 84 | assert decode(b'\xFE\xFF\x00\xe9', 'ascii') == ('é', lookup('utf-16be')) # UTF-16-BE with BOM 85 | assert decode(b'\xFF\xFE\xe9\x00', 'ascii') == ('é', lookup('utf-16le')) # UTF-16-LE with BOM 86 | assert decode(b'\xFE\xFF\xe9\x00', 'ascii') == ('\ue900', lookup('utf-16be')) 87 | assert decode(b'\xFF\xFE\x00\xe9', 'ascii') == ('\ue900', lookup('utf-16le')) 88 | 89 | assert decode(b'\x00\xe9', 'UTF-16BE') == ('é', lookup('utf-16be')) 90 | assert decode(b'\xe9\x00', 'UTF-16LE') == ('é', lookup('utf-16le')) 91 | assert decode(b'\xe9\x00', 'UTF-16') == ('é', lookup('utf-16le')) 92 | 93 | assert decode(b'\xe9\x00', 'UTF-16BE') == ('\ue900', lookup('utf-16be')) 94 | assert decode(b'\x00\xe9', 'UTF-16LE') == ('\ue900', lookup('utf-16le')) 95 | assert decode(b'\x00\xe9', 'UTF-16') == ('\ue900', lookup('utf-16le')) 96 | 97 | 98 | def test_encode(): 99 | assert encode('é', 'latin1') == b'\xe9' 100 | assert encode('é', 'utf8') == b'\xc3\xa9' 101 | assert encode('é', 'utf8') == b'\xc3\xa9' 102 | assert encode('é', 'utf-16') == b'\xe9\x00' 103 | assert encode('é', 'utf-16le') == b'\xe9\x00' 104 | assert encode('é', 'utf-16be') == b'\x00\xe9' 105 | 106 | 107 | def test_iter_decode(): 108 | def iter_decode_to_string(input, fallback_encoding): 109 | output, _encoding = iter_decode(input, fallback_encoding) 110 | return ''.join(output) 111 | assert iter_decode_to_string([], 'latin1') == '' 112 | assert iter_decode_to_string([b''], 'latin1') == '' 113 | assert iter_decode_to_string([b'\xe9'], 'latin1') == 'é' 114 | assert iter_decode_to_string([b'hello'], 'latin1') == 'hello' 115 | assert iter_decode_to_string([b'he', b'llo'], 'latin1') == 'hello' 116 | assert iter_decode_to_string([b'hell', b'o'], 'latin1') == 'hello' 117 | assert iter_decode_to_string([b'\xc3\xa9'], 'latin1') == 'é' 118 | assert iter_decode_to_string([b'\xEF\xBB\xBF\xc3\xa9'], 'latin1') == 'é' 119 | assert iter_decode_to_string([ 120 | b'\xEF\xBB\xBF', b'\xc3', b'\xa9'], 'latin1') == 'é' 121 | assert iter_decode_to_string([ 122 | b'\xEF\xBB\xBF', b'a', b'\xc3'], 'latin1') == 'a\uFFFD' 123 | assert iter_decode_to_string([ 124 | b'', b'\xEF', b'', b'', b'\xBB\xBF\xc3', b'\xa9'], 'latin1') == 'é' 125 | assert iter_decode_to_string([b'\xEF\xBB\xBF'], 'latin1') == '' 126 | assert iter_decode_to_string([b'\xEF\xBB'], 'latin1') == 'ï»' 127 | assert iter_decode_to_string([b'\xFE\xFF\x00\xe9'], 'latin1') == 'é' 128 | assert iter_decode_to_string([b'\xFF\xFE\xe9\x00'], 'latin1') == 'é' 129 | assert iter_decode_to_string([ 130 | b'', b'\xFF', b'', b'', b'\xFE\xe9', b'\x00'], 'latin1') == 'é' 131 | assert iter_decode_to_string([ 132 | b'', b'h\xe9', b'llo'], 'x-user-defined') == 'h\uF7E9llo' 133 | 134 | 135 | def test_iter_encode(): 136 | assert b''.join(iter_encode([], 'latin1')) == b'' 137 | assert b''.join(iter_encode([''], 'latin1')) == b'' 138 | assert b''.join(iter_encode(['é'], 'latin1')) == b'\xe9' 139 | assert b''.join(iter_encode(['', 'é', '', ''], 'latin1')) == b'\xe9' 140 | assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16')) == b'\xe9\x00' 141 | assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16le')) == b'\xe9\x00' 142 | assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16be')) == b'\x00\xe9' 143 | assert b''.join(iter_encode([ 144 | '', 'h\uF7E9', '', 'llo'], 'x-user-defined')) == b'h\xe9llo' 145 | 146 | 147 | def test_x_user_defined(): 148 | encoded = b'2,\x0c\x0b\x1aO\xd9#\xcb\x0f\xc9\xbbt\xcf\xa8\xca' 149 | decoded = '2,\x0c\x0b\x1aO\uf7d9#\uf7cb\x0f\uf7c9\uf7bbt\uf7cf\uf7a8\uf7ca' 150 | encoded = b'aa' 151 | decoded = 'aa' 152 | assert decode(encoded, 'x-user-defined') == (decoded, lookup('x-user-defined')) 153 | assert encode(decoded, 'x-user-defined') == encoded 154 | -------------------------------------------------------------------------------- /webencodings/x_user_defined.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | 4 | webencodings.x_user_defined 5 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 6 | 7 | An implementation of the x-user-defined encoding. 8 | 9 | :copyright: Copyright 2012 by Simon Sapin 10 | :license: BSD, see LICENSE for details. 11 | 12 | """ 13 | 14 | from __future__ import unicode_literals 15 | 16 | import codecs 17 | 18 | 19 | ### Codec APIs 20 | 21 | class Codec(codecs.Codec): 22 | 23 | def encode(self, input, errors='strict'): 24 | return codecs.charmap_encode(input, errors, encoding_table) 25 | 26 | def decode(self, input, errors='strict'): 27 | return codecs.charmap_decode(input, errors, decoding_table) 28 | 29 | 30 | class IncrementalEncoder(codecs.IncrementalEncoder): 31 | def encode(self, input, final=False): 32 | return codecs.charmap_encode(input, self.errors, encoding_table)[0] 33 | 34 | 35 | class IncrementalDecoder(codecs.IncrementalDecoder): 36 | def decode(self, input, final=False): 37 | return codecs.charmap_decode(input, self.errors, decoding_table)[0] 38 | 39 | 40 | class StreamWriter(Codec, codecs.StreamWriter): 41 | pass 42 | 43 | 44 | class StreamReader(Codec, codecs.StreamReader): 45 | pass 46 | 47 | 48 | ### encodings module API 49 | 50 | codec_info = codecs.CodecInfo( 51 | name='x-user-defined', 52 | encode=Codec().encode, 53 | decode=Codec().decode, 54 | incrementalencoder=IncrementalEncoder, 55 | incrementaldecoder=IncrementalDecoder, 56 | streamreader=StreamReader, 57 | streamwriter=StreamWriter, 58 | ) 59 | 60 | 61 | ### Decoding Table 62 | 63 | # Python 3: 64 | # for c in range(256): print(' %r' % chr(c if c < 128 else c + 0xF700)) 65 | decoding_table = ( 66 | '\x00' 67 | '\x01' 68 | '\x02' 69 | '\x03' 70 | '\x04' 71 | '\x05' 72 | '\x06' 73 | '\x07' 74 | '\x08' 75 | '\t' 76 | '\n' 77 | '\x0b' 78 | '\x0c' 79 | '\r' 80 | '\x0e' 81 | '\x0f' 82 | '\x10' 83 | '\x11' 84 | '\x12' 85 | '\x13' 86 | '\x14' 87 | '\x15' 88 | '\x16' 89 | '\x17' 90 | '\x18' 91 | '\x19' 92 | '\x1a' 93 | '\x1b' 94 | '\x1c' 95 | '\x1d' 96 | '\x1e' 97 | '\x1f' 98 | ' ' 99 | '!' 100 | '"' 101 | '#' 102 | '$' 103 | '%' 104 | '&' 105 | "'" 106 | '(' 107 | ')' 108 | '*' 109 | '+' 110 | ',' 111 | '-' 112 | '.' 113 | '/' 114 | '0' 115 | '1' 116 | '2' 117 | '3' 118 | '4' 119 | '5' 120 | '6' 121 | '7' 122 | '8' 123 | '9' 124 | ':' 125 | ';' 126 | '<' 127 | '=' 128 | '>' 129 | '?' 130 | '@' 131 | 'A' 132 | 'B' 133 | 'C' 134 | 'D' 135 | 'E' 136 | 'F' 137 | 'G' 138 | 'H' 139 | 'I' 140 | 'J' 141 | 'K' 142 | 'L' 143 | 'M' 144 | 'N' 145 | 'O' 146 | 'P' 147 | 'Q' 148 | 'R' 149 | 'S' 150 | 'T' 151 | 'U' 152 | 'V' 153 | 'W' 154 | 'X' 155 | 'Y' 156 | 'Z' 157 | '[' 158 | '\\' 159 | ']' 160 | '^' 161 | '_' 162 | '`' 163 | 'a' 164 | 'b' 165 | 'c' 166 | 'd' 167 | 'e' 168 | 'f' 169 | 'g' 170 | 'h' 171 | 'i' 172 | 'j' 173 | 'k' 174 | 'l' 175 | 'm' 176 | 'n' 177 | 'o' 178 | 'p' 179 | 'q' 180 | 'r' 181 | 's' 182 | 't' 183 | 'u' 184 | 'v' 185 | 'w' 186 | 'x' 187 | 'y' 188 | 'z' 189 | '{' 190 | '|' 191 | '}' 192 | '~' 193 | '\x7f' 194 | '\uf780' 195 | '\uf781' 196 | '\uf782' 197 | '\uf783' 198 | '\uf784' 199 | '\uf785' 200 | '\uf786' 201 | '\uf787' 202 | '\uf788' 203 | '\uf789' 204 | '\uf78a' 205 | '\uf78b' 206 | '\uf78c' 207 | '\uf78d' 208 | '\uf78e' 209 | '\uf78f' 210 | '\uf790' 211 | '\uf791' 212 | '\uf792' 213 | '\uf793' 214 | '\uf794' 215 | '\uf795' 216 | '\uf796' 217 | '\uf797' 218 | '\uf798' 219 | '\uf799' 220 | '\uf79a' 221 | '\uf79b' 222 | '\uf79c' 223 | '\uf79d' 224 | '\uf79e' 225 | '\uf79f' 226 | '\uf7a0' 227 | '\uf7a1' 228 | '\uf7a2' 229 | '\uf7a3' 230 | '\uf7a4' 231 | '\uf7a5' 232 | '\uf7a6' 233 | '\uf7a7' 234 | '\uf7a8' 235 | '\uf7a9' 236 | '\uf7aa' 237 | '\uf7ab' 238 | '\uf7ac' 239 | '\uf7ad' 240 | '\uf7ae' 241 | '\uf7af' 242 | '\uf7b0' 243 | '\uf7b1' 244 | '\uf7b2' 245 | '\uf7b3' 246 | '\uf7b4' 247 | '\uf7b5' 248 | '\uf7b6' 249 | '\uf7b7' 250 | '\uf7b8' 251 | '\uf7b9' 252 | '\uf7ba' 253 | '\uf7bb' 254 | '\uf7bc' 255 | '\uf7bd' 256 | '\uf7be' 257 | '\uf7bf' 258 | '\uf7c0' 259 | '\uf7c1' 260 | '\uf7c2' 261 | '\uf7c3' 262 | '\uf7c4' 263 | '\uf7c5' 264 | '\uf7c6' 265 | '\uf7c7' 266 | '\uf7c8' 267 | '\uf7c9' 268 | '\uf7ca' 269 | '\uf7cb' 270 | '\uf7cc' 271 | '\uf7cd' 272 | '\uf7ce' 273 | '\uf7cf' 274 | '\uf7d0' 275 | '\uf7d1' 276 | '\uf7d2' 277 | '\uf7d3' 278 | '\uf7d4' 279 | '\uf7d5' 280 | '\uf7d6' 281 | '\uf7d7' 282 | '\uf7d8' 283 | '\uf7d9' 284 | '\uf7da' 285 | '\uf7db' 286 | '\uf7dc' 287 | '\uf7dd' 288 | '\uf7de' 289 | '\uf7df' 290 | '\uf7e0' 291 | '\uf7e1' 292 | '\uf7e2' 293 | '\uf7e3' 294 | '\uf7e4' 295 | '\uf7e5' 296 | '\uf7e6' 297 | '\uf7e7' 298 | '\uf7e8' 299 | '\uf7e9' 300 | '\uf7ea' 301 | '\uf7eb' 302 | '\uf7ec' 303 | '\uf7ed' 304 | '\uf7ee' 305 | '\uf7ef' 306 | '\uf7f0' 307 | '\uf7f1' 308 | '\uf7f2' 309 | '\uf7f3' 310 | '\uf7f4' 311 | '\uf7f5' 312 | '\uf7f6' 313 | '\uf7f7' 314 | '\uf7f8' 315 | '\uf7f9' 316 | '\uf7fa' 317 | '\uf7fb' 318 | '\uf7fc' 319 | '\uf7fd' 320 | '\uf7fe' 321 | '\uf7ff' 322 | ) 323 | 324 | ### Encoding table 325 | encoding_table = codecs.charmap_build(decoding_table) 326 | --------------------------------------------------------------------------------