├── .bumpversion.cfg ├── .gitignore ├── .pyup.yml ├── .travis.yml ├── LICENSE ├── README.rst ├── html5lib_truncation ├── __init__.py ├── filters.py ├── shortcuts.py └── utils.py ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── conftest.py ├── test_filters.py └── test_utils.py └── tox.ini /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | files = setup.py html5lib_truncation/__init__.py 3 | commit = True 4 | tag = True 5 | current_version = 0.1.0 6 | 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.gitignore.io 2 | 3 | ### Python ### 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | env/ 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | 47 | # Translations 48 | *.mo 49 | *.pot 50 | 51 | # Django stuff: 52 | *.log 53 | 54 | # Sphinx documentation 55 | docs/_build/ 56 | 57 | # PyBuilder 58 | target/ 59 | 60 | -------------------------------------------------------------------------------- /.pyup.yml: -------------------------------------------------------------------------------- 1 | # autogenerated pyup.io config file 2 | # see https://pyup.io/docs/configuration/ for all available options 3 | 4 | update: insecure 5 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | - "3.3" 5 | - "3.4" 6 | - "pypy" 7 | install: 8 | - "pip install ." 9 | - "pip install pytest pytest-cov pytest-pep8 coveralls" 10 | script: "py.test" 11 | after_success: "coveralls" 12 | branches: 13 | only: 14 | - master 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | Copyright (c) 2015 Jiangge Zhang 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in all 12 | copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 17 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 18 | DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 19 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 20 | OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | |Build Status| |Coverage Status| |PyPI Version| |Wheel Status| 2 | 3 | html5lib-truncation 4 | =================== 5 | 6 | ``html5lib-truncation`` is a html5lib_ filter implementation, which can 7 | truncate HTML to specific length in display, but never breaks HTML tags. 8 | 9 | There is a shortcut function, the simplest way to use it: 10 | 11 | .. code-block:: python 12 | 13 | >>> from html5lib_truncation import truncate_html 14 | >>> 15 | >>> html = u'

A very very long link

' 16 | >>> truncate_html(html, 8) 17 | u'

A very' 18 | >>> truncate_html(html, 8, break_words=True) 19 | u'

A very ve' 20 | >>> truncate_html(html, 20, end='...') 21 | u'

A very very...' 22 | >>> truncate_html(html, 20, end='...', break_words=True) 23 | u'

A very very lon...' 24 | 25 | 26 | .. _html5lib: https://github.com/html5lib/html5lib-python 27 | 28 | 29 | Installation 30 | ------------ 31 | 32 | :: 33 | 34 | pip install html5lib-truncation 35 | 36 | Don't forget to put it into your ``requirements.txt`` or ``setup.py``. 37 | 38 | 39 | API Overview 40 | ------------ 41 | 42 | The core API of html5lib-truncation is the filter: 43 | 44 | .. code-block:: python 45 | 46 | import html5lib 47 | from html5lib_truncation import TruncationFilter 48 | 49 | etree = html5lib.parse(u'

A very very long link

') 50 | walker = html5lib.getTreeWalker('etree') 51 | 52 | stream = walker(etree) 53 | stream = TruncationFilter(stream, 20, end='...', break_words=True) 54 | 55 | serializer = html5lib.serializer.HTMLSerializer() 56 | serialized = serializer.serialize(stream) 57 | 58 | print(u''.join(serialized).strip()) 59 | 60 | The output is ``

A very very lon...``. 61 | 62 | 63 | Issues 64 | ------ 65 | 66 | If you want to report bugs or other issues, please create issues on 67 | `GitHub Issues `_. 68 | 69 | 70 | Contributes 71 | ----------- 72 | 73 | You can send a pull reueqst on 74 | `GitHub `_. 75 | 76 | .. |Build Status| image:: https://img.shields.io/travis/tonyseek/html5lib-truncation.svg?style=flat 77 | :target: https://travis-ci.org/tonyseek/html5lib-truncation 78 | :alt: Build Status 79 | .. |Coverage Status| image:: https://img.shields.io/coveralls/tonyseek/html5lib-truncation.svg?style=flat 80 | :target: https://coveralls.io/r/tonyseek/html5lib-truncation 81 | :alt: Coverage Status 82 | .. |Wheel Status| image:: https://img.shields.io/pypi/wheel/html5lib-truncation.svg?style=flat 83 | :target: https://warehouse.python.org/project/html5lib-truncation 84 | :alt: Wheel Status 85 | .. |PyPI Version| image:: https://img.shields.io/pypi/v/html5lib-truncation.svg?style=flat 86 | :target: https://pypi.python.org/pypi/html5lib-truncation 87 | :alt: PyPI Version 88 | -------------------------------------------------------------------------------- /html5lib_truncation/__init__.py: -------------------------------------------------------------------------------- 1 | from .filters import TruncationFilter 2 | from .shortcuts import truncate_html 3 | 4 | __all__ = ['TruncationFilter', 'truncate_html'] 5 | __version__ = '0.1.0' 6 | -------------------------------------------------------------------------------- /html5lib_truncation/filters.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | 3 | from .utils import truncate_sentence 4 | 5 | 6 | class TruncationFilter(object): 7 | """The filter of html5lib for truncating documents. 8 | 9 | Just like the other filters defined in :mod:`html5lib.filters`, this 10 | class's instances could be used with filter-wrapped or raw stream. 11 | 12 | :param source: The source stream. 13 | :param max_chars: The maximum characters on display after truncated. 14 | :param break_words: ``True`` if breaking words is allowed. It is defaults 15 | to ``False`` which means last broken word will be 16 | dropped in truncating. 17 | :param end: The end characters such as a ellipsis ``"..."``. 18 | """ 19 | 20 | def __init__(self, source, max_chars, break_words=False, end=''): 21 | self.source = source 22 | self.max_chars = max_chars 23 | self.break_words = break_words 24 | self.end = end 25 | 26 | def __iter__(self): 27 | return TruncationIterator(self) 28 | 29 | def __getattr__(self, name): 30 | return getattr(self.source, name) 31 | 32 | 33 | class TruncationIterator(object): 34 | """The truncation iterator. It is stateful.""" 35 | 36 | def __init__(self, master): 37 | self.master = master 38 | self.source = iter(master.source) 39 | self.total_tags = 0 40 | self.total_chars = 0 41 | 42 | @property 43 | def overflow(self): 44 | return self.total_chars + len(self.master.end) > self.master.max_chars 45 | 46 | @property 47 | def all_tags_closed(self): 48 | return self.total_tags <= 0 49 | 50 | def __iter__(self): 51 | return self 52 | 53 | def __next__(self): 54 | token = next(self.source) 55 | 56 | if token['type'] == 'StartTag': 57 | self.total_tags += 1 58 | 59 | if token['type'] == 'EndTag': 60 | self.total_tags -= 1 61 | 62 | if token['type'] == 'Characters': 63 | self.total_chars += len(token['data']) 64 | if self.overflow: 65 | token = dict(token) 66 | overflow_chars = ( 67 | self.total_chars - self.master.max_chars + 68 | len(self.master.end)) 69 | token['data'] = truncate_sentence( 70 | text=token['data'], 71 | max_chars=len(token['data']) - overflow_chars, 72 | break_words=self.master.break_words, 73 | padding=len(self.master.end)) 74 | if token['data']: 75 | token['data'] += self.master.end 76 | 77 | if self.overflow and self.all_tags_closed: 78 | raise StopIteration 79 | 80 | return token 81 | 82 | # For compatible with Python 2.x 83 | next = __next__ 84 | -------------------------------------------------------------------------------- /html5lib_truncation/shortcuts.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | 3 | import html5lib 4 | 5 | from .filters import TruncationFilter 6 | 7 | 8 | def truncate_html(html, *args, **kwargs): 9 | """Truncates HTML string. 10 | 11 | :param html: The HTML string or parsed element tree (with 12 | :func:`html5lib.parse`). 13 | :param kwargs: Similar with :class:`.filters.TruncationFilter`. 14 | 15 | :return: The truncated HTML string. 16 | """ 17 | if hasattr(html, 'getchildren'): 18 | etree = html 19 | else: 20 | etree = html5lib.parse(html) 21 | 22 | walker = html5lib.getTreeWalker('etree') 23 | 24 | stream = walker(etree) 25 | stream = TruncationFilter(stream, *args, **kwargs) 26 | 27 | serializer = html5lib.serializer.HTMLSerializer() 28 | serialized = serializer.serialize(stream) 29 | 30 | return u''.join(serialized).strip() 31 | -------------------------------------------------------------------------------- /html5lib_truncation/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | 3 | 4 | def truncate_sentence(text, max_chars, break_words=False, padding=0): 5 | """Truncates a sentence. 6 | 7 | :param max_chars: The maximum characters of truncated sentence. 8 | :param break_words: If you wish to truncate given sentence strictly even 9 | if it breaks a word, set it to ``True``. It defaults 10 | to ``False`` which means truncating given sentence 11 | shorter but never breaking words. 12 | :param padding: The padding size for truncating. It is usually used to 13 | keep spaces for some ending characters such as ``"..."``. 14 | :return: The truncated sentence. 15 | """ 16 | if break_words: 17 | return text[:-abs(max_chars - len(text)) - padding] 18 | 19 | words = [] 20 | for word in text.split(): 21 | predicted_len = ( 22 | sum(map(len, words)) + # length of words 23 | len(word) + # length of next word 24 | len(words) - 1 + # length of spaces 25 | padding) 26 | if predicted_len >= max_chars: 27 | break 28 | words.append(word) 29 | return ' '.join(words) 30 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [pytest] 2 | addopts = --cov html5lib_truncation --pep8 3 | pep8ignore = 4 | docs/conf.py ALL 5 | docs/_themes/* ALL 6 | [bdist_wheel] 7 | universal = 1 8 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | 4 | with open('README.rst') as readme: 5 | next(readme) 6 | long_description = ''.join(readme).strip() 7 | 8 | 9 | setup( 10 | name='html5lib-truncation', 11 | version='0.1.0', 12 | author='Jiangge Zhang', 13 | author_email='tonyseek@gmail.com', 14 | description='Truncating HTML with html5lib filter', 15 | long_description=long_description, 16 | url='https://github.com/tonyseek/html5lib-truncation', 17 | license='MIT', 18 | classifiers=[ 19 | 'Development Status :: 3 - Alpha', 20 | 'License :: OSI Approved :: MIT License', 21 | 'Operating System :: OS Independent', 22 | 'Programming Language :: Python', 23 | 'Programming Language :: Python :: 2.7', 24 | 'Programming Language :: Python :: 3.3', 25 | 'Programming Language :: Python :: 3.4', 26 | 'Programming Language :: Python :: Implementation :: PyPy', 27 | ], 28 | packages=find_packages(), 29 | platforms=['Any'], 30 | install_requires=['html5lib'], 31 | ) 32 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tonyseek/html5lib-truncation/b5551e345e583d04dbdf6b97dc2a43a266eec8d6/tests/__init__.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | 3 | import pytest 4 | import html5lib 5 | 6 | 7 | html = '''

Return a truncated copy of the string. The length is specified 8 | with the first parameter which defaults to 9 | 255. If the second parameter is 10 | true the filter 11 | will cut the text at length. Otherwise it will discard the last word. If 12 | the text was in fact truncated it will append an ellipsis sign 13 | ("..."). If you 14 | want a different ellipsis sign than 15 | "..." you can specify it using the third 16 | parameter.

''' 17 | 18 | 19 | @pytest.fixture 20 | def etree(): 21 | return html5lib.parse(html) 22 | -------------------------------------------------------------------------------- /tests/test_filters.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | 3 | from html5lib import getTreeWalker 4 | from html5lib_truncation import truncate_html, TruncationFilter 5 | 6 | 7 | result_a = ( 8 | '

Return a truncated copy of the string. The length is specified with ' 9 | 'the first parameter which \n' 10 | '\n' 11 | ' ' 12 | ' ' 13 | '\n' 14 | '') 15 | result_b = ( 16 | '

Return a truncated copy of the string. The length is specified\n' 17 | 'with the first parameter which defa \n' 18 | '\n' 19 | ' ' 20 | ' ' 21 | '\n' 22 | '') 23 | result_c = ( 24 | '

Return a truncated copy of the string. The length is specified with ' 25 | 'the first parameter... \n' 26 | '\n' 27 | ' ' 28 | ' ' 29 | '\n' 30 | '') 31 | result_d = ( 32 | '

Return a truncated copy of the string. The length is specified\n' 33 | 'with the first parameter whic... \n' 34 | '\n' 35 | ' ' 36 | ' ' 37 | '\n' 38 | '') 39 | 40 | 41 | def test_truncation(etree): 42 | assert truncate_html(etree, 98) == result_a 43 | assert truncate_html(etree, 98, break_words=True) == result_b 44 | assert truncate_html(etree, 98, end='...') == result_c 45 | assert truncate_html(etree, 98, end='...', break_words=True) == result_d 46 | 47 | 48 | def test_truncation_with_string(): 49 | assert truncate_html(result_a, 98) == result_a 50 | 51 | 52 | def test_iterable(etree): 53 | walker = getTreeWalker('etree') 54 | stream = walker(etree) 55 | stream = TruncationFilter(stream, 98, end='...') 56 | 57 | assert stream.tree is etree 58 | 59 | iterator = iter(stream) 60 | assert iterator is not stream 61 | assert iter(iterator) is iterator 62 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | 3 | from html5lib_truncation.utils import truncate_sentence 4 | 5 | 6 | def test_truncate_sentence(): 7 | s = 'Three Rings for the Elven-kings under the sky' 8 | 9 | assert truncate_sentence(s, 18) == 'Three Rings for' 10 | assert truncate_sentence(s, 18, break_words=True) == 'Three Rings for th' 11 | assert truncate_sentence(s, 18, break_words=False) == 'Three Rings for' 12 | 13 | assert truncate_sentence(s, 18, break_words=True, padding=9) == 'Three Rin' 14 | assert truncate_sentence(s, 18, break_words=False, padding=9) == 'Three' 15 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py27,py33,py34,pypy 3 | [testenv] 4 | deps = 5 | pytest 6 | pytest-cov 7 | pytest-pep8 8 | commands = 9 | py.test 10 | --------------------------------------------------------------------------------