├── .bumpversion.cfg
├── .gitignore
├── .pyup.yml
├── .travis.yml
├── LICENSE
├── README.rst
├── html5lib_truncation
    ├── __init__.py
    ├── filters.py
    ├── shortcuts.py
    └── utils.py
├── setup.cfg
├── setup.py
├── tests
    ├── __init__.py
    ├── conftest.py
    ├── test_filters.py
    └── test_utils.py
└── tox.ini


/.bumpversion.cfg:
--------------------------------------------------------------------------------
1 | [bumpversion]
2 | files = setup.py html5lib_truncation/__init__.py
3 | commit = True
4 | tag = True
5 | current_version = 0.1.0
6 | 
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Created by https://www.gitignore.io
 2 | 
 3 | ### Python ###
 4 | # Byte-compiled / optimized / DLL files
 5 | __pycache__/
 6 | *.py[cod]
 7 | 
 8 | # C extensions
 9 | *.so
10 | 
11 | # Distribution / packaging
12 | .Python
13 | env/
14 | build/
15 | develop-eggs/
16 | dist/
17 | downloads/
18 | eggs/
19 | .eggs/
20 | lib/
21 | lib64/
22 | parts/
23 | sdist/
24 | var/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | 
29 | # PyInstaller
30 | #  Usually these files are written by a python script from a template
31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 | 
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 | 
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .coverage
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | 
47 | # Translations
48 | *.mo
49 | *.pot
50 | 
51 | # Django stuff:
52 | *.log
53 | 
54 | # Sphinx documentation
55 | docs/_build/
56 | 
57 | # PyBuilder
58 | target/
59 | 
60 | 


--------------------------------------------------------------------------------
/.pyup.yml:
--------------------------------------------------------------------------------
1 | # autogenerated pyup.io config file 
2 | # see https://pyup.io/docs/configuration/ for all available options
3 | 
4 | update: insecure
5 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "2.7"
 4 |   - "3.3"
 5 |   - "3.4"
 6 |   - "pypy"
 7 | install:
 8 |   - "pip install ."
 9 |   - "pip install pytest pytest-cov pytest-pep8 coveralls"
10 | script: "py.test"
11 | after_success: "coveralls"
12 | branches:
13 |   only:
14 |     - master
15 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | Copyright (c) 2015 Jiangge Zhang
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | of this software and associated documentation files (the "Software"), to deal
 6 | in the Software without restriction, including without limitation the rights
 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | copies of the Software, and to permit persons to whom the Software is
 9 | furnished to do so, subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in all
12 | copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
18 | DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
19 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
20 | OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | |Build Status| |Coverage Status| |PyPI Version| |Wheel Status|
 2 | 
 3 | html5lib-truncation
 4 | ===================
 5 | 
 6 | ``html5lib-truncation`` is a html5lib_ filter implementation, which can
 7 | truncate HTML to specific length in display, but never breaks HTML tags.
 8 | 
 9 | There is a shortcut function, the simplest way to use it:
10 | 
11 | .. code-block:: python
12 | 
13 |     >>> from html5lib_truncation import truncate_html
14 |     >>>
15 |     >>> html = u'<p>A <a href="#">very very long link</a></p>'
16 |     >>> truncate_html(html, 8)
17 |     u'<p>A <a href=#>very</a>'
18 |     >>> truncate_html(html, 8, break_words=True)
19 |     u'<p>A <a href=#>very ve</a>'
20 |     >>> truncate_html(html, 20, end='...')
21 |     u'<p>A <a href=#>very very...</a>'
22 |     >>> truncate_html(html, 20, end='...', break_words=True)
23 |     u'<p>A <a href=#>very very lon...</a>'
24 | 
25 | 
26 | .. _html5lib: https://github.com/html5lib/html5lib-python
27 | 
28 | 
29 | Installation
30 | ------------
31 | 
32 | ::
33 | 
34 |     pip install html5lib-truncation
35 | 
36 | Don't forget to put it into your ``requirements.txt`` or ``setup.py``.
37 | 
38 | 
39 | API Overview
40 | ------------
41 | 
42 | The core API of html5lib-truncation is the filter:
43 | 
44 | .. code-block:: python
45 | 
46 |     import html5lib
47 |     from html5lib_truncation import TruncationFilter
48 | 
49 |     etree = html5lib.parse(u'<p>A <a href="#">very very long link</a></p>')
50 |     walker = html5lib.getTreeWalker('etree')
51 | 
52 |     stream = walker(etree)
53 |     stream = TruncationFilter(stream, 20, end='...', break_words=True)
54 | 
55 |     serializer = html5lib.serializer.HTMLSerializer()
56 |     serialized = serializer.serialize(stream)
57 | 
58 |     print(u''.join(serialized).strip())
59 | 
60 | The output is ``<p>A <a href=#>very very lon...</a>``.
61 | 
62 | 
63 | Issues
64 | ------
65 | 
66 | If you want to report bugs or other issues, please create issues on
67 | `GitHub Issues <https://github.com/tonyseek/html5lib-truncation/issues>`_.
68 | 
69 | 
70 | Contributes
71 | -----------
72 | 
73 | You can send a pull reueqst on
74 | `GitHub <https://github.com/tonyseek/html5lib-truncation/pulls>`_.
75 | 
76 | .. |Build Status| image:: https://img.shields.io/travis/tonyseek/html5lib-truncation.svg?style=flat
77 |    :target: https://travis-ci.org/tonyseek/html5lib-truncation
78 |    :alt: Build Status
79 | .. |Coverage Status| image:: https://img.shields.io/coveralls/tonyseek/html5lib-truncation.svg?style=flat
80 |    :target: https://coveralls.io/r/tonyseek/html5lib-truncation
81 |    :alt: Coverage Status
82 | .. |Wheel Status| image:: https://img.shields.io/pypi/wheel/html5lib-truncation.svg?style=flat
83 |    :target: https://warehouse.python.org/project/html5lib-truncation
84 |    :alt: Wheel Status
85 | .. |PyPI Version| image:: https://img.shields.io/pypi/v/html5lib-truncation.svg?style=flat
86 |    :target: https://pypi.python.org/pypi/html5lib-truncation
87 |    :alt: PyPI Version
88 | 


--------------------------------------------------------------------------------
/html5lib_truncation/__init__.py:
--------------------------------------------------------------------------------
1 | from .filters import TruncationFilter
2 | from .shortcuts import truncate_html
3 | 
4 | __all__ = ['TruncationFilter', 'truncate_html']
5 | __version__ = '0.1.0'
6 | 


--------------------------------------------------------------------------------
/html5lib_truncation/filters.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | 
 3 | from .utils import truncate_sentence
 4 | 
 5 | 
 6 | class TruncationFilter(object):
 7 |     """The filter of html5lib for truncating documents.
 8 | 
 9 |     Just like the other filters defined in :mod:`html5lib.filters`, this
10 |     class's instances could be used with filter-wrapped or raw stream.
11 | 
12 |     :param source: The source stream.
13 |     :param max_chars: The maximum characters on display after truncated.
14 |     :param break_words: ``True`` if breaking words is allowed. It is defaults
15 |                         to ``False`` which means last broken word will be
16 |                         dropped in truncating.
17 |     :param end: The end characters such as a ellipsis ``"..."``.
18 |     """
19 | 
20 |     def __init__(self, source, max_chars, break_words=False, end=''):
21 |         self.source = source
22 |         self.max_chars = max_chars
23 |         self.break_words = break_words
24 |         self.end = end
25 | 
26 |     def __iter__(self):
27 |         return TruncationIterator(self)
28 | 
29 |     def __getattr__(self, name):
30 |         return getattr(self.source, name)
31 | 
32 | 
33 | class TruncationIterator(object):
34 |     """The truncation iterator. It is stateful."""
35 | 
36 |     def __init__(self, master):
37 |         self.master = master
38 |         self.source = iter(master.source)
39 |         self.total_tags = 0
40 |         self.total_chars = 0
41 | 
42 |     @property
43 |     def overflow(self):
44 |         return self.total_chars + len(self.master.end) > self.master.max_chars
45 | 
46 |     @property
47 |     def all_tags_closed(self):
48 |         return self.total_tags <= 0
49 | 
50 |     def __iter__(self):
51 |         return self
52 | 
53 |     def __next__(self):
54 |         token = next(self.source)
55 | 
56 |         if token['type'] == 'StartTag':
57 |             self.total_tags += 1
58 | 
59 |         if token['type'] == 'EndTag':
60 |             self.total_tags -= 1
61 | 
62 |         if token['type'] == 'Characters':
63 |             self.total_chars += len(token['data'])
64 |             if self.overflow:
65 |                 token = dict(token)
66 |                 overflow_chars = (
67 |                     self.total_chars - self.master.max_chars +
68 |                     len(self.master.end))
69 |                 token['data'] = truncate_sentence(
70 |                     text=token['data'],
71 |                     max_chars=len(token['data']) - overflow_chars,
72 |                     break_words=self.master.break_words,
73 |                     padding=len(self.master.end))
74 |                 if token['data']:
75 |                     token['data'] += self.master.end
76 | 
77 |         if self.overflow and self.all_tags_closed:
78 |             raise StopIteration
79 | 
80 |         return token
81 | 
82 |     # For compatible with Python 2.x
83 |     next = __next__
84 | 


--------------------------------------------------------------------------------
/html5lib_truncation/shortcuts.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | 
 3 | import html5lib
 4 | 
 5 | from .filters import TruncationFilter
 6 | 
 7 | 
 8 | def truncate_html(html, *args, **kwargs):
 9 |     """Truncates HTML string.
10 | 
11 |     :param html: The HTML string or parsed element tree (with
12 |                  :func:`html5lib.parse`).
13 |     :param kwargs: Similar with :class:`.filters.TruncationFilter`.
14 | 
15 |     :return: The truncated HTML string.
16 |     """
17 |     if hasattr(html, 'getchildren'):
18 |         etree = html
19 |     else:
20 |         etree = html5lib.parse(html)
21 | 
22 |     walker = html5lib.getTreeWalker('etree')
23 | 
24 |     stream = walker(etree)
25 |     stream = TruncationFilter(stream, *args, **kwargs)
26 | 
27 |     serializer = html5lib.serializer.HTMLSerializer()
28 |     serialized = serializer.serialize(stream)
29 | 
30 |     return u''.join(serialized).strip()
31 | 


--------------------------------------------------------------------------------
/html5lib_truncation/utils.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | 
 3 | 
 4 | def truncate_sentence(text, max_chars, break_words=False, padding=0):
 5 |     """Truncates a sentence.
 6 | 
 7 |     :param max_chars: The maximum characters of truncated sentence.
 8 |     :param break_words: If you wish to truncate given sentence strictly even
 9 |                         if it breaks a word, set it to ``True``. It defaults
10 |                         to ``False`` which means truncating given sentence
11 |                         shorter but never breaking words.
12 |     :param padding: The padding size for truncating. It is usually used to
13 |                     keep spaces for some ending characters such as ``"..."``.
14 |     :return: The truncated sentence.
15 |     """
16 |     if break_words:
17 |         return text[:-abs(max_chars - len(text)) - padding]
18 | 
19 |     words = []
20 |     for word in text.split():
21 |         predicted_len = (
22 |             sum(map(len, words)) +  # length of words
23 |             len(word) +  # length of next word
24 |             len(words) - 1 +  # length of spaces
25 |             padding)
26 |         if predicted_len >= max_chars:
27 |             break
28 |         words.append(word)
29 |     return ' '.join(words)
30 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [pytest]
2 | addopts = --cov html5lib_truncation --pep8
3 | pep8ignore =
4 |     docs/conf.py ALL
5 |     docs/_themes/* ALL
6 | [bdist_wheel]
7 | universal = 1
8 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | 
 4 | with open('README.rst') as readme:
 5 |     next(readme)
 6 |     long_description = ''.join(readme).strip()
 7 | 
 8 | 
 9 | setup(
10 |     name='html5lib-truncation',
11 |     version='0.1.0',
12 |     author='Jiangge Zhang',
13 |     author_email='tonyseek@gmail.com',
14 |     description='Truncating HTML with html5lib filter',
15 |     long_description=long_description,
16 |     url='https://github.com/tonyseek/html5lib-truncation',
17 |     license='MIT',
18 |     classifiers=[
19 |         'Development Status :: 3 - Alpha',
20 |         'License :: OSI Approved :: MIT License',
21 |         'Operating System :: OS Independent',
22 |         'Programming Language :: Python',
23 |         'Programming Language :: Python :: 2.7',
24 |         'Programming Language :: Python :: 3.3',
25 |         'Programming Language :: Python :: 3.4',
26 |         'Programming Language :: Python :: Implementation :: PyPy',
27 |     ],
28 |     packages=find_packages(),
29 |     platforms=['Any'],
30 |     install_requires=['html5lib'],
31 | )
32 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tonyseek/html5lib-truncation/b5551e345e583d04dbdf6b97dc2a43a266eec8d6/tests/__init__.py


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | 
 3 | import pytest
 4 | import html5lib
 5 | 
 6 | 
 7 | html = '''<p>Return a truncated copy of the string. The length is specified
 8 | with the first parameter which defaults to <tt class="docutils literal">
 9 | <span class="pre">255</span></tt>. If the second parameter is
10 | <tt class="docutils literal"><span class="pre">true</span></tt> the filter
11 | will cut the text at length. Otherwise it will discard the last word. If
12 | the text was in fact truncated it will append an ellipsis sign
13 | (<tt class="docutils literal"><span class="pre">"..."</span></tt>). If you
14 | want a different ellipsis sign than <tt class="docutils literal">
15 | <span class="pre">"..."</span></tt> you can specify it using the third
16 | parameter.</p>'''
17 | 
18 | 
19 | @pytest.fixture
20 | def etree():
21 |     return html5lib.parse(html)
22 | 


--------------------------------------------------------------------------------
/tests/test_filters.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | 
 3 | from html5lib import getTreeWalker
 4 | from html5lib_truncation import truncate_html, TruncationFilter
 5 | 
 6 | 
 7 | result_a = (
 8 |     '<p>Return a truncated copy of the string. The length is specified with '
 9 |     'the first parameter which <tt class="docutils literal">\n'
10 |     '<span class=pre></span></tt>\n'
11 |     '<tt class="docutils literal"><span class=pre></span></tt> '
12 |     '<tt class="docutils literal"><span class=pre></span></tt> '
13 |     '<tt class="docutils literal">\n'
14 |     '<span class=pre></span></tt>')
15 | result_b = (
16 |     '<p>Return a truncated copy of the string. The length is specified\n'
17 |     'with the first parameter which defa <tt class="docutils literal">\n'
18 |     '<span class=pre></span></tt>\n'
19 |     '<tt class="docutils literal"><span class=pre></span></tt> '
20 |     '<tt class="docutils literal"><span class=pre></span></tt> '
21 |     '<tt class="docutils literal">\n'
22 |     '<span class=pre></span></tt>')
23 | result_c = (
24 |     '<p>Return a truncated copy of the string. The length is specified with '
25 |     'the first parameter... <tt class="docutils literal">\n'
26 |     '<span class=pre></span></tt>\n'
27 |     '<tt class="docutils literal"><span class=pre></span></tt> '
28 |     '<tt class="docutils literal"><span class=pre></span></tt> '
29 |     '<tt class="docutils literal">\n'
30 |     '<span class=pre></span></tt>')
31 | result_d = (
32 |     '<p>Return a truncated copy of the string. The length is specified\n'
33 |     'with the first parameter whic... <tt class="docutils literal">\n'
34 |     '<span class=pre></span></tt>\n'
35 |     '<tt class="docutils literal"><span class=pre></span></tt> '
36 |     '<tt class="docutils literal"><span class=pre></span></tt> '
37 |     '<tt class="docutils literal">\n'
38 |     '<span class=pre></span></tt>')
39 | 
40 | 
41 | def test_truncation(etree):
42 |     assert truncate_html(etree, 98) == result_a
43 |     assert truncate_html(etree, 98, break_words=True) == result_b
44 |     assert truncate_html(etree, 98, end='...') == result_c
45 |     assert truncate_html(etree, 98, end='...', break_words=True) == result_d
46 | 
47 | 
48 | def test_truncation_with_string():
49 |     assert truncate_html(result_a, 98) == result_a
50 | 
51 | 
52 | def test_iterable(etree):
53 |     walker = getTreeWalker('etree')
54 |     stream = walker(etree)
55 |     stream = TruncationFilter(stream, 98, end='...')
56 | 
57 |     assert stream.tree is etree
58 | 
59 |     iterator = iter(stream)
60 |     assert iterator is not stream
61 |     assert iter(iterator) is iterator
62 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | 
 3 | from html5lib_truncation.utils import truncate_sentence
 4 | 
 5 | 
 6 | def test_truncate_sentence():
 7 |     s = 'Three Rings for the Elven-kings under the sky'
 8 | 
 9 |     assert truncate_sentence(s, 18) == 'Three Rings for'
10 |     assert truncate_sentence(s, 18, break_words=True) == 'Three Rings for th'
11 |     assert truncate_sentence(s, 18, break_words=False) == 'Three Rings for'
12 | 
13 |     assert truncate_sentence(s, 18, break_words=True, padding=9) == 'Three Rin'
14 |     assert truncate_sentence(s, 18, break_words=False, padding=9) == 'Three'
15 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py27,py33,py34,pypy
 3 | [testenv]
 4 | deps =
 5 |     pytest
 6 |     pytest-cov
 7 |     pytest-pep8
 8 | commands =
 9 |     py.test
10 | 


--------------------------------------------------------------------------------