├── .flake8
├── .gitattributes
├── .github
    └── workflows
    │   ├── python-package.yml
    │   └── python-publish.yml
├── .gitignore
├── .travis.yml
├── LICENSE
├── Makefile
├── README.md
├── doc
    ├── __init__.py
    └── source
    │   ├── __init__.py
    │   ├── api.rst
    │   ├── conf.py
    │   └── index.rst
├── pyproject.toml
├── readability
    ├── __init__.py
    ├── browser.py
    ├── cleaners.py
    ├── debug.py
    ├── encoding.py
    ├── htmls.py
    └── readability.py
├── requirements-dev.txt
├── setup.py
├── tests
    ├── __init__.py
    ├── samples
    │   ├── si-game.sample.html
    │   ├── summary-keep-all-images.sample.html
    │   ├── the-hurricane-rubin-carter-denzel-washington.html
    │   ├── too-many-images.sample.html
    │   └── utf-8-kanji.sample.html
    └── test_article_only.py
└── tox.ini


/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = E501, W503 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | tests/samples/* linguist-vendored
2 | 


--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 3 | 
 4 | name: Python package
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ "master" ]
 9 |   pull_request:
10 |     branches: [ "master" ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       fail-fast: false
18 |       matrix:
19 |         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
20 | 
21 |     steps:
22 |     - uses: actions/checkout@v4
23 |     - name: Set up Python ${{ matrix.python-version }}
24 |       uses: actions/setup-python@v3
25 |       with:
26 |         python-version: ${{ matrix.python-version }}
27 |     - name: Install dependencies
28 |       run: |
29 |         python -m pip install --upgrade pip
30 |         python -m pip install flake8 pytest
31 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
32 |     - name: Lint with flake8
33 |       run: |
34 |         # stop the build if there are Python syntax errors or undefined names
35 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
36 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
37 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
38 |     - name: Test with pytest
39 |       run: |
40 |         pytest
41 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: Upload Python Package
10 | 
11 | on:
12 |   release:
13 |     types: [published]
14 | 
15 | permissions:
16 |   contents: read
17 | 
18 | jobs:
19 |   deploy:
20 | 
21 |     runs-on: ubuntu-latest
22 | 
23 |     steps:
24 |     - uses: actions/checkout@v3
25 |     - name: Set up Python
26 |       uses: actions/setup-python@v3
27 |       with:
28 |         python-version: '3.x'
29 |     - name: Install dependencies
30 |       run: |
31 |         python -m pip install --upgrade pip
32 |         pip install build
33 |     - name: Build package
34 |       run: python -m build
35 |     - name: Publish package
36 |       uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
37 |       with:
38 |         user: __token__
39 |         password: ${{ secrets.PYPI_API_TOKEN }}
40 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | __pycache__
 3 | *.egg-info
 4 | /build
 5 | /dist
 6 | /bin
 7 | /include
 8 | /lib
 9 | /local
10 | /man
11 | nosetests.xml
12 | .coverage
13 | .tox
14 | .idea
15 | .cache
16 | /.noseids
17 | /.venv
18 | /poetry.lock


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | os: linux
 3 | cache: pip
 4 | 
 5 | matrix:
 6 |   include:
 7 |     - name: "Python 3.8 on Linux"
 8 |       dist: xenial
 9 |       python: 3.8
10 |     - name: "Python 3.9 Nightly on Linux"
11 |       dist: bionic
12 |       python: nightly
13 |     - name: "Pypy 3 on Linux"
14 |       python: pypy3
15 |   allow_failures:
16 |     - python: nightly
17 |     - python: pypy3
18 |     - os: osx
19 | 
20 | install:
21 |   - if [ $PIP ]; then true; else PIP=pip3; fi
22 |   - travis_retry $PIP install -U pip wheel tox-travis pytest-cov codecov
23 |   - travis_retry $PIP install -U -r requirements.txt -e ".[test]"
24 | 
25 | script:
26 |   - tox
27 | 
28 | after_success:
29 |   - codecov
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Makefile to help automate tasks
 2 | WD := $(shell pwd)
 3 | PY := .venv/bin/python
 4 | PIP := .venv/bin/pip
 5 | PEP8 := .venv/bin/pep8
 6 | NOSE := .venv/bin/nosetests
 7 | TWINE := .venv/bin/twine
 8 | 
 9 | # ###########
10 | # Tests rule!
11 | # ###########
12 | .PHONY: test
13 | test: venv develop $(NOSE)
14 | 	$(NOSE) --with-id -s tests
15 | 
16 | $(NOSE): setup
17 | 
18 | # #######
19 | # INSTALL
20 | # #######
21 | .PHONY: all
22 | all: setup develop
23 | 
24 | venv: .venv/bin/python
25 | 
26 | setup: venv
27 | 	$(PIP) install -r requirements-dev.txt | grep -v "already satisfied" || true
28 | 
29 | .venv/bin/python:
30 | 	test -d .venv || which python3 && python3 -m venv .venv || virtualenv .venv
31 | 
32 | .PHONY: clean
33 | clean:
34 | 	rm -rf .venv
35 | 
36 | develop: .venv/lib/python*/site-packages/readability-lxml.egg-link
37 | 
38 | .venv/lib/python*/site-packages/readability-lxml.egg-link:
39 | 	$(PY) setup.py develop
40 | 
41 | 
42 | # ###########
43 | # Development
44 | # ###########
45 | .PHONY: clean_all
46 | clean_all: clean_venv
47 | 
48 | .PHONY: build
49 | build:
50 | 	poetry build
51 | 
52 | # ###########
53 | # Deploy
54 | # ###########
55 | .PHONY: dist
56 | dist:
57 | 	$(PY) -m pip install wheel
58 | 	$(PY) setup.py sdist bdist_wheel
59 | 	$(TWINE) check dist/*
60 | 
61 | .PHONY: upload
62 | upload:
63 | 	$(TWINE) upload dist/*
64 | 
65 | .PHONY: bump
66 | bump:
67 | 	$(EDITOR) readability/__init__.py
68 | 	$(eval VERSION := $(shell grep "__version__" readability/__init__.py | cut -d'"' -f2))
69 | 	# fix first occurrence of version in pyproject.toml
70 | 	sed -i '0,/version = ".*"/s//version = "$(VERSION)"/' pyproject.toml
71 | 	git commit -m "Bump version to $(VERSION)" pyproject.toml readability/__init__.py
72 | 	git tag $(VERSION)
73 | 	git push --tags
74 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![PyPI version](https://img.shields.io/pypi/v/readability-lxml.svg)](https://pypi.python.org/pypi/readability-lxml)
 2 | 
 3 | # python-readability
 4 | 
 5 | Given an HTML document, extract and clean up the main body text and title.
 6 | 
 7 | This is a Python port of a Ruby port of [arc90's Readability project](https://web.archive.org/web/20130519040221/http://www.readability.com/).
 8 | 
 9 | ## Installation
10 | 
11 | It's easy using `pip`, just run:
12 | 
13 | ```bash
14 | $ pip install readability-lxml
15 | ```
16 | 
17 | As an alternative, you may also use conda to install, just run:
18 | 
19 | ```bash
20 | $ conda install -c conda-forge readability-lxml
21 | ```
22 | 
23 | ## Usage
24 | 
25 | ```python
26 | >>> import requests
27 | >>> from readability import Document
28 | 
29 | >>> response = requests.get('http://example.com')
30 | >>> doc = Document(response.content)
31 | >>> doc.title()
32 | 'Example Domain'
33 | 
34 | >>> doc.summary()
35 | """<html><body><div><body id="readabilityBody">\n<div>\n    <h1>Example Domain</h1>\n
36 | <p>This domain is established to be used for illustrative examples in documents. You may
37 | use this\n    domain in examples without prior coordination or asking for permission.</p>
38 | \n    <p><a href="http://www.iana.org/domains/example">More information...</a></p>\n</div>
39 | \n</body>\n</div></body></html>"""
40 | ```
41 | 
42 | ## Change Log
43 | - 0.8.4 Better CJK support, thanks @cdhigh
44 | - 0.8.3.1 Support for python 3.8 - 3.13
45 | - 0.8.3 We can now save all images via keep_all_images=True (default is to save 1 main image), thanks @botlabsDev
46 | - 0.8.2 Added article author(s) (thanks @mattblaha)
47 | - 0.8.1 Fixed processing of non-ascii HTMLs via regexps.
48 | - 0.8 Replaced XHTML output with HTML5 output in summary() call.
49 | - 0.7.1 Support for Python 3.7 . Fixed a slowdown when processing documents with lots of spaces.
50 | - 0.7 Improved HTML5 tags handling. Fixed stripping unwanted HTML nodes (only first matching node was removed before).
51 | - 0.6 Finally a release which supports Python versions 2.6, 2.7, 3.3 - 3.6
52 | - 0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and 3.4
53 | - 0.4 Added Videos loading and allowed more images per paragraph
54 | - 0.3 Added Document.encoding, positive\_keywords and negative\_keywords
55 | 
56 | ## Licensing
57 | 
58 | This code is under [the Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0) license.
59 | 
60 | ## Thanks to
61 | 
62 | - Latest [readability.js](https://github.com/MHordecki/readability-redux/blob/master/readability/readability.js)
63 | - Ruby port by starrhorne and iterationlabs
64 | - [Python port](https://github.com/gfxmonk/python-readability) by gfxmonk
65 | - [Decruft effort](https://web.archive.org/web/20110214150709/https://www.minvolai.com/blog/decruft-arc90s-readability-in-python/) to move to lxml
66 | - "BR to P" fix from readability.js which improves quality for smaller texts
67 | - Github users contributions.
68 | 


--------------------------------------------------------------------------------
/doc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buriy/python-readability/c8d8011f3d4c69d7667a52395237e56e66af8ea4/doc/__init__.py


--------------------------------------------------------------------------------
/doc/source/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buriy/python-readability/c8d8011f3d4c69d7667a52395237e56e66af8ea4/doc/source/__init__.py


--------------------------------------------------------------------------------
/doc/source/api.rst:
--------------------------------------------------------------------------------
 1 | Reference
 2 | =========
 3 | 
 4 | .. automodule:: readability
 5 |     :members:
 6 |     :show-inheritance:
 7 | 
 8 | .. automodule:: readability.browser
 9 |     :members:
10 |     :show-inheritance:
11 | 
12 | .. automodule:: readability.cleaners
13 |     :members:
14 |     :show-inheritance:
15 | 
16 | .. automodule:: readability.debug
17 |     :members:
18 |     :show-inheritance:
19 | 
20 | .. automodule:: readability.encoding
21 |     :members:
22 |     :show-inheritance:
23 | 
24 | .. automodule:: readability.htmls
25 |     :members:
26 |     :show-inheritance:
27 | 
28 | .. automodule:: readability.readability
29 |     :members:
30 |     :show-inheritance:
31 | 


--------------------------------------------------------------------------------
/doc/source/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | #
  3 | # readability documentation build configuration file, created by
  4 | # sphinx-quickstart on Thu Mar 23 16:29:38 2017.
  5 | #
  6 | # This file is execfile()d with the current directory set to its
  7 | # containing dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | # If extensions (or modules to document with autodoc) are in another directory,
 16 | # add these directories to sys.path here. If the directory is relative to the
 17 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 18 | #
 19 | import os
 20 | import sys
 21 | 
 22 | sys.path.insert(0, os.path.abspath("../.."))
 23 | 
 24 | import readability
 25 | 
 26 | # -- General configuration ------------------------------------------------
 27 | 
 28 | # If your documentation needs a minimal Sphinx version, state it here.
 29 | #
 30 | # needs_sphinx = '1.0'
 31 | 
 32 | # Add any Sphinx extension module names here, as strings. They can be
 33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 34 | # ones.
 35 | extensions = [
 36 |     "sphinx.ext.autodoc",
 37 |     "sphinx.ext.doctest",
 38 |     "sphinx.ext.intersphinx",
 39 |     "sphinx.ext.todo",
 40 |     "myst_parser",
 41 | ]
 42 | 
 43 | # Add any paths that contain templates here, relative to this directory.
 44 | templates_path = ["_templates"]
 45 | 
 46 | # The suffix(es) of source filenames.
 47 | # You can specify multiple suffix as a list of string:
 48 | #
 49 | source_suffix = [".rst", ".md"]
 50 | 
 51 | # The master toctree document.
 52 | master_doc = "index"
 53 | 
 54 | # General information about the project.
 55 | project = "readability"
 56 | copyright = "2020, Yuri Baburov"
 57 | author = "Yuri Baburov"
 58 | 
 59 | # The version info for the project you're documenting, acts as replacement for
 60 | # |version| and |release|, also used in various other places throughout the
 61 | # built documents.
 62 | 
 63 | # The short X.Y version.
 64 | version = readability.__version__
 65 | 
 66 | # The full version, including alpha/beta/rc tags.
 67 | release = readability.__version__
 68 | 
 69 | # The language for content autogenerated by Sphinx. Refer to documentation
 70 | # for a list of supported languages.
 71 | #
 72 | # This is also used if you do content translation via gettext catalogs.
 73 | # Usually you set "language" from the command line for these cases.
 74 | language = "en"
 75 | 
 76 | # List of patterns, relative to source directory, that match files and
 77 | # directories to ignore when looking for source files.
 78 | # This patterns also effect to html_static_path and html_extra_path
 79 | exclude_patterns = []
 80 | 
 81 | # The name of the Pygments (syntax highlighting) style to use.
 82 | pygments_style = "sphinx"
 83 | 
 84 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 85 | todo_include_todos = False
 86 | 
 87 | 
 88 | # -- Options for HTML output ----------------------------------------------
 89 | 
 90 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 91 | # a list of builtin themes.
 92 | #
 93 | html_theme = "sphinx_rtd_theme"
 94 | 
 95 | # Theme options are theme-specific and customize the look and feel of a theme
 96 | # further.  For a list of options available for each theme, see the
 97 | # documentation.
 98 | #
 99 | # html_theme_options = {}
100 | 
101 | # Add any paths that contain custom static files (such as style sheets) here,
102 | # relative to this directory. They are copied after the builtin static files,
103 | # so a file named "default.css" will overwrite the builtin "default.css".
104 | html_static_path = []  #'_static']
105 | 
106 | 
107 | # -- Options for HTMLHelp output ------------------------------------------
108 | 
109 | # Output file base name for HTML help builder.
110 | htmlhelp_basename = "readabilitydoc"
111 | 
112 | 
113 | # -- Options for LaTeX output ---------------------------------------------
114 | 
115 | latex_elements = {
116 |     # The paper size ('letterpaper' or 'a4paper').
117 |     #
118 |     # 'papersize': 'letterpaper',
119 |     # The font size ('10pt', '11pt' or '12pt').
120 |     #
121 |     # 'pointsize': '10pt',
122 |     # Additional stuff for the LaTeX preamble.
123 |     #
124 |     # 'preamble': '',
125 |     # Latex figure (float) alignment
126 |     #
127 |     # 'figure_align': 'htbp',
128 | }
129 | 
130 | # Grouping the document tree into LaTeX files. List of tuples
131 | # (source start file, target name, title,
132 | #  author, documentclass [howto, manual, or own class]).
133 | latex_documents = [(master_doc, "readability.tex", "Readability Documentation", "Yuri Baburov", "manual")]
134 | 
135 | 
136 | # -- Options for manual page output ---------------------------------------
137 | 
138 | # One entry per manual page. List of tuples
139 | # (source start file, name, description, authors, manual section).
140 | man_pages = [(master_doc, "readability", "readability Documentation", [author], 1)]
141 | 
142 | 
143 | # -- Options for Texinfo output -------------------------------------------
144 | 
145 | # Grouping the document tree into Texinfo files. List of tuples
146 | # (source start file, target name, title, author,
147 | #  dir menu entry, description, category)
148 | texinfo_documents = [
149 |     (
150 |         master_doc,
151 |         "readability",
152 |         "Readability Documentation",
153 |         author,
154 |         "readability",
155 |         "One line description of project.",
156 |         "Miscellaneous",
157 |     )
158 | ]
159 | 
160 | 
161 | intersphinx_mapping = {
162 |     "python": ("https://docs.python.org/3", None),
163 | }
164 | 


--------------------------------------------------------------------------------
/doc/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. include:: ../../README.rst
 2 | 
 3 | .. toctree::
 4 |     :maxdepth: 2
 5 | 
 6 |     api
 7 | 
 8 | Indices and tables
 9 | ------------------
10 | 
11 | * :ref:`genindex`
12 | * :ref:`modindex`
13 | * :ref:`search`
14 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "readability-lxml"
 3 | version = "0.8.4.1"
 4 | description = "fast html to text parser (article readability tool) with python 3 support"
 5 | authors = ["Yuri Baburov <burchik@gmail.com>"]
 6 | license = "Apache License 2.0"
 7 | readme = "README.md"
 8 | packages = [
 9 |     { include = "readability" },
10 | ]
11 | 
12 | [tool.poetry.dependencies]
13 | python = ">=3.8.2,<3.14"
14 | chardet = "^5.2.0"
15 | cssselect = [
16 |     { version = "~1.2", markers = "python_version < '3.9'" },
17 |     { version = "~1.3", markers = "python_version >= '3.9'" }
18 | ]
19 | lxml = {extras = ["html-clean"], version = "^5.4.0"}
20 | lxml-html-clean = {markers = "python_version < \"3.11\"", version = "^0.4.2"}
21 | 
22 | 
23 | [build-system]
24 | requires = ["poetry-core"]
25 | build-backend = "poetry.core.masonry.api"
26 | 


--------------------------------------------------------------------------------
/readability/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.8.4.1"
2 | 
3 | from .readability import Document
4 | 


--------------------------------------------------------------------------------
/readability/browser.py:
--------------------------------------------------------------------------------
 1 | def open_in_browser(html):
 2 |     """
 3 |     Open the HTML document in a web browser, saving it to a temporary
 4 |     file to open it.  Note that this does not delete the file after
 5 |     use.  This is mainly meant for debugging.
 6 |     """
 7 |     import os
 8 |     import webbrowser
 9 |     import tempfile
10 | 
11 |     handle, fn = tempfile.mkstemp(suffix=".html")
12 |     f = os.fdopen(handle, "wb")
13 |     try:
14 |         f.write(b"<meta charset='UTF-8' />")
15 |         f.write(html.encode("utf-8"))
16 |     finally:
17 |         # we leak the file itself here, but we should at least close it
18 |         f.close()
19 |     url = "file://" + fn.replace(os.path.sep, "/")
20 |     webbrowser.open(url)
21 |     return url
22 | 


--------------------------------------------------------------------------------
/readability/cleaners.py:
--------------------------------------------------------------------------------
 1 | # strip out a set of nuisance html attributes that can mess up rendering in RSS feeds
 2 | import re
 3 | try:
 4 |     from lxml.html.clean import Cleaner
 5 | except ImportError:
 6 |     from lxml_html_clean import Cleaner
 7 | 
 8 | bad_attrs = ["width", "height", "style", "[-a-z]*color", "background[-a-z]*", "on*"]
 9 | single_quoted = "'[^']+'"
10 | double_quoted = '"[^"]+"'
11 | non_space = "[^ \"'>]+"
12 | htmlstrip = re.compile(
13 |     "<"  # open
14 |     "([^>]+) "  # prefix
15 |     "(?:%s) *" % ("|".join(bad_attrs),)
16 |     + "= *(?:%s|%s|%s)"  # undesirable attributes
17 |     % (non_space, single_quoted, double_quoted)
18 |     + "([^>]*)"  # value  # postfix
19 |     ">",  # end
20 |     re.I,
21 | )
22 | 
23 | 
24 | def clean_attributes(html):
25 |     while htmlstrip.search(html):
26 |         html = htmlstrip.sub("<\\1\\2>", html)
27 |     return html
28 | 
29 | 
30 | def normalize_spaces(s):
31 |     if not s:
32 |         return ""
33 |     """replace any sequence of whitespace
34 |     characters with a single space"""
35 |     return " ".join(s.split())
36 | 
37 | 
38 | html_cleaner = Cleaner(
39 |     scripts=True,
40 |     javascript=True,
41 |     comments=True,
42 |     style=True,
43 |     links=True,
44 |     meta=False,
45 |     add_nofollow=False,
46 |     page_structure=False,
47 |     processing_instructions=True,
48 |     embedded=False,
49 |     frames=False,
50 |     forms=False,
51 |     annoying_tags=False,
52 |     remove_tags=None,
53 |     remove_unknown_tags=False,
54 |     safe_attrs_only=False,
55 | )
56 | 


--------------------------------------------------------------------------------
/readability/debug.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | # FIXME: use with caution, can leak memory
 5 | uids = {}
 6 | uids_document = None
 7 | 
 8 | 
 9 | def describe_node(node):
10 |     global uids
11 |     if node is None:
12 |         return ""
13 |     if not hasattr(node, "tag"):
14 |         return "[%s]" % type(node)
15 |     name = node.tag
16 |     if node.get("id", ""):
17 |         name += "#" + node.get("id")
18 |     if node.get("class", "").strip():
19 |         name += "." + ".".join(node.get("class").split())
20 |     if name[:4] in ["div#", "div."]:
21 |         name = name[3:]
22 |     if name in ["tr", "td", "div", "p"]:
23 |         uid = uids.get(node)
24 |         if uid is None:
25 |             uid = uids[node] = len(uids) + 1
26 |         name += "{%02d}" % uid
27 |     return name
28 | 
29 | 
30 | def describe(node, depth=1):
31 |     global uids, uids_document
32 |     doc = node.getroottree().getroot()
33 |     if doc != uids_document:
34 |         uids = {}
35 |         uids_document = doc
36 | 
37 |     # return repr(NodeRepr(node))
38 |     parent = ""
39 |     if depth and node.getparent() is not None:
40 |         parent = describe(node.getparent(), depth=depth - 1) + ">"
41 |     return parent + describe_node(node)
42 | 
43 | 
44 | RE_COLLAPSE_WHITESPACES = re.compile(r"\s+", re.U)
45 | 
46 | 
47 | def text_content(elem, length=40):
48 |     content = RE_COLLAPSE_WHITESPACES.sub(" ", elem.text_content().replace("\r", ""))
49 |     if len(content) < length:
50 |         return content
51 |     return content[:length] + "..."
52 | 


--------------------------------------------------------------------------------
/readability/encoding.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | try:
 3 |     import cchardet as chardet
 4 | except ImportError:
 5 |     import chardet
 6 | 
 7 | 
 8 | RE_CHARSET = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
 9 | RE_PRAGMA = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
10 | RE_XML = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
11 | 
12 | CHARSETS = {
13 |     "big5": "big5hkscs",
14 |     "gb2312": "gb18030",
15 |     "ascii": "utf-8",
16 |     "maccyrillic": "cp1251",
17 |     "win1251": "cp1251",
18 |     "win-1251": "cp1251",
19 |     "windows-1251": "cp1251",
20 | }
21 | 
22 | 
23 | def fix_charset(encoding):
24 |     """Overrides encoding when charset declaration
25 |        or charset determination is a subset of a larger
26 |        charset.  Created because of issues with Chinese websites"""
27 |     encoding = encoding.lower()
28 |     return CHARSETS.get(encoding, encoding)
29 | 
30 | 
31 | def get_encoding(page):
32 |     # Regex for XML and HTML Meta charset declaration
33 |     declared_encodings = (
34 |         RE_CHARSET.findall(page) + RE_PRAGMA.findall(page) + RE_XML.findall(page)
35 |     )
36 | 
37 |     # Try any declared encodings
38 |     for declared_encoding in declared_encodings:
39 |         try:
40 |             # Python3 only
41 |             # declared_encoding will actually be bytes but .decode() only
42 |             # accepts `str` type. Decode blindly with ascii because no one should
43 |             # ever use non-ascii characters in the name of an encoding.
44 |             declared_encoding = declared_encoding.decode("ascii", "replace")
45 | 
46 |             encoding = fix_charset(declared_encoding)
47 |             # Now let's decode the page
48 |             page.decode(encoding)
49 |             # It worked!
50 |             return encoding
51 |         except UnicodeDecodeError:
52 |             pass
53 | 
54 |     # Fallback to chardet if declared encodings fail
55 |     # Remove all HTML tags, and leave only text for chardet
56 |     text = re.sub(r'(\s*</?[^>]*>)+\s*', ' ', page).strip()
57 |     enc = 'utf-8'
58 |     if len(text) < 10:
59 |         return enc  # can't guess
60 |     res = chardet.detect(text)
61 |     enc = res["encoding"] or "utf-8"
62 |     # print '->', enc, "%.2f" % res['confidence']
63 |     enc = fix_charset(enc)
64 |     return enc
65 | 


--------------------------------------------------------------------------------
/readability/htmls.py:
--------------------------------------------------------------------------------
  1 | from lxml.html import tostring
  2 | import lxml.html
  3 | import re
  4 | 
  5 | from .cleaners import normalize_spaces, clean_attributes
  6 | from .encoding import get_encoding
  7 | 
  8 | utf8_parser = lxml.html.HTMLParser(encoding="utf-8")
  9 | 
 10 | 
 11 | def build_doc(page):
 12 |     if isinstance(page, str):
 13 |         encoding = None
 14 |         decoded_page = page
 15 |     else:
 16 |         encoding = get_encoding(page) or "utf-8"
 17 |         decoded_page = page.decode(encoding, "replace")
 18 | 
 19 |     # XXX: we have to do .decode and .encode even for utf-8 pages to remove bad characters
 20 |     doc = lxml.html.document_fromstring(
 21 |         decoded_page.encode("utf-8", "replace"), parser=utf8_parser
 22 |     )
 23 |     return doc, encoding
 24 | 
 25 | 
 26 | def js_re(src, pattern, flags, repl):
 27 |     return re.compile(pattern, flags).sub(src, repl.replace("$", "\\"))
 28 | 
 29 | 
 30 | def normalize_entities(cur_title):
 31 |     entities = {
 32 |         "\u2014": "-",
 33 |         "\u2013": "-",
 34 |         "&mdash;": "-",
 35 |         "&ndash;": "-",
 36 |         "\u00A0": " ",
 37 |         "\u00AB": '"',
 38 |         "\u00BB": '"',
 39 |         "&quot;": '"',
 40 |     }
 41 |     for c, r in entities.items():
 42 |         if c in cur_title:
 43 |             cur_title = cur_title.replace(c, r)
 44 | 
 45 |     return cur_title
 46 | 
 47 | 
 48 | def norm_title(title):
 49 |     return normalize_entities(normalize_spaces(title))
 50 | 
 51 | 
 52 | def get_title(doc):
 53 |     title = doc.find(".//title")
 54 |     if title is None or title.text is None or len(title.text) == 0:
 55 |         return "[no-title]"
 56 | 
 57 |     return norm_title(title.text)
 58 | 
 59 | 
 60 | def get_author(doc):
 61 |     author = doc.find(".//meta[@name='author']")
 62 |     if author is None or 'content' not in author.keys() or \
 63 |        len(author.get('content')) == 0:
 64 |         return "[no-author]"
 65 | 
 66 |     return author.get('content')
 67 | 
 68 | 
 69 | def add_match(collection, text, orig):
 70 |     text = norm_title(text)
 71 |     if len(text.split()) >= 2 and len(text) >= 15:
 72 |         if text.replace('"', "") in orig.replace('"', ""):
 73 |             collection.add(text)
 74 | 
 75 | 
 76 | TITLE_CSS_HEURISTICS = [
 77 |     "#title",
 78 |     "#head",
 79 |     "#heading",
 80 |     ".pageTitle",
 81 |     ".news_title",
 82 |     ".title",
 83 |     ".head",
 84 |     ".heading",
 85 |     ".contentheading",
 86 |     ".small_header_red",
 87 | ]
 88 | 
 89 | 
 90 | def shorten_title(doc):
 91 |     title = doc.find(".//title")
 92 |     if title is None or title.text is None or len(title.text) == 0:
 93 |         return ""
 94 | 
 95 |     title = orig = norm_title(title.text)
 96 | 
 97 |     candidates = set()
 98 | 
 99 |     for item in [".//h1", ".//h2", ".//h3"]:
100 |         for e in list(doc.iterfind(item)):
101 |             if e.text:
102 |                 add_match(candidates, e.text, orig)
103 |             if e.text_content():
104 |                 add_match(candidates, e.text_content(), orig)
105 | 
106 |     for item in TITLE_CSS_HEURISTICS:
107 |         for e in doc.cssselect(item):
108 |             if e.text:
109 |                 add_match(candidates, e.text, orig)
110 |             if e.text_content():
111 |                 add_match(candidates, e.text_content(), orig)
112 | 
113 |     cjk = re.compile('[\u4e00-\u9fff]+')
114 | 
115 |     if candidates:
116 |         title = sorted(candidates, key=len)[-1]
117 |     else:
118 |         for delimiter in [" | ", " - ", " :: ", " / "]:
119 |             if delimiter in title:
120 |                 parts = orig.split(delimiter)
121 |                 p0 = parts[0]
122 |                 pl = parts[-1]
123 |                 if (len(p0.split()) >= 4) or (len(p0) >= 4 and cjk.search(p0)):
124 |                     title = p0
125 |                     break
126 |                 elif (len(pl.split()) >= 4) or (len(pl) >= 4 and cjk.search(pl)):
127 |                     title = pl
128 |                     break
129 |         else:
130 |             if ": " in title:
131 |                 p1 = orig.split(": ")[-1]
132 |                 if (len(p1.split()) >= 4) or (len(p1) >= 4 and cjk.search(p1)):
133 |                     title = p1
134 |                 else:
135 |                     title = orig.split(": ", 1)[1]
136 | 
137 |     if cjk.search(title):
138 |         if not (4 <= len(title) < 100):  # Allow length >= 4, cap at 100
139 |             return orig
140 |     elif not 15 < len(title) < 150:
141 |         return orig
142 | 
143 |     return title
144 | 
145 | 
146 | # is it necessary? Cleaner from LXML is initialized correctly in cleaners.py
147 | def get_body(doc):
148 |     for elem in doc.xpath(".//script | .//link | .//style"):
149 |         elem.drop_tree()
150 |     # tostring() always return utf-8 encoded string
151 |     # FIXME: isn't better to use tounicode?
152 |     raw_html = tostring(doc.body or doc)
153 |     if isinstance(raw_html, bytes):
154 |         raw_html = raw_html.decode()
155 |     cleaned = clean_attributes(raw_html)
156 |     try:
157 |         # BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
158 |         return cleaned
159 |     except Exception:  # FIXME find the equivalent lxml error
160 |         # logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
161 |         return raw_html
162 | 


--------------------------------------------------------------------------------
/readability/readability.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import logging
  3 | import re
  4 | import sys
  5 | import urllib.request
  6 | import urllib.parse
  7 | import urllib.error
  8 | 
  9 | from lxml.etree import tostring
 10 | from lxml.etree import tounicode
 11 | from lxml.etree import _ElementTree
 12 | from lxml.html import document_fromstring
 13 | from lxml.html import fragment_fromstring
 14 | from lxml.html import HtmlElement
 15 | 
 16 | from .cleaners import clean_attributes
 17 | from .cleaners import html_cleaner
 18 | from .htmls import build_doc
 19 | from .htmls import get_body
 20 | from .htmls import get_title
 21 | from .htmls import get_author
 22 | from .htmls import shorten_title
 23 | from .debug import describe, text_content
 24 | 
 25 | 
 26 | log = logging.getLogger("readability.readability")
 27 | 
 28 | REGEXES = {
 29 |     "unlikelyCandidatesRe": re.compile(
 30 |         r"combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter",
 31 |         re.I,
 32 |     ),
 33 |     "okMaybeItsACandidateRe": re.compile(r"and|article|body|column|main|shadow", re.I),
 34 |     "positiveRe": re.compile(
 35 |         r"article|body|content|entry|hentry|main|page|pagination|post|text|blog|story",
 36 |         re.I,
 37 |     ),
 38 |     "negativeRe": re.compile(
 39 |         r"combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget",
 40 |         re.I,
 41 |     ),
 42 |     "divToPElementsRe": re.compile(
 43 |         r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)", re.I
 44 |     ),
 45 |     # 'replaceBrsRe': re.compile(r'(<br[^>]*>[ \n\r\t]*){2,}',re.I),
 46 |     # 'replaceFontsRe': re.compile(r'<(\/?)font[^>]*>',re.I),
 47 |     # 'trimRe': re.compile(r'^\s+|\s+$/'),
 48 |     # 'normalizeRe': re.compile(r'\s{2,}/'),
 49 |     # 'killBreaksRe': re.compile(r'(<br\s*\/?>(\s|&nbsp;?)*){1,}/'),
 50 |     "videoRe": re.compile(r"https?:\/\/(www\.)?(youtube|vimeo)\.com", re.I),
 51 |     # skipFootnoteLink:      /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
 52 | }
 53 | 
 54 | 
 55 | class Unparseable(ValueError):
 56 |     pass
 57 | 
 58 | 
 59 | def to_int(x):
 60 |     if not x:
 61 |         return None
 62 |     x = x.strip()
 63 |     if x.endswith("px"):
 64 |         return int(x[:-2])
 65 |     if x.endswith("em"):
 66 |         return int(x[:-2]) * 12
 67 |     return int(x)
 68 | 
 69 | 
 70 | def clean(text):
 71 |     # Many spaces make the following regexes run forever
 72 |     text = re.sub(r"\s{255,}", " " * 255, text)
 73 |     text = re.sub(r"\s*\n\s*", "\n", text)
 74 |     text = re.sub(r"\t|[ \t]{2,}", " ", text)
 75 |     return text.strip()
 76 | 
 77 | 
 78 | def text_length(i):
 79 |     return len(clean(i.text_content() or ""))
 80 | 
 81 | 
 82 | def compile_pattern(elements):
 83 |     if not elements:
 84 |         return None
 85 |     elif isinstance(elements, re.Pattern):
 86 |         return elements
 87 |     elif isinstance(elements, (str, bytes)):
 88 |         if isinstance(elements, bytes):
 89 |             elements = str(elements, "utf-8")
 90 |         elements = elements.split(",")
 91 |     if isinstance(elements, (list, tuple)):
 92 |         return re.compile("|".join([re.escape(x.strip()) for x in elements]), re.U)
 93 |     else:
 94 |         raise Exception(f"Unknown type for the pattern: {type(elements)}")
 95 |         # assume string or string like object
 96 | 
 97 | 
 98 | class Document:
 99 |     """Class to build a etree document out of html."""
100 | 
101 |     def __init__(
102 |         self,
103 |         input,
104 |         positive_keywords=None,
105 |         negative_keywords=None,
106 |         url=None,
107 |         min_text_length=25,
108 |         retry_length=250,
109 |         xpath=False,
110 |         handle_failures="discard",
111 |     ):
112 |         """Generate the document
113 | 
114 |         :param input: string of the html content.
115 |         :param positive_keywords: regex, list or comma-separated string of patterns in classes and ids
116 |         :param negative_keywords: regex, list or comma-separated string in classes and ids
117 |         :param min_text_length: Tunable. Set to a higher value for more precise detection of longer texts.
118 |         :param retry_length: Tunable. Set to a lower value for better detection of very small texts.
119 |         :param xpath: If set to True, adds x="..." attribute to each HTML node,
120 |         containing xpath path pointing to original document path (allows to
121 |         reconstruct selected summary in original document).
122 |         :param handle_failures: Parameter passed to `lxml` for handling failure during exception.
123 |         Support options = ["discard", "ignore", None]
124 | 
125 |         Examples:
126 |             positive_keywords=["news-item", "block"]
127 |             positive_keywords=["news-item, block"]
128 |             positive_keywords=re.compile("news|block")
129 |             negative_keywords=["mysidebar", "related", "ads"]
130 | 
131 |         The Document class is not re-enterable.
132 |         It is designed to create a new Document() for each HTML file to process it.
133 | 
134 |         API methods:
135 |         .title() -- full title
136 |         .short_title() -- cleaned up title
137 |         .content() -- full content
138 |         .summary() -- cleaned up content
139 |         """
140 |         self.input = input
141 |         self.html = None
142 |         self.encoding = None
143 |         self.positive_keywords = compile_pattern(positive_keywords)
144 |         self.negative_keywords = compile_pattern(negative_keywords)
145 |         self.url = url
146 |         self.min_text_length = min_text_length
147 |         self.retry_length = retry_length
148 |         self.xpath = xpath
149 |         self.handle_failures = handle_failures
150 | 
151 |     def _html(self, force=False):
152 |         if force or self.html is None:
153 |             self.html = self._parse(self.input)
154 |             if self.xpath:
155 |                 root = self.html.getroottree()
156 |                 for i in self.html.getiterator():
157 |                     # print root.getpath(i)
158 |                     i.attrib["x"] = root.getpath(i)
159 |         return self.html
160 | 
161 |     def _parse(self, input):
162 |         if isinstance(input, (_ElementTree, HtmlElement)):
163 |             doc = input
164 |             self.encoding = 'utf-8'
165 |         else:
166 |             doc, self.encoding = build_doc(input)
167 |         doc = html_cleaner.clean_html(doc)
168 |         base_href = self.url
169 |         if base_href:
170 |             # trying to guard against bad links like <a href="http://[http://...">
171 |             try:
172 |                 # such support is added in lxml 3.3.0
173 |                 doc.make_links_absolute(
174 |                     base_href,
175 |                     resolve_base_href=True,
176 |                     handle_failures=self.handle_failures,
177 |                 )
178 |             except TypeError:  # make_links_absolute() got an unexpected keyword argument 'handle_failures'
179 |                 # then we have lxml < 3.3.0
180 |                 # please upgrade to lxml >= 3.3.0 if you're failing here!
181 |                 doc.make_links_absolute(
182 |                     base_href,
183 |                     resolve_base_href=True,
184 |                     handle_failures=self.handle_failures,
185 |                 )
186 |         else:
187 |             doc.resolve_base_href(handle_failures=self.handle_failures)
188 |         return doc
189 | 
190 |     def content(self):
191 |         """Returns document body"""
192 |         return get_body(self._html(True))
193 | 
194 |     def title(self):
195 |         """Returns document title"""
196 |         return get_title(self._html(True))
197 | 
198 |     def author(self):
199 |         """Returns document author"""
200 |         return get_author(self._html(True))
201 | 
202 |     def short_title(self):
203 |         """Returns cleaned up document title"""
204 |         return shorten_title(self._html(True))
205 | 
206 |     def get_clean_html(self):
207 |         """
208 |         An internal method, which can be overridden in subclasses, for example,
209 |         to disable or to improve DOM-to-text conversion in .summary() method
210 |         """
211 |         return clean_attributes(tounicode(self.html, method="html"))
212 | 
213 |     def summary(self, html_partial=False, keep_all_images=False):
214 |         """
215 |         Given a HTML file, extracts the text of the article.
216 | 
217 |         :param html_partial: return only the div of the document, don't wrap
218 |                              in html and body tags.
219 |         :param keep_all_images: Keep all images in summary.
220 | 
221 |         Warning: It mutates internal DOM representation of the HTML document,
222 |         so it is better to call other API methods before this one.
223 |         """
224 |         try:
225 |             ruthless = True
226 |             while True:
227 |                 self._html(True)
228 |                 for i in self.tags(self.html, "script", "style"):
229 |                     i.drop_tree()
230 |                 for i in self.tags(self.html, "body"):
231 |                     i.set("id", "readabilityBody")
232 |                 if ruthless:
233 |                     self.remove_unlikely_candidates()
234 |                 self.transform_misused_divs_into_paragraphs()
235 |                 candidates = self.score_paragraphs()
236 | 
237 |                 best_candidate = self.select_best_candidate(candidates)
238 | 
239 |                 if best_candidate:
240 |                     article = self.get_article(
241 |                         candidates, best_candidate, html_partial=html_partial
242 |                     )
243 |                 else:
244 |                     if ruthless:
245 |                         log.info("ruthless removal did not work. ")
246 |                         ruthless = False
247 |                         log.debug(
248 |                                 "ended up stripping too much - "
249 |                                 "going for a safer _parse"
250 |                         )
251 |                         # try again
252 |                         continue
253 |                     else:
254 |                         log.debug(
255 |                                 "Ruthless and lenient parsing did not work. "
256 |                                 "Returning raw html"
257 |                         )
258 |                         article = self.html.find("body")
259 |                         if article is None:
260 |                             article = self.html
261 |                 cleaned_article = self.sanitize(article, candidates, keep_all_images)
262 | 
263 |                 article_length = len(cleaned_article or "")
264 |                 retry_length = self.retry_length
265 |                 of_acceptable_length = article_length >= retry_length
266 |                 if ruthless and not of_acceptable_length:
267 |                     ruthless = False
268 |                     # Loop through and try again.
269 |                     continue
270 |                 else:
271 |                     return cleaned_article
272 |         except Exception as e:
273 |             log.exception("error getting summary: ")
274 |             raise Unparseable(str(e)).with_traceback(sys.exc_info()[2])
275 | 
276 |     def get_article(self, candidates, best_candidate, html_partial=False):
277 |         # Now that we have the top candidate, look through its siblings for
278 |         # content that might also be related.
279 |         # Things like preambles, content split by ads that we removed, etc.
280 |         sibling_score_threshold = max([10, best_candidate["content_score"] * 0.2])
281 |         # create a new html document with a html->body->div
282 |         if html_partial:
283 |             output = fragment_fromstring("<div/>")
284 |         else:
285 |             output = document_fromstring("<div/>")
286 |         best_elem = best_candidate["elem"]
287 |         parent = best_elem.getparent()
288 |         siblings = parent.getchildren() if parent is not None else [best_elem]
289 |         for sibling in siblings:
290 |             # in lxml there no concept of simple text
291 |             # if isinstance(sibling, NavigableString): continue
292 |             append = False
293 |             if sibling is best_elem:
294 |                 append = True
295 |             sibling_key = sibling  # HashableElement(sibling)
296 |             if (
297 |                 sibling_key in candidates
298 |                 and candidates[sibling_key]["content_score"] >= sibling_score_threshold
299 |             ):
300 |                 append = True
301 | 
302 |             if sibling.tag == "p":
303 |                 link_density = self.get_link_density(sibling)
304 |                 node_content = sibling.text or ""
305 |                 node_length = len(node_content)
306 | 
307 |                 if node_length > 80 and link_density < 0.25:
308 |                     append = True
309 |                 elif (
310 |                     node_length <= 80
311 |                     and link_density == 0
312 |                     and re.search(r"\.( |$)", node_content)
313 |                 ):
314 |                     append = True
315 | 
316 |             if append:
317 |                 # We don't want to append directly to output, but the div
318 |                 # in html->body->div
319 |                 if html_partial:
320 |                     output.append(sibling)
321 |                 else:
322 |                     output.getchildren()[0].getchildren()[0].append(sibling)
323 |         # if output is not None:
324 |         #    output.append(best_elem)
325 |         return output
326 | 
327 |     def select_best_candidate(self, candidates):
328 |         if not candidates:
329 |             return None
330 | 
331 |         sorted_candidates = sorted(
332 |             candidates.values(), key=lambda x: x["content_score"], reverse=True
333 |         )
334 |         for candidate in sorted_candidates[:5]:
335 |             elem = candidate["elem"]
336 |             log.debug("Top 5 : {:6.3f} {}".format(candidate["content_score"], describe(elem)))
337 | 
338 |         best_candidate = sorted_candidates[0]
339 |         return best_candidate
340 | 
341 |     def get_link_density(self, elem):
342 |         link_length = 0
343 |         for i in elem.findall(".//a"):
344 |             link_length += text_length(i)
345 |         # if len(elem.findall(".//div") or elem.findall(".//p")):
346 |         #    link_length = link_length
347 |         total_length = text_length(elem)
348 |         return float(link_length) / max(total_length, 1)
349 | 
350 |     def score_paragraphs(self):
351 |         MIN_LEN = self.min_text_length
352 |         candidates = {}
353 |         ordered = []
354 |         for elem in self.tags(self._html(), "p", "pre", "td"):
355 |             parent_node = elem.getparent()
356 |             if parent_node is None:
357 |                 continue
358 |             grand_parent_node = parent_node.getparent()
359 | 
360 |             inner_text = clean(elem.text_content() or "")
361 |             inner_text_len = len(inner_text)
362 | 
363 |             # If this paragraph is less than 25 characters
364 |             # don't even count it.
365 |             if inner_text_len < MIN_LEN:
366 |                 continue
367 | 
368 |             if parent_node not in candidates:
369 |                 candidates[parent_node] = self.score_node(parent_node)
370 |                 ordered.append(parent_node)
371 | 
372 |             if grand_parent_node is not None and grand_parent_node not in candidates:
373 |                 candidates[grand_parent_node] = self.score_node(grand_parent_node)
374 |                 ordered.append(grand_parent_node)
375 | 
376 |             content_score = 1
377 |             content_score += len(inner_text.split(","))
378 |             content_score += min((inner_text_len / 100), 3)
379 |             # if elem not in candidates:
380 |             #    candidates[elem] = self.score_node(elem)
381 | 
382 |             # WTF? candidates[elem]['content_score'] += content_score
383 |             candidates[parent_node]["content_score"] += content_score
384 |             if grand_parent_node is not None:
385 |                 candidates[grand_parent_node]["content_score"] += content_score / 2.0
386 | 
387 |         # Scale the final candidates score based on link density. Good content
388 |         # should have a relatively small link density (5% or less) and be
389 |         # mostly unaffected by this operation.
390 |         for elem in ordered:
391 |             candidate = candidates[elem]
392 |             ld = self.get_link_density(elem)
393 |             score = candidate["content_score"]
394 |             log.debug(
395 |                 "Branch %6.3f %s link density %.3f -> %6.3f"
396 |                 % (score, describe(elem), ld, score * (1 - ld))
397 |             )
398 |             candidate["content_score"] *= 1 - ld
399 | 
400 |         return candidates
401 | 
402 |     def class_weight(self, e):
403 |         weight = 0
404 |         for feature in [e.get("class", None), e.get("id", None)]:
405 |             if feature:
406 |                 if REGEXES["negativeRe"].search(feature):
407 |                     weight -= 25
408 | 
409 |                 if REGEXES["positiveRe"].search(feature):
410 |                     weight += 25
411 | 
412 |                 if self.positive_keywords and self.positive_keywords.search(feature):
413 |                     weight += 25
414 | 
415 |                 if self.negative_keywords and self.negative_keywords.search(feature):
416 |                     weight -= 25
417 | 
418 |         if self.positive_keywords and self.positive_keywords.match("tag-" + e.tag):
419 |             weight += 25
420 | 
421 |         if self.negative_keywords and self.negative_keywords.match("tag-" + e.tag):
422 |             weight -= 25
423 | 
424 |         return weight
425 | 
426 |     def score_node(self, elem):
427 |         content_score = self.class_weight(elem)
428 |         name = elem.tag.lower()
429 |         if name in ["div", "article"]:
430 |             content_score += 5
431 |         elif name in ["pre", "td", "blockquote"]:
432 |             content_score += 3
433 |         elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form", "aside"]:
434 |             content_score -= 3
435 |         elif name in [
436 |             "h1",
437 |             "h2",
438 |             "h3",
439 |             "h4",
440 |             "h5",
441 |             "h6",
442 |             "th",
443 |             "header",
444 |             "footer",
445 |             "nav",
446 |         ]:
447 |             content_score -= 5
448 |         return {"content_score": content_score, "elem": elem}
449 | 
450 |     def remove_unlikely_candidates(self):
451 |         for elem in self.html.findall(".//*"):
452 |             s = "{} {}".format(elem.get("class", ""), elem.get("id", ""))
453 |             if len(s) < 2:
454 |                 continue
455 |             if (
456 |                 REGEXES["unlikelyCandidatesRe"].search(s)
457 |                 and (not REGEXES["okMaybeItsACandidateRe"].search(s))
458 |                 and elem.tag not in ["html", "body"]
459 |             ):
460 |                 log.debug("Removing unlikely candidate - %s" % describe(elem))
461 |                 elem.drop_tree()
462 | 
463 |     def transform_misused_divs_into_paragraphs(self):
464 |         for elem in self.tags(self.html, "div"):
465 |             # transform <div>s that do not contain other block elements into
466 |             # <p>s
467 |             # FIXME: The current implementation ignores all descendants that
468 |             # are not direct children of elem
469 |             # This results in incorrect results in case there is an <img>
470 |             # buried within an <a> for example
471 |             if not REGEXES["divToPElementsRe"].search(
472 |                 str(b"".join(tostring(s, encoding='utf-8') for s in elem))
473 |                 # str(b"".join(map(tostring_, list(elem))))
474 |             ):
475 |                 # log.debug("Altering %s to p" % (describe(elem)))
476 |                 elem.tag = "p"
477 |                 # print "Fixed element "+describe(elem)
478 | 
479 |         for elem in self.tags(self.html, "div"):
480 |             if elem.text and elem.text.strip():
481 |                 p = fragment_fromstring("<p/>")
482 |                 p.text = elem.text
483 |                 elem.text = None
484 |                 elem.insert(0, p)
485 |                 # print "Appended "+tounicode(p)+" to "+describe(elem)
486 | 
487 |             for pos, child in reversed(list(enumerate(elem))):
488 |                 if child.tail and child.tail.strip():
489 |                     p = fragment_fromstring("<p/>")
490 |                     p.text = child.tail
491 |                     child.tail = None
492 |                     elem.insert(pos + 1, p)
493 |                     # print "Inserted "+tounicode(p)+" to "+describe(elem)
494 |                 if child.tag == "br":
495 |                     # print 'Dropped <br> at '+describe(elem)
496 |                     child.drop_tree()
497 | 
498 |     def tags(self, node, *tag_names):
499 |         for tag_name in tag_names:
500 |             yield from node.findall(".//%s" % tag_name)
501 | 
502 |     def reverse_tags(self, node, *tag_names):
503 |         for tag_name in tag_names:
504 |             yield from reversed(node.findall(".//%s" % tag_name))
505 | 
506 |     def sanitize(self, node, candidates, keep_all_images=False):
507 |         MIN_LEN = self.min_text_length
508 |         for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
509 |             if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33:
510 |                 header.drop_tree()
511 | 
512 |         for elem in self.tags(node, "form", "textarea"):
513 |             elem.drop_tree()
514 | 
515 |         for elem in self.tags(node, "iframe"):
516 |             if "src" in elem.attrib and REGEXES["videoRe"].search(elem.attrib["src"]):
517 |                 elem.text = "VIDEO"  # ADD content to iframe text node to force <iframe></iframe> proper output
518 |             else:
519 |                 elem.drop_tree()
520 | 
521 |         allowed = {}
522 |         # Conditionally clean <table>s, <ul>s, and <div>s
523 |         for el in self.reverse_tags(
524 |             node, "table", "ul", "div", "aside", "header", "footer", "section"
525 |         ):
526 |             if el in allowed:
527 |                 continue
528 |             weight = self.class_weight(el)
529 |             if el in candidates:
530 |                 content_score = candidates[el]["content_score"]
531 |                 # print '!',el, '-> %6.3f' % content_score
532 |             else:
533 |                 content_score = 0
534 |             tag = el.tag
535 | 
536 |             if weight + content_score < 0:
537 |                 log.debug(
538 |                     "Removed %s with score %6.3f and weight %-3s"
539 |                     % (describe(el), content_score, weight,)
540 |                 )
541 |                 el.drop_tree()
542 |             elif el.text_content().count(",") < 10:
543 |                 counts = {}
544 |                 for kind in ["p", "img", "li", "a", "embed", "input"]:
545 |                     counts[kind] = len(el.findall(".//%s" % kind))
546 |                 counts["li"] -= 100
547 |                 counts["input"] -= len(el.findall('.//input[@type="hidden"]'))
548 | 
549 |                 # Count the text length excluding any surrounding whitespace
550 |                 content_length = text_length(el)
551 |                 link_density = self.get_link_density(el)
552 |                 parent_node = el.getparent()
553 |                 if parent_node is not None:
554 |                     if parent_node in candidates:
555 |                         content_score = candidates[parent_node]["content_score"]
556 |                     else:
557 |                         content_score = 0
558 |                 # if parent_node is not None:
559 |                 # pweight = self.class_weight(parent_node) + content_score
560 |                 # pname = describe(parent_node)
561 |                 # else:
562 |                 # pweight = 0
563 |                 # pname = "no parent"
564 |                 to_remove = False
565 |                 reason = ""
566 | 
567 |                 if keep_all_images and el.tag == 'div' and counts["img"] >= 1:
568 |                     continue
569 |                 if counts["p"] and counts["img"] > 1 + counts["p"] * 1.3:
570 |                     reason = "too many images (%s)" % counts["img"]
571 |                     to_remove = True
572 |                 elif counts["li"] > counts["p"] and tag not in ("ol", "ul"):
573 |                     reason = "more <li>s than <p>s"
574 |                     to_remove = True
575 |                 elif counts["input"] > (counts["p"] / 3):
576 |                     reason = "less than 3x <p>s than <input>s"
577 |                     to_remove = True
578 |                 elif content_length < MIN_LEN and counts["img"] == 0:
579 |                     reason = (
580 |                         "too short content length %s without a single image"
581 |                         % content_length
582 |                     )
583 |                     to_remove = True
584 |                 elif content_length < MIN_LEN and counts["img"] > 2:
585 |                     reason = (
586 |                         "too short content length %s and too many images"
587 |                         % content_length
588 |                     )
589 |                     to_remove = True
590 |                 elif weight < 25 and link_density > 0.2:
591 |                     reason = "too many links {:.3f} for its weight {}".format(
592 |                         link_density,
593 |                         weight,
594 |                     )
595 |                     to_remove = True
596 |                 elif weight >= 25 and link_density > 0.5:
597 |                     reason = "too many links {:.3f} for its weight {}".format(
598 |                         link_density,
599 |                         weight,
600 |                     )
601 |                     to_remove = True
602 |                 elif (counts["embed"] == 1 and content_length < 75) or counts[
603 |                     "embed"
604 |                 ] > 1:
605 |                     reason = (
606 |                         "<embed>s with too short content length, or too many <embed>s"
607 |                     )
608 |                     to_remove = True
609 |                 elif not content_length:
610 |                     reason = "no content"
611 |                     to_remove = True
612 |                     #                if el.tag == 'div' and counts['img'] >= 1 and to_remove:
613 |                     #                    imgs = el.findall('.//img')
614 |                     #                    valid_img = False
615 |                     #                    log.debug(tounicode(el))
616 |                     #                    for img in imgs:
617 |                     #
618 |                     #                        height = img.get('height')
619 |                     #                        text_length = img.get('text_length')
620 |                     #                        log.debug ("height %s text_length %s" %(repr(height), repr(text_length)))
621 |                     #                        if to_int(height) >= 100 or to_int(text_length) >= 100:
622 |                     #                            valid_img = True
623 |                     #                            log.debug("valid image" + tounicode(img))
624 |                     #                            break
625 |                     #                    if valid_img:
626 |                     #                        to_remove = False
627 |                     #                        log.debug("Allowing %s" %el.text_content())
628 |                     #                        for desnode in self.tags(el, "table", "ul", "div"):
629 |                     #                            allowed[desnode] = True
630 | 
631 |                     # find x non empty preceding and succeeding siblings
632 |                     i, j = 0, 0
633 |                     x = 1
634 |                     siblings = []
635 |                     for sib in el.itersiblings():
636 |                         # log.debug(sib.text_content())
637 |                         sib_content_length = text_length(sib)
638 |                         if sib_content_length:
639 |                             i = +1
640 |                             siblings.append(sib_content_length)
641 |                             if i == x:
642 |                                 break
643 |                     for sib in el.itersiblings(preceding=True):
644 |                         # log.debug(sib.text_content())
645 |                         sib_content_length = text_length(sib)
646 |                         if sib_content_length:
647 |                             j = +1
648 |                             siblings.append(sib_content_length)
649 |                             if j == x:
650 |                                 break
651 |                     # log.debug(str_(siblings))
652 |                     if siblings and sum(siblings) > 1000:
653 |                         to_remove = False
654 |                         log.debug("Allowing %s" % describe(el))
655 |                         for desnode in self.tags(el, "table", "ul", "div", "section"):
656 |                             allowed[desnode] = True
657 | 
658 |                 if to_remove:
659 |                     log.debug(
660 |                         "Removed %6.3f %s with weight %s cause it has %s."
661 |                         % (content_score, describe(el), weight, reason)
662 |                     )
663 |                     # print tounicode(el)
664 |                     # log.debug("pname %s pweight %.3f" %(pname, pweight))
665 |                     el.drop_tree()
666 |                 else:
667 |                     log.debug(
668 |                         "Not removing %s of length %s: %s"
669 |                         % (describe(el), content_length, text_content(el))
670 |                     )
671 | 
672 |         self.html = node
673 |         return self.get_clean_html()
674 | 
675 | 
676 | def main():
677 |     VERBOSITY = {1: logging.WARNING, 2: logging.INFO, 3: logging.DEBUG}
678 | 
679 |     from optparse import OptionParser
680 | 
681 |     parser = OptionParser(usage="%prog: [options] [file]")
682 |     parser.add_option("-v", "--verbose", action="count", default=0)
683 |     parser.add_option(
684 |         "-b", "--browser", default=None, action="store_true", help="open in browser"
685 |     )
686 |     parser.add_option(
687 |         "-l", "--log", default=None, help="save logs into file (appended)"
688 |     )
689 |     parser.add_option(
690 |         "-u", "--url", default=None, help="use URL instead of a local file"
691 |     )
692 |     parser.add_option("-x", "--xpath", default=None, help="add original xpath")
693 |     parser.add_option(
694 |         "-p",
695 |         "--positive-keywords",
696 |         default=None,
697 |         help="positive keywords (comma-separated)",
698 |         action="store",
699 |     )
700 |     parser.add_option(
701 |         "-n",
702 |         "--negative-keywords",
703 |         default=None,
704 |         help="negative keywords (comma-separated)",
705 |         action="store",
706 |     )
707 |     (options, args) = parser.parse_args()
708 | 
709 |     if options.verbose:
710 |         logging.basicConfig(
711 |             level=VERBOSITY[options.verbose],
712 |             filename=options.log,
713 |             format="%(asctime)s: %(levelname)s: %(message)s (at %(filename)s: %(lineno)d)",
714 |         )
715 | 
716 |     if not (len(args) == 1 or options.url):
717 |         parser.print_help()
718 |         sys.exit(1)
719 | 
720 |     file = None
721 |     if options.url:
722 |         headers = {"User-Agent": "Mozilla/5.0"}
723 |         request = urllib.request.Request(options.url, None, headers)
724 |         file = urllib.request.urlopen(request)
725 |     else:
726 |         file = open(args[0])
727 |     try:
728 |         doc = Document(
729 |             file.read(),
730 |             url=options.url,
731 |             positive_keywords=options.positive_keywords,
732 |             negative_keywords=options.negative_keywords,
733 |         )
734 |         if options.browser:
735 |             from .browser import open_in_browser
736 | 
737 |             result = "<h2>" + doc.short_title() + "</h2><br/>" + doc.summary()
738 |             open_in_browser(result)
739 |         else:
740 |             result = "Title:" + doc.short_title() + "\n" + doc.summary()
741 |             print(result)
742 |     finally:
743 |         file.close()
744 | 
745 | 
746 | if __name__ == "__main__":
747 |     main()
748 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | nose
2 | twine
3 | flake8


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import codecs
 4 | import os
 5 | import re
 6 | from setuptools import setup
 7 | 
 8 | speed_deps = [
 9 |      "cchardet",
10 | ]
11 | 
12 | extras = {
13 |     'speed': speed_deps,
14 | }
15 | 
16 | # Adapted from https://github.com/pypa/pip/blob/master/setup.py
17 | def find_version(*file_paths):
18 |     here = os.path.abspath(os.path.dirname(__file__))
19 | 
20 |     # Intentionally *not* adding an encoding option to open, See:
21 |     #   https://github.com/pypa/virtualenv/issues/201#issuecomment-3145690
22 |     with codecs.open(os.path.join(here, *file_paths), "r") as fp:
23 |         version_file = fp.read()
24 |         version_match = re.search(
25 |             r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M,
26 |         )
27 |         if version_match:
28 |             return version_match.group(1)
29 | 
30 |     raise RuntimeError("Unable to find version string.")
31 | 
32 | 
33 | setup(
34 |     name="readability-lxml",
35 |     version=find_version("readability", "__init__.py"),
36 |     author="Yuri Baburov",
37 |     author_email="burchik@gmail.com",
38 |     description="fast html to text parser (article readability tool) with python 3 support",
39 |     test_suite="tests.test_article_only",
40 |     long_description=open("README.md").read(),
41 |     long_description_content_type="text/markdown",
42 |     license="Apache License 2.0",
43 |     url="http://github.com/buriy/python-readability",
44 |     packages=["readability"],
45 |     install_requires=[
46 |         "chardet",
47 |         "lxml[html_clean]",
48 |         "lxml-html-clean; python_version < '3.11'",
49 |         "cssselect"
50 |     ],
51 |     extras_require=extras,
52 |     classifiers=[
53 |         "Environment :: Web Environment",
54 |         "Intended Audience :: Developers",
55 |         "Operating System :: OS Independent",
56 |         "Topic :: Text Processing :: Indexing",
57 |         "Topic :: Utilities",
58 |         "Topic :: Internet",
59 |         "Topic :: Software Development :: Libraries :: Python Modules",
60 |         "Programming Language :: Python",
61 |         "Programming Language :: Python :: 3",
62 |         "Programming Language :: Python :: 3.8",
63 |         "Programming Language :: Python :: 3.9",
64 |         "Programming Language :: Python :: 3.10",
65 |         "Programming Language :: Python :: 3.11",
66 |         "Programming Language :: Python :: 3.12",
67 |         "Programming Language :: Python :: 3.13",
68 |         "Programming Language :: Python :: Implementation :: PyPy",
69 |     ],
70 | )
71 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buriy/python-readability/c8d8011f3d4c69d7667a52395237e56e66af8ea4/tests/__init__.py


--------------------------------------------------------------------------------
/tests/samples/si-game.sample.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
  2 |    <html>
  3 |    <head>
  4 |    <meta http-equiv="content-type" content="text/html; charset=iso-8859-1">
  5 |    <a href="/baseball/mlb/teams/tigers/">
  6 |       <title>Detroit Tigers vs. Kansas City Royals - Preview - April 16, 2012</title></a><meta name="description" content="Tigers-Royals preview for game played on April 16, 2012">
  7 |    <meta name="keywords" content="Detroit Tigers, Kansas City Royals, preview, mlb, baseball, si.com">
  8 |    <script type="text/javascript">
  9 |    var SPORTID = "MLB";
 10 |    var PATH = "/baseball/mlb/scoreboards/2012/04/16/";
 11 |    var FEEDNAME = "scoreboard.dat";
 12 |    isViewcast = true;
 13 |    var searchString = document.location.href;
 14 |    </script>
 15 |    <link rel="stylesheet" type="text/css" href="http://i.cdn.turner.com/si/.e/css/pkg/global_41/129.css"/>
 16 | <script type="text/javascript" language="JavaScript" src="http://i.cdn.turner.com/si/.e/js/4.1/global/lib/jquery-1.5.2.min.js"></script>
 17 | <script language="JavaScript" type="text/javascript" src="http://i.cdn.turner.com/si/.e/js/pkg/global/593.js"></script>
 18 | <script src="http://img.timeinc.net/shared/static/js/tii_ads.js"></script><script>var adConfig=new TiiAdConfig('3475.si2');adConfig.setRevSciTracking(true);</script>
 19 | 
 20 | <!--[if IE 9]>
 21 | <link rel="stylesheet" type="text/css" href="http://i.cdn.turner.com/si/.e/css/4.1/ie9.css" />
 22 | <![endif]-->
 23 | <link rel="stylesheet" type="text/css" href="http://i.cdn.turner.com/si/.element/css/4.1/gameflash.css"/>
 24 | <link rel="stylesheet" type="text/css" href="http://i.cdn.turner.com/si/.element/css/4.1/miniscores.css"/>
 25 | <script language="javascript" type="text/javascript">
 26 | 	function hidediv() {
 27 | 			if (document.getElementById) { // DOM3 = IE5, NS6
 28 | 					document.getElementById('cnngCommentsBox').className = 'cnngCommentsBoxOff';
 29 | 			}
 30 | 			else {
 31 | 					if (document.layers) { // Netscape 4
 32 | 							document.cnngCommentsBox.className = 'cnngCommentsBoxOff';
 33 | 					}
 34 | 					else { // IE 4
 35 | 							document.all.cnngCommentsBox.className = 'cnngCommentsBoxOff';
 36 | 					}
 37 | 			}
 38 | 	}
 39 | 	function showdiv() {
 40 | 			if (document.getElementById) { // DOM3 = IE5, NS6
 41 | 					document.getElementById('cnngCommentsBox').className = 'cnngCommentsBox';
 42 | 			}
 43 | 			else {
 44 | 					if (document.layers) { // Netscape 4
 45 | 							document.cnngCommentsBox.className = 'cnngCommentsBox';
 46 | 					}
 47 | 					else { // IE 4
 48 | 							document.all.cnngCommentsBox.className = 'cnngCommentsBox';
 49 | 					}
 50 | 			}
 51 | 	}
 52 | function siVideoBegin(cvpInstance, videoId) {  }
 53 | function siVideoPlay(cvpInstance, videoId) {
 54 | 	var cvpData = cvpInstance.getContentEntry(videoId);
 55 | 	var cvpObject = window.JSON.parse(cvpData);
 56 | 	jQuery('#cnnCVPRecapDetails').show();
 57 | 	jQuery('#cvpHeadline').html(cvpObject.headline);
 58 | 	jQuery('#cvpDescription').html(cvpObject.description);
 59 | 	jQuery('#cvpSource').html(cvpObject.source);
 60 | }
 61 | 
 62 | function siVideoPlayHead(cvpInstance, playheadTime, totalDuration) { }
 63 | 
 64 | function siVideoAdStarted(cvpInstance, videoId) { }
 65 | 
 66 | function siVideoTrackingAdCountdown(seconds) { }
 67 | 
 68 | function siVideoComplete(cvpInstance, videoId) { }
 69 | 
 70 | function siVideoPause(cvpInstance, videoId, paused) { }
 71 | 
 72 | function siVideoSeek() { }
 73 | </script>
 74 | <script language="JavaScript" src="/.element/js/4.1/ads/sasd_ads.js"></script>
 75 | <script src="http://i.cdn.turner.com/si/.element/js/4.1/global/lib/iframe_ad_factory.js"></script><script>iframeAdFactory.url = '/si_adspaces/4.0/iframe.html';
 76 | window.setInterval(function(){ iframeAdFactory.refresh() }, 45000);</script>
 77 | 
 78 | 
 79 | <script type="text/javascript">
 80 | var adFactory = new TiiAdFactory(adConfig, "mlb/gameflashpage");
 81 | iframeAdFactory.queryString = 'TiiAdConfig=3475.si2&adConfigPairs=' + '&TiiAdFactory=' + encodeURIComponent('mlb/gameflashpage') + '&adFactoryPairs=' + '&paramPairs=' + encodeURIComponent('sport=mlb');
 82 | if (TiiAdsIsDebugMode()) { iframeAdFactory.queryString += '&debugads=y'; }
 83 | </script>
 84 | <link rel="stylesheet" type="text/css" href="http://z.cdn.turner.com/si/.element/css/4.1/gameflash_mlb.css"/>
 85 | <script type="text/javascript" src="http://z.cdn.turner.com/si/.element/js/4.1/global/lib/jquery-1.4.2.min.js"></script>
 86 | 
 87 | <link rel="stylesheet" type="text/css" href="http://z.cdn.turner.com/si/.element/ssi/scoreboards/4.2/css/scoreticker-master.css"/>
 88 | <script type="text/javascript" src="http://z.cdn.turner.com/si/.element/ssi/gameflash/4.2/football/nfl/js/jquery.jsonp-2.1.4.min.js"></script>
 89 | <script type="text/javascript" src="http://z.cdn.turner.com/si/.element/ssi/scoreboards/4.2/js/scoreticker-master.js"></script>
 90 | <script type="text/javascript" src="http://z.cdn.turner.com/si/.element/ssi/scoreboards/4.2/js/scoreticker-mlb.js"></script>
 91 | 
 92 | 
 93 | 
 94 | 
 95 | 
 96 |    </head>
 97 |    <body>
 98 |    <!--[if IE 6]><div class="ie"><div class="ie6"><![endif]--><!--[if IE 7]><div class="ie"><div class="ie7"><![endif]--><!--[if
 99 |    IE 8]><div class="ie"><div class="ie8"><![endif]-->
100 |    <div class="cnnPage">
101 |    		
102 |    <!-- start contentHeader-->
103 |    <style>
104 | DIV.cnnSearch { padding:5px 0; }
105 | DIV.cnnSearch DIV.cnnRight { padding:4px 0; }
106 | DIV.cnnSearch DIV.cnnLeft { margin:0;padding:0; }
107 | DIV.cnnSearch DIV.cnnLeft LI { float:left;margin:0;padding:0 5px 0 0; }
108 | DIV.cnnSearch DIV.cnnLeft LI A { display:block;margin:0;padding:0; }
109 | DIV.cnnSearch DIV.cnnLeft LI IMG { vertical-align:bottom; }
110 | DIV.cnnSearch DIV.cnnLeft LI DL { margin:0;padding:0;position:relative;z-index:999999; }
111 | DIV.cnnSearch DIV.cnnLeft LI DT { margin:0;padding:0; }
112 | DIV.cnnSearch DIV.cnnLeft LI DD { left:-999em;margin:0;padding:0 3px 0 1px;position:absolute;top:23px; }
113 | DIV.cnnSearch DIV.cnnLeft LI DL.cnnOver DD,
114 | DIV.cnnSearch DIV.cnnLeft LI DL:hover DD { left:auto; }
115 | 
116 | DIV.cnnBanner { height:auto; }
117 | DIV.cnnBannerSection DIV.cnnLeft { width:auto; }
118 | DIV.cnnBannerSection DIV.cnnLeft A { display:inline;height:auto;width:auto; }
119 | 
120 | DIV.cnnBanner { background:transparent url('http://i.cdn.turner.com/si/.element/img/4.1/sect/global/topper.gif') no-repeat top right;position:relative;text-align:left;width:1000px; }
121 | .ie6 DIV.cnnBanner { width:1000px; }
122 | DIV.cnnBanner DIV IMG { display:block; }
123 | DIV.cnnBannerSection { height:99px;position:absolute;left:243px;top:0px;width:757px; }
124 | DIV.cnnBannerSection TD.col0 { display:none; }
125 | DIV.cnnBannerSection DIV.cnn_border { display:none; }
126 | DIV.cnnBannerSection IMG { display:inline;float:left; }
127 | DIV.cnnBannerSection DIV.cnnLeft { float:left; }
128 | DIV.cnnBannerSection DIV.cnnLeft IMG { float:none; }
129 | DIV.cnnBannerSection DIV.cnnRight { float:right;margin:8px 6px 0 0; }
130 | DIV.cnnBannerSection DIV.cnn_header { color:#000;font:bold 50px georgia;line-height:58px;padding:6px 10px 0 0; }
131 | DIV.cnnBannerSection DIV.cnn_header SPAN { font-size:10px;color:#ccc; }
132 | DIV.cnnBannerSection DIV.cnn_header A { color:#000; }
133 | DIV.cnnBannerSection DIV.cnn_header UL { color:#ccc;float:right;font-size:10px;line-height:12px;margin-top:36px; }
134 | .ie DIV.cnnBannerSection DIV.cnn_header UL { margin-top:-21px; }
135 | DIV.cnnBannerSection DIV.cnn_header UL LI { border-left:1px solid #ccc;float:left;padding:0 4px; }
136 | DIV.cnnBannerSection DIV.cnn_header UL LI#cnnItem0 { border:0; }
137 | DIV.cnnBannerSection DIV.cnn_header UL LI#cnnItem2 DIV.cnn_more { font:normal 9px arial; }
138 | DIV.cnnBannerSection DIV.cnn_header UL LI#cnnItem2 DIV.cnn_more A { font:normal 9px arial; }
139 | DIV.cnnBannerSection DIV.cnn_header UL LI DIV.cnn_rollover { background-image:url('http://i.cdn.turner.com/si/.e1d/img/4.0/global/pixels/blank_pixel.gif');display:none;padding:10px 0 9px 0;left:103px;position:absolute;width:654px; }
140 | .ie DIV.cnnBannerSection DIV.cnn_header UL LI DIV.cnn_rollover { top:55px; }
141 | DIV.cnnBannerSection DIV.cnn_header UL LI.cnnOver .cnn_rollover,
142 | DIV.cnnBannerSection DIV.cnn_header UL LI:hover .cnn_rollover { display:block; }
143 | DIV.cnnBannerSection DIV.cnn_more { color:#2e373c;font-size:10px;padding:2px 0 0 0; }
144 | DIV.cnnBannerSection DIV.cnn_more A { color:#fff;font-weight:bold; }
145 | DIV.cnnBannerSection DIV.cnn_more A:hover { color:#e7e7e7; }
146 | DIV.cnnBannerSection DIV.cnn_more DIV { display:none;color:#ccc;line-height:12px; }
147 | DIV.cnnBannerSection DIV.cnn_more DIV SPAN A { font:9px arial;font-weight:normal; }
148 | DIV.cnnBannerSection DIV.cnn_header DIV.cnn_more A { font-family:arial; }
149 | 
150 | DIV.cnnGameScores { background:#6f7f8b;border-bottom:11px solid #384d5e; }
151 | </style>
152 | 
153 | <!-- start personalize -->
154 | <div class="cnnPersonalize"><div><div><script>cnn_writePresonalizeBar();</script></div></div></div>
155 | <!-- end personalize -->
156 | 
157 | <!-- start searchbar -->
158 | <div class="cnnSearch">
159 | 	<div class="cnnLeft"><ul>
160 | 	<li class="cnnItem0" id="cnnCM1"><dl><script type="text/javascript">
161 | /* script for 50/50 split */
162 | /*var min=1;
163 | var max=2;
164 | x = Math.floor(Math.random() * (max - min + 1)) + min;
165 | if(x/2 == 1) {
166 |   document.write('<dt><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1006340.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe0"><img src="http://i.cdn.turner.com/si/2011_images/cm/WS11_btn_champ_STL.png" alt="Get the Cardinals Championship Package" title="Get the Cardinals Championship Package"/></a></dt>');
167 |   document.write('<dd><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1006340.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe1"><img src="http://i.cdn.turner.com/si/2011_images/cm/WS11_dropdown_STL.png" alt="Get the Cardinals Championship Package" title="Get the Cardinals Championship Package"/></a></dd>');
168 | } else {
169 |   document.write('<dt><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1007180.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe0"><img src="http://i.cdn.turner.com/si/2011_images/cm/EA-N4S-TheRun-btn.png" alt="Get Need for Speed 12 FREE" title="Get Need for Speed 12 FREE"/></a></dt>');
170 |   document.write('<dd><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1007180.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe1"><img src="http://i.cdn.turner.com/si/2011_images/cm/EA-N4S-TheRun-SI-dropdown.jpg" alt="Get Need for Speed 12 FREE" title="Get Need for Speed 12 FREE"/></a></dd>');
171 | }
172 | */
173 | </script>
174 | 
175 | 
176 | <!--Kentucky-->
177 | <dt><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1009459.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe0"><img src="http://i.cdn.turner.com/si/2012_images/cm/si-btn-champ-kentucky.png" alt="Get the Wildcats Championship Package" title="Get the Wildcats Championship Package"/></a></dt>
178 | <dd><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1009459.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe1"><img src="http://i.cdn.turner.com/si/2012_images/cm/si-dropdown-kentucky.png" alt="Get the Wildcats Championship Package" title="Get the Wildcats Championship Package"/></a></dd>
179 | 
180 | <!--original generic sub buttons, changed on 10.26.11 for world series-->
181 | <!--<dt><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1005085.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe0"><img src="http://i.cdn.turner.com/si/2011_images/cm/si-btn-EA-MADDEN12.png" alt="Get EA Sports Madden NFL 12 Free!" title="Get EA Sports Madden NFL 12 Free!"/></a></dt>
182 | <dd><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1005085.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe1"><img src="http://i.cdn.turner.com/si/2011_images/cm/si-dropdown-EA-MADDEN12.jpg" alt="Get EA Sports Madden NFL 12 Free!" title="Get EA Sports Madden NFL 12 Free!"/></a></dd>
183 | -->
184 | 
185 | <script><!--
186 | /*
187 | if (cnnPage.isHomepage) {
188 | 	var button = $e('cnn_cm_subscribe0');
189 | 	button.href = 'https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1004340.html';
190 | 	button = $e('cnn_cm_subscribe1');
191 | 	button.href = 'https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1004340.html';
192 | }
193 | */
194 | //--></script>
195 | </dl></a></li>
196 | 	<li class="cnnItem1"><dl><script type="text/javascript">
197 | var min=1;
198 | var max=2;
199 | x = Math.floor(Math.random() * (max - min + 1)) + min;
200 | /*turning off 50/50 for now*/
201 | /*if(x/2 == 1) {
202 |   document.write('<dt><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1002346.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe2"><img src="http://i.cdn.turner.com/si/.element/img/4.1/global/cm/button_subscribe_si_red.png" alt="Subscribe to SI" title="Subscribe to SI"/></a></dt>');
203 |   document.write('<dd><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1002346.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe4"><img src="http://i.cdn.turner.com/si/.element/img/4.1/global/cm/dropdown_subscribe_si_red.png" alt="Subscribe to SI" title="Subscribe to SI"/></a></dd>');
204 | } else {*/
205 |   document.write('<dt><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1002346.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe2"><img src="http://i.cdn.turner.com/si/.element/img/4.1/global/cm/button_subscribe_si_red.png" alt="Subscribe to SI" title="Subscribe to SI"/></a></dt>');
206 |   document.write('<dd style="margin-left:-79px"><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1002346.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe4"><img src="http://i.cdn.turner.com/si/2012_images/cm/bn_2osi16579_290x162_v1.png" alt="Subscribe to SI" title="Subscribe to SI"/></a></dd>');
207 | //}
208 | </script>
209 | </dl></li>
210 | 	<li class="cnnItem2"><dl><!--Default ROS
211 | <a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1001406.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe3"><img src="http://i.cdn.turner.com/si/2012_images/cm/si-btn3_170x30_sigift.png" alt="Give the Gift of SI" title="Give the Gift of SI"/></a>
212 | -->
213 | 
214 | <script type="text/javascript">
215 | /*var min=1;
216 | var max=2;
217 | x = Math.floor(Math.random() * (max - min + 1)) + min;
218 | if(x/2 == 1) {
219 |   document.write('<dt><a href="https://subscription.si.com/storefront/Give-the-Gift-of-Sports-Illustrated/site/si-donor0411jacket.html?xid=sirosheader&link=1001406" target="_blank" rel="nofollow" id="cnn_cm_subscribe0"><img src="http://i.cdn.turner.com/si/2011_images/cm/170x30.png" alt="Give the Gift of SI" title="Give the Gift of SI"/></a></dt>');
220 |   document.write('<dd><a href="https://subscription.si.com/storefront/Give-the-Gift-of-Sports-Illustrated/site/si-donor0411jacket.html?xid=sirosheader&link=1001406" target="_blank" rel="nofollow" id="cnn_cm_subscribe1"><img src="http://i.cdn.turner.com/si/2011_images/cm/170X110.jpg" alt="Give the Gift of SI" title="Give the Gift of SI"/></a></dd>');
221 | } else {
222 |   document.write('<dt><a href="http://www.si.com/swim2012" target="_blank" rel="nofollow" id="cnn_cm_subscribe0"><img src="http://i.cdn.turner.com/si/2012_images/cm/si-btn_swim.png" alt="Swimsuit 2012" title="Swimsuit 2012"/></a></dt>');
223 |   document.write('<dd><a href="http://www.si.com/swim2012" target="_blank" rel="nofollow" id="cnn_cm_subscribe1"><img src="http://i.cdn.turner.com/si/2012_images/cm/SWIM_2012_dropdown.png" alt="Swimsuit 2012" title="Swimsuit 2012"/></a></dd>');
224 | */
225 | </script>
226 | 
227 | <!--MLB2K 2012-->
228 | <dt><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1009469.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe0"><img src="http://i.cdn.turner.com/si/2012_images/cm/si-btn3_MLB2K12.png" alt="Get MLB 2K 12 FREE" title="Get MLB 2K 12 FREE"/></a></dt>
229 | <dd><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1009469.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe1"><img src="http://i.cdn.turner.com/si/2012_images/cm/si-MLB2K12-dropdown.jpg" alt="Get MLB 2K 12 FREE" title="Get MLB 2K 12 FREE"/></a></dd>
230 | 
231 | <!--swimsuit 2012-->
232 | <!--
233 | <dt><a href="http://sportsillustrated.cnn.com/swim2012" target="_blank" rel="nofollow" id="cnn_cm_subscribe0"><img src="http://i.cdn.turner.com/si/2012_images/cm/si-btn_swim.png" alt="Swimsuit 2012" title="Swimsuit 2012"/></a></dt>
234 | <dd><a href="http://sportsillustrated.cnn.com/swim2012" target="_blank" rel="nofollow" id="cnn_cm_subscribe1"><img src="http://i.cdn.turner.com/si/2012_images/cm/SWIM_2012_dropdown.png" alt="Swimsuit 2012" title="Swimsuit 2012"/></a></dd>
235 | -->
236 | </dl></li>
237 | </ul>
238 | </div>
239 | 	<div class="cnnRight"><form method="get" action="http://sportsillustrated.cnn.com/search/" name="cm_search"><input type="text" name="text" class="cnnLeft"/><input type="image" src="http://i.cdn.turner.com/si/.element/img/4.1/global/search.gif" alt="Search" title="Search" class="cnnRight"/></form></div>
240 | </div>
241 | <!-- end searchbar -->
242 | 
243 | <!-- start banner -->
244 | <div class="cnnBanner">
245 | 	<div><a href="/"><img src="http://i.cdn.turner.com/si/.element/img/4.1/sect/global/logo2.png" alt="SI.com Home" title="SI.com Home"/></a>
246 | </div>
247 | 	<div class="cnnBannerSection">
248 | 		<div class="cnnLeft"><a href="/baseball/mlb/"><img src="http://i.cdn.turner.com/si/.element/img/4.0/sect/baseball/mlb/icon.jpg"/></a></div>
249 | 		<div class="cnn_header"><a href="/baseball/mlb/">MLB GAMEFLASH</a></div>
250 | 		<div class="cnn_more" style="font-size:9px;"><a href="/baseball/mlb/scoreboards/today/">Scores</a> | <a href="/baseball/mlb/teams/">Teams</a> | <a href="/baseball/mlb/players/">Players</a> | <a href="/fantasy/player_news/mlb/">Player News</a> | <a href="/baseball/mlb/standings/">Standings</a> | <a href="/baseball/mlb/probables/today/">Probables</a> | <a href="/baseball/mlb/schedules/weekly/today/">Schedules</a> | <a href="/baseball/mlb/stats/">Stats</a> | <a href="/baseball/mlb/transactions/">Transactions</a> | <a href="/baseball/mlb/injuries/">Injuries</a> | <a href="http://www.ticketcity.com/mlb-tickets.html " target="_blank" rel="nofollow">Tickets</a> | <a href="http://mlb.mlb.com/mlb/subscriptions/index.jsp?product=si&vbID=simlbtv_test" target="_blank" rel="nofollow">MLB.TV</a>
251 | </div>
252 | 	</div>
253 | </div>
254 | <div class="cnnClear"></div>
255 | <!-- end banner -->
256 | 
257 | <style>
258 | /*
259 | DIV.cnnTopnav LI A { color:#000;display:block;padding:0 16px 0 16px!important; }
260 | DIV.cnnTopnav LI A { color:#000;display:block;padding:0 23px 0 22px!important; }
261 | */
262 | DIV.cnnTopnav LI A { color:#000;display:block;padding:0 11px 0 11px!important; }
263 | DIV.cnnTopnav LI.cnnFirst { padding-left:0px; }
264 | </style>
265 | <div class="cnnTopnav">
266 | 	<ul>
267 | 		<li class="cnnFirst"><a href="/extramustard/?eref=sinav">EXTRA MUSTARD</a></li>
268 | 		<li><a href="http://www.fannation.com/?eref=sinav">FANNATION</a></li>
269 | 		<li><a href="/multimedia/photo_gallery/?eref=sinav">PHOTOS</a></li>
270 | 		<li><a href="/swimsuit/?eref=sinav">SWIMSUIT</a></li>
271 | 		<li><a href="/fantasy/?eref=sinav">FANTASY</a></li>
272 | 		<li><a href="/magazine/sportsman/?eref=sinav">SPORTSMAN</a></li>
273 | 		<li><a href="http://www.sportsillustratedeverywhere.com/">MAGAZINE</a></li>
274 | 		<li><a href="/sifk/?eref=sinav">SI KIDS</a></li>
275 | 		<li><a href="/highschool/?eref=sinav">HIGH SCHOOL</a></li>
276 | 		<li><a href="/behindthemic/?eref=sinav">BEHIND THE MIC</a></li>
277 | 		<li><a href="http://www.twackle.com/" target="_blank" rel="nofollow">TWACKLE</a></li>
278 | 		<!--<li><a href="http://www.maxpreps.com/national/national.htm?eref=sinav" target="_blank" rel="nofollow">MAXPREPS</a></li>-->
279 | 	</ul>
280 | </div>
281 | <!-- end topnav -->
282 | 
283 | <style>
284 | 	.ie6 #cnnBotnav LI#cnnBotnav0 { width:49px; } /* NFL */
285 | 	.ie6 #cnnBotnav LI#cnnBotnav1 { width:150px; } /* COLLEGE FOOTBALL */
286 | 	.ie6 #cnnBotnav LI#cnnBotnav2 { width:50px; } /* MLB */
287 | 	.ie6 #cnnBotnav LI#cnnBotnav3 { width:51px; } /* NBA */
288 | 	.ie6 #cnnBotnav LI#cnnBotnav4 { width:101px; } /* COLLEGE BB */
289 | 	.ie6 #cnnBotnav LI#cnnBotnav5 { width:58px; } /* GOLF */
290 | 	.ie6 #cnnBotnav LI#cnnBotnav6 { width:50px; } /* NHL */
291 | 	.ie6 #cnnBotnav LI#cnnBotnav7 { width:74px; } /* RACING */
292 | 	.ie6 #cnnBotnav LI#cnnBotnav8 { width:74px; } /* SOCCER */
293 | 	.ie6 #cnnBotnav LI#cnnBotnav9 { width:121px; } /* MMA & BOXING */
294 | 	.ie6 #cnnBotnav LI#cnnBotnav11 { width:73px; } /* TENNIS */
295 | 	.ie6 #cnnBotnav LI#cnnBotnav12 { width:63px; } /* MORE */
296 | 	.ie6 #cnnBotnav LI#cnnBotnav13 { width:74px; } /* VIDEO */
297 | 	#cnnBotnav LI#cnnBotnav0 STRONG { width:49px; } /* NFL */
298 | 	#cnnBotnav LI#cnnBotnav1 STRONG { width:150px; } /* COLLEGE FOOTBALL */
299 | 	#cnnBotnav LI#cnnBotnav2 STRONG { width:50px; } /* MLB */
300 | 	#cnnBotnav LI#cnnBotnav3 STRONG { width:51px; } /* NBA */
301 | 	#cnnBotnav LI#cnnBotnav4 STRONG { width:101px; } /* COLLEGE BB */
302 | 	#cnnBotnav LI#cnnBotnav5 STRONG { width:58px; } /* GOLF */
303 | 	#cnnBotnav LI#cnnBotnav6 STRONG { width:50px; } /* NHL */
304 | 	#cnnBotnav LI#cnnBotnav7 STRONG { width:74px; } /* RACING */
305 | 	#cnnBotnav LI#cnnBotnav8 STRONG { width:74px; } /* SOCCER */
306 | 	#cnnBotnav LI#cnnBotnav9 STRONG { width:121px; } /* MMA & BOXING */
307 | 	#cnnBotnav LI#cnnBotnav11 STRONG { width:73px; } /* TENNIS */
308 | 	#cnnBotnav LI#cnnBotnav12 STRONG { width:63px; } /* MORE */
309 | 	#cnnBotnav LI#cnnBotnav13 STRONG { width:74px; } /* VIDEO */
310 | 
311 | /* realignment */
312 | 	#cnnBotnav LI#cnnBotnav11:hover UL,
313 | 	#cnnBotnav LI#cnnBotnav11 LI.cnnOver UL { margin-left:0; } /* width of subnav minus width of TENNIS minus width of MORE minus 2 lines */
314 | 	#cnnBotnav LI#cnnBotnav12:hover UL,
315 | 	#cnnBotnav LI#cnnBotnav12 LI.cnnOver UL { margin-left:-41px; } /* width of subnav minus width of MORE minus 1 line */
316 | 	#cnnBotnav LI#cnnBotnav13:hover UL,
317 | 	#cnnBotnav LI#cnnBotnav13 LI.cnnOver UL { margin-left:-93px; width:168px; } /* width of subnav minus width of MORE minus 1 line */
318 | 	#cnnBotnav LI#cnnBotnav13 UL LI { width:168px; }
319 | </style>
320 | <!-- start botnav -->
321 | <div class="cnnBotnav">
322 | 	<div>
323 | 		<ul id="cnnBotnav" style="height:29px;overflow:hidden;">
324 | 			<li id="cnnBotnav0" nav="nfl">
325 | 				<a href="/football/nfl/?eref=sinav"><strong>NFL</strong></a>
326 | 			</li>
327 | 			<li id="cnnBotnav1" nav="ncaaf">
328 | 				<a href="/football/ncaa/?eref=sinav"><strong>COLLEGE FOOTBALL</strong></a>
329 | 			</li>
330 | 			<li id="cnnBotnav2" nav="mlb">
331 | 				<a href="/baseball/mlb/?eref=sinav"><strong>MLB</strong></a>
332 | 			</li>
333 | 			<li id="cnnBotnav3" nav="nba">
334 | 				<a href="/basketball/nba/?eref=sinav"><strong>NBA</strong></a>
335 | 			</li>
336 | 			<li id="cnnBotnav4" nav="ncaabb">
337 | 				<a href="/basketball/ncaa/?eref=sinav"><strong>COLLEGE BB</strong></a>
338 | 			</li>
339 | 			<li id="cnnBotnav5" nav="golf">
340 | 				<a href="http://www.golf.com/?eref=sinav"><strong>GOLF</strong></a>
341 | 			</li>
342 | 			<li id="cnnBotnav6" nav="nhl">
343 | 				<a href="/hockey/nhl/?eref=sinav"><strong>NHL</strong></a>
344 | 			</li>
345 | 			<li id="cnnBotnav7" nav="racing">
346 | 				<a href="/racing/?eref=sinav"><strong>RACING</strong></a>
347 | 			</li>
348 | 			<li id="cnnBotnav8" nav="soccer">
349 | 				<a href="/soccer/?eref=sinav"><strong>SOCCER</strong></a>
350 | 			</li>
351 | 			<li id="cnnBotnav9" nav="boxmma">
352 | 				<a href="/mma/?eref=sinav"><strong>MMA &amp; BOXING</strong></a>
353 | 			</li>
354 | 			<li id="cnnBotnav11" nav="tennis">
355 | 				<a href="/tennis/?eref=sinav"><strong>TENNIS</strong></a>
356 | 			</li>
357 | 			<li id="cnnBotnav12" nav="more">
358 | 				<a href="/more/?eref=sinav"><strong>MORE</strong></a>
359 | 			</li>
360 | 			<li id="cnnBotnav13" nav="video">
361 | 				<a href="/video/?eref=sinav"><strong>VIDEO</strong></a>
362 | 			</li>
363 | 		</ul>
364 | 	</div>
365 | </div>
366 | <!-- end botnav -->
367 | 
368 | 
369 | <div class="cnnViewerAd"><script type="text/javascript">iframeAdFactory.getAd('i_728x90', 728, 90, new Array('728x90','101x1'), true);</script></div>
370 | 
371 | <!-- start scoreboard ticker -->
372 | <div id="scoreticker" class="stMLB">
373 | 
374 | 	<div id="stScrollWrap">
375 | 		<a href="" class="stScrollControl left disabled"></a>
376 | 		<a href="" class="stScrollControl right"></a>
377 | 		<div id="stScroller"></div>
378 | 	</div>
379 | 	
380 | </div>
381 | <!-- end scoreboard ticker -->
382 | 
383 |    <!-- end contentHeader-->
384 |    
385 |    <!-- start scoreboard -->
386 |    <div class="cnngScoreboardNoLastPlay">
387 |       <div class="cnngScoreboard">
388 |          <div class="cnnLeft">
389 |             <div>&nbsp;
390 |                						
391 |             </div>
392 |             <table border="0" cellpadding="0" cellspacing="0">
393 |                <tr class="cnnRow0">
394 |                   <td class="cnnCol0">&nbsp;</td>
395 |                   <td class="cnnCol1">1</td>
396 |                   <td class="cnnCol2">2</td>
397 |                   <td class="cnnCol3">3</td>
398 |                   <td class="cnnCol4">4</td>
399 |                   <td class="cnnCol5">5</td>
400 |                   <td class="cnnCol6">6</td>
401 |                   <td class="cnnCol7">7</td>
402 |                   <td class="cnnCol8">8</td>
403 |                   <td class="cnnCol9">9</td>
404 |                   <td class="cnnColR">R</td>
405 |                   <td class="cnnColH">H</td>
406 |                   <td class="cnnColE">E</td>
407 |                </tr>
408 |                <tr class="cnnRow1">
409 |                   <td class="cnnCol0"><a href="/baseball/mlb/teams/tigers/">TIGERS</a></td>
410 |                   <td class="cnnCol1">&nbsp;</td>
411 |                   <td class="cnnCol2">&nbsp;</td>
412 |                   <td class="cnnCol3">&nbsp;</td>
413 |                   <td class="cnnCol4">&nbsp;</td>
414 |                   <td class="cnnCol5">&nbsp;</td>
415 |                   <td class="cnnCol6">&nbsp;</td>
416 |                   <td class="cnnCol7">&nbsp;</td>
417 |                   <td class="cnnCol8">&nbsp;</td>
418 |                   <td class="cnnCol9">&nbsp;</td>
419 |                   <td class="cnnColR">&nbsp;</td>
420 |                   <td class="cnnColH">&nbsp;</td>
421 |                   <td class="cnnColE">&nbsp;</td>
422 |                </tr>
423 |                <tr class="cnnRow2">
424 |                   <td class="cnnCol0"><a href="/baseball/mlb/teams/royals/">ROYALS</a></td>
425 |                   <td class="cnnCol1">&nbsp;</td>
426 |                   <td class="cnnCol2">&nbsp;</td>
427 |                   <td class="cnnCol3">&nbsp;</td>
428 |                   <td class="cnnCol4">&nbsp;</td>
429 |                   <td class="cnnCol5">&nbsp;</td>
430 |                   <td class="cnnCol6">&nbsp;</td>
431 |                   <td class="cnnCol7">&nbsp;</td>
432 |                   <td class="cnnCol8">&nbsp;</td>
433 |                   <td class="cnnCol9">&nbsp;</td>
434 |                   <td class="cnnColR">&nbsp;</td>
435 |                   <td class="cnnColH">&nbsp;</td>
436 |                   <td class="cnnColE">&nbsp;</td>
437 |                </tr>
438 |             </table>
439 |          </div>
440 |          <div class="cnnRight">
441 |             <ol>
442 |                <li class="cnnItem4">8:10 PM ET
443 |                   						
444 |                </li>
445 |             </ol>
446 |             <ul>
447 |                <li class="cnnItem0"><strong>Tigers</strong><a href="/baseball/mlb/players/7590/"><img src="http://i.cdn.turner.com/si/.e1d/img/4.0/global/baseball/mlb/players/7590_small.jpg" border="0" width="50" height="76" alt="Verlander" title="Verlander"></a><a href="/baseball/mlb/players/7590/">
448 |                      <div class="cnnLine0">Verlander</div>
449 |                      <div class="cnnLine4">0-1</div>
450 |                      <div class="cnnLine5">2.2&nbsp;ERA</div>
451 |                      <div class="cnnLine6">&nbsp;</div>
452 |                      <div class="cnnLine7">&nbsp;</div></a></li>
453 |                <li class="cnnItem1"><strong>Royals</strong><a href="/baseball/mlb/players/8932/"><img src="http://i.cdn.turner.com/si/.e1d/img/4.0/global/baseball/mlb/players/8932_small.jpg" border="0" width="50" height="76" alt="Duffy" title="Duffy"></a><a href="/baseball/mlb/players/8932//">
454 |                      <div class="cnnLine0">Duffy</div>
455 |                      <div class="cnnLine4">1-0</div>
456 |                      <div class="cnnLine5">0&nbsp;ERA</div>
457 |                      <div class="cnnLine6">&nbsp;</div>
458 |                      <div class="cnnLine7">&nbsp;</div></a></li>
459 |             </ul>
460 |          </div>
461 |       </div>
462 |    </div>
463 |    <!-- end scoreboard -->
464 |    
465 |    <!-- start navbar -->
466 |    <div class="cnngNavbar">
467 |       <table border="0" cellpadding="0" cellspacing="0">
468 |          <tr>
469 |             <td class="cnnCol0"><span>PREVIEW</span></td>
470 |             <td class="cnnCol0"><a href="40630_matchup.html">MATCHUP</a></td></li>
471 |             <td class="cnnCol3"><a href="40630_fancomment.html">FAN COMMENTS</a></td>
472 |          </tr>
473 |       </table>
474 |    </div>
475 |    <!-- end navbar -->
476 |    
477 |    <!-- start content -->
478 |    <div class="cnngContent">
479 |    	<div class="cnngPreview">
480 |    		<div class="cnnLeft">
481 |    			
482 |    <!-- REAPFINDREPLACE:20120515:/.element/ssi/story/4.1/wires/ap/expired_story.html:/baseball/mlb/gameflash/2012/04/16/40630_preview.html-->
483 |    <h1>Tigers-Royals Preview</h1>
484 |    <p>
485 |       
486 |       <span class="cnnDataLinked"><a href="/baseball/mlb/players/7590/index.html">Justin Verlander</a></span>
487 |       has pitched well in each of his first two starts, though he doesn't have a win to show for those efforts.
488 |       
489 |    </p>
490 |    <p>
491 |       He hasn't had much trouble earning victories against the 
492 |       <span class="cnnDataLinked"><a href="/baseball/mlb/teams/royals/index.html">Kansas City Royals</a></span>
493 |       .
494 |       
495 |    </p>
496 |    <p>
497 |       Verlander looks to continue his mastery of the Royals when the 
498 |       <span class="cnnDataLinked"><a href="/baseball/mlb/teams/tigers/index.html">Detroit Tigers</a></span>
499 |       visit Kauffman Stadium in the opener of a three-game series Monday night.
500 |       
501 |    </p>
502 |    <p>
503 |       The reigning AL 
504 |       <span class="cnnDataLinked"><a href="/baseball/mlb/players/49534/index.html">Cy Young</a></span>
505 |       winner and MVP had a 2-0 lead through eight innings in both of his outings, but the Tigers weren't able to hold the lead.
506 |       
507 |    </p>
508 |    <p>Verlander (0-1, 2.20 ERA) allowed two hits before running into trouble in the ninth against Tampa Bay on Wednesday, getting
509 |       charged with four runs in 8 1-3 innings of a 4-2 defeat.
510 |    </p>"Once a couple guys got on, really the first time I've cranked it up like that - and lost a little bit of my consistency that
511 |    I'd had all day," Verlander said. "It's inexcusable. This loss rests solely on my shoulders." 
512 |    <p>The right-hander did his part in his opening-day start against Boston on April 5, allowing two hits before the bullpen faltered.
513 |       Detroit ended up winning 3-2 with a run in the bottom of the ninth, though Verlander didn't earn a decision.
514 |    </p>
515 |    <p>That hasn't been the case in his last four starts against the Royals, winning each with a 1.82 ERA. Verlander is 13-2 with
516 |       a 2.40 ERA in 19 career starts versus Kansas City, and another win will give him more victories than he has against any other
517 |       team. He's also beaten Cleveland 13 times.
518 |    </p>
519 |    <p>Verlander is 8-2 with a 1.82 ERA lifetime at Kauffman Stadium, where the Royals (3-6) were swept in a three-game series against
520 |       the Indians with Sunday's 13-7 loss.
521 |    </p>
522 |    <p>
523 |       
524 |       <span class="cnnDataLinked"><a href="/baseball/mlb/players/7634/index.html">Billy Butler</a></span>
525 |       , who is 14 for 39 (.359) with two homers off Verlander, had an RBI single and is hitting .364 with four doubles and a homer
526 |       during a five-game hitting streak.
527 |       
528 |    </p>
529 |    <p>
530 |       Royals pitchers allowed seven home runs, 17 extra-base hits and 32 runs in the series, and manager 
531 |       <span class="cnnDataLinked"><a href="/baseball/mlb/players/1716/index.html">Ned Yost</a></span>
532 |       turned to outfielder 
533 |       <span class="cnnDataLinked"><a href="/baseball/mlb/players/7899/index.html">Mitch Maier</a></span>
534 |       in the ninth to pitched a scoreless inning Sunday.
535 |       
536 |    </p>"Let's hope it doesn't happen again," Maier said. "I don't like to be put in that situation, but we needed an inning." 
537 |    <p>
538 |       Kansas City will look to bounce back with the help of another solid outing from 
539 |       <span class="cnnDataLinked"><a href="/baseball/mlb/players/8932/index.html">Danny Duffy</a></span>
540 |       (1-0, 0.00), who allowed one hit and struck out eight in six innings of a 3-0 win over Oakland on Tuesday.
541 |       
542 |    </p>
543 |    <p>The left-hander will be seeking his first win against Detroit after going 0-2 with a 5.63 ERA in three starts versus the Tigers
544 |       as a rookie.
545 |    </p>
546 |    <p>
547 |       
548 |       <span class="cnnDataLinked"><a href="/baseball/mlb/players/7129/index.html">Gerald Laird</a></span>
549 |       was a triple short of the cycle and helped the Tigers (6-3) salvage the finale of a three-game series with a 5-2 victory over
550 |       Chicago on Sunday.
551 |       
552 |    </p>
553 |    <p>
554 |       
555 |       <span class="cnnDataLinked"><a href="/baseball/mlb/players/8419/index.html">Rick Porcello</a></span>
556 |       allowed one run in 7 2-3 innings to give Detroit's starting rotation its first victory.
557 |       
558 |    </p>"All the other starters have pitched well," Porcello said. "It's just the way it's happened so far." 
559 |    <p>Verlander allowed three runs in seven innings of a 4-3 win over the Royals on Aug. 6, beating Duffy, who gave up three runs
560 |       over five.
561 |    </p>
562 |    <!-- /REAPFINDREPLACE:20120515:/.element/ssi/story/4.1/wires/ap/expired_story.html:/baseball/mlb/gameflash/2012/04/16/40630_preview.html-->
563 |    			<p class="cnnLast">
564 |    				<a href="http://biz.stats.com/" target="new">&#169; 2011 STATS LLC <img src="http://i.a.cnn.net/si/images/STATSlogo.gif" align="absmiddle" alt="STATS, Inc"></a>
565 | 
566 |    			</p>
567 |    		</div>
568 |    		<div class="cnnRight">
569 |    			
570 |    			<div class="cnngCommentsBox" id="cnngCommentsBox">
571 |    				<div class="cnngComments">
572 |    					<div class="cnnHolder">						
573 |    						<div id="fanComments">
574 |    							<iframe src="http://www.fannation.com/gameday/gameflash_game_comments/320416107?sport_id=2" width="397" height="390" marginwidth="0" scrolling="no" frameborder="0"></iframe>
575 |    						</div>
576 |    					</div>
577 |    				</div>
578 |    				<div class="cnn_footer">
579 |    					<div class="cnngToggleOn"><a href="javascript:hidediv();">TURN COMMENTS <span>OFF</span></a></div>
580 |    					<div class="cnngToggleOff"><a href="javascript:showdiv();">TURN COMMENTS <span>ON</span></a></div>
581 |    				</div>
582 |    			</div>
583 |    		</div>
584 |    	</div>
585 |    </div>
586 |    <!-- end content -->
587 |    
588 |    <!-- start contentFooter -->
589 |    <div class="cnnWideSL"><script type="text/javascript">adsonar_placementId=1488671;adsonar_pid=769769;adsonar_ps=-1;adsonar_zw=978;adsonar_zh=150;</script><script>cnnad_createSL();</script></div>
590 | <!-- start footerbox -->
591 | <div class="cnnFooterBox">
592 | 	<div class="cnnHolder">
593 | 		<div class="cnnRight">
594 | 			<dl>
595 | 				<dt><a href="/"><img src="http://i.cdn.turner.com/si/.element/img/4.1/global/footer_logo.jpg" alt="SI.com" title="SI.com"/></a></dt>
596 | 				<dd><span>Hot Topics:</span>   <a href="/2012/writers/peter_king/04/16/countdown/index.html" title="Peter King: MMQB"class="cnnFirst">Peter King: MMQB</a>   <a href="http://nhl-red-light.si.com/2012/04/16/mayhem-reigns-in-stanley-cup-playoffs/" title="NHL Playoffs" target="new" >NHL Playoffs</a>   <a href="/2012/writers/george_schroeder/04/16/arkansas-football-petrino/index.html" title="Bobby Petrino">Bobby Petrino</a>   <a href="/2012/baseball/mlb/04/16/valentine.youkilis.ap/index.html" title="Bobby Valentine">Bobby Valentine</a>   <a href="/2012/writers/michael_mccann/04/16/roger.clemens.trial.preview/index.html" title="Roger Clemens">Roger Clemens</a>   <a href="/2012/baseball/mlb/04/16/power.rankings/index.html" title="MLB Power Rankings">MLB Power Rankings</a>   <a href="/2012/writers/richard_rothschild/04/13/jackie.robinson/index.html" title="Jackie Robinson">Jackie Robinson</a> </dd>
597 | 			</dl>
598 | 			<div class="cnnClear"></div>
599 | 			<ul>
600 | 				<li><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1002348.html" target="_blank" rel="nofollow">SUBSCRIBE TO SI</a></li>
601 | 				<li><a href="http://www.sportsillustratedeverywhere.com" target="_blank" rel="nofollow">DIGITAL EDITION</a></li>
602 | 				<li><a href="/mobile/">SI MOBILE</a></li>
603 | 				<li><a href="/2010/about_us/jobs/">JOBS</a></li>
604 | 				<li><a href="/sitemap/">SITE MAP</a></li>
605 | 				<li><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1003862.html" target="_blank" rel="nofollow">GIVE THE GIFT OF SI</a></li>
606 | 				<li><a href="http://sipictures.com/" target="_blank" rel="nofollow">SI PICTURE SALES</a></li>
607 | 				<li><a href="http://www.sportsillustratedsnapshot.com" target="_blank" rel="nofollow">PICTURES OF THE DAY</a></li>
608 | 				<li><a href="/about_us/">ABOUT US</a></li>
609 | 				<li><a href="http://simediakit.com" target="_blank" rel="nofollow">SI MEDIA KITS</a></li>
610 | 				<li><a href="http://www.sicovers.com/default.aspx?utm_source=sicom&utm_medium=ftr&utm_campaign=icrefer&xid=siftr" target="_blank" rel="nofollow">SI COVER COLLECTION</a></li>
611 | 				<li><a href="http://sicustomerservice.com/" target="_blank" rel="nofollow">SI CUSTOMER SERVICE</a></li>
612 | 				<li><a href="/2008/magazine/si.books/">SI BOOKS</a></li>
613 | 				<li><a href="/about_us/feedback/">CONTACT US</a></li>
614 | 				<li><a href="/services/rss/">ADD RSS HEADLINE</a></li>
615 | 			</ul>
616 | 			<div class="cnnClear"></div>
617 | 			<div class="cnnCopyright">
618 | 				<style>
619 | 				.cnnFooterBox .cnnHolder { overflow:hidden; }
620 | 				.cnnFooterBox .cnnRight DIV.cnnCopyright { line-height:16px;padding-top:2px;text-align:left; }
621 | 				.cnnFooterBox .cnnRight DIV.cnnCopyright IMG { float:left;margin:0 6px 14px 0; }
622 | 				.cnnFooterBox .cnnRight DIV.cnnCopyright IMG#cnnFooterAdOpt { float:none;margin:0 0 0 6px;vertical-align:bottom; }
623 | 				</style>
624 | 				<img src="http://i.cdn.turner.com//si/.element/img/4.1/global/logo_footer_turner.png" alt="Turner - SI Digital"/> 
625 | 				<script type="text/javascript">if( ( ( document.location.pathname ).indexOf( '/basketball/nba' ) >= 0 ) || ( ( document.location.pathname ).indexOf( '/video/nba' ) == 0 ) ) { document.write( 'TM & &#169; 2012 Turner Broadcasting System, Inc. A Time Warner Company. All Rights Reserved. SI.com is part of CNN Digital Network, which is part of the Turner Digital Network.' ); } else { document.write( 'TM & &#169; 2012 Turner Broadcasting System, Inc. A Time Warner Company. All Rights Reserved. SI.com is part of CNN Digital Network, which is part of the Turner Digital Network.' ); }</script><noscript>TM & &#169; 2012 Turner Broadcasting System, Inc. A Time Warner Company. All Rights Reserved. SI.com is part of CNN Digital Network, which is part of the Turner Digital Network.</noscript>
626 | 				<br/> <a href="/interactive_legal.html" rel="nofollow">Terms</a> under which this service is provided to you. Read our <a href="/privacy/" rel="nofollow">privacy guidelines</a>, <a href="https://subscription.timeinc.com/storefront/privacy/si/generic_privacy_new.html?dnp-source=E#california" rel="nofollow">your California privacy rights</a>, and <a href="http://subscription-assets.timeinc.com/prod/assets/themes/magazines/default/template-resources/html/legal/ti-corp-behavioral.html">ad choices<img src="http://i.cdn.turner.com/si/.element/img/4.1/global/logo_adchoices.gif" id="cnnFooterAdOpt"/></a>.
627 | 			</div>
628 | 		</div>
629 | 		<div class="cnnLeft"><a href="http://sportsillustrated.cnn.com/vault/cover/featured/11730/index.htm?xid=sivcoverhome"><img style="vertical-align:bottom;" title="SI Cover" alt="SI Cover" src="http://i.cdn.turner.com/si/si_online/covers/images/2012/0416_thumb.jpg"></a><a href="http://www.sportsillustratedeverywhere.com/?xid=sivcoverhome"><img style="vertical-align:bottom;" src="http://i.cdn.turner.com/sivault/.element/img/1.0/read_all_articles_96x12.gif" alt="Read All Articles" border="0" width="96" height="12"></a><a href="http://www.sicovers.com/ils.aspx?p=SPR20120416golf&utm_source=sivault&utm_medium=inet&utm_campain=icrefer &xid=sivcoverhome" target="_blank"><img style="vertical-align:bottom;" src="http://i.cdn.turner.com/sivault/.element/img/1.0/buy_cover_reprint.gif" alt="Buy Cover Reprint" border="0" width="96" height="12"></a>
630 | </div>
631 | 	</div>
632 | </div>
633 | 
634 | <!-- end footerbox -->
635 | 
636 | <!-- start searchbar -->
637 | <div class="cnnSearchFooter">
638 | 	<div class="cnnCenter"><form method="get" action="http://sportsillustrated.cnn.com/search/" name="footer_search"><input id="searchInputFooter" type="text" name="text" class="cnnLeft"/><input type="image" src="http://i.cdn.turner.com/si/.element/img/4.1/global/search.gif" alt="Search" title="Search" class="cnnRight"/></form></div>
639 | </div>
640 | 
641 | <!-- end searchbar -->
642 | 
643 | <!--START OF PAGELINKS.JS-->
644 | <script language="Javascript">// Post Processing code to update links with tracking references
645 | 
646 | var url = window.location.href.toString();
647 | url = url.replace(/http:\/\/[^\/]*/, '');
648 | url = url.replace(/\?.*$/, '');
649 | 
650 | // All links on page
651 | var links = document.getElementsByTagName('a');
652 | 
653 | for (var i=0; i < links.length; i++) {
654 | 	var link = links[i];
655 | 	if (link.href); else continue;
656 | 	if (link.href.indexOf('.html/')>0) { siLog.debug('Fix trail slash - ',link.href); link.href = link.href.replace(/\.html\//,'.html'); }
657 | 	if (!cnnPage.isHomepage) {
658 | 		// Loop through links, add erefs where expected
659 | 		if (link.href.indexOf('http://www.fannation.com/') == 0) {
660 | 			cnnAddQ( link, 'eref=fromSI' );
661 | 		}
662 | 		if (url != '/' && link.href.indexOf('/vault') > 0) {
663 | 			cnnAddQ( link, 'eref=sisf' );
664 | 		} 
665 | 		if (url.indexOf('/danpatrick') != 0 && link.href.indexOf('/danpatrick') > 0 && link.href.indexOf('.mp3') < 0) {
666 | 			cnnAddQ( link, 'eref=fromSI' );
667 | 		}
668 | 	}
669 | 	if (link.innerHTML == link.getAttribute('title')) {
670 | 		link.setAttribute('title','');
671 | 	}
672 | }
673 | 
674 | function cnnAddQ (link, add) {
675 | 	if (link.href.toLowerCase().indexOf('javascript') == -1) {
676 | 		if (link.href.indexOf('?') > 0) link.href = link.href + '&' + add;
677 | 		else link.href = link.href + '?' + add;
678 | 	}
679 | }
680 | 
681 | // Add whitespace to cnnClear
682 | var breaks = $c('cnnClear','div');
683 | 
684 | /* Homepage */
685 | if (cnnPage.isHomepage) {
686 | 	cnnTagHPLinks(); 
687 | 	/* iPad */
688 | 	if(navigator.userAgent.indexOf('iPad')>-1) {
689 | 		$e('cnnShareRow_mobile').href='http://ax.itunes.apple.com/WebObjects/MZStore.'
690 | 		+'woa/wa/browserRedirect?url=itms%253A%252F%252Fax.itunes.apple.com%252FWebObj'
691 | 		+'ects%252FMZStore.woa%252Fwa%252FviewSoftware%253Fid%253D329510739%2526mt%253D8';
692 | 	}
693 | 	/* Poll frame height issue */
694 | 	if ($e('cnnPollFrame')) { $e('cnnPollFrame').setAttribute('height','169'); }
695 | }</script>
696 | <!--END OF PAGELINKS.JS-->
697 | 
698 | </div>
699 | <div><!-- move tracking out of cnnpage -->
700 | <!-- ADBP/JSMD -->
701 | <!-- ADBP Meta Data -->
702 | <script type="text/javascript" src="http://i.cdn.turner.com/si/.e/js/4.1/global/jsmd/metadata.js"></script>
703 | <!-- /ADBP Meta Data -->	
704 | 
705 | <!-- JSMD Code --> 
706 | <script language="JavaScript" type="text/javascript" src="http://i.cdn.turner.com/si/.element/js/4.1/global/jsmd/jsmd.js"></script> 
707 | <script language="JavaScript"> 
708 | <!-- $pathname is defined in metadata.js
709 | if($pathname.indexOf("/.element/ssi/ads.iframes/") == -1 && $pathname.indexOf("/doubleclick/dartiframe.html") == -1) {
710 | 	var jsmd=_jsmd.init();
711 | 	if(document.referrer !== window.location.href){
712 | 		jsmd.send();
713 | 	}
714 | }
715 | //-->
716 | </script> 
717 | <!-- / End JSMD Code -->
718 | <!-- /ADBP/JSMD -->
719 | </div>
720 | 
721 | <div style="font-size:1px;line-height:1px;">
722 | <div><img src="/cookie.crumb" width="1" height="1"></div>
723 | </div>
724 | 
725 | <img src="http://i.cdn.turner.com/si/.e/img/4.0/global/pixels/blank_pixel.gif" alt="" id="TargetImageDE" name="TargetImageDE" onload="cnnad_getDEAdHeadCookie(this)" height="1" width="1">
726 | 
727 | <script language="JavaScript">
728 | 	siTracking.init();
729 | </script>
730 | <script language="JavaScript">
731 | 	//ADM
732 | 	cnnad_sendADMData();
733 | 	cnnad_ugsync();
734 | </script>
735 | 
736 | <!-- TIIAD -->
737 | <script type="text/javascript">
738 | function siQuantcast()
739 | {
740 | 	var lb = "Time Inc News Business and Sports,Sports Illustrated";
741 | 	var lb_ch = (jsmd.get("m:page.section[0]") ? jsmd.get("m:page.section[0]") : "");
742 | 	lb+=(lb_ch != null && typeof(lb_ch) == "string" && lb_ch.length > 0) ? "." + lb_ch:"";
743 | 	return lb;
744 | }
745 | _qoptions={
746 | 	qacct:"p-5dyPa639IrgIw",
747 | 	labels:siQuantcast()
748 | };
749 | </script>
750 | <script type="text/javascript" src="http://edge.quantserve.com/quant.js"></script>
751 | <noscript><img src="http://pixel.quantserve.com/pixel/p-5dyPa639IrgIw.gif?labels=Time Inc News Business and Sports,Sports Illustrated" style="display: none;" border="0" height="1" width="1" alt="Quantcast"/></noscript> 
752 | <script src="http://js.revsci.net/gateway/gw.js?csid=H07710&auto=t" type="text/javascript"></script>
753 | <!-- /TIIAD -->
754 | 
755 | <script src="http://i.cdn.turner.com/si/.e1d/js/4.1/global/pagelinks.js" type="text/javascript"></script>
756 | <script src="http://i.cdn.turner.com/si/.e1d/js/4.1/global/subnav.js" type="text/javascript"></script>
757 | 
758 |    <!-- end contentFooter -->
759 |    
760 |    <!--[if IE 6]></div></div><![endif]--><!--[if IE 7]></div></div><![endif]--><!--[if IE 8]></div></div><![endif]-->
761 |    </body>
762 |    </html>


--------------------------------------------------------------------------------
/tests/samples/summary-keep-all-images.sample.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head></head>
 4 | <body>
 5 | <h2>
 6 |     <span>
 7 |         H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline
 8 |     </span>
 9 | </h2>
10 | <p>
11 |     <spa>
12 |         Text Text Text Text Text Text Text Text Text Text
13 |     </spa>
14 | </p>
15 | <div>
16 |     <span>
17 |         <a>
18 |             <img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAIAAAACUFjqAAABhGlDQ1BJQ0MgcHJvZmlsZQAAKJF9kT1Iw1AUhU9TpSLVDnYQcchQnSyIFXHUKhShQqgVWnUweekfNDEkKS6OgmvBwZ/FqoOLs64OroIg+APiLjgpukiJ9yWFFjFeeLyP8+45vHcfIDSqTLO6xgFNt81MKinm8iti6BUBhBFBPxIys4xZSUrDt77uqZvqLs6z/Pv+rD61YDEgIBLPMMO0ideJpzZtg/M+cZSVZZX4nHjMpAsSP3Jd8fiNc8llgWdGzWxmjjhKLJY6WOlgVjY14knimKrplC/kPFY5b3HWqjXWuid/YbigLy9xndYwUljAIiSIUFBDBVXYiNOuk2IhQ+dJH/+Q65fIpZCrAkaOeWxAg+z6wf/g92ytYmLCSwonge4Xx/kYAUK7QLPuON/HjtM8AYLPwJXe9m80gOlP0uttLXYERLaBi+u2puwBlzvA4JMhm7IrBWkJxSLwfkbflAcGboHeVW9urXOcPgBZmlX6Bjg4BEZLlL3m8+6ezrn929Oa3w9e03KfJqsuOAAAAAlwSFlzAAAuIwAALiMBeKU/dgAAAAd0SU1FB+kBDA8PKt1W5MYAAAAZdEVYdENvbW1lbnQAQ3JlYXRlZCB3aXRoIEdJTVBXgQ4XAAAAFUlEQVQY02P8x+rFgBswMeAFI1UaAJ65AWFYB2G5AAAAAElFTkSuQmCC"
19 |             />
20 |          </a>
21 |     </span>
22 | </div>
23 | <p>
24 |     <spa>
25 |         Text Text Text Text Text Text Text Text Text Text
26 |     </spa>
27 | </p>
28 | </body>
29 | </html>


--------------------------------------------------------------------------------
/tests/samples/the-hurricane-rubin-carter-denzel-washington.html:
--------------------------------------------------------------------------------
   1 | 
   2 | <!DOCTYPE html>
   3 | <html id="js-context" class="js-off is-not-modern id--signed-out" data-page-path="/film/2014/apr/24/the-hurricane-rubin-carter-denzel-washington">
   4 | <head>
   5 | <meta charset="utf-8"/>
   6 | <title>The Hurricane: the facts of Rubin Carter's life story are beaten to a pulp | Film | The Guardian</title>
   7 | <meta http-equiv="X-UA-Compatible" content="IE=Edge"/>
   8 | <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
   9 | <meta name="format-detection" content="telephone=no"/>
  10 | <meta name="HandheldFriendly" content="True"/>
  11 | <link rel="dns-prefetch" href="http://assets.guim.co.uk/"/>
  12 | <link rel="dns-prefetch" href="//i.guim.co.uk"/>
  13 | <link rel="dns-prefetch" href="http://api.nextgen.guardianapps.co.uk"/>
  14 | <link rel="dns-prefetch" href="http://hits.theguardian.com"/>
  15 | <link rel="dns-prefetch" href="//j.ophan.co.uk"/>
  16 | <link rel="dns-prefetch" href="//ophan.theguardian.com"/>
  17 | <link rel="dns-prefetch" href="//oas.theguardian.com"/>
  18 | <link rel="dns-prefetch" href="//beacon.guim.co.uk"/>
  19 | <link rel="shortcut icon" type="image/png" href="http://assets.guim.co.uk/images/favicons/79d7ab5a729562cebca9c6a13c324f0e/32x32.ico"/>
  20 | <link rel="apple-touch-icon" sizes="152x152" href="http://assets.guim.co.uk/images/favicons/451963ac2e23633472bf48e2856d3f04/152x152.png"/>
  21 | <link rel="apple-touch-icon" sizes="144x144" href="http://assets.guim.co.uk/images/favicons/1a3f98d8491f8cfdc224089b785da86b/144x144.png"/>
  22 | <link rel="apple-touch-icon" sizes="120x120" href="http://assets.guim.co.uk/images/favicons/cf23080600002e50f5869c72f5a904bd/120x120.png"/>
  23 | <link rel="apple-touch-icon" sizes="114x114" href="http://assets.guim.co.uk/images/favicons/f438f6041a4c1d0289e6debd112880c2/114x114.png"/>
  24 | <link rel="apple-touch-icon" sizes="72x72" href="http://assets.guim.co.uk/images/favicons/b5050517955e7cf1e493ccc53e64ca05/72x72.png"/>
  25 | <link rel="apple-touch-icon-precomposed" href="http://assets.guim.co.uk/images/favicons/4fd650035a2cebafea4e210990874c64/57x57.png"/>
  26 | <link rel="canonical" href="http://www.theguardian.com/film/2014/apr/24/the-hurricane-rubin-carter-denzel-washington"/>
  27 | <meta name="apple-mobile-web-app-title" content="Guardian"/>
  28 | <meta name="application-name" content="The Guardian"/>
  29 | <meta name="msapplication-TileColor" content="#005689"/>
  30 | <meta name="msapplication-TileImage" content="http://assets.guim.co.uk/images/favicons/f06f6996e193d1ddcd614ea852322d25/windows_tile_144_b.png"/>
  31 | <link rel="publisher" href="https://plus.google.com/113000071431138202574"/>
  32 | <meta name="author" content="Alex von Tunzelmann"/>
  33 | <meta name="description" content="Denzel Washington&#x27;s compelling performance gives Norman Jewison&#x27;s biopic punch – despite its many inaccuracies, writes Alex von Tunzelmann"/>
  34 | <meta name="thumbnail" content="//i.guim.co.uk/static/w-620/h--/q-95/sys-images/Arts/Arts_/Pictures/2014/4/23/1398270516995/The-Hurricane-007.jpg"/>
  35 | <meta name="keywords" content="Sport,Film,Denzel Washington"/>
  36 | <meta name="news_keywords" content="Sport,Film,Denzel Washington"/>
  37 | <meta property="og:url" content="http://www.theguardian.com/film/2014/apr/24/the-hurricane-rubin-carter-denzel-washington"/>
  38 | <meta property="article:author" content="http://www.theguardian.com/profile/alexvontunzelmann"/>
  39 | <meta property="og:description" content="Denzel Washington&#x27;s compelling performance gives Norman Jewison&#x27;s biopic punch – despite its many inaccuracies, writes Alex von Tunzelmann"/>
  40 | <meta property="og:image" content="http://static.guim.co.uk/sys-images/Arts/Arts_/Pictures/2014/4/23/1398270516995/The-Hurricane-007.jpg"/>
  41 | <meta property="article:publisher" content="https://www.facebook.com/theguardian"/>
  42 | <meta property="og:type" content="article"/>
  43 | <meta property="article:section" content="Film"/>
  44 | <meta property="article:published_time" content="2014-04-24T09:26:18.000Z"/>
  45 | <meta property="og:title" content="The Hurricane: the facts of Rubin Carter&#x27;s life story are beaten to a pulp"/>
  46 | <meta property="fb:app_id" content="180444840287"/>
  47 | <meta property="article:tag" content="Sport,Film,Denzel Washington"/>
  48 | <meta property="og:site_name" content="the Guardian"/>
  49 | <meta property="article:modified_time" content="2014-06-21T00:41:29.000+01:00"/>
  50 | <meta name="twitter:site" content="@guardian"/>
  51 | <meta name="twitter:app:name:iphone" content="The Guardian"/>
  52 | <meta name="twitter:app:id:iphone" content="409128287"/>
  53 | <meta name="twitter:app:name:googleplay" content="The Guardian"/>
  54 | <meta name="twitter:app:id:googleplay" content="com.guardian"/>
  55 | <meta name="twitter:app:url:googleplay" content="guardian://www.theguardian.com/film/2014/apr/24/the-hurricane-rubin-carter-denzel-washington"/>
  56 | <meta name="twitter:card" content="summary_large_image"/>
  57 | <meta name="google-site-verification" content="LR-FN6c2gIEUoo3k049w1nxyHykmac5ZE3SaUOiKc30"/>
  58 | <!-- scripts and CSS were cut -->
  59 | </head>
  60 | <body id="top" class="has-localnav" itemscope itemtype="http://schema.org/WebPage">
  61 | <div class="site-message js-site-message is-hidden" data-link-name="release message" role="dialog" aria-label="welcome" aria-describedby="site-message__message">
  62 | <div class="gs-container">
  63 | <div class="site-message__inner js-site-message-inner">
  64 | <div class="site-message__media">
  65 | <span class="inline-marque-36 inline-icon u-vertical-align-middle-icon">
  66 | <svg width="36" height="36" viewBox="0 0 36 36"><path d="M21.3 8.8c0-4.9-1.5-5.7-3.3-5.7-1.8 0-3.2.7-3.2 5.7s1.5 5.5 3.2 5.5c1.8-.1 3.3-.6 3.3-5.5m-6.5 18.8c-2.3 0-2.9 1.7-2.9 2.9 0 1.8 1.6 3.4 6.3 3.4 5.3 0 6.8-1.5 6.8-3.4 0-1.7-1.3-2.9-3.4-2.9h-6.8zM10.5 2.4C4.3 5.2 0 11.4 0 18.7c0 4.9 2 9.4 5.2 12.6V31c0-3.2 3.1-4.4 5.9-5-2.6-.6-3.9-2.5-3.9-4.4 0-2.6 2.9-4.8 4.3-5.8l-.2-.1c-2.5-1.4-4.1-3.8-4.1-7 0-2.7 1.2-4.9 3.3-6.3M36 18.8C36 11.4 31.5 5 25.1 2.3c2.1 1.4 3.4 3.5 3.5 6.3l.1.6c0 5.4-4.4 8.2-10.7 8.2-1.6 0-2.7-.1-4.1-.5-.6.4-1.1 1.1-1.1 1.8 0 .9.8 1.6 1.8 1.6h8.8c5.5 0 8.2 2.2 8.2 7.1 0 1.6-.3 3.1-1 4.3 3.3-3.4 5.4-7.9 5.4-12.9"/></svg>
  67 | </span>
  68 | </div>
  69 | <div class="site-message__copy js-site-message-copy u-cf">
  70 | </div>
  71 | <div class="site-message__close">
  72 | <button class="site-message__close-btn js-site-message-close" data-link-name="hide release message">
  73 | <span class="u-h">Close</span>
  74 | <span class="inline-close-icon-white-small inline-icon ">
  75 | <svg xmlns="http://www.w3.org/2000/svg" width="30" height="30"><path fill="#fff" d="M21 9.8l-.8-.8-5.2 4.8L9.8 9l-.8.8 4.8 5.2L9 20.2l.8.8 5.2-4.8 5.2 4.8.8-.8-4.8-5.2L21 9.8"/></svg>
  76 | </span>
  77 | </button>
  78 | </div>
  79 | </div>
  80 | </div>
  81 | </div>
  82 | <a class="u-h skip" href="#maincontent" data-link-name="skip : main content">Skip to main content</a>
  83 | <div class="sticky-nav-mt-test">
  84 | <div class="top-banner-ad-container top-banner-ad-container--desktop top-banner-ad-container--above-nav">
  85 | <div id="dfp-ad--top-above-nav" class="js-ad-slot ad-slot ad-slot--dfp ad-slot--top-above-nav ad-slot--top-banner-ad" data-link-name="ad slot top-above-nav" data-test-id="ad-slot-top-above-nav" data-name="top-above-nav" data-mobile="1,1|88,70|728,90" data-desktop="1,1|88,70|728,90|940,230|900,250|970,250"> </div>
  86 | </div>
  87 | <header id="header" class="l-header u-cf " role="banner" data-link-name="global navigation: header">
  88 | <div class="js-navigation-header navigation-container navigation-container--collapsed">
  89 | <div class="gs-container l-header__inner">
  90 | <div class="l-header-pre u-cf">
  91 | <div class="brand-bar">
  92 | <div class="brand-bar__item brand-bar__item--profile popup-container
  93 |                                     has-popup brand-bar__item--has-control js-profile-nav" data-component="identity-profile">
  94 | <a href="https://profile.theguardian.com/signin" data-link-name="User profile" data-toggle="popup--profile" class="brand-bar__item--action popup__toggle" data-test-id="sign-in-link" aria-haspopup="true">
  95 | <span class="inline-profile-36 inline-icon rounded-icon control__icon-wrapper">
  96 | <svg width="18" height="18"><path fill="#fff" d="M9 7.3c1.6 0 3.4-1.8 3.4-3.9S11.1 0 9 0 5.6 1.3 5.6 3.4s2 3.9 3.4 3.9zm5.9 3.4l-.9-.8c-1.7-.6-3.1-.9-5-.9s-3.3.3-5 .9l-.9.9L1 17.2l.9.8h14.3l.9-.9-2.2-6.4z"/></svg>
  97 | </span>
  98 | <span class="js-profile-info control__info" data-test-id="sign-in-name">sign in</span>
  99 | </a>
 100 | <div class="js-profile-popup"></div>
 101 | </div>
 102 | <div class="brand-bar__item has-popup brand-bar__item--has-control
 103 |                                 popup-container brand-bar__item--subscribe" data-component="subscribe">
 104 | <a href="http://subscribe.theguardian.com/?INTCMP=NGW_HEADER_UK_GU_SUBSCRIBE" class="brand-bar__item--action" data-link-name="common.editions.Uk$@51c63b2e : topNav : subscribe" class="brand-bar__item--action">
 105 | <span class="inline-marque-36 inline-icon rounded-icon control__icon-wrapper">
 106 | <svg width="36" height="36" viewBox="0 0 36 36"><path d="M21.3 8.8c0-4.9-1.5-5.7-3.3-5.7-1.8 0-3.2.7-3.2 5.7s1.5 5.5 3.2 5.5c1.8-.1 3.3-.6 3.3-5.5m-6.5 18.8c-2.3 0-2.9 1.7-2.9 2.9 0 1.8 1.6 3.4 6.3 3.4 5.3 0 6.8-1.5 6.8-3.4 0-1.7-1.3-2.9-3.4-2.9h-6.8zM10.5 2.4C4.3 5.2 0 11.4 0 18.7c0 4.9 2 9.4 5.2 12.6V31c0-3.2 3.1-4.4 5.9-5-2.6-.6-3.9-2.5-3.9-4.4 0-2.6 2.9-4.8 4.3-5.8l-.2-.1c-2.5-1.4-4.1-3.8-4.1-7 0-2.7 1.2-4.9 3.3-6.3M36 18.8C36 11.4 31.5 5 25.1 2.3c2.1 1.4 3.4 3.5 3.5 6.3l.1.6c0 5.4-4.4 8.2-10.7 8.2-1.6 0-2.7-.1-4.1-.5-.6.4-1.1 1.1-1.1 1.8 0 .9.8 1.6 1.8 1.6h8.8c5.5 0 8.2 2.2 8.2 7.1 0 1.6-.3 3.1-1 4.3 3.3-3.4 5.4-7.9 5.4-12.9"/></svg>
 107 | </span>
 108 | <span class="control__info">subscribe</span>
 109 | </a>
 110 | </div>
 111 | <div class="brand-bar__item has-popup popup-container brand-bar__item--has-control brand-bar__item--search" data-component="search">
 112 | <a href="https://www.google.co.uk/advanced_search?q=site:www.theguardian.com" data-is-ajax data-link-name="Search icon" class="brand-bar__item--action popup__toggle js-search-toggle" data-toggle="popup--search" aria-haspopup="true">
 113 | <span class="inline-search-36 inline-icon rounded-icon control__icon-wrapper">
 114 | <svg width="18" height="18" viewBox="0 0 18 18"><path d="M6.5 1.6c2.7 0 4.9 2.2 4.9 4.9s-2.2 4.9-4.9 4.9-4.9-2.2-4.9-4.9 2.2-4.9 4.9-4.9m0-1.6C2.9 0 0 2.9 0 6.5S2.9 13 6.5 13 13 10.1 13 6.5 10.1 0 6.5 0zm6.6 11.5l4.9 4.9-1.6 1.6-4.9-4.9v-.8l.8-.8h.8z"/></svg>
 115 | </span>
 116 | <span class="control__info">search</span>
 117 | </a>
 118 | </div>
 119 | <div class="brand-bar__item--right brand-bar__item--right--uk-edition" data-component="guardian-services">
 120 | <div class="brand-bar__item brand-bar__item--jobs">
 121 | <a class="brand-bar__item--action" data-link-name="uk : topNav : jobs" href="http://jobs.theguardian.com/?INTCMP=NGW_TOPNAV_UK_GU_JOBS">jobs</a>
 122 | </div>
 123 | <div class="brand-bar__item brand-bar__item--soulmates">
 124 | <a class="brand-bar__item--action" data-link-name="uk : topNav : soulmates" href="https://soulmates.theguardian.com/?INTCMP=NGW_TOPNAV_UK_GU_SOULMATES">dating</a>
 125 | </div>
 126 | <div class="brand-bar__item has-popup brand-bar__item--has-control brand-bar__item--more">
 127 | <a href="#guardian-services-top-menu" class="brand-bar__item--action popup__toggle" data-toggle="top-bar__popup--more" data-link-name="uk : topNav : more" aria-haspopup="true" aria-controls="guardian-services-top-menu">
 128 | <span class="rounded-icon control__icon-wrapper">
 129 | <!--[if (gt IE 8)&(IEMobile)]><!-->
 130 | <span class="inline-ellipsis-36 inline-icon ">
 131 | <svg xmlns="http://www.w3.org/2000/svg" width="24" height="18"><circle cx="3" cy="10" r="3" fill="#fff"/><circle cx="12" cy="10" r="3" fill="#fff"/><circle cx="21" cy="10" r="3" fill="#fff"/></svg>
 132 | </span>
 133 | <!--<![endif]-->
 134 | </span>
 135 | <span class="control__info" data-test-id="sign-in-name">more</span>
 136 | </a>
 137 | <div class="popup is-off top-bar__popup--more" id="guardian-services-top-menu">
 138 | <h3 class="popup__group-header">from the guardian:</h3>
 139 | <ul class="popup__group">
 140 | <li class="popup__item brand-bar__popup--jobs">
 141 | <a class="brand-bar__item--action" data-link-name="uk : topNav : jobs" href="http://jobs.theguardian.com/?INTCMP=NGW_TOPNAV_UK_GU_JOBS">jobs</a>
 142 | </li>
 143 | <li class="popup__item brand-bar__popup--soulmates">
 144 | <a class="brand-bar__item--action" data-link-name="uk : topNav : soulmates" href="https://soulmates.theguardian.com/?INTCMP=NGW_TOPNAV_UK_GU_SOULMATES">dating</a>
 145 | </li>
 146 | <li class="popup__item">
 147 | <a class="brand-bar__item--action" data-link-name="uk : topNav : masterclasses" href="http://www.theguardian.com/guardian-masterclasses?INTCMP=NGW_TOPNAV_UK_GU_MASTERCLASSES">masterclasses</a>
 148 | </li>
 149 | </ul>
 150 | <div class="brand-bar__popup--membership">
 151 | <h3 class="popup__group-header">join us:</h3>
 152 | <ul class="popup__group">
 153 | <li class="popup__item">
 154 | <a class="brand-bar__item--action" data-link-name="uk : topNav : membership" href="https://membership.theguardian.com/?INTCMP=NGW_TOPNAV_UK_GU_MEMBERSHIP">membership</a>
 155 | </li>
 156 | <li class="popup__item">
 157 | <a class="brand-bar__item--action" data-link-name="uk : topNav : subscribe" href="http://subscribe.theguardian.com/?INTCMP=NGW_TOPNAV_UK_GU_SUBSCRIBE">subscribe</a>
 158 | </li>
 159 | </ul>
 160 | </div>
 161 | <div class="brand-bar__popup--edition">
 162 | <h3 class="popup__group-header">change edition:</h3>
 163 | <ul class="popup__group">
 164 | <li class="popup__item">
 165 | <a class="brand-bar__item--action brand-bar__item--inline-action" data-edition="UK" data-link-name="switch to UK edition" href="http://www.theguardian.com/preference/edition/uk" title="Switch to the UK edition">
 166 | <span class="u-h">switch to the </span> UK <span class="u-h"> edition</span>
 167 | </a>
 168 | <a class="brand-bar__item--action brand-bar__item--inline-action" data-edition="US" data-link-name="switch to US edition" href="http://www.theguardian.com/preference/edition/us" title="Switch to the US edition">
 169 | <span class="u-h">switch to the </span> US <span class="u-h"> edition</span>
 170 | </a>
 171 | <a class="brand-bar__item--action brand-bar__item--inline-action" data-edition="AU" data-link-name="switch to AU edition" href="http://www.theguardian.com/preference/edition/au" title="Switch to the AU edition">
 172 | <span class="u-h">switch to the </span> AU <span class="u-h"> edition</span>
 173 | </a>
 174 | </li>
 175 | </ul>
 176 | </div>
 177 | </div>
 178 | </div>
 179 | <div class="brand-bar__item has-popup brand-bar__item--edition" data-component="edition">
 180 | <a href="#guardian-edition-menu" class="brand-bar__item--action brand-bar__item--action-beta popup__toggle" data-link-name="topNav : edition" data-toggle="top-bar__popup--edition" aria-haspopup="true" aria-controls="guardian-edition-menu">
 181 | <span class="brand-bar__edition-name">International</span>
 182 | <small class="brand-bar__beta">beta</small>
 183 | </a>
 184 | <ul class="popup popup__group is-off top-bar__popup--edition" id="guardian-edition-menu">
 185 | <li class="popup__item">
 186 | <a class="brand-bar__item--action" data-edition="UK" data-link-name="switch to UK edition" href="http://www.theguardian.com/preference/edition/uk" title="Switch to the UK edition">
 187 | <span class="u-h">switch to the </span>
 188 | <span class="brand-bar__edition-name">UK edition</span>
 189 | </a>
 190 | </li>
 191 | <li class="popup__item">
 192 | <a class="brand-bar__item--action" data-edition="US" data-link-name="switch to US edition" href="http://www.theguardian.com/preference/edition/us" title="Switch to the US edition">
 193 | <span class="u-h">switch to the </span>
 194 | <span class="brand-bar__edition-name">US edition</span>
 195 | </a>
 196 | </li>
 197 | <li class="popup__item">
 198 | <a class="brand-bar__item--action" data-edition="AU" data-link-name="switch to AU edition" href="http://www.theguardian.com/preference/edition/au" title="Switch to the AU edition">
 199 | <span class="u-h">switch to the </span>
 200 | <span class="brand-bar__edition-name">Australia edition</span>
 201 | </a>
 202 | </li>
 203 | </ul>
 204 | </div>
 205 | </div>
 206 | </div>
 207 | </div>
 208 | <div class="popup popup--search is-off"><div class="js-search-placeholder"></div></div>
 209 | <div class="l-header-main">
 210 | <a href="/international" data-link-name="site logo" id="logo" class="logo-wrapper" data-component="logo">
 211 | <span class="u-h">The Guardian</span>
 212 | <!--[if (gt IE 8)&(IEMobile)]><!-->
 213 | <span class="inline-guardian-logo-320 inline-logo ">
 214 | <svg width="320" height="60" viewBox="0 0 320 60"><path fill="#fff" d="M284 45h16v-3l-3-1.5v-20c1.2-.9 2.8-1.1 4.3-1.1 2.8 0 3.7.9 3.7 4.1v17l-3 1.5v3h16v-3l-3-1.5v-19c0-5.7-2.1-8.3-7.1-8.3-4.1 0-8.1 1.5-10.8 4V13h-1l-12.4 2.2v2.7l3.3 1.6v21l-3 1.5v3zM245.3.4c-3 0-5.4 2.4-5.4 5.5 0 3 2.4 5.4 5.4 5.4 2.9 0 5.4-2.4 5.4-5.4-.1-3.1-2.5-5.5-5.4-5.5zM237 15.1v2.8l3 1.6v20.9l-3 1.5V45h16v-3.1l-3-1.5V13.1h-1l-12 2zM223 39c-.7.6-1.7 1.1-3.2 1.1-4 0-5.9-3.3-5.9-10.9 0-8.7 2.4-11.6 5.6-11.6 1.8 0 2.8.6 3.5 1.4v20zm0-24.4c-1.2-.9-3.3-1.4-5-1.4-7.4 0-14.5 4.4-14.5 16.8 0 11.9 7.1 15.7 11.8 15.7 3.8 0 6.4-1.7 7.6-3.4h.3v3.3h.9l11.9-1.4v-2.3l-3-1.8V.6h-1l-12.6 2v2.8l3.6 1.5v7.7zM181 18l3 1.5v20.9l-3 1.5V45h17v-3.1l-4-1.5V24.1c1.8-1.4 4.1-1.9 6.8-1.9.9 0 1.6.2 2.2.3v-9c-.3-.1-.7-.2-1.2-.2-3.3 0-6 2.2-7.8 6.2V13h-1l-12 2v3zm-19.3-.8c3.9 0 5.3 2 5.3 5.9v3.5l-6.1 1.1c-5.9 1.1-10.4 2.9-10.4 9.3 0 5.1 3.5 8.7 8.3 8.7 3.8 0 7.4-1.7 8.7-4.4h.3c.5 3.3 3.3 4.4 6.4 4.4 2.4 0 4.8-.6 5.7-1.6v-2l-2.9-1.5v-18c0-7-5.2-9.4-13.3-9.4-5.3 0-8.6 1.3-11.4 2.6v7.8h4.7l2-6c1.1-.4 2.3-.4 2.7-.4zm2.3 22.9c-1.9 0-4-1.1-4-4.6 0-2.4 2.4-4.7 4.8-5l2.2-.5v8.5s-1.9 1.6-3 1.6zm100.7-22.9c3.9 0 5.3 2 5.3 5.9v3.5l-6.1 1.1c-5.9 1.1-10.4 2.9-10.4 9.3 0 5.1 3.5 8.7 8.3 8.7 3.8 0 7.4-1.7 8.7-4.4h.3c.5 3.3 3.3 4.4 6.4 4.4 2.4 0 4.8-.6 5.7-1.6v-2l-2.9-1.5v-18c0-7-5.2-9.4-13.3-9.4-5.3 0-8.6 1.3-11.4 2.6v7.8h4.7l2-6c1.1-.4 2.3-.4 2.7-.4zm2.3 22.9c-1.9 0-4-1.1-4-4.6 0-2.4 2.4-4.7 4.8-5l2.2-.5v8.5s-1.9 1.6-3 1.6zm-138.7 5.6c.4 0 .9 0 1.3-.1 3.5-.3 6.7-2 8.4-4.2v4.1l12-1.5v-2l-3-2V13h-1l-12 2.3V18l4 1.7V38c-1.1.8-2.4 1.3-4.2 1.3-2.5 0-4.8-.8-4.8-4.3V13h-1l-12 2.5v2.6l4 1.6V36c0 5.4 2.2 9.7 8.3 9.7zM96 38c-1.2 0-2.5-.8-2.5-1.9 0-.8.6-1.7 1.4-2.3 1.6.5 3 .6 5 .6 7.8 0 13.2-3.7 13.2-10.4 0-3-1.3-4.6-3.2-6.4L115 19v-6l-8.2 1.6c-1.9-.7-4.5-1.6-7-1.6-7.8 0-13.2 4.1-13.2 10.8 0 4.1 2 7.1 5 8.8l.2.2c-1.7 1.2-5.3 4-5.3 7.2 0 2.4 1.5 4.8 4.8 5.5-3.4.8-7.3 2.5-7.3 6.5 0 4.1 5.9 8 15.5 8 11.8 0 16.5-5.7 16.5-13 0-6.1-2.8-9-9.5-9H96zm7.5-14c0 5.7-1.3 6.5-3.5 6.5s-4-.8-4-6.5c0-5.8 1.8-7.5 4-7.5s3.5 2 3.5 7.5zM92 50.9c.1-1.5 1.1-3.4 3.7-3.6h8.6c2.5 0 3.7 2 3.7 3.6 0 3.2-2 4.4-8.3 4.4-5.5 0-7.8-2.2-7.7-4.4z"/><path fill="#AAD8F1" d="M83 30c0-13-5.1-16.9-13-16.9-9 0-15 6.2-15 16.4 0 10.5 5.5 16.2 15.8 16.2 5.6 0 9.7-2.7 11.2-4.7v-3c-2.1.7-3.9 1.2-7.7 1.2-5.6 0-9.3-3.2-9.3-9.2h18zM69.9 16.6c2.5 0 3.8 1.8 3.8 9.6l-8.4.7c.1-7.9 1.8-10.3 4.6-10.3zM37 45v-3l-3-1.5V21c1.2-.9 3.2-1.7 4.8-1.7 2.8 0 4.3 1.6 4.2 4.2v17L40 42v3h16v-3l-3-1.5v-19c0-5.7-3.3-8.3-7.7-8.3-4.1 0-8.6 1.3-11.3 3.8V0h-1L21 2v3l4 1.5v34L22 42v3h15zM4 36.4c0 5.7 2.8 9.3 8.9 9.3 3.1 0 6.2-.8 8.1-2.3v-3.8c-.8.3-1.9.5-2.9.5-2.9 0-4.1-1.6-4.1-4.6V19h7v-5h-7V6.5L4 8v6l-4 1v4h4v17.4z"/></svg>
 215 | </span>
 216 | <!--<![endif]-->
 217 | <!--[if (lt IE 9)&(!IEMobile)]>
 218 |                                 <span class="inline-logo inline-guardian-logo-320"></span>
 219 |                             <![endif]-->
 220 | <span class="logo__tagline hide-on-mobile">Winner of the Pulitzer prize 2014</span>
 221 | </a>
 222 | </div>
 223 | </div>
 224 | <div class="navigation navigation--has-local-navigation navigation--has-signposting">
 225 | <div class="gs-container">
 226 | <div class="navigation__inner">
 227 | <div class="navigation__scroll">
 228 | <nav class="navigation__container navigation__container--first" data-component="nav" role="navigation" aria-label="Current section">
 229 | <ul class="signposting">
 230 | <li class="signposting__item signposting__item--home">
 231 | <a class="signposting__action" href="/international" data-link-name="nav : signposting : home">home</a>
 232 | </li>
 233 | <li class="signposting__item signposting__item--parent">
 234 | <span class="signposting__separator" aria-hidden="true"><span class="signposting__separator__inner">›</span></span>
 235 | <a class="signposting__action" href="http://www.theguardian.com/uk/culture" data-link-name="nav : signposting : culture">culture</a>
 236 | </li>
 237 | <li class="signposting__item signposting__item--current">
 238 | <span class="signposting__separator"><span class="signposting__separator__inner">›</span></span>
 239 | <a class="signposting__action" href="http://www.theguardian.com/film" data-link-name="nav : signposting : culture &gt; film">film</a>
 240 | </li>
 241 | </ul>
 242 | <ul class="local-navigation">
 243 | <li class="local-navigation__item">
 244 | <a class="local-navigation__action" href="http://www.theguardian.com/tv-and-radio" data-link-name="nav : secondary : tv &amp; radio">tv & radio</a>
 245 | </li>
 246 | <li class="local-navigation__item">
 247 | <a class="local-navigation__action" href="http://www.theguardian.com/music" data-link-name="nav : secondary : music">music</a>
 248 | </li>
 249 | <li class="local-navigation__item">
 250 | <a class="local-navigation__action" href="http://www.theguardian.com/technology/games" data-link-name="nav : secondary : games">games</a>
 251 | </li>
 252 | <li class="local-navigation__item">
 253 | <a class="local-navigation__action" href="http://www.theguardian.com/books" data-link-name="nav : secondary : books">books</a>
 254 | </li>
 255 | <li class="local-navigation__item">
 256 | <a class="local-navigation__action" href="http://www.theguardian.com/artanddesign" data-link-name="nav : secondary : art &amp; design">art & design</a>
 257 | </li>
 258 | <li class="local-navigation__item">
 259 | <a class="local-navigation__action" href="http://www.theguardian.com/stage" data-link-name="nav : secondary : stage">stage</a>
 260 | </li>
 261 | <li class="local-navigation__item">
 262 | <a class="local-navigation__action" href="http://www.theguardian.com/music/classicalmusicandopera" data-link-name="nav : secondary : classical">classical</a>
 263 | </li>
 264 | </ul>
 265 | </nav>
 266 | <nav class="navigation__container navigation__container--second" data-component="nav" role="navigation" aria-label="Guardian sections">
 267 | <ul class="top-navigation js-top-navigation">
 268 | <li class="top-navigation__item top-navigation__item--home">
 269 | <a href="/international" class="top-navigation__action top-navigation__action--has-icon" data-link-name="nav : primary : home" title="Back to homepage">
 270 | <span class="top-navigation__icon-wrapper">
 271 | <span class="top-navigation__icon top-navigation__icon--home "></span>
 272 | </span>
 273 | <span class="u-h">home</span>
 274 | </a>
 275 | </li>
 276 | <li class="top-navigation__item">
 277 | <a class="top-navigation__action" href="http://www.theguardian.com/uk-news" data-link-name="nav : primary : UK">
 278 | UK
 279 | </a>
 280 | </li>
 281 | <li class="top-navigation__item">
 282 | <a class="top-navigation__action" href="http://www.theguardian.com/politics/general-election-2015" data-link-name="nav : primary : election">
 283 | election
 284 | </a>
 285 | </li>
 286 | <li class="top-navigation__item">
 287 | <a class="top-navigation__action" href="http://www.theguardian.com/world" data-link-name="nav : primary : world">
 288 | world
 289 | </a>
 290 | </li>
 291 | <li class="top-navigation__item">
 292 | <a class="top-navigation__action" href="http://www.theguardian.com/uk/sport" data-link-name="nav : primary : sport">
 293 | sport
 294 | </a>
 295 | </li>
 296 | <li class="top-navigation__item">
 297 | <a class="top-navigation__action" href="http://www.theguardian.com/football" data-link-name="nav : primary : football">
 298 | football
 299 | </a>
 300 | </li>
 301 | <li class="top-navigation__item">
 302 | <a class="top-navigation__action" href="http://www.theguardian.com/uk/commentisfree" data-link-name="nav : primary : opinion">
 303 | opinion
 304 | </a>
 305 | </li>
 306 | <li class="top-navigation__item top-navigation__item--current">
 307 | <a class="top-navigation__action" href="http://www.theguardian.com/uk/culture" data-link-name="nav : primary : culture">
 308 | culture
 309 | <span class="u-h">selected</span>
 310 | </a>
 311 | </li>
 312 | <li class="top-navigation__item">
 313 | <a class="top-navigation__action" href="http://www.theguardian.com/uk/business" data-link-name="nav : primary : business">
 314 | business
 315 | </a>
 316 | </li>
 317 | <li class="top-navigation__item">
 318 | <a class="top-navigation__action" href="http://www.theguardian.com/lifeandstyle" data-link-name="nav : primary : lifestyle">
 319 | lifestyle
 320 | </a>
 321 | </li>
 322 | <li class="top-navigation__item">
 323 | <a class="top-navigation__action" href="http://www.theguardian.com/fashion" data-link-name="nav : primary : fashion">
 324 | fashion
 325 | </a>
 326 | </li>
 327 | <li class="top-navigation__item">
 328 | <a class="top-navigation__action" href="http://www.theguardian.com/uk/environment" data-link-name="nav : primary : environment">
 329 | environment
 330 | </a>
 331 | </li>
 332 | <li class="top-navigation__item">
 333 | <a class="top-navigation__action" href="http://www.theguardian.com/uk/technology" data-link-name="nav : primary : tech">
 334 | tech
 335 | </a>
 336 | </li>
 337 | <li class="top-navigation__item">
 338 | <a class="top-navigation__action" href="http://www.theguardian.com/travel" data-link-name="nav : primary : travel">
 339 | travel
 340 | </a>
 341 | </li>
 342 | </ul>
 343 | </nav>
 344 | </div>
 345 | <a class="navigation-toggle js-navigation-toggle" href="#footer-nav" data-link-name="nav : allSections" data-target-nav="js-navigation-header">
 346 | <i class="burger-icon"></i><span class="navigation-toggle-label navigation-toggle-label--open" aria-haspopup="true" aria-controls="all-sections-popup" aria-label="browse all sections"><span class="navigation-toggle-label__extra navigation-toggle-label__extra--browse">browse </span>all<span class="navigation-toggle-label__extra"> sections</span></span>
 347 | <span class="navigation-toggle-label navigation-toggle-label--close" aria-label="close all sections">close</span>
 348 | </a>
 349 | </div>
 350 | <div id="all-sections-popup" class="navigation__expandable js-mega-nav-placeholder" data-component="all-nav"></div>
 351 | </div>
 352 | </div>
 353 | </div>
 354 | </header>
 355 | </div>
 356 | <div class="top-banner-below-nav-mt-test"></div>
 357 | <div class="top-banner-ad-container top-banner-ad-container--mobile">
 358 | <div id="dfp-ad--top" class="js-ad-slot ad-slot ad-slot--dfp ad-slot--top ad-slot--top-banner-ad" data-link-name="ad slot top" data-test-id="ad-slot-top" data-name="top" data-label="false" data-mobile="1,1|300,50|320,50|88,70"> </div>
 359 | </div>
 360 | <div id="maincontent" tabindex="0"></div>
 361 | <div class="js-breaking-news-placeholder breaking-news breaking-news--hidden breaking-news--fade-in" data-link-name="breaking news" data-component="breaking-news"></div>
 362 | <div class="l-side-margins">
 363 | <article id="article" data-test-id="article-root" class="content content--article tonal tonal--tone-news section-film
 364 |         
 365 |         " itemscope itemtype="http://schema.org/NewsArticle" role="main">
 366 | <header class="content__head tonal__head tonal__head--tone-news
 367 |     ">
 368 | <div class="content__header tonal__header u-cf">
 369 | <div class="gs-container">
 370 | <div class="content__main-column">
 371 | <div class="content__labels">
 372 | <div class="content__section-label">
 373 | <a class="content__section-label__link" data-link-name="article section" href="http://www.theguardian.com/film/sport">Sport</a>
 374 | </div>
 375 | <div class="content__series-label ">
 376 | <a class="content__series-label__link" href="http://www.theguardian.com/film/series/reelhistory">Reel history</a>
 377 | </div>
 378 | </div>
 379 | <h1 class="content__headline js-score" itemprop="headline">
 380 | The Hurricane: the facts of Rubin Carter's life story are beaten to a pulp
 381 | </h1>
 382 | </div>
 383 | </div>
 384 | </div>
 385 | <div class="tonal__standfirst u-cf">
 386 | <div class="gs-container">
 387 | <div class="content__main-column">
 388 | <div class="content__standfirst" data-link-name="standfirst" data-component="standfirst">
 389 | <meta itemprop="description" content="Denzel Washington&#x27;s compelling performance gives Norman Jewison&#x27;s biopic punch – despite its many inaccuracies, writes Alex von Tunzelmann"/>
 390 | Denzel Washington's compelling performance gives Norman Jewison's biopic punch – despite its many inaccuracies
 391 | </div>
 392 | </div>
 393 | </div>
 394 | </div>
 395 | </header>
 396 | <div class="content__main tonal__main tonal__main--tone-news">
 397 | <div class="gs-container">
 398 | <div class="content__main-column content__main-column--article js-content-main-column ">
 399 | <div class="js-football-tabs football-tabs content__mobile-full-width"></div>
 400 | <figure itemprop="associatedMedia image" itemscope itemtype="http://schema.org/ImageObject" data-component="image" class="media-primary media-content">
 401 | <a href="/film/2014/apr/24/the-hurricane-rubin-carter-denzel-washington#img-1" class="article__img-container js-gallerythumbs" data-link-name="Launch Article Lightbox" data-is-ajax>
 402 | <div class="">
 403 | <img src="//i.guim.co.uk/static/w-300/h--/q-95/sys-images/Arts/Arts_/Pictures/2014/4/23/1398270512680/The-Hurricane-004.jpg" srcset="//i.guim.co.uk/static/w-620/h--/q-95/sys-images/Arts/Arts_/Pictures/2014/4/23/1398270516995/The-Hurricane-007.jpg 620w, //i.guim.co.uk/static/w-700/h--/q-95/sys-images/Arts/Arts_/Pictures/2014/4/23/1398270518562/The-Hurricane-008.jpg 700w, //i.guim.co.uk/static/w-645/h--/q-95/sys-images/Arts/Arts_/Pictures/2014/4/23/1398270518562/The-Hurricane-008.jpg 645w, //i.guim.co.uk/static/w-465/h--/q-95/sys-images/Arts/Arts_/Pictures/2014/4/23/1398266760181/The-Hurricane-Boxing-Matc-002.jpg 465w" sizes="(min-width: 980px) 620px, (min-width: 740px) 700px, (min-width: 660px) 620px, (min-width: 480px) 645px, 465px" class="maxed responsive-img" itemprop="contentUrl representativeOfPage" alt="The Hurricane"/>
 404 | <span class="rounded-icon article__fullscreen"><i class="i i-expand-white"></i><i class="i i-expand-black"></i></span>
 405 | </div>
 406 | </a>
 407 | <figcaption class="caption caption--main caption--img" itemprop="description">
 408 | Packing punch … Denzel Washington in The Hurricane Photograph: Allstar/Cinetext Collection/Sportsphoto/Allstar/Cinetext Collection
 409 | </figcaption>
 410 | </figure>
 411 | <div class="content__meta-container js-football-meta u-cf
 412 |     
 413 |     
 414 |     
 415 |     
 416 |     
 417 |     
 418 |     ">
 419 | <p class="byline" data-link-name="byline" data-component="meta-byline"><span itemscope="" itemtype="http://schema.org/Person" itemprop="author">
 420 | <a rel="author" class="tone-colour" itemprop="url name" data-link-name="auto tag link" href="http://www.theguardian.com/profile/alexvontunzelmann">Alex von Tunzelmann</a></span></p>
 421 | <p class="content__dateline" aria-hidden="true">
 422 | <time itemprop="datePublished" datetime='2014-04-24T10:26:18+0100' data-timestamp="1398331578000" class="content__dateline-wpd js-wpd">
 423 | Thursday 24 April 2014 <span class="content__dateline-time">10.26 BST</span>
 424 | </time>
 425 | <time itemprop="dateModified" datetime='2014-06-21T00:41:29+0100' data-timestamp="1403307689000" class="content__dateline-lm js-lm u-h">
 426 | Last modified on Saturday 21 June 2014 <span class="content__dateline-time">00.41 BST</span>
 427 | </time>
 428 | </p>
 429 | <div class="meta__extras">
 430 | <div class="meta__social" data-component="share">
 431 | <ul class="social social--top u-unstyled u-cf" data-component="social">
 432 | <li class="social__item social__item--facebook" data-link-name="facebook">
 433 | <a class="social__action social-icon-wrapper" data-link-name="social top" href="https://www.facebook.com/sharer/sharer.php?u=http%3A%2F%2Fgu.com%2Fp%2F3zjk4%2Fsfb&amp;ref=responsive" target="_blank" title="Facebook">
 434 | <span class="u-h">Share on Facebook</span>
 435 | <span class="rounded-icon social-icon social-icon--facebook">
 436 | <i class="i-share-facebook--white i"></i>
 437 | </span>
 438 | </a>
 439 | </li>
 440 | <li class="social__item social__item--twitter" data-link-name="twitter">
 441 | <a class="social__action social-icon-wrapper" data-link-name="social top" href="https://twitter.com/intent/tweet?text=The+Hurricane%3A+the+facts+of+Rubin+Carter%27s+life+story+are+beaten+to+a+pulp&amp;url=http%3A%2F%2Fgu.com%2Fp%2F3zjk4%2Fstw" target="_blank" title="Twitter">
 442 | <span class="u-h">Share on Twitter</span>
 443 | <span class="rounded-icon social-icon social-icon--twitter">
 444 | <i class="i-share-twitter--white i"></i>
 445 | </span>
 446 | </a>
 447 | </li>
 448 | <li class="social__item social__item--email" data-link-name="email">
 449 | <a class="social__action social-icon-wrapper" data-link-name="social top" href="mailto:?subject=The%20Hurricane%3A%20the%20facts%20of%20Rubin%20Carter&#x27;s%20life%20story%20are%20beaten%20to%20a%20pulp&amp;body=http%3A%2F%2Fgu.com%2Fp%2F3zjk4%2Fsbl" target="_blank" title="Email">
 450 | <span class="u-h">Share via Email</span>
 451 | <span class="rounded-icon social-icon social-icon--email">
 452 | <i class="i-share-email--white i"></i>
 453 | </span>
 454 | </a>
 455 | </li>
 456 | <li class="social__item social__item--linkedin" data-link-name="linkedin">
 457 | <a class="social__action social-icon-wrapper" data-link-name="social top" href="http://www.linkedin.com/shareArticle?mini=true&amp;title=The+Hurricane%3A+the+facts+of+Rubin+Carter%27s+life+story+are+beaten+to+a+pulp&amp;url=http%3A%2F%2Fgu.com%2Fp%2F3zjk4" target="_blank" title="LinkedIn">
 458 | <span class="u-h">Share on LinkedIn</span>
 459 | <span class="rounded-icon social-icon social-icon--linkedin">
 460 | <i class="i-share-linkedin--white i"></i>
 461 | </span>
 462 | </a>
 463 | </li>
 464 | <li class="social__item social__item--gplus" data-link-name="gplus">
 465 | <a class="social__action social-icon-wrapper" data-link-name="social top" href="https://plus.google.com/share?url=http%3A%2F%2Fgu.com%2Fp%2F3zjk4%2Fsgp&amp;amp;hl=en-GB&amp;amp;wwc=1" target="_blank" title="Google plus">
 466 | <span class="u-h">Share on Google+</span>
 467 | <span class="rounded-icon social-icon social-icon--gplus">
 468 | <i class="i-share-gplus--white i"></i>
 469 | </span>
 470 | </a>
 471 | </li>
 472 | <li class="social__item social__item--whatsapp" data-link-name="whatsapp">
 473 | <a class="social__action social-icon-wrapper" data-link-name="social top" href="whatsapp://send?text=%22The%20Hurricane%3A%20the%20facts%20of%20Rubin%20Carter&#x27;s%20life%20story%20are%20beaten%20to%20a%20pulp%22%20http%3A%2F%2Fgu.com%2Fp%2F3zjk4%2Fswa" target="_blank" title="WhatsApp">
 474 | <span class="u-h">Share on WhatsApp</span>
 475 | <span class="rounded-icon social-icon social-icon--whatsapp">
 476 | <i class="i-share-whatsapp--white i"></i>
 477 | </span>
 478 | </a>
 479 | </li>
 480 | </ul>
 481 | </div>
 482 | <div class="meta__numbers modern-visible">
 483 | <div class="u-h meta__number js-sharecount">
 484 | </div>
 485 | <div class="u-h meta__number" data-discussion-id="/p/3zjk4" data-commentcount-format="content" data-discussion-closed="false">
 486 | </div>
 487 | </div>
 488 | <div class="meta__save-for-later" data-link-name="meta-save-for-later" data-component="meta-save-for-later">
 489 | </div>
 490 | </div>
 491 | </div>
 492 | <div class="content__article-body from-content-api js-article__body" itemprop="articleBody" data-test-id="article-review-body">
 493 | <p><a href="http://www.theguardian.com/film/movie/81975/hurricane" title="" data-link-name="in body link" data-component="in-body-link" class=" u-underline">The Hurricane</a> (1999)<br/>Director: Norman Jewison<br/>Entertainment grade: B<br/>History grade: D–</p>
 494 | <p><a href="http://www.theguardian.com/sport/2014/apr/20/rubin-hurricane-carter-boxer-dies-76" title="" data-link-name="in body link" data-component="in-body-link" class=" u-underline">Rubin &quot;Hurricane&quot; Carter</a>, who died this week, was a boxer in the United States. He was convicted of a 1966 triple homicide in two trials and became a cause celebre, inspiring Bob Dylan's song <a href="http://www.openculture.com/2014/04/bob-dylan-plays-first-live-performance-of-hurricane.html" title="" data-link-name="in body link" data-component="in-body-link" class=" u-underline">Hurricane</a>. The convictions were set aside by a federal court in 1985, on the grounds that they had been &quot;predicated upon an appeal to racism rather than reason&quot;.</p>
 495 | <h2>Fictionalisation</h2>
 496 | <body>
 497 | <figure itemprop="associatedMedia image" itemscope="" itemtype="http://schema.org/ImageObject" class="element element-image img--landscape">
 498 | <img srcset="//i.guim.co.uk/static/w-620/h--/q-95/sys-images/Arts/Arts_/Pictures/2014/4/23/1398267192388/The-Hurricaine-prison-011.jpg 620w, //i.guim.co.uk/static/w-605/h--/q-95/sys-images/Arts/Arts_/Pictures/2014/4/23/1398267192388/The-Hurricaine-prison-011.jpg 605w, //i.guim.co.uk/static/w-445/h--/q-95/sys-images/Arts/Arts_/Pictures/2014/4/23/1398267192388/The-Hurricaine-prison-011.jpg 445w" sizes="(min-width: 660px) 620px, (min-width: 480px) 605px, 445px" src="//i.guim.co.uk/static/w-300/h--/q-95/sys-images/Arts/Arts_/Pictures/2014/4/23/1398267192388/The-Hurricaine-prison-011.jpg" alt="The Hurricaine prison" class="gu-image" itemprop="contentUrl"/>
 499 | <figcaption itemprop="description" class="caption caption--img caption caption--img">
 500 | High points … the film's most gripping scenes show Carter's time in prison. Allstar/UNIVERSAL PICTURES/Sportsphoto Ltd./Allstar
 501 | </figcaption>
 502 | </figure>
 503 | </body>
 504 | <p>A title card before the film admits that some characters have been composited or invented, and some incidents fictionalised. That's fair enough, of course – though viewers would do well to keep the disclaimer at the front of their minds throughout. The film's narrative skips back and forth, from Carter (Denzel Washington) protesting in prison in 1973, back to a boxing match in the 1960s (filmed in black and white, with a nod to <a href="http://[http://www.theguardian.com/film/filmblog/2013/may/09/raging-bull-reel-history-martin-scorsese" title="" data-link-name="in body link" data-component="in-body-link" class=" u-underline">Raging Bull</a>), forward to the night of the triple homicide at the Lafayette Bar &amp; Grill in Paterson, New Jersey, in 1966.</p>
 505 | <h2>Record</h2>
 506 | <p>Having established the crime, the film delves into Carter's youth. It is true that he ran away from a juvenile detention centre and joined the army, but in The Hurricane he appears to emerge from it with full honours. In real life, he underwent four court martials for various behavioural and discipline offences and was eventually discharged as &quot;unfit for military service&quot;. He was afterwards convicted of three muggings. Perhaps the film-makers felt that this background made Carter an unsympathetic character – but, in real life, the fact that Carter didn't get on with army authority and had a criminal record was part of his story. Nothing in his background made it any more acceptable that he was wrongfully convicted of three murders.</p>
 507 | <h2>Sport</h2>
 508 | <p>As an alternative narrative, the film chooses to establish Carter's alienation as a black man through a middleweight title fight in 1964. On screen, Carter clearly wins over defender Joey Giardello – but the white judges award the title to the white Giardello anyway. It's one of those incidents that the flimsy opening disclaimer is presumably supposed to cover. In real life, Carter boxed well for the first five rounds, but Giardello took control as the match went on and was awarded a unanimous victory by the judges. <a href="http://nypost.com/2014/04/20/boxer-rubin-hurricane-carter-dies-at-76/" title="" data-link-name="in body link" data-component="in-body-link" class=" u-underline">Carter agreed</a> that Giardello deserved his victory. So upset was Giardello by this inaccurate portrayal that he launched a <a href="http://www.theguardian.com/film/2000/feb/23/news1" title="" data-link-name="in body link" data-component="in-body-link" class=" u-underline">lawsuit</a> against the makers of the film. Reportedly, they settled out of court for an undisclosed sum.</p>
 509 | <h2>Romance</h2>
 510 | <body>
 511 | <figure itemprop="associatedMedia image" itemscope="" itemtype="http://schema.org/ImageObject" class="element element-image img--landscape">
 512 | <img srcset="//i.guim.co.uk/static/w-620/h--/q-95/sys-images/Arts/Arts_/Pictures/2014/4/23/1398269449711/THE-HURRICANE-006.jpg 620w, //i.guim.co.uk/static/w-605/h--/q-95/sys-images/Arts/Arts_/Pictures/2014/4/23/1398269449711/THE-HURRICANE-006.jpg 605w, //i.guim.co.uk/static/w-445/h--/q-95/sys-images/Arts/Arts_/Pictures/2014/4/23/1398269449711/THE-HURRICANE-006.jpg 445w" sizes="(min-width: 660px) 620px, (min-width: 480px) 605px, 445px" src="//i.guim.co.uk/static/w-300/h--/q-95/sys-images/Arts/Arts_/Pictures/2014/4/23/1398269449711/THE-HURRICANE-006.jpg" alt="THE HURRICANE" class="gu-image" itemprop="contentUrl"/>
 513 | <figcaption itemprop="description" class="caption caption--img caption caption--img">
 514 | Loyal in love … Debbi Morgan as Carter's wife, Mae Thelma. Allstar/UNIVERSAL PICTURES/Sportsphoto Ltd./Allstar
 515 | </figcaption>
 516 | </figure>
 517 | </body>
 518 | <p>The most gripping parts of The Hurricane show Carter's time in prison. He decides he must give up wanting things, in order that his jailers cannot take anything away from him. At the height of his self-denial, his loyal, adoring wife Mae Thelma visits him. &quot;I want you to divorce me,&quot; he says. &quot;I'm dead. Just bury me. Please.&quot; It's a beautifully acted and affecting scene – but the truth was not quite so noble. <a href="http://www.theguardian.com/world/2014/apr/21/rubin-hurricane-carter" title="" data-link-name="in body link" data-component="in-body-link" class=" u-underline">Thelma divorced Carter</a> on the grounds of his repeated infidelities with supporters.</p>
 519 | <h2>Justice</h2>
 520 | <body>
 521 | <figure itemprop="associatedMedia image" itemscope="" itemtype="http://schema.org/ImageObject" class="element element-image img--landscape">
 522 | <img srcset="//i.guim.co.uk/static/w-620/h--/q-95/sys-images/Arts/Arts_/Pictures/2014/4/23/1398268795613/THE-HURRICANE-VICELLOUS-R-006.jpg 620w, //i.guim.co.uk/static/w-605/h--/q-95/sys-images/Arts/Arts_/Pictures/2014/4/23/1398268795613/THE-HURRICANE-VICELLOUS-R-006.jpg 605w, //i.guim.co.uk/static/w-445/h--/q-95/sys-images/Arts/Arts_/Pictures/2014/4/23/1398268795613/THE-HURRICANE-VICELLOUS-R-006.jpg 445w" sizes="(min-width: 660px) 620px, (min-width: 480px) 605px, 445px" src="//i.guim.co.uk/static/w-300/h--/q-95/sys-images/Arts/Arts_/Pictures/2014/4/23/1398268795613/THE-HURRICANE-VICELLOUS-R-006.jpg" alt="THE HURRICANE VICELLOUS REON SHANNON" class="gu-image" itemprop="contentUrl"/>
 523 | <figcaption itemprop="description" class="caption caption--img caption caption--img">
 524 | A new chapter … Vicellous Reon Shannon (right) plays Lesra Martin, who is enchanted by Carter's autobiography Allstar/UNIVERSAL PICTURES/Sportsphoto Ltd./Allstar
 525 | </figcaption>
 526 | </figure>
 527 | </body>
 528 | <p> In Toronto in the 1980s, young Lesra Martin (Vicellous Reon Shannon) buys Carter's autobiography in a second-hand book sale and is enchanted. Martin, a black boy from Brooklyn, lives in a commune of Canadians who seem to be harmless, though even the film's best efforts can't prevent them from seeming a bit weird. The commune is run by three well-meaning white liberals, who set out to free Carter. The screenplay can't decide whether they're heroes or idiots, and makes a right old mess of the facts of the case while it tries to work that out. Fortunately, Washington's performance is so powerful, nuanced and intensely compelling that it carries the film to the finish line, making it watchable despite the growing heaps of inaccuracies. He lost the Oscar in 2000 to Kevin Spacey for American Beauty, but deservedly won a Golden Globe.</p>
 529 | <h2>Verdict</h2>
 530 | <p>The Hurricane goes 15 rounds with history and beats it to a pulp.</p>
 531 | </div>
 532 | <div class="submeta">
 533 | <hr/>
 534 | <div data-link-name="keywords" data-component="keywords">
 535 | <h2 class="submeta__head">Topics</h2>
 536 | <ul class="keyword-list inline-list">
 537 | <li class="inline-list__item ">
 538 | <a class="  button button--small button--tag button--secondary" href="http://www.theguardian.com/film/sport" data-link-name="keyword: film/sport" itemprop="keywords">
 539 | Sport
 540 | </a>
 541 | </li>
 542 | <li class="inline-list__item ">
 543 | <a class="  button button--small button--tag button--secondary" href="http://www.theguardian.com/film/denzelwashington" data-link-name="keyword: film/denzelwashington" itemprop="keywords">
 544 | Denzel Washington
 545 | </a>
 546 | </li>
 547 | </ul>
 548 | </div>
 549 | <hr/>
 550 | <div data-component="share">
 551 | <ul class="social social--bottom u-unstyled u-cf" data-component="social">
 552 | <li class="social__item social__item--facebook" data-link-name="facebook">
 553 | <a class="social__action social-icon-wrapper" data-link-name="social bottom" href="https://www.facebook.com/sharer/sharer.php?u=http%3A%2F%2Fgu.com%2Fp%2F3zjk4%2Fsfb&amp;ref=responsive" target="_blank" title="Facebook">
 554 | <span class="u-h">Share on Facebook</span>
 555 | <span class="rounded-icon social-icon social-icon--facebook">
 556 | <i class="i-share-facebook--white i"></i>
 557 | </span>
 558 | </a>
 559 | </li>
 560 | <li class="social__item social__item--twitter" data-link-name="twitter">
 561 | <a class="social__action social-icon-wrapper" data-link-name="social bottom" href="https://twitter.com/intent/tweet?text=The+Hurricane%3A+the+facts+of+Rubin+Carter%27s+life+story+are+beaten+to+a+pulp&amp;url=http%3A%2F%2Fgu.com%2Fp%2F3zjk4%2Fstw" target="_blank" title="Twitter">
 562 | <span class="u-h">Share on Twitter</span>
 563 | <span class="rounded-icon social-icon social-icon--twitter">
 564 | <i class="i-share-twitter--white i"></i>
 565 | </span>
 566 | </a>
 567 | </li>
 568 | <li class="social__item social__item--email" data-link-name="email">
 569 | <a class="social__action social-icon-wrapper" data-link-name="social bottom" href="mailto:?subject=The%20Hurricane%3A%20the%20facts%20of%20Rubin%20Carter&#x27;s%20life%20story%20are%20beaten%20to%20a%20pulp&amp;body=http%3A%2F%2Fgu.com%2Fp%2F3zjk4%2Fsbl" target="_blank" title="Email">
 570 | <span class="u-h">Share via Email</span>
 571 | <span class="rounded-icon social-icon social-icon--email">
 572 | <i class="i-share-email--white i"></i>
 573 | </span>
 574 | </a>
 575 | </li>
 576 | <li class="social__item social__item--linkedin" data-link-name="linkedin">
 577 | <a class="social__action social-icon-wrapper" data-link-name="social bottom" href="http://www.linkedin.com/shareArticle?mini=true&amp;title=The+Hurricane%3A+the+facts+of+Rubin+Carter%27s+life+story+are+beaten+to+a+pulp&amp;url=http%3A%2F%2Fgu.com%2Fp%2F3zjk4" target="_blank" title="LinkedIn">
 578 | <span class="u-h">Share on LinkedIn</span>
 579 | <span class="rounded-icon social-icon social-icon--linkedin">
 580 | <i class="i-share-linkedin--white i"></i>
 581 | </span>
 582 | </a>
 583 | </li>
 584 | <li class="social__item social__item--gplus" data-link-name="gplus">
 585 | <a class="social__action social-icon-wrapper" data-link-name="social bottom" href="https://plus.google.com/share?url=http%3A%2F%2Fgu.com%2Fp%2F3zjk4%2Fsgp&amp;amp;hl=en-GB&amp;amp;wwc=1" target="_blank" title="Google plus">
 586 | <span class="u-h">Share on Google+</span>
 587 | <span class="rounded-icon social-icon social-icon--gplus">
 588 | <i class="i-share-gplus--white i"></i>
 589 | </span>
 590 | </a>
 591 | </li>
 592 | <li class="social__item social__item--whatsapp" data-link-name="whatsapp">
 593 | <a class="social__action social-icon-wrapper" data-link-name="social bottom" href="whatsapp://send?text=%22The%20Hurricane%3A%20the%20facts%20of%20Rubin%20Carter&#x27;s%20life%20story%20are%20beaten%20to%20a%20pulp%22%20http%3A%2F%2Fgu.com%2Fp%2F3zjk4%2Fswa" target="_blank" title="WhatsApp">
 594 | <span class="u-h">Share on WhatsApp</span>
 595 | <span class="rounded-icon social-icon social-icon--whatsapp">
 596 | <i class="i-share-whatsapp--white i"></i>
 597 | </span>
 598 | </a>
 599 | </li>
 600 | </ul>
 601 | </div>
 602 | </div>
 603 | <div class="after-article js-after-article"></div>
 604 | </div>
 605 | <div class="content__secondary-column js-secondary-column" aria-hidden="true">
 606 | <div class="mpu-container js-mpu-ad-slot"></div>
 607 | <div class="js-components-container"></div>
 608 | </div>
 609 | </div>
 610 | </div>
 611 | </article>
 612 | <div class="content-footer ">
 613 | <div id="comments" class="discussion discussion--not-staff discussion--loading js-comments
 614 |         discussion--closed
 615 |         u-cf" data-discussion-key="/p/3zjk4" data-discussion-closed="true" data-component="discussion">
 616 | <div class="discussion__top-border gs-container">
 617 | <div class="content__main-column">
 618 | <div class="modern-hidden">
 619 | <div class="discussion__heading">
 620 | <div class="container__meta modern-hidden">
 621 | <h2 class="container__meta__title">
 622 | <a href="http://www.theguardian.com/discussion/p/3zjk4" data-link-name="View all comments">
 623 | View all comments &gt;</a>
 624 | </h2>
 625 | </div>
 626 | </div>
 627 | </div>
 628 | <div class="modern-visible">
 629 | <div class="discussion__heading">
 630 | <div class="container__meta">
 631 | <h2 class="container__meta__title">comments <span class="discussion__comment-count js-discussion-comment-count"></span></h2>
 632 | <p class="container__meta__item discussion__meta discussion__meta--open-signed-out"><a class="u-underline" href="https://profile.theguardian.com/signin">Sign in</a>
 633 | or <a class="u-underline" href="https://profile.theguardian.com/register">create your Guardian account</a> to join the discussion.
 634 | </p>
 635 | <p class="container__meta__item discussion__meta discussion__meta--closed">This discussion is closed for comments.</p>
 636 | <p class="container__meta__item discussion__meta discussion__meta--readonly">
 637 | We’re doing some maintenance right now. You can still read comments, but please come back later to add your own.
 638 | </p>
 639 | <p class="container__meta__item d-discussion__error discussion__meta discussion__meta--banned">
 640 | Commenting has been disabled for this account (<a href="/community-faqs#321a">why?</a>)
 641 | </p>
 642 | </div>
 643 | </div>
 644 | <div class="discussion__comment-box js-discussion-comment-box--top"></div>
 645 | <div class="discussion__top-comments js-discussion-top-comments"></div>
 646 | <div class="discussion__toolbar js-discussion-toolbar u-cf">
 647 | <div class="discussion__toolbar-dropdown js-comment-order-dropdown">
 648 | <button class="u-button-reset popup__toggle" data-toggle="popup--comments-order" aria-haspopup="true" aria-controls="comments-order-popup">Order by <span class="js-comment-order"></span></button>
 649 | <ul id="comments-order-popup" class="popup popup__group popup--comments-order is-off">
 650 | <li class="popup__item">
 651 | <button class="u-button-reset popup__action" data-order="newest" data-link-name="comments-newest">newest</button>
 652 | </li>
 653 | <li class="popup__item">
 654 | <button class="u-button-reset popup__action" data-order="oldest" data-link-name="comments-oldest">oldest</button>
 655 | </li>
 656 | </ul>
 657 | </div>
 658 | <div class="discussion__toolbar-dropdown hide-until-tablet js-comment-pagesize-dropdown sign-in-required">
 659 | <button class="u-button-reset popup__toggle" data-toggle="popup--comments-pagesize" aria-haspopup="true" aria-controls="comments-pagesize-popup">Show <span class="js-comment-pagesize">25</span></button>
 660 | <ul id="comments-pagesize-popup" class="popup popup__group popup--comments-pagesize is-off">
 661 | <li class="popup__item">
 662 | <button class="u-button-reset popup__action" data-pagesize="25" data-link-name="comments-pagesize-25">25</button>
 663 | </li>
 664 | <li class="popup__item">
 665 | <button class="u-button-reset popup__action" data-pagesize="50" data-link-name="comments-pagesize-50">50</button>
 666 | </li>
 667 | <li class="popup__item">
 668 | <button class="u-button-reset popup__action" data-pagesize="100" data-link-name="comments-pagesize-100">100</button>
 669 | </li>
 670 | <li class="popup__item">
 671 | <button class="u-button-reset popup__action" data-pagesize="All" data-link-name="comments-pagesize-All">All</button>
 672 | </li>
 673 | </ul>
 674 | </div>
 675 | <div class="discussion__toolbar-dropdown js-comment-threading-dropdown">
 676 | <button class="u-button-reset popup__toggle" data-toggle="popup--comments-threading" aria-haspopup="true" aria-controls="comments-order-threading">Threads <span class="js-comment-threading"></span></button>
 677 | <ul id="comments-order-threading" class="popup popup__group popup--comments-threading is-off">
 678 | <li class="popup__item">
 679 | <button class="u-button-reset popup__action" data-threading="collapsed" data-link-name="comments-threading-collapsed">collapsed</button>
 680 | </li>
 681 | <li class="popup__item">
 682 | <button class="u-button-reset popup__action" data-threading="expanded" data-link-name="comments-threading-expanded">expanded</button>
 683 | </li>
 684 | <li class="popup__item">
 685 | <button class="u-button-reset popup__action" data-threading="unthreaded" data-link-name="comments-threading-unthreaded">unthreaded</button>
 686 | </li>
 687 | </ul>
 688 | </div>
 689 | <div class="discussion__pagination discussion__pagination--top js-discussion-pagination"></div>
 690 | </div>
 691 | <div class="preload-msg discussion__loader">Loading comments… <a href=/discussion/p/3zjk4 class="accessible-link">Trouble loading?</a><div class="is-updating"></div></div>
 692 | <div class="discussion__main-comments js-discussion-main-comments"></div>
 693 | <div class="discussion__comment-box discussion__comment-box--bottom js-discussion-comment-box--bottom"></div>
 694 | <button class="discussion__show-button button--show-more button button--large button--primary js-discussion-show-button" data-link-name="more-comments">
 695 | <i class="i i-plus-white"></i>
 696 | View more comments
 697 | </button>
 698 | <script type="text/template" id="tmpl-comment-box">
 699 |                         
 700 | <form class="component js-comment-box d-comment-box">
 701 | 
 702 |     <div class="d-comment-box__meta">
 703 |         <span class="d-comment-box__avatar-wrapper"></span>
 704 |         <div class="d-comment-box__meta-text">
 705 |             <span class="d-comment-box__author-label">Signed in as</span>
 706 |             <span class="d-comment-box__author"></span>
 707 |             <span class="i i-reply-grey"></span>
 708 |             <span class="d-comment-box__reply-to-author"></span>
 709 |             <span class="u-fauxlink d-comment-box__show-parent" role="button">Show comment</span>
 710 |             <span class="u-fauxlink d-comment-box__hide-parent" role="button">Hide comment</span>
 711 |         </div>
 712 |     </div>
 713 |     <div class="d-comment-box__parent-comment-wrapper">
 714 |         <div class="d-comment-box__parent-comment-spout"></div>
 715 |         <div class="d-comment-box__parent-comment">
 716 |             <span class="d-comment-box__parent-comment-author"></span>
 717 |             <div class="d-comment-box__parent-comment-body"></div>
 718 |             <span class="u-fauxlink d-comment-box__hide-parent" role="button">Hide comment</span>
 719 |         </div>
 720 |     </div>
 721 | 
 722 |     <div class="d-comment-box__content">
 723 |         <div class="d-comment-box__messages"></div>
 724 |         <div class="d-discussion__error d-comment-box__premod">
 725 |             <i class="i i-status-alert"></i>
 726 |             <span class="d-discussion__error-text">Your comments are currently being pre-moderated (<a href="/community-faqs#311" target="_blank">why?</a>)</span>
 727 |         </div>
 728 |         <textarea name="body" class="textarea d-comment-box__body" placeholder="Join the discussion…"></textarea>
 729 |         <button type="submit" class="u-button-reset button button--large button--primary submit-input d-comment-box__submit">Post your comment</button>
 730 |         <span class="u-fauxlink d-comment-box__preview" role="button">Preview</span>
 731 |         <span class="u-fauxlink d-comment-box__hide-preview" role="button">Hide preview</span>
 732 |         <span class="u-fauxlink d-comment-box__cancel" role="button">Cancel</span>
 733 |         <ul class="d-comment-box__formatting-controls">
 734 |             <li class="d-comment-box__formatting-bold" title="Bold">B</li>
 735 |             <li class="d-comment-box__formatting-italic" title="Italic">i</li>
 736 |             <li class="d-comment-box__formatting-quote" title="Quote">&#8221;</li>
 737 |             <li class="d-comment-box__formatting-link" title="Link">Link</li>
 738 |         </ul>
 739 |         <div class="d-comment-box__preview-wrapper">
 740 |             <div class="d-comment-box__preview-spout"></div>
 741 |             <div class="d-comment-box__preview-block">
 742 |                 <div class="d-comment-box__preview-body"></div>
 743 |             </div>
 744 |         </div>
 745 |     </div>
 746 | </form>
 747 |                         </script>
 748 | </div>
 749 | </div>
 750 | </div>
 751 | </div>
 752 | <div class="fc-container fc-container--commercial-high">
 753 | <div id="dfp-ad--merchandising-high" class="js-ad-slot ad-slot ad-slot--dfp ad-slot--merchandising-high ad-slot--commercial-component-high" data-link-name="ad slot merchandising-high" data-test-id="ad-slot-merchandising-high" data-name="merchandising-high" data-label="false" data-refresh="false" data-mobile="1,1|88,87"> </div>
 754 | </div>
 755 | <aside class="related js-related hide-on-childrens-books-site" role="complementary" data-test-id="related-content">
 756 | </aside>
 757 | <aside class="onward js-onward facia-container facia-container--layout-content tone-news" role="complementary"></aside>
 758 | <div class="js-repositioned-comments content__repositioned-comments"></div>
 759 | <section class="fc-container fc-container--has-toggle" data-link-name="most-popular" data-component="most-popular">
 760 | <div class="fc-container__inner">
 761 | <div class="fc-container__header js-container__header">
 762 | <h2 class="fc-container__header__title">
 763 | <a href="http://www.theguardian.com/most-read/film" data-link-name="Most viewed film">popular</a>
 764 | </h2>
 765 | </div>
 766 | <div class="fc-container__body fc-container--rolled-up-hide js-popular-trails">
 767 | </div>
 768 | </div>
 769 | </section>
 770 | <div class="fc-container fc-container--outbrain hide-on-childrens-books-site">
 771 | <div class="fc-container__inner" data-component="outbrain" data-link-name="outbrain">
 772 | <div class="OUTBRAIN" data-src="DROP_PERMALINK_HERE" data-ob-template="guardian"></div>
 773 | </div>
 774 | </div>
 775 | <div class="fc-container fc-container--commercial">
 776 | <div id="dfp-ad--merchandising" class="js-ad-slot ad-slot ad-slot--dfp ad-slot--merchandising ad-slot--commercial-component" data-link-name="ad slot merchandising" data-test-id="ad-slot-merchandising" data-name="merchandising" data-label="false" data-refresh="false" data-mobile="1,1|88,88"> </div>
 777 | </div>
 778 | </div>
 779 | </div>
 780 | <footer class="l-footer u-cf" data-link-name="footer" data-component="footer">
 781 | <div class="l-footer__primary">
 782 | <div id="footer-nav" class="gs-container">
 783 | <div class="brand-bar u-cf">
 784 | <a href="/international" data-link-name="site logo" class="guardian-logo-footer hide-on-mobile">
 785 | <span class="u-h">The Guardian</span>
 786 | <i class="i i-guardian-logo-160"></i>
 787 | </a>
 788 | <a class="brand-bar__item brand-bar__item--action" data-link-name="back to top" href="#top">
 789 | <span class="rounded-icon control__icon-wrapper">
 790 | <span class="inline-arrow-up inline-icon ">
 791 | <svg width="24" height="18" viewBox="0 0 24 18"><path d="M.4 15.3l10.5-8.4L12 6l1.1.9 10.5 8.4-.5.7L12 9.7.9 16l-.5-.7z"/></svg>
 792 | </span>
 793 | </span>
 794 | <span class="control__info">back to top</span>
 795 | </a>
 796 | </div>
 797 | <div class="l-footer__navigation-wrapper u-cf">
 798 | <div class="js-navigation-footer navigation-container navigation-container--collapsed">
 799 | <div class="gs-container navigation">
 800 | <div class="navigation__inner" aria-hidden="true">
 801 | <div class="navigation__scroll">
 802 | <div class="navigation__container navigation__container--second" data-component="footer-nav">
 803 | <ul class="top-navigation js-top-navigation">
 804 | <li class="top-navigation__item top-navigation__item--home">
 805 | <a href="/international" class="top-navigation__action top-navigation__action--has-icon" data-link-name="nav : primary : home" title="Back to homepage">
 806 | <span class="top-navigation__icon-wrapper">
 807 | <span class="top-navigation__icon top-navigation__icon--home "></span>
 808 | </span>
 809 | <span class="u-h">home</span>
 810 | </a>
 811 | </li>
 812 | <li class="top-navigation__item">
 813 | <a class="top-navigation__action" href="http://www.theguardian.com/uk-news" data-link-name="nav : primary : UK">
 814 | UK
 815 | </a>
 816 | </li>
 817 | <li class="top-navigation__item">
 818 | <a class="top-navigation__action" href="http://www.theguardian.com/politics/general-election-2015" data-link-name="nav : primary : election">
 819 | election
 820 | </a>
 821 | </li>
 822 | <li class="top-navigation__item">
 823 | <a class="top-navigation__action" href="http://www.theguardian.com/world" data-link-name="nav : primary : world">
 824 | world
 825 | </a>
 826 | </li>
 827 | <li class="top-navigation__item">
 828 | <a class="top-navigation__action" href="http://www.theguardian.com/uk/sport" data-link-name="nav : primary : sport">
 829 | sport
 830 | </a>
 831 | </li>
 832 | <li class="top-navigation__item">
 833 | <a class="top-navigation__action" href="http://www.theguardian.com/football" data-link-name="nav : primary : football">
 834 | football
 835 | </a>
 836 | </li>
 837 | <li class="top-navigation__item">
 838 | <a class="top-navigation__action" href="http://www.theguardian.com/uk/commentisfree" data-link-name="nav : primary : opinion">
 839 | opinion
 840 | </a>
 841 | </li>
 842 | <li class="top-navigation__item top-navigation__item--current">
 843 | <a class="top-navigation__action" href="http://www.theguardian.com/uk/culture" data-link-name="nav : primary : culture">
 844 | culture
 845 | <span class="u-h">selected</span>
 846 | </a>
 847 | </li>
 848 | <li class="top-navigation__item">
 849 | <a class="top-navigation__action" href="http://www.theguardian.com/uk/business" data-link-name="nav : primary : business">
 850 | business
 851 | </a>
 852 | </li>
 853 | <li class="top-navigation__item">
 854 | <a class="top-navigation__action" href="http://www.theguardian.com/lifeandstyle" data-link-name="nav : primary : lifestyle">
 855 | lifestyle
 856 | </a>
 857 | </li>
 858 | <li class="top-navigation__item">
 859 | <a class="top-navigation__action" href="http://www.theguardian.com/fashion" data-link-name="nav : primary : fashion">
 860 | fashion
 861 | </a>
 862 | </li>
 863 | <li class="top-navigation__item">
 864 | <a class="top-navigation__action" href="http://www.theguardian.com/uk/environment" data-link-name="nav : primary : environment">
 865 | environment
 866 | </a>
 867 | </li>
 868 | <li class="top-navigation__item">
 869 | <a class="top-navigation__action" href="http://www.theguardian.com/uk/technology" data-link-name="nav : primary : tech">
 870 | tech
 871 | </a>
 872 | </li>
 873 | <li class="top-navigation__item">
 874 | <a class="top-navigation__action" href="http://www.theguardian.com/travel" data-link-name="nav : primary : travel">
 875 | travel
 876 | </a>
 877 | </li>
 878 | </ul>
 879 | </div>
 880 | </div>
 881 | <a class="navigation-toggle js-navigation-toggle" href="#footer-nav" data-link-name="nav : allSections" data-target-nav="js-navigation-footer">
 882 | <i class="burger-icon"></i>
 883 | <span class="navigation-toggle-label navigation-toggle-label--open" aria-haspopup="true" aria-controls="all-sections-popup" aria-label="view all sections">all<span class="navigation-toggle-label__extra"> sections</span></span>
 884 | <span class="navigation-toggle-label navigation-toggle-label--close" aria-label="close all sections">close</span>
 885 | </a>
 886 | </div>
 887 | <div class="js-mega-nav navigation__expandable" data-component="all-footer-nav" data-link-name="global navigation: footer : sections">
 888 | <nav role="navigation" aria-label="All sections">
 889 | <ul class="global-navigation js-global-navigation">
 890 | <li class="global-navigation__section global-navigation__section--home">
 891 | <a class="global-navigation__title" href="/international" data-link-name="nav : globalTop : home">
 892 | home
 893 | </a>
 894 | </li>
 895 | <li class="global-navigation__section">
 896 | <a class="global-navigation__title" href="http://www.theguardian.com/uk-news" data-link-name="nav : globalTop : UK">
 897 | UK
 898 | </a>
 899 | <ul class="global-navigation__children">
 900 | <li class="global-navigation__child">
 901 | <a class="global-navigation__action" href="http://www.theguardian.com/politics" data-link-name="nav : globalSub : politics">
 902 | politics
 903 | </a>
 904 | </li>
 905 | <li class="global-navigation__child">
 906 | <a class="global-navigation__action" href="http://www.theguardian.com/education" data-link-name="nav : globalSub : education">
 907 | education
 908 | </a>
 909 | </li>
 910 | <li class="global-navigation__child">
 911 | <a class="global-navigation__action" href="http://www.theguardian.com/uk/media" data-link-name="nav : globalSub : media">
 912 | media
 913 | </a>
 914 | </li>
 915 | <li class="global-navigation__child">
 916 | <a class="global-navigation__action" href="http://www.theguardian.com/society" data-link-name="nav : globalSub : society">
 917 | society
 918 | </a>
 919 | </li>
 920 | <li class="global-navigation__child">
 921 | <a class="global-navigation__action" href="http://www.theguardian.com/law" data-link-name="nav : globalSub : law">
 922 | law
 923 | </a>
 924 | </li>
 925 | <li class="global-navigation__child">
 926 | <a class="global-navigation__action" href="http://www.theguardian.com/uk/scotland" data-link-name="nav : globalSub : scotland">
 927 | scotland
 928 | </a>
 929 | </li>
 930 | <li class="global-navigation__child">
 931 | <a class="global-navigation__action" href="http://www.theguardian.com/uk/wales" data-link-name="nav : globalSub : wales">
 932 | wales
 933 | </a>
 934 | </li>
 935 | <li class="global-navigation__child">
 936 | <a class="global-navigation__action" href="http://www.theguardian.com/uk/northernireland" data-link-name="nav : globalSub : northern ireland">
 937 | northern ireland
 938 | </a>
 939 | </li>
 940 | </ul>
 941 | </li>
 942 | <li class="global-navigation__section">
 943 | <a class="global-navigation__title" href="http://www.theguardian.com/politics/general-election-2015" data-link-name="nav : globalTop : election">
 944 | election
 945 | </a>
 946 | </li>
 947 | <li class="global-navigation__section">
 948 | <a class="global-navigation__title" href="http://www.theguardian.com/world" data-link-name="nav : globalTop : world">
 949 | world
 950 | </a>
 951 | <ul class="global-navigation__children">
 952 | <li class="global-navigation__child">
 953 | <a class="global-navigation__action" href="http://www.theguardian.com/world/europe-news" data-link-name="nav : globalSub : europe">
 954 | europe
 955 | </a>
 956 | </li>
 957 | <li class="global-navigation__child">
 958 | <a class="global-navigation__action" href="http://www.theguardian.com/us-news" data-link-name="nav : globalSub : US">
 959 | US
 960 | </a>
 961 | </li>
 962 | <li class="global-navigation__child">
 963 | <a class="global-navigation__action" href="http://www.theguardian.com/world/americas" data-link-name="nav : globalSub : americas">
 964 | americas
 965 | </a>
 966 | </li>
 967 | <li class="global-navigation__child">
 968 | <a class="global-navigation__action" href="http://www.theguardian.com/world/asia" data-link-name="nav : globalSub : asia">
 969 | asia
 970 | </a>
 971 | </li>
 972 | <li class="global-navigation__child">
 973 | <a class="global-navigation__action" href="http://www.theguardian.com/australia-news" data-link-name="nav : globalSub : australia">
 974 | australia
 975 | </a>
 976 | </li>
 977 | <li class="global-navigation__child">
 978 | <a class="global-navigation__action" href="http://www.theguardian.com/world/africa" data-link-name="nav : globalSub : africa">
 979 | africa
 980 | </a>
 981 | </li>
 982 | <li class="global-navigation__child">
 983 | <a class="global-navigation__action" href="http://www.theguardian.com/world/middleeast" data-link-name="nav : globalSub : middle east">
 984 | middle east
 985 | </a>
 986 | </li>
 987 | <li class="global-navigation__child">
 988 | <a class="global-navigation__action" href="http://www.theguardian.com/cities" data-link-name="nav : globalSub : cities">
 989 | cities
 990 | </a>
 991 | </li>
 992 | <li class="global-navigation__child">
 993 | <a class="global-navigation__action" href="http://www.theguardian.com/global-development" data-link-name="nav : globalSub : development">
 994 | development
 995 | </a>
 996 | </li>
 997 | </ul>
 998 | </li>
 999 | <li class="global-navigation__section">
1000 | <a class="global-navigation__title" href="http://www.theguardian.com/uk/sport" data-link-name="nav : globalTop : sport">
1001 | sport
1002 | </a>
1003 | <ul class="global-navigation__children">
1004 | <li class="global-navigation__child">
1005 | <a class="global-navigation__action" href="http://www.theguardian.com/football" data-link-name="nav : globalSub : football">
1006 | football
1007 | </a>
1008 | </li>
1009 | <li class="global-navigation__child">
1010 | <a class="global-navigation__action" href="http://www.theguardian.com/sport/cricket" data-link-name="nav : globalSub : cricket">
1011 | cricket
1012 | </a>
1013 | </li>
1014 | <li class="global-navigation__child">
1015 | <a class="global-navigation__action" href="http://www.theguardian.com/sport/rugby-union" data-link-name="nav : globalSub : rugby union">
1016 | rugby union
1017 | </a>
1018 | </li>
1019 | <li class="global-navigation__child">
1020 | <a class="global-navigation__action" href="http://www.theguardian.com/sport/formulaone" data-link-name="nav : globalSub : F1">
1021 | F1
1022 | </a>
1023 | </li>
1024 | <li class="global-navigation__child">
1025 | <a class="global-navigation__action" href="http://www.theguardian.com/sport/tennis" data-link-name="nav : globalSub : tennis">
1026 | tennis
1027 | </a>
1028 | </li>
1029 | <li class="global-navigation__child">
1030 | <a class="global-navigation__action" href="http://www.theguardian.com/sport/golf" data-link-name="nav : globalSub : golf">
1031 | golf
1032 | </a>
1033 | </li>
1034 | <li class="global-navigation__child">
1035 | <a class="global-navigation__action" href="http://www.theguardian.com/sport/cycling" data-link-name="nav : globalSub : cycling">
1036 | cycling
1037 | </a>
1038 | </li>
1039 | <li class="global-navigation__child">
1040 | <a class="global-navigation__action" href="http://www.theguardian.com/sport/boxing" data-link-name="nav : globalSub : boxing">
1041 | boxing
1042 | </a>
1043 | </li>
1044 | <li class="global-navigation__child">
1045 | <a class="global-navigation__action" href="http://www.theguardian.com/sport/horse-racing" data-link-name="nav : globalSub : racing">
1046 | racing
1047 | </a>
1048 | </li>
1049 | <li class="global-navigation__child">
1050 | <a class="global-navigation__action" href="http://www.theguardian.com/sport/rugbyleague" data-link-name="nav : globalSub : rugby league">
1051 | rugby league
1052 | </a>
1053 | </li>
1054 | <li class="global-navigation__child">
1055 | <a class="global-navigation__action" href="http://www.theguardian.com/sport/us-sport" data-link-name="nav : globalSub : US sports">
1056 | US sports
1057 | </a>
1058 | </li>
1059 | </ul>
1060 | </li>
1061 | <li class="global-navigation__section">
1062 | <a class="global-navigation__title" href="http://www.theguardian.com/football" data-link-name="nav : globalTop : football">
1063 | football
1064 | </a>
1065 | <ul class="global-navigation__children">
1066 | <li class="global-navigation__child">
1067 | <a class="global-navigation__action" href="http://www.theguardian.com/football/live" data-link-name="nav : globalSub : live scores">
1068 | live scores
1069 | </a>
1070 | </li>
1071 | <li class="global-navigation__child">
1072 | <a class="global-navigation__action" href="http://www.theguardian.com/football/tables" data-link-name="nav : globalSub : tables">
1073 | tables
1074 | </a>
1075 | </li>
1076 | <li class="global-navigation__child">
1077 | <a class="global-navigation__action" href="http://www.theguardian.com/football/competitions" data-link-name="nav : globalSub : competitions">
1078 | competitions
1079 | </a>
1080 | </li>
1081 | <li class="global-navigation__child">
1082 | <a class="global-navigation__action" href="http://www.theguardian.com/football/results" data-link-name="nav : globalSub : results">
1083 | results
1084 | </a>
1085 | </li>
1086 | <li class="global-navigation__child">
1087 | <a class="global-navigation__action" href="http://www.theguardian.com/football/fixtures" data-link-name="nav : globalSub : fixtures">
1088 | fixtures
1089 | </a>
1090 | </li>
1091 | <li class="global-navigation__child">
1092 | <a class="global-navigation__action" href="http://www.theguardian.com/football/teams" data-link-name="nav : globalSub : clubs">
1093 | clubs
1094 | </a>
1095 | </li>
1096 | </ul>
1097 | </li>
1098 | <li class="global-navigation__section">
1099 | <a class="global-navigation__title" href="http://www.theguardian.com/uk/commentisfree" data-link-name="nav : globalTop : opinion">
1100 | opinion
1101 | </a>
1102 | <ul class="global-navigation__children">
1103 | <li class="global-navigation__child">
1104 | <a class="global-navigation__action" href="http://www.theguardian.com/index/contributors" data-link-name="nav : globalSub : columnists">
1105 | columnists
1106 | </a>
1107 | </li>
1108 | </ul>
1109 | </li>
1110 | <li class="global-navigation__section global-navigation__section--current">
1111 | <a class="global-navigation__title" href="http://www.theguardian.com/uk/culture" data-link-name="nav : globalTop : culture">
1112 | culture
1113 | <span class="u-h">selected</span>
1114 | </a>
1115 | <ul class="global-navigation__children">
1116 | <li class="global-navigation__child global-navigation__child--current">
1117 | <a class="global-navigation__action" href="http://www.theguardian.com/film" data-link-name="nav : globalSub : film">
1118 | film
1119 | <span class="u-h">selected</span>
1120 | </a>
1121 | </li>
1122 | <li class="global-navigation__child">
1123 | <a class="global-navigation__action" href="http://www.theguardian.com/tv-and-radio" data-link-name="nav : globalSub : tv &amp; radio">
1124 | tv & radio
1125 | </a>
1126 | </li>
1127 | <li class="global-navigation__child">
1128 | <a class="global-navigation__action" href="http://www.theguardian.com/music" data-link-name="nav : globalSub : music">
1129 | music
1130 | </a>
1131 | </li>
1132 | <li class="global-navigation__child">
1133 | <a class="global-navigation__action" href="http://www.theguardian.com/technology/games" data-link-name="nav : globalSub : games">
1134 | games
1135 | </a>
1136 | </li>
1137 | <li class="global-navigation__child">
1138 | <a class="global-navigation__action" href="http://www.theguardian.com/books" data-link-name="nav : globalSub : books">
1139 | books
1140 | </a>
1141 | </li>
1142 | <li class="global-navigation__child">
1143 | <a class="global-navigation__action" href="http://www.theguardian.com/artanddesign" data-link-name="nav : globalSub : art &amp; design">
1144 | art & design
1145 | </a>
1146 | </li>
1147 | <li class="global-navigation__child">
1148 | <a class="global-navigation__action" href="http://www.theguardian.com/stage" data-link-name="nav : globalSub : stage">
1149 | stage
1150 | </a>
1151 | </li>
1152 | <li class="global-navigation__child">
1153 | <a class="global-navigation__action" href="http://www.theguardian.com/music/classicalmusicandopera" data-link-name="nav : globalSub : classical">
1154 | classical
1155 | </a>
1156 | </li>
1157 | </ul>
1158 | </li>
1159 | <li class="global-navigation__section">
1160 | <a class="global-navigation__title" href="http://www.theguardian.com/uk/business" data-link-name="nav : globalTop : business">
1161 | business
1162 | </a>
1163 | <ul class="global-navigation__children">
1164 | <li class="global-navigation__child">
1165 | <a class="global-navigation__action" href="http://www.theguardian.com/business/economics" data-link-name="nav : globalSub : economics">
1166 | economics
1167 | </a>
1168 | </li>
1169 | <li class="global-navigation__child">
1170 | <a class="global-navigation__action" href="http://www.theguardian.com/business/banking" data-link-name="nav : globalSub : banking">
1171 | banking
1172 | </a>
1173 | </li>
1174 | <li class="global-navigation__child">
1175 | <a class="global-navigation__action" href="http://www.theguardian.com/business/retail" data-link-name="nav : globalSub : retail">
1176 | retail
1177 | </a>
1178 | </li>
1179 | <li class="global-navigation__child">
1180 | <a class="global-navigation__action" href="http://www.theguardian.com/business/stock-markets" data-link-name="nav : globalSub : markets">
1181 | markets
1182 | </a>
1183 | </li>
1184 | <li class="global-navigation__child">
1185 | <a class="global-navigation__action" href="http://www.theguardian.com/business/eurozone" data-link-name="nav : globalSub : eurozone">
1186 | eurozone
1187 | </a>
1188 | </li>
1189 | </ul>
1190 | </li>
1191 | <li class="global-navigation__section">
1192 | <a class="global-navigation__title" href="http://www.theguardian.com/lifeandstyle" data-link-name="nav : globalTop : lifestyle">
1193 | lifestyle
1194 | </a>
1195 | <ul class="global-navigation__children">
1196 | <li class="global-navigation__child">
1197 | <a class="global-navigation__action" href="http://www.theguardian.com/lifeandstyle/food-and-drink" data-link-name="nav : globalSub : food">
1198 | food
1199 | </a>
1200 | </li>
1201 | <li class="global-navigation__child">
1202 | <a class="global-navigation__action" href="http://www.theguardian.com/lifeandstyle/health-and-wellbeing" data-link-name="nav : globalSub : health &amp; fitness">
1203 | health & fitness
1204 | </a>
1205 | </li>
1206 | <li class="global-navigation__child">
1207 | <a class="global-navigation__action" href="http://www.theguardian.com/lifeandstyle/love-and-sex" data-link-name="nav : globalSub : love &amp; sex">
1208 | love & sex
1209 | </a>
1210 | </li>
1211 | <li class="global-navigation__child">
1212 | <a class="global-navigation__action" href="http://www.theguardian.com/lifeandstyle/family" data-link-name="nav : globalSub : family">
1213 | family
1214 | </a>
1215 | </li>
1216 | <li class="global-navigation__child">
1217 | <a class="global-navigation__action" href="http://www.theguardian.com/lifeandstyle/women" data-link-name="nav : globalSub : women">
1218 | women
1219 | </a>
1220 | </li>
1221 | <li class="global-navigation__child">
1222 | <a class="global-navigation__action" href="http://www.theguardian.com/lifeandstyle/home-and-garden" data-link-name="nav : globalSub : home &amp; garden">
1223 | home & garden
1224 | </a>
1225 | </li>
1226 | </ul>
1227 | </li>
1228 | <li class="global-navigation__section">
1229 | <a class="global-navigation__title" href="http://www.theguardian.com/fashion" data-link-name="nav : globalTop : fashion">
1230 | fashion
1231 | </a>
1232 | </li>
1233 | <li class="global-navigation__section">
1234 | <a class="global-navigation__title" href="http://www.theguardian.com/uk/environment" data-link-name="nav : globalTop : environment">
1235 | environment
1236 | </a>
1237 | <ul class="global-navigation__children">
1238 | <li class="global-navigation__child">
1239 | <a class="global-navigation__action" href="http://www.theguardian.com/environment/climate-change" data-link-name="nav : globalSub : climate change">
1240 | climate change
1241 | </a>
1242 | </li>
1243 | <li class="global-navigation__child">
1244 | <a class="global-navigation__action" href="http://www.theguardian.com/environment/wildlife" data-link-name="nav : globalSub : wildlife">
1245 | wildlife
1246 | </a>
1247 | </li>
1248 | <li class="global-navigation__child">
1249 | <a class="global-navigation__action" href="http://www.theguardian.com/environment/energy" data-link-name="nav : globalSub : energy">
1250 | energy
1251 | </a>
1252 | </li>
1253 | <li class="global-navigation__child">
1254 | <a class="global-navigation__action" href="http://www.theguardian.com/environment/pollution" data-link-name="nav : globalSub : pollution">
1255 | pollution
1256 | </a>
1257 | </li>
1258 | </ul>
1259 | </li>
1260 | <li class="global-navigation__section">
1261 | <a class="global-navigation__title" href="http://www.theguardian.com/uk/technology" data-link-name="nav : globalTop : tech">
1262 | tech
1263 | </a>
1264 | </li>
1265 | <li class="global-navigation__section">
1266 | <a class="global-navigation__title" href="http://www.theguardian.com/travel" data-link-name="nav : globalTop : travel">
1267 | travel
1268 | </a>
1269 | <ul class="global-navigation__children">
1270 | <li class="global-navigation__child">
1271 | <a class="global-navigation__action" href="http://www.theguardian.com/travel/uk" data-link-name="nav : globalSub : UK">
1272 | UK
1273 | </a>
1274 | </li>
1275 | <li class="global-navigation__child">
1276 | <a class="global-navigation__action" href="http://www.theguardian.com/travel/europe" data-link-name="nav : globalSub : europe">
1277 | europe
1278 | </a>
1279 | </li>
1280 | <li class="global-navigation__child">
1281 | <a class="global-navigation__action" href="http://www.theguardian.com/travel/usa" data-link-name="nav : globalSub : US">
1282 | US
1283 | </a>
1284 | </li>
1285 | </ul>
1286 | </li>
1287 | <li class="global-navigation__section">
1288 | <a class="global-navigation__title" href="http://www.theguardian.com/uk/money" data-link-name="nav : globalTop : money">
1289 | money
1290 | </a>
1291 | <ul class="global-navigation__children">
1292 | <li class="global-navigation__child">
1293 | <a class="global-navigation__action" href="http://www.theguardian.com/money/property" data-link-name="nav : globalSub : property">
1294 | property
1295 | </a>
1296 | </li>
1297 | <li class="global-navigation__child">
1298 | <a class="global-navigation__action" href="http://www.theguardian.com/money/savings" data-link-name="nav : globalSub : savings">
1299 | savings
1300 | </a>
1301 | </li>
1302 | <li class="global-navigation__child">
1303 | <a class="global-navigation__action" href="http://www.theguardian.com/money/pensions" data-link-name="nav : globalSub : pensions">
1304 | pensions
1305 | </a>
1306 | </li>
1307 | <li class="global-navigation__child">
1308 | <a class="global-navigation__action" href="http://www.theguardian.com/money/debt" data-link-name="nav : globalSub : borrowing">
1309 | borrowing
1310 | </a>
1311 | </li>
1312 | <li class="global-navigation__child">
1313 | <a class="global-navigation__action" href="http://www.theguardian.com/money/work-and-careers" data-link-name="nav : globalSub : careers">
1314 | careers
1315 | </a>
1316 | </li>
1317 | </ul>
1318 | </li>
1319 | <li class="global-navigation__section">
1320 | <a class="global-navigation__title" href="http://www.theguardian.com/science" data-link-name="nav : globalTop : science">
1321 | science
1322 | </a>
1323 | </li>
1324 | <li class="global-navigation__section">
1325 | <a class="global-navigation__title" href="http://www.theguardian.com/guardian-professional" data-link-name="nav : globalTop : professional networks">
1326 | professional networks
1327 | </a>
1328 | </li>
1329 | <li class="global-navigation__section">
1330 | <a class="global-navigation__title" href="http://www.theguardian.com/observer" data-link-name="nav : globalTop : the observer">
1331 | the observer
1332 | </a>
1333 | </li>
1334 | <li class="global-navigation__section">
1335 | <a class="global-navigation__title" href="http://www.theguardian.com/theguardian" data-link-name="nav : globalTop : today&#x27;s paper">
1336 | today's paper
1337 | </a>
1338 | <ul class="global-navigation__children">
1339 | <li class="global-navigation__child">
1340 | <a class="global-navigation__action" href="http://www.theguardian.com/theguardian/mainsection/editorialsandreply" data-link-name="nav : globalSub : editorials and letters">
1341 | editorials and letters
1342 | </a>
1343 | </li>
1344 | <li class="global-navigation__child">
1345 | <a class="global-navigation__action" href="http://www.theguardian.com/tone/obituaries" data-link-name="nav : globalSub : obituaries">
1346 | obituaries
1347 | </a>
1348 | </li>
1349 | <li class="global-navigation__child">
1350 | <a class="global-navigation__action" href="http://www.theguardian.com/theguardian/g2" data-link-name="nav : globalSub : g2">
1351 | g2
1352 | </a>
1353 | </li>
1354 | <li class="global-navigation__child">
1355 | <a class="global-navigation__action" href="http://www.theguardian.com/theguardian/weekend" data-link-name="nav : globalSub : weekend">
1356 | weekend
1357 | </a>
1358 | </li>
1359 | <li class="global-navigation__child">
1360 | <a class="global-navigation__action" href="http://www.theguardian.com/theguardian/theguide" data-link-name="nav : globalSub : the guide">
1361 | the guide
1362 | </a>
1363 | </li>
1364 | <li class="global-navigation__child">
1365 | <a class="global-navigation__action" href="http://www.theguardian.com/theguardian/guardianreview" data-link-name="nav : globalSub : saturday review">
1366 | saturday review
1367 | </a>
1368 | </li>
1369 | </ul>
1370 | </li>
1371 | <li class="global-navigation__section">
1372 | <a class="global-navigation__title" href="http://www.theguardian.com/membership" data-link-name="nav : globalTop : membership">
1373 | membership
1374 | </a>
1375 | </li>
1376 | <li class="global-navigation__section">
1377 | <a class="global-navigation__title" href="http://www.theguardian.com/crosswords" data-link-name="nav : globalTop : crosswords">
1378 | crosswords
1379 | </a>
1380 | </li>
1381 | <li class="global-navigation__section">
1382 | <a class="global-navigation__title" href="http://www.theguardian.com/video" data-link-name="nav : globalTop : video">
1383 | video
1384 | </a>
1385 | </li>
1386 | </ul>
1387 | </nav>
1388 | </div>
1389 | </div>
1390 | </div>
1391 | </div>
1392 | <ul class="breadcrumb signposting">
1393 | <li class="signposting__item signposting__item--parent">
1394 | <div itemscope itemtype="http://data-vocabulary.org/Breadcrumb">
1395 | <a itemprop="url" href="http://www.theguardian.com/uk/culture" data-link-name="/culture" class="signposting__action"><span itemprop="title">Culture</span></a>
1396 | </div>
1397 | </li>
1398 | <li class="signposting__item signposting__item--parent">
1399 | <div itemscope itemtype="http://data-vocabulary.org/Breadcrumb">
1400 | <span class="signposting__separator" aria-hidden="true"><span class="signposting__separator__inner">›</span></span>
1401 | <a itemprop="url" href="http://www.theguardian.com/film" data-link-name="/film" class="signposting__action"><span itemprop="title">Film</span></a>
1402 | </div>
1403 | </li>
1404 | <li class="signposting__item signposting__item--parent">
1405 | <div itemscope itemtype="http://data-vocabulary.org/Breadcrumb">
1406 | <span class="signposting__separator" aria-hidden="true"><span class="signposting__separator__inner">›</span></span>
1407 | <a itemprop="url" href="http://www.theguardian.com/film/sport" data-link-name="/film/sport" class="signposting__action"><span itemprop="title">Sport</span></a>
1408 | </div>
1409 | </li>
1410 | </ul>
1411 | </div>
1412 | </div>
1413 | <div class="l-footer__secondary gs-container" role="contentinfo">
1414 | <ul class="colophon u-cf">
1415 | <li class="colophon__item"><a data-link-name="uk : footer : membership" href="https://membership.theguardian.com/?INTCMP=NGW_FOOTER_UK_GU_MEMBERSHIP">
1416 | membership</a></li>
1417 | <li class="colophon__item"><a data-link-name="uk : footer : jobs" href="http://jobs.theguardian.com/?INTCMP=NGW_FOOTER_UK_GU_JOBS">
1418 | jobs</a></li>
1419 | <li class="colophon__item"><a data-link-name="uk : footer : soulmates" href="https://soulmates.theguardian.com/?INTCMP=NGW_FOOTER_UK_GU_SOULMATES">
1420 | dating</a></li>
1421 | <li class="colophon__item"><a data-link-name="uk : footer : masterclasses" href="http://www.theguardian.com/guardian-masterclasses?INTCMP=NGW_FOOTER_UK_GU_MASTERCLASSES">
1422 | masterclasses</a></li>
1423 | <li class="colophon__item"><a data-link-name="uk : footer : subscribe" href="http://subscribe.theguardian.com/?INTCMP=NGW_FOOTER_UK_GU_SUBSCRIBE">
1424 | subscribe</a></li>
1425 | <li class="colophon__item"><a data-link-name="all topics" href="http://www.theguardian.com/index/subjects/a">all topics</a></li>
1426 | <li class="colophon__item"><a data-link-name="all contributors" href="http://www.theguardian.com/index/contributors">all contributors</a></li>
1427 | <li class="colophon__item"><a data-link-name="uk : footer : about us" href="http://www.theguardian.com/info">
1428 | about us</a></li>
1429 | <li class="colophon__item"><a data-link-name="uk : footer : contact us" href="http://www.theguardian.com/help/contact-us">
1430 | contact us</a></li>
1431 | <li class="colophon__item">
1432 | <form class="js-tech-feedback" action="//beacon.guim.co.uk/tech-feedback?uri=http://www.theguardian.com/film/2014/apr/24/the-hurricane-rubin-carter-denzel-washington" method="post">
1433 | <input data-link-name="tech feedback" type="submit" value="report technical issue">
1434 | </form>
1435 | </li>
1436 | <li class="colophon__item"><a data-link-name="complaints" href="http://www.theguardian.com/info/complaints-and-corrections">
1437 | complaints &amp; corrections</a></li>
1438 | <li class="colophon__item"><a data-link-name="terms" href="http://www.theguardian.com/help/terms-of-service">terms &amp; conditions</a></li>
1439 | <li class="colophon__item"><a data-link-name="privacy" href="http://www.theguardian.com/info/privacy">privacy policy</a></li>
1440 | <li class="colophon__item"><a data-link-name="cookie" href="http://www.theguardian.com/info/cookies">cookie policy</a></li>
1441 | <li class="colophon__item"><a data-link-name="securedrop" href="https://securedrop.theguardian.com/">securedrop</a></li>
1442 | </ul>
1443 | <div class="l-footer__misc">
1444 | <div class="really-serious-copyright">© 2015 Guardian News and Media Limited or its affiliated companies. All rights reserved.</div>
1445 | </div>
1446 | </div>
1447 | </footer>
1448 | <noscript id="omnitureNoScript">
1449 | <div>
1450 | <img id="omnitureNoScriptImage" alt="" src="http://hits.theguardian.com/b/ss/guardiangu-frontend,guardiangu-network/1/H.25.3/?c3=theguardian.com&c4=Sport%2CFilm%2CDenzel+Washington&v19=frontend&cdp=2&c11=film&c8=2081110&v7=GFE%3Afilm%3AArticle%3Athe-hurricane-rubin-carter-denzel-washington&event=&ns=guardian&c19=frontend&v23=&c56=No+Javascript&c67=nextgenServed&v8=2081110&c9=Article&ch=film&c30=content&c6=Alex+von+Tunzelmann&e27=&g=www.theguardian.com%2Ffilm%2F2014%2Fapr%2F24%2Fthe-hurricane-rubin-carter-denzel-washington&c13=Reel+history&pageName=GFE%3Afilm%3AArticle%3Athe-hurricane-rubin-carter-denzel-washington&c14=6562&c10=&c25=" width="1" height="1" class="u-h"/>
1451 | </div>
1452 | </noscript>
1453 | <script>if(!guardian.isModernBrowser){var analyticsImage=document.createElement("img");analyticsImage.src="http://hits.theguardian.com/b/ss/guardiangu-frontend,guardiangu-network/1/H.25.3/?c3=theguardian.com&c4=Sport%2CFilm%2CDenzel+Washington&v19=frontend&cdp=2&c11=film&c8=2081110&v7=GFE%3Afilm%3AArticle%3Athe-hurricane-rubin-carter-denzel-washington&event=&ns=guardian&c19=frontend&v23=&c56=Partial+Javascript&c67=nextgenServed&v8=2081110&c9=Article&ch=film&c30=content&c6=Alex+von+Tunzelmann&e27=&g=www.theguardian.com%2Ffilm%2F2014%2Fapr%2F24%2Fthe-hurricane-rubin-carter-denzel-washington&c13=Reel+history&pageName=GFE%3Afilm%3AArticle%3Athe-hurricane-rubin-carter-denzel-washington&c14=6562&c10=&c25=";analyticsImage.width="1";analyticsImage.height="1";document.body.appendChild(analyticsImage);var img=new Image();img.src="//beacon.guim.co.uk/count/pva.gif";var s=document.createElement('script'),sc=document.getElementsByTagName('script')[0];s.src='http://assets.guim.co.uk/javascripts/bootstraps/b8bebbfd475d40dfd6b7dcfb23e8f546/ophan.js';s.aysnc=true;sc.parentNode.insertBefore(s,sc);}</script>
1454 | <noscript>
1455 | <div style="display:inline;">
1456 | <img height="1" width="1" style="border-style:none;" alt="" src="//googleads.g.doubleclick.net/pagead/viewthroughconversion/971225648/?value=0&amp;guid=ON&amp;script=0"/>
1457 | </div>
1458 | </noscript>
1459 | <img src="//beacon.guim.co.uk/count/pv.gif" alt="" style="display : none ;" rel="nofollow"/>
1460 | <script>(function(isVeryModern){function insertUserName(){function getCookieValue(a){var d=[],e=document.cookie.split(";");a=RegExp("^\\s*"+a+"=\\s*(.*?)\\s*$");for(var b=0;b<e.length;b++){var f=e[b].match(a);f&&d.push(f[1]);}
1461 | if(d.length>0){return d[0];}
1462 | return null;}
1463 | function decodeBase64(str){return decodeURIComponent(encodeURIComponent(atob(str.replace(/-/g,'+').replace(/_/g,'/').replace(/,/g,'='))));}
1464 | function getUserDisplayNameFromCookie(){var cookieData=getCookieValue('GU_U');var userData=cookieData?JSON.parse(decodeBase64(cookieData.split('.')[0])):null;if(userData){return userData[2];}
1465 | return null;}
1466 | var userDisplayName=getUserDisplayNameFromCookie();if(userDisplayName){document.getElementsByClassName('js-profile-info')[0].innerHTML=userDisplayName;document.getElementsByClassName('js-profile-nav')[0].classList.add('is-signed-in');if(window.guardian.config.switches.becomeAMember){var $register=document.getElementsByClassName('js-profile-register')[0];$register.parentElement.removeChild($register);}}}
1467 | insertUserName();})(guardian.isModernBrowser&&'atob'in window&&'classList'in document.documentElement);</script>
1468 | <div id="dfp-ad--pageskin-inread" class="js-ad-slot ad-slot ad-slot--dfp ad-slot--pageskin-inread ad-slot--page-skin" data-link-name="ad slot pageskin-inread" data-test-id="ad-slot-pageskin-inread" data-name="pageskin-inread" data-label="false" data-refresh="false" data-out-of-page="true" data-wide="1,1"> </div>
1469 | </body>
1470 | </html>
1471 | 


--------------------------------------------------------------------------------
/tests/samples/utf-8-kanji.sample.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="ja">
 3 |   <body>
 4 |     <div>
 5 |       <article>
 6 |         <div>
 7 |           草枕
 8 |           夏目漱石
 9 | 
10 | 
11 |           一
12 | 
13 |           　山路を登りながら、こう考えた。
14 |           　智に働けば角が立つ。情に棹させば流される。意地を通せば窮屈だ。とかくに人の世は住みにくい。
15 |           　住みにくさが高じると、安い所へ引き越したくなる。どこへ越しても住みにくいと悟った時、詩が生れて、画が出来る。
16 |           　人の世を作ったものは神でもなければ鬼でもない。やはり向う三軒両隣りにちらちらするただの人である。ただの人が作った人の世が住みにくいからとて、越す国はあるまい。あれば人でなしの国へ行くばかりだ。人でなしの国は人の世よりもなお住みにくかろう。
17 |           　越す事のならぬ世が住みにくければ、住みにくい所をどれほどか、寛容て、束の間の命を、束の間でも住みよくせねばならぬ。ここに詩人という天職が出来て、ここに画家という使命が降る。あらゆる芸術の士は人の世を長閑にし、人の心を豊かにするが故に尊とい。
18 |           　住みにくき世から、住みにくき煩いを引き抜いて、ありがたい世界をまのあたりに写すのが詩である、画である。あるは音楽と彫刻である。こまかに云えば写さないでもよい。ただまのあたりに見れば、そこに詩も生き、歌も湧く。着想を紙に落さぬとも※(「王＋膠のつくり」、第3水準1-88-22)鏘の音は胸裏に起る。丹青は画架に向って塗抹せんでも五彩の絢爛は自から心眼に映る。ただおのが住む世を、かく観じ得て、霊台方寸のカメラに澆季溷濁の俗界を清くうららかに収め得れば足る。この故に無声の詩人には一句なく、無色の画家には尺※(「糸＋賺のつくり」、第3水準1-90-17)なきも、かく人世を観じ得るの点において、かく煩悩を解脱するの点において、かく清浄界に出入し得るの点において、またこの不同不二の乾坤を建立し得るの点において、我利私慾の覊絆を掃蕩するの点において、――千金の子よりも、万乗の君よりも、あらゆる俗界の寵児よりも幸福である。
19 |           　世に住むこと二十年にして、住むに甲斐ある世と知った。二十五年にして明暗は表裏のごとく、日のあたる所にはきっと影がさすと悟った。三十の今日はこう思うている。――喜びの深きとき憂いよいよ深く、楽みの大いなるほど苦しみも大きい。これを切り放そうとすると身が持てぬ。片づけようとすれば世が立たぬ。金は大事だ、大事なものが殖えれば寝る間も心配だろう。恋はうれしい、嬉しい恋が積もれば、恋をせぬ昔がかえって恋しかろ。閣僚の肩は数百万人の足を支えている。背中には重い天下がおぶさっている。うまい物も食わねば惜しい。少し食えば飽き足らぬ。存分食えばあとが不愉快だ。……
20 |           　余の考がここまで漂流して来た時に、余の右足は突然坐りのわるい角石の端を踏み損くなった。平衡を保つために、すわやと前に飛び出した左足が、仕損じの埋め合せをすると共に、余の腰は具合よく方三尺ほどな岩の上に卸りた。肩にかけた絵の具箱が腋の下から躍り出しただけで、幸いと何の事もなかった。
21 |           　立ち上がる時に向うを見ると、路から左の方にバケツを伏せたような峰が聳えている。杉か檜か分からないが根元から頂きまでことごとく蒼黒い中に、山桜が薄赤くだんだらに棚引いて、続ぎ目が確と見えぬくらい靄が濃い。少し手前に禿山が一つ、群をぬきんでて眉に逼る。禿げた側面は巨人の斧で削り去ったか、鋭どき平面をやけに谷の底に埋めている。天辺に一本見えるのは赤松だろう。枝の間の空さえ判然している。行く手は二丁ほどで切れているが、高い所から赤い毛布が動いて来るのを見ると、登ればあすこへ出るのだろう。路はすこぶる難義だ。
22 |           　土をならすだけならさほど手間も入るまいが、土の中には大きな石がある。土は平らにしても石は平らにならぬ。石は切り砕いても、岩は始末がつかぬ。掘崩した土の上に悠然と峙って、吾らのために道を譲る景色はない。向うで聞かぬ上は乗り越すか、廻らなければならん。巌のない所でさえ歩るきよくはない。左右が高くって、中心が窪んで、まるで一間幅を三角に穿って、その頂点が真中を貫いていると評してもよい。路を行くと云わんより川底を渉ると云う方が適当だ。固より急ぐ旅でないから、ぶらぶらと七曲りへかかる。
23 |           　たちまち足の下で雲雀の声がし出した。谷を見下したが、どこで鳴いてるか影も形も見えぬ。ただ声だけが明らかに聞える。せっせと忙しく、絶間なく鳴いている。方幾里の空気が一面に蚤に刺されていたたまれないような気がする。あの鳥の鳴く音には瞬時の余裕もない。のどかな春の日を鳴き尽くし、鳴きあかし、また鳴き暮らさなければ気が済まんと見える。その上どこまでも登って行く、いつまでも登って行く。雲雀はきっと雲の中で死ぬに相違ない。登り詰めた揚句は、流れて雲に入って、漂うているうちに形は消えてなくなって、ただ声だけが空の裡に残るのかも知れない。
24 |           　巌角を鋭どく廻って、按摩なら真逆様に落つるところを、際どく右へ切れて、横に見下すと、菜の花が一面に見える。雲雀はあすこへ落ちるのかと思った。いいや、あの黄金の原から飛び上がってくるのかと思った。次には落ちる雲雀と、上る雲雀が十文字にすれ違うのかと思った。最後に、落ちる時も、上る時も、また十文字に擦れ違うときにも元気よく鳴きつづけるだろうと思った。
25 |           　春は眠くなる。猫は鼠を捕る事を忘れ、人間は借金のある事を忘れる。時には自分の魂の居所さえ忘れて正体なくなる。ただ菜の花を遠く望んだときに眼が醒める。雲雀の声を聞いたときに魂のありかが判然する。雲雀の鳴くのは口で鳴くのではない、魂全体が鳴くのだ。魂の活動が声にあらわれたもののうちで、あれほど元気のあるものはない。ああ愉快だ。こう思って、こう愉快になるのが詩である。
26 |           　たちまちシェレーの雲雀の詩を思い出して、口のうちで覚えたところだけ暗誦して見たが、覚えているところは二三句しかなかった。その二三句のなかにこんなのがある。
27 |           　　We look before and after
28 |           　　　　And pine for what is not:
29 |           　　Our sincerest laughter
30 |           　　　　With some pain is fraught;
31 |           Our sweetest songs are those that tell of saddest thought.
32 |           「前をみては、後えを見ては、物欲しと、あこがるるかなわれ。腹からの、笑といえど、苦しみの、そこにあるべし。うつくしき、極みの歌に、悲しさの、極みの想、籠るとぞ知れ」
33 |           　なるほどいくら詩人が幸福でも、あの雲雀のように思い切って、一心不乱に、前後を忘却して、わが喜びを歌う訳には行くまい。西洋の詩は無論の事、支那の詩にも、よく万斛の愁などと云う字がある。詩人だから万斛で素人なら一合で済むかも知れぬ。して見ると詩人は常の人よりも苦労性で、凡骨の倍以上に神経が鋭敏なのかも知れん。超俗の喜びもあろうが、無量の悲も多かろう。そんならば詩人になるのも考え物だ。
34 |           　しばらくは路が平で、右は雑木山、左は菜の花の見つづけである。足の下に時々蒲公英を踏みつける。鋸のような葉が遠慮なく四方へのして真中に黄色な珠を擁護している。菜の花に気をとられて、踏みつけたあとで、気の毒な事をしたと、振り向いて見ると、黄色な珠は依然として鋸のなかに鎮座している。呑気なものだ。また考えをつづける。
35 |           　詩人に憂はつきものかも知れないが、あの雲雀を聞く心持になれば微塵の苦もない。菜の花を見ても、ただうれしくて胸が躍るばかりだ。蒲公英もその通り、桜も――桜はいつか見えなくなった。こう山の中へ来て自然の景物に接すれば、見るものも聞くものも面白い。面白いだけで別段の苦しみも起らぬ。起るとすれば足が草臥れて、旨いものが食べられぬくらいの事だろう。
36 |           　しかし苦しみのないのはなぜだろう。ただこの景色を一幅の画として観、一巻の詩として読むからである。画であり詩である以上は地面を貰って、開拓する気にもならねば、鉄道をかけて一儲けする了見も起らぬ。ただこの景色が――腹の足しにもならぬ、月給の補いにもならぬこの景色が景色としてのみ、余が心を楽ませつつあるから苦労も心配も伴わぬのだろう。自然の力はここにおいて尊とい。吾人の性情を瞬刻に陶冶して醇乎として醇なる詩境に入らしむるのは自然である。
37 |           　恋はうつくしかろ、孝もうつくしかろ、忠君愛国も結構だろう。しかし自身がその局に当れば利害の旋風に捲き込まれて、うつくしき事にも、結構な事にも、目は眩んでしまう。したがってどこに詩があるか自身には解しかねる。
38 |           　これがわかるためには、わかるだけの余裕のある第三者の地位に立たねばならぬ。三者の地位に立てばこそ芝居は観て面白い。小説も見て面白い。芝居を見て面白い人も、小説を読んで面白い人も、自己の利害は棚へ上げている。見たり読んだりする間だけは詩人である。
39 |           　それすら、普通の芝居や小説では人情を免かれぬ。苦しんだり、怒ったり、騒いだり、泣いたりする。見るものもいつかその中に同化して苦しんだり、怒ったり、騒いだり、泣いたりする。取柄は利慾が交らぬと云う点に存するかも知れぬが、交らぬだけにその他の情緒は常よりは余計に活動するだろう。それが嫌だ。
40 |           　苦しんだり、怒ったり、騒いだり、泣いたりは人の世につきものだ。余も三十年の間それを仕通して、飽々した。飽き飽きした上に芝居や小説で同じ刺激を繰り返しては大変だ。余が欲する詩はそんな世間的の人情を鼓舞するようなものではない。俗念を放棄して、しばらくでも塵界を離れた心持ちになれる詩である。いくら傑作でも人情を離れた芝居はない、理非を絶した小説は少かろう。どこまでも世間を出る事が出来ぬのが彼らの特色である。ことに西洋の詩になると、人事が根本になるからいわゆる詩歌の純粋なるものもこの境を解脱する事を知らぬ。どこまでも同情だとか、愛だとか、正義だとか、自由だとか、浮世の勧工場にあるものだけで用を弁じている。いくら詩的になっても地面の上を馳けてあるいて、銭の勘定を忘れるひまがない。シェレーが雲雀を聞いて嘆息したのも無理はない。
41 |           　うれしい事に東洋の詩歌はそこを解脱したのがある。採菊東籬下、悠然見南山。ただそれぎりの裏に暑苦しい世の中をまるで忘れた光景が出てくる。垣の向うに隣りの娘が覗いてる訳でもなければ、南山に親友が奉職している次第でもない。超然と出世間的に利害損得の汗を流し去った心持ちになれる。独坐幽篁裏、弾琴復長嘯、深林人不知、明月来相照。ただ二十字のうちに優に別乾坤を建立している。この乾坤の功徳は「不如帰」や「金色夜叉」の功徳ではない。汽船、汽車、権利、義務、道徳、礼義で疲れ果てた後に、すべてを忘却してぐっすり寝込むような功徳である。
42 |           　二十世紀に睡眠が必要ならば、二十世紀にこの出世間的の詩味は大切である。惜しい事に今の詩を作る人も、詩を読む人もみんな、西洋人にかぶれているから、わざわざ呑気な扁舟を泛べてこの桃源に溯るものはないようだ。余は固より詩人を職業にしておらんから、王維や淵明の境界を今の世に布教して広げようと云う心掛も何もない。ただ自分にはこう云う感興が演芸会よりも舞踏会よりも薬になるように思われる。ファウストよりも、ハムレットよりもありがたく考えられる。こうやって、ただ一人絵の具箱と三脚几を担いで春の山路をのそのそあるくのも全くこれがためである。淵明、王維の詩境を直接に自然から吸収して、すこしの間でも非人情の天地に逍遥したいからの願。一つの酔興だ。
43 |           　もちろん人間の一分子だから、いくら好きでも、非人情はそう長く続く訳には行かぬ。淵明だって年が年中南山を見詰めていたのでもあるまいし、王維も好んで竹藪の中に蚊帳を釣らずに寝た男でもなかろう。やはり余った菊は花屋へ売りこかして、生えた筍は八百屋へ払い下げたものと思う。こう云う余もその通り。いくら雲雀と菜の花が気に入ったって、山のなかへ野宿するほど非人情が募ってはおらん。こんな所でも人間に逢う。じんじん端折りの頬冠りや、赤い腰巻の姉さんや、時には人間より顔の長い馬にまで逢う。百万本の檜に取り囲まれて、海面を抜く何百尺かの空気を呑んだり吐いたりしても、人の臭いはなかなか取れない。それどころか、山を越えて落ちつく先の、今宵の宿は那古井の温泉場だ。
44 |           　ただ、物は見様でどうでもなる。レオナルド・ダ・ヴィンチが弟子に告げた言に、あの鐘の音を聞け、鐘は一つだが、音はどうとも聞かれるとある。一人の男、一人の女も見様次第でいかようとも見立てがつく。どうせ非人情をしに出掛けた旅だから、そのつもりで人間を見たら、浮世小路の何軒目に狭苦しく暮した時とは違うだろう。よし全く人情を離れる事が出来んでも、せめて御能拝見の時くらいは淡い心持ちにはなれそうなものだ。能にも人情はある。七騎落でも、墨田川でも泣かぬとは保証が出来ん。しかしあれは情三分芸七分で見せるわざだ。我らが能から享けるありがた味は下界の人情をよくそのままに写す手際から出てくるのではない。そのままの上へ芸術という着物を何枚も着せて、世の中にあるまじき悠長な振舞をするからである。
45 |           　しばらくこの旅中に起る出来事と、旅中に出逢う人間を能の仕組と能役者の所作に見立てたらどうだろう。まるで人情を棄てる訳には行くまいが、根が詩的に出来た旅だから、非人情のやりついでに、なるべく節倹してそこまでは漕ぎつけたいものだ。南山や幽篁とは性の違ったものに相違ないし、また雲雀や菜の花といっしょにする事も出来まいが、なるべくこれに近づけて、近づけ得る限りは同じ観察点から人間を視てみたい。芭蕉と云う男は枕元へ馬が尿するのをさえ雅な事と見立てて発句にした。余もこれから逢う人物を――百姓も、町人も、村役場の書記も、爺さんも婆さんも――ことごとく大自然の点景として描き出されたものと仮定して取こなして見よう。もっとも画中の人物と違って、彼らはおのがじし勝手な真似をするだろう。しかし普通の小説家のようにその勝手な真似の根本を探ぐって、心理作用に立ち入ったり、人事葛藤の詮議立てをしては俗になる。動いても構わない。画中の人間が動くと見れば差し支ない。画中の人物はどう動いても平面以外に出られるものではない。平面以外に飛び出して、立方的に働くと思えばこそ、こっちと衝突したり、利害の交渉が起ったりして面倒になる。面倒になればなるほど美的に見ている訳に行かなくなる。これから逢う人間には超然と遠き上から見物する気で、人情の電気がむやみに双方で起らないようにする。そうすれば相手がいくら働いても、こちらの懐には容易に飛び込めない訳だから、つまりは画の前へ立って、画中の人物が画面の中をあちらこちらと騒ぎ廻るのを見るのと同じ訳になる。間三尺も隔てていれば落ちついて見られる。あぶな気なしに見られる。言を換えて云えば、利害に気を奪われないから、全力を挙げて彼らの動作を芸術の方面から観察する事が出来る。余念もなく美か美でないかと鑒識する事が出来る。
46 |           　ここまで決心をした時、空があやしくなって来た。煮え切れない雲が、頭の上へ靠垂れ懸っていたと思ったが、いつのまにか、崩れ出して、四方はただ雲の海かと怪しまれる中から、しとしとと春の雨が降り出した。菜の花は疾くに通り過して、今は山と山の間を行くのだが、雨の糸が濃かでほとんど霧を欺くくらいだから、隔たりはどれほどかわからぬ。時々風が来て、高い雲を吹き払うとき、薄黒い山の背が右手に見える事がある。何でも谷一つ隔てて向うが脈の走っている所らしい。左はすぐ山の裾と見える。深く罩める雨の奥から松らしいものが、ちょくちょく顔を出す。出すかと思うと、隠れる。雨が動くのか、木が動くのか、夢が動くのか、何となく不思議な心持ちだ。
47 |           　路は存外広くなって、かつ平だから、あるくに骨は折れんが、雨具の用意がないので急ぐ。帽子から雨垂れがぽたりぽたりと落つる頃、五六間先きから、鈴の音がして、黒い中から、馬子がふうとあらわれた。
48 |           「ここらに休む所はないかね」
49 |           「もう十五丁行くと茶屋がありますよ。だいぶ濡れたね」
50 |           　まだ十五丁かと、振り向いているうちに、馬子の姿は影画のように雨につつまれて、またふうと消えた。
51 |           　糠のように見えた粒は次第に太く長くなって、今は一筋ごとに風に捲かれる様までが目に入る。羽織はとくに濡れ尽して肌着に浸み込んだ水が、身体の温度で生暖く感ぜられる。気持がわるいから、帽を傾けて、すたすた歩行く。
52 |           　茫々たる薄墨色の世界を、幾条の銀箭が斜めに走るなかを、ひたぶるに濡れて行くわれを、われならぬ人の姿と思えば、詩にもなる、句にも咏まれる。有体なる己れを忘れ尽して純客観に眼をつくる時、始めてわれは画中の人物として、自然の景物と美しき調和を保つ。ただ降る雨の心苦しくて、踏む足の疲れたるを気に掛ける瞬間に、われはすでに詩中の人にもあらず、画裡の人にもあらず。依然として市井の一豎子に過ぎぬ。雲煙飛動の趣も眼に入らぬ。落花啼鳥の情けも心に浮ばぬ。蕭々として独り春山を行く吾の、いかに美しきかはなおさらに解せぬ。初めは帽を傾けて歩行た。後にはただ足の甲のみを見詰めてあるいた。終りには肩をすぼめて、恐る恐る歩行た。雨は満目の樹梢を揺かして四方より孤客に逼る。非人情がちと強過ぎたようだ。
53 |         </div>
54 |       </article>
55 |     </div>
56 | 
57 |     <div>
58 |       <a href="https://www.aozora.gr.jp/cards/000148/card776.html">青空文庫 - 図書カード：No.776</a>
59 |     </div>
60 | </html>
61 | 


--------------------------------------------------------------------------------
/tests/test_article_only.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import unittest
  4 | 
  5 | from readability import Document
  6 | from functools import wraps
  7 | 
  8 | 
  9 | class TimeoutException(Exception):
 10 |     """Exception raised when a function exceeds its time limit."""
 11 |     pass
 12 | 
 13 | 
 14 | def timeout(seconds):
 15 |     """Decorator to enforce a timeout on function execution."""
 16 |     def decorator(func):
 17 |         @wraps(func)
 18 |         def wrapper(*args, **kwargs):
 19 |             start_time = time.perf_counter()
 20 |             result = func(*args, **kwargs)
 21 |             end_time = time.perf_counter()
 22 |             elapsed_time = end_time - start_time
 23 |             if elapsed_time > seconds:
 24 |                 raise TimeoutException(
 25 |                     f"Function '{func.__name__}' exceeded time limit of {seconds} seconds "
 26 |                     f"with an execution time of {elapsed_time:.4f} seconds"
 27 |                 )
 28 |             return result
 29 |         return wrapper
 30 |     return decorator
 31 | 
 32 | 
 33 | SAMPLES = os.path.join(os.path.dirname(__file__), "samples")
 34 | 
 35 | 
 36 | def load_sample(filename):
 37 |     """Helper to get the content out of the sample files"""
 38 |     with open(os.path.join(SAMPLES, filename)) as f:
 39 |         html = f.read()
 40 |     return html
 41 | 
 42 | 
 43 | class TestArticleOnly(unittest.TestCase):
 44 |     """The option to not get back a full html doc should work
 45 | 
 46 |     Given a full html document, the call can request just divs of processed
 47 |     content. In this way the developer can then wrap the article however they
 48 |     want in their own view or application.
 49 | 
 50 |     """
 51 | 
 52 |     def test_si_sample(self):
 53 |         """Using the si sample, load article with only opening body element"""
 54 |         sample = load_sample("si-game.sample.html")
 55 |         doc = Document(
 56 |             sample,
 57 |             url="http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html",
 58 |         )
 59 |         res = doc.summary()
 60 |         self.assertEqual("<html><body><div><div class", res[0:27])
 61 | 
 62 |     def test_si_sample_html_partial(self):
 63 |         """Using the si sample, make sure we can get the article alone."""
 64 |         sample = load_sample("si-game.sample.html")
 65 |         doc = Document(
 66 |             sample,
 67 |             url="http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html",
 68 |         )
 69 |         res = doc.summary(html_partial=True)
 70 |         self.assertEqual('<div><div class="', res[0:17])
 71 | 
 72 |     def test_too_many_images_sample_html_partial(self):
 73 |         """Using the too-many-images sample, make sure we still get the article."""
 74 |         sample = load_sample("too-many-images.sample.html")
 75 |         doc = Document(sample)
 76 |         res = doc.summary(html_partial=True)
 77 |         self.assertEqual('<div><div class="post-body', res[0:26])
 78 | 
 79 |     def test_wrong_link_issue_49(self):
 80 |         """We shouldn't break on bad HTML."""
 81 |         sample = load_sample("the-hurricane-rubin-carter-denzel-washington.html")
 82 |         doc = Document(sample)
 83 |         res = doc.summary(html_partial=True)
 84 |         self.assertEqual('<div><div class="content__article-body ', res[0:39])
 85 | 
 86 |     def test_best_elem_is_root_and_passing(self):
 87 |         sample = (
 88 |             '<html class="article" id="body">'
 89 |             "   <body>"
 90 |             "       <p>1234567890123456789012345</p>"
 91 |             "   </body>"
 92 |             "</html>"
 93 |         )
 94 |         doc = Document(sample)
 95 |         doc.summary()
 96 | 
 97 |     def test_correct_cleanup(self):
 98 |         sample = """
 99 |         <html>
100 |             <body>
101 |                 <section>test section</section>
102 |                 <article class="">
103 | <p>Lot of text here.</p>
104 |                 <div id="advertisement"><a href="link">Ad</a></div>
105 | <p>More text is written here, and contains punctuation and dots.</p>
106 | </article>
107 |                 <aside id="comment1"/>
108 |                 <div id="comment2">
109 |                     <a href="asd">spam</a>
110 |                     <a href="asd">spam</a>
111 |                     <a href="asd">spam</a>
112 |                 </div>
113 |                 <div id="comment3"/>
114 |                 <aside id="comment4">A small comment.</aside>
115 |                 <div id="comment5"><p>The comment is also helpful, but it's
116 |                     still not the correct item to be extracted.</p>
117 |                     <p>It's even longer than the article itself!"</p></div>
118 |             </body>
119 |         </html>
120 |         """
121 |         doc = Document(sample)
122 |         s = doc.summary()
123 |         # print(s)
124 |         assert "punctuation" in s
125 |         assert not "comment" in s
126 |         assert not "aside" in s
127 | 
128 |     # Many spaces make some regexes run forever
129 |     @timeout(3)
130 |     def test_many_repeated_spaces(self):
131 |         long_space = " " * 1000000
132 |         sample = "<html><body><p>foo" + long_space + "</p></body></html>"
133 | 
134 |         doc = Document(sample)
135 |         s = doc.summary()
136 | 
137 |         assert "foo" in s
138 | 
139 |     def test_not_self_closing(self):
140 |         sample = '<h2><a href="#"></a>foobar</h2>'
141 |         doc = Document(sample)
142 |         assert (
143 |             '<body id="readabilityBody"><h2><a href="#"></a>foobar</h2></body>'
144 |             == doc.summary()
145 |         )
146 | 
147 |     def test_utf8_kanji(self):
148 |         """Using the UTF-8 kanji sample, load article which is written in kanji"""
149 |         sample = load_sample("utf-8-kanji.sample.html")
150 |         doc = Document(sample)
151 |         res = doc.summary()
152 |         assert 0 < len(res) < 10000
153 | 
154 |     def test_author_present(self):
155 |         sample = load_sample("the-hurricane-rubin-carter-denzel-washington.html")
156 |         doc = Document(sample)
157 |         assert 'Alex von Tunzelmann' == doc.author()
158 | 
159 |     def test_author_absent(self):
160 |         sample = load_sample("si-game.sample.html")
161 |         doc = Document(sample)
162 |         assert '[no-author]' == doc.author()
163 | 
164 |     def test_keep_images_present(self):
165 |         sample = load_sample("summary-keep-all-images.sample.html")
166 | 
167 |         doc = Document(sample)
168 | 
169 |         assert "<img" in doc.summary(keep_all_images=True)
170 | 
171 |     def test_keep_images_absent(self):
172 |         sample = load_sample("summary-keep-all-images.sample.html")
173 | 
174 |         doc = Document(sample)
175 | 
176 |         assert "<img" not in doc.summary(keep_all_images=False)
177 | 
178 |     def test_keep_images_absent_by_defautl(self):
179 |         sample = load_sample("summary-keep-all-images.sample.html")
180 | 
181 |         doc = Document(sample)
182 | 
183 |         assert "<img" not in doc.summary()
184 | 
185 |     def test_cjk_summary(self):
186 |         """Check we can extract CJK text correctly."""
187 |         html = """
188 |         <html>
189 |             <head>
190 |                 <title>这是标题</title>
191 |             </head>
192 |             <body>
193 |                 <div>一些无关紧要的内容</div>
194 |                 <div class="article-content">
195 |                     <h1>主要文章标题</h1>
196 |                     <p>这是主要内容的第一段。</p>
197 |                     <p>これはコンテンツの第2段落です。</p>
198 |                     <p>이것은 콘텐츠의 세 번째 단락입니다.</p>
199 |                     <p>This is the fourth paragraph.</p>
200 |                 </div>
201 |                 <div>More irrelevant stuff</div>
202 |             </body>
203 |         </html>
204 |         """
205 |         doc = Document(html)
206 |         summary = doc.summary()
207 |         # Check that the main CJK content is present in the summary
208 |         self.assertTrue("这是主要内容的第一段" in summary)
209 |         self.assertTrue("これはコンテンツの第2段落です" in summary)
210 |         self.assertTrue("이것은 콘텐츠의 세 번째 단락입니다" in summary)
211 |         # Check that irrelevant content is mostly gone
212 |         self.assertFalse("一些无关紧要的内容" in summary)
213 | 
214 |     def test_shorten_title_delimiter_bug(self):
215 |         """Test that shorten_title handles delimiters correctly when the last part is valid.
216 | 
217 |         This specifically targets a potential bug where 'p1' might be used instead of 'pl'.
218 |         """
219 |         html = """
220 |         <html>
221 |             <head>
222 |                 <title>Short Part | これは長いです</title>
223 |             </head>
224 |             <body>
225 |                 <div>Content</div>
226 |             </body>
227 |         </html>
228 |         """
229 |         doc = Document(html)
230 |         # With the bug, this call might raise NameError: name 'p1' is not defined
231 |         # With the fix, it should correctly return the last part.
232 |         short_title = doc.short_title()
233 |         self.assertEqual(short_title, "これは長いです")
234 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | # Tox (http://tox.testrun.org/) is a tool for running tests
 2 | # in multiple virtualenvs. This configuration file will run the
 3 | # test suite on all supported python versions. To use it, "pip install tox"
 4 | # and then run "tox" from this directory.
 5 | 
 6 | [tox]
 7 | envlist =
 8 |     py{38,39,310,311,312,313,py3}, doc
 9 | skip_missing_interpreters =
10 |     True
11 | 
12 | [testenv]
13 | deps =
14 |     pytest
15 |     doc: sphinx
16 |     doc: sphinx_rtd_theme
17 |     doc: myst-parser
18 | 
19 | # This creates the virtual envs with --site-packages so already packages
20 | # that are already installed will be reused. This is especially useful on
21 | # Windows. Since we use lxml instead of compiling it locally (which in turn
22 | # requires a Compiler and the build dependencies), you can download
23 | # it from http://www.lfd.uci.edu/~gohlke/pythonlibs/#lxml and install it via
24 | # $PYTHONDIR\Scripts\pip.exe install *.whl
25 | sitepackages=
26 |     True
27 | commands =
28 |     pip install -r requirements.txt -e ".[test]"
29 |     py.test
30 | 
31 | [testenv:doc]
32 | commands =
33 |     sphinx-build -b html doc/source/ build/
34 | 


--------------------------------------------------------------------------------