├── .github
    └── workflows
    │   ├── lint.yml
    │   └── test.yml
├── .gitignore
├── .pylintrc
├── CHANGELOG.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── benchmarks
    ├── Dockerfile
    └── README.md
├── make_release.py
├── readabilipy
    ├── __init__.py
    ├── __main__.py
    ├── __version__.py
    ├── extractors
    │   ├── __init__.py
    │   ├── extract_date.py
    │   ├── extract_element.py
    │   └── extract_title.py
    ├── javascript
    │   ├── ExtractArticle.js
    │   └── package.json
    ├── simple_json.py
    ├── simple_tree.py
    ├── simplifiers
    │   ├── __init__.py
    │   ├── html.py
    │   └── text.py
    └── utils.py
├── setup.py
└── tests
    ├── checks.py
    ├── data
        ├── addictinginfo.com-1_full_article.html
        ├── addictinginfo.com-1_full_page.html
        ├── addictinginfo.com-1_full_page_javascript.json
        ├── addictinginfo.com-1_plain_text_paragraphs_from_simple_article.json
        ├── addictinginfo.com-1_simple_article_from_full_article.json
        ├── addictinginfo.com-1_simple_article_from_full_page.json
        ├── addictinginfo.com-1_simple_article_from_full_page_content_digest.json
        ├── addictinginfo.com-1_simple_article_from_full_page_content_digest_node_indexes.json
        ├── addictinginfo.com-1_simple_article_from_full_page_node_indexes.json
        ├── benchmarkinghuge.html
        ├── conservativehq.com-1_full_page.html
        ├── conservativehq.com-1_simple_article_from_full_page.json
        ├── davidwolfe.com-1_full_page.html
        ├── davidwolfe.com-1_simple_article_from_full_page.json
        ├── list_items_full_page.html
        ├── list_items_plain_text_paragraph_node_indexes.json
        ├── list_items_simple_article_from_full_page.json
        ├── list_items_simple_article_from_full_page_content_digests.json
        ├── list_items_simple_article_from_full_page_node_indexes.json
        ├── non_article_full_page.html
        ├── non_article_full_page.json
        ├── plain-content-test_full_article.html
        └── plain-content-test_full_article_javascript.json
    ├── test_article_extraction.py
    ├── test_benchmarking.py
    ├── test_date_functions.py
    ├── test_extract_element.py
    ├── test_html_elements.py
    ├── test_javascript.py
    ├── test_normal_html.py
    ├── test_simple_json.py
    ├── test_simple_tree.py
    ├── test_simplifiers_html.py
    ├── test_simplifiers_text.py
    ├── test_title_functions.py
    └── test_weird_html.py


/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
 1 | name: ReadabiliPy CI Linting
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |   pull_request:
 7 | 
 8 | jobs:
 9 |   build:
10 | 
11 |     runs-on: "ubuntu-20.04"
12 |     strategy:
13 |       matrix:
14 |         python-version: ["3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
15 | 
16 |     steps:
17 |       - uses: actions/checkout@v4
18 |       - name: Set up Python ${{ matrix.python-version }}
19 |         uses: actions/setup-python@v4
20 |         with:
21 |           python-version: ${{ matrix.python-version }}
22 | 
23 |       - name: Install dependencies
24 |         run: |
25 |           python -m pip install --upgrade pip
26 |           pip install -e ".[test]"
27 | 
28 |       - name: Lint with pyflakes
29 |         run: |
30 |           pyflakes *.py readabilipy tests
31 | 
32 |       - name: check PEP8
33 |         run: |
34 |           pycodestyle --statistics --ignore=E501 --count *.py readabilipy tests
35 | 
36 |       - name: Run pylint for stricter error checking
37 |         run: |
38 |           pylint readabilipy
39 |           pylint ./tests/*.py
40 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: ReadabiliPy CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |   pull_request:
 7 | 
 8 | jobs:
 9 |   build:
10 | 
11 |     runs-on: "ubuntu-20.04"
12 |     strategy:
13 |       matrix:
14 |         python-version: ["3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
15 |         node-version: [18.x, 20.x, 22.x]
16 | 
17 |     steps:
18 |       - uses: actions/checkout@v4
19 |       - name: Set up Python ${{ matrix.python-version }}
20 |         uses: actions/setup-python@v4
21 |         with:
22 |           python-version: ${{ matrix.python-version }}
23 | 
24 |       - name: Set up Node.js ${{ matrix.node-version }}
25 |         uses: actions/setup-node@v4
26 |         with:
27 |           node-version: ${{ matrix.node-version }}
28 | 
29 |       - name: Install dependencies
30 |         run: |
31 |           python -m pip install --upgrade pip
32 |           pip install -e ".[test]"
33 | 
34 |       - name: Test with pytest
35 |         run: |
36 |           pytest -v . --cov readabilipy --cov-report term-missing --benchmark-disable
37 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # PyCharm IDE stuff
  2 | .idea/
  3 | 
  4 | # OSX temporary files
  5 | .DS_Store
  6 | 
  7 | # ===== PYTHON STUFF ====
  8 | # Byte-compiled / optimized / DLL files
  9 | __pycache__/
 10 | *.py[cod]
 11 | *$py.class
 12 | 
 13 | # C extensions
 14 | *.so
 15 | 
 16 | # Distribution / packaging
 17 | .Python
 18 | build/
 19 | develop-eggs/
 20 | dist/
 21 | downloads/
 22 | eggs/
 23 | .eggs/
 24 | lib/
 25 | lib64/
 26 | parts/
 27 | sdist/
 28 | var/
 29 | wheels/
 30 | *.egg-info/
 31 | .installed.cfg
 32 | *.egg
 33 | MANIFEST
 34 | 
 35 | # PyInstaller
 36 | #  Usually these files are written by a python script from a template
 37 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 38 | *.manifest
 39 | *.spec
 40 | 
 41 | # Installer logs
 42 | pip-log.txt
 43 | pip-delete-this-directory.txt
 44 | 
 45 | # Unit test / coverage reports
 46 | htmlcov/
 47 | .tox/
 48 | .nox/
 49 | .coverage
 50 | .coverage.*
 51 | .cache
 52 | nosetests.xml
 53 | coverage.xml
 54 | *.cover
 55 | .hypothesis/
 56 | .pytest_cache/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | 
 67 | # Flask stuff:
 68 | instance/
 69 | .webassets-cache
 70 | 
 71 | # Scrapy stuff:
 72 | .scrapy
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # IPython
 84 | profile_default/
 85 | ipython_config.py
 86 | 
 87 | # pyenv
 88 | .python-version
 89 | 
 90 | # celery beat schedule file
 91 | celerybeat-schedule
 92 | 
 93 | # SageMath parsed files
 94 | *.sage.py
 95 | 
 96 | # Environments
 97 | .env
 98 | .venv
 99 | env/
100 | venv/
101 | ENV/
102 | env.bak/
103 | venv.bak/
104 | 
105 | # Spyder project settings
106 | .spyderproject
107 | .spyproject
108 | 
109 | # Rope project settings
110 | .ropeproject
111 | 
112 | # mkdocs documentation
113 | /site
114 | 
115 | # mypy
116 | .mypy_cache/
117 | .dmypy.json
118 | dmypy.json
119 | 
120 | # Pyre type checker
121 | .pyre/
122 | 
123 | 
124 | # ===== NODE STUFF =====
125 | # Logs
126 | logs
127 | *.log
128 | npm-debug.log*
129 | yarn-debug.log*
130 | yarn-error.log*
131 | 
132 | # Runtime data
133 | pids
134 | *.pid
135 | *.seed
136 | *.pid.lock
137 | 
138 | # Directory for instrumented libs generated by jscoverage/JSCover
139 | lib-cov
140 | 
141 | # Coverage directory used by tools like istanbul
142 | coverage
143 | 
144 | # nyc test coverage
145 | .nyc_output
146 | 
147 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
148 | .grunt
149 | 
150 | # Bower dependency directory (https://bower.io/)
151 | bower_components
152 | 
153 | # node-waf configuration
154 | .lock-wscript
155 | 
156 | # Compiled binary addons (https://nodejs.org/api/addons.html)
157 | build/Release
158 | 
159 | # Dependency directories
160 | node_modules/
161 | jspm_packages/
162 | 
163 | # TypeScript v1 declaration files
164 | typings/
165 | 
166 | # Optional npm cache directory
167 | .npm
168 | 
169 | # Optional eslint cache
170 | .eslintcache
171 | 
172 | # Optional REPL history
173 | .node_repl_history
174 | 
175 | # Output of 'npm pack'
176 | *.tgz
177 | 
178 | # Yarn Integrity file
179 | .yarn-integrity
180 | 
181 | # dotenv environment variables file
182 | .env
183 | 
184 | # next.js build output
185 | .next
186 | 
187 | # package-lock
188 | package-lock.json
189 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Change Log
 2 | 
 3 | ## Version 0.3.0
 4 | * Fixed some bugs. Updated supported Python versions.
 5 | 
 6 | ## Version 0.2.0
 7 | * Restructured project ready for initial PyPI upload.
 8 | 
 9 | ## Version 0.1.0
10 | * Final version used by Misinformation project.
11 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 The Alan Turing Institute
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include readabilipy/javascript/*.js
2 | include readabilipy/javascript/package.json
3 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for easier installation and cleanup.
  2 | #
  3 | # Uses self-documenting macros from here:
  4 | # http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html
  5 | 
  6 | SHELL := bash
  7 | .SHELLFLAGS := -eu -o pipefail -c
  8 | MAKEFLAGS += --warn-undefined-variables --no-builtin-rules
  9 | 
 10 | PACKAGE=readabilipy
 11 | DOC_DIR=./docs
 12 | VENV_DIR=/tmp/rdpy_venv
 13 | TEST_DIR=./tests
 14 | 
 15 | .PHONY: help
 16 | 
 17 | .DEFAULT_GOAL := help
 18 | 
 19 | # Display a help message when called without target, using the ## comments
 20 | help:
 21 | 	@grep -E '^[0-9a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) |\
 22 | 		 awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-15s\033[0m\
 23 | 		 %s\n", $$1, $$2}'
 24 | 
 25 | ################
 26 | # Installation #
 27 | ################
 28 | 
 29 | .PHONY: install
 30 | 
 31 | install: ## Install for the current user using the default python command
 32 | 	python setup.py build_ext --inplace
 33 | 	python setup.py install --user
 34 | 
 35 | 
 36 | ################
 37 | # Distribution #
 38 | ################
 39 | 
 40 | .PHONY: release dist
 41 | 
 42 | release: ## Make a release
 43 | 	python make_release.py
 44 | 
 45 | dist: ## Make Python source distribution
 46 | 	python setup.py sdist bdist_wheel
 47 | 
 48 | 
 49 | ###########
 50 | # Testing #
 51 | ###########
 52 | 
 53 | .PHONY: test
 54 | 
 55 | test: venv ## Run unit tests
 56 | 	source $(VENV_DIR)/bin/activate && cd $(TEST_DIR) && python -m pytest -v . --cov readabilipy --cov-report term-missing --benchmark-disable
 57 | 	source $(VENV_DIR)/bin/activate && pyflakes *.py readabilipy $(TEST_DIR)
 58 | 	source $(VENV_DIR)/bin/activate && pycodestyle --statistics --ignore=E501 --count *.py readabilipy $(TEST_DIR)
 59 | 	source $(VENV_DIR)/bin/activate && pylint readabilipy $(TEST_DIR)/*.py
 60 | 
 61 | #################
 62 | # Documentation #
 63 | #################
 64 | 
 65 | .PHONY: docs
 66 | 
 67 | docs: install ## Build documentation with Sphinx
 68 | 	exit; # not implemented
 69 | 	source $(VENV_DIR)/bin/activate && m2r README.md && mv README.rst $(DOC_DIR)
 70 | 	source $(VENV_DIR)/bin/activate && m2r CHANGELOG.md && mv CHANGELOG.rst $(DOC_DIR)
 71 | 	cd $(DOC_DIR) && \
 72 | 		rm source/* && \
 73 | 		source $(VENV_DIR)/bin/activate && \
 74 | 		sphinx-apidoc -H 'ReadabiliPy API Documentation' -o source ../$(PACKAGE) && \
 75 | 		touch source/AUTOGENERATED
 76 | 	$(MAKE) -C $(DOC_DIR) html
 77 | 
 78 | #######################
 79 | # Virtual environment #
 80 | #######################
 81 | 
 82 | .PHONY: venv clean_venv
 83 | 
 84 | venv: $(VENV_DIR)/bin/activate
 85 | 
 86 | $(VENV_DIR)/bin/activate: setup.py
 87 | 	test -d $(VENV_DIR) || python -m venv $(VENV_DIR)
 88 | 	source $(VENV_DIR)/bin/activate && pip install .[dev]
 89 | 	touch $(VENV_DIR)/bin/activate
 90 | 
 91 | clean_venv:
 92 | 	rm -rf $(VENV_DIR)
 93 | 
 94 | ############
 95 | # Clean up #
 96 | ############
 97 | 
 98 | .PHONY: clean
 99 | 
100 | clean: ## Clean build dist and egg directories left after install
101 | 	rm -rf ./dist
102 | 	rm -rf ./build
103 | 	rm -rf ./$(PACKAGE).egg-info
104 | 	rm -rf $(VENV_DIR)
105 | 	rm -f MANIFEST
106 | 	rm -rf $(PACKAGE)/javascript/node_modules
107 | 	find . -type f -iname '*.pyc' -delete
108 | 	find . -type d -name '__pycache__' -empty -delete
109 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ReadabiliPy
  2 | 
  3 | [![Coverage Status](https://coveralls.io/repos/github/alan-turing-institute/ReadabiliPy/badge.svg?branch=master)](https://coveralls.io/github/alan-turing-institute/ReadabiliPy?branch=master)
  4 | 
  5 | `ReadabiliPy` contains a Python wrapper for Mozilla's [Readability.js](https://github.com/mozilla/readability) Node.js package, as well as article extraction routines written in pure Python.
  6 | 
  7 | This package augments the output of `Readability.js` to also return a list of plain text representations of article paragraphs.
  8 | 
  9 | `ReadabiliPy` comes with a handy command line application: ``readabilipy``.
 10 | 
 11 | ## Installation
 12 | 
 13 | To use the `Readability.js` wrapper you need to have a working [Node.js](https://nodejs.org/en/download/) installation of version 14 or higher.
 14 | Make sure to install Node.js before installing this package, as this ensures Readability.js will be installed.
 15 | If you only want to use the Python-based article extraction, you **do not need** to install Node.js.
 16 | 
 17 | `ReadabiliPy` can be installed simply from PyPI:
 18 | 
 19 | ```
 20 | $ pip install readabilipy
 21 | ```
 22 | 
 23 | Note that to update to a new version of `Readability.js` you can simply reinstall `ReadabiliPy`.
 24 | 
 25 | ## Usage
 26 | 
 27 | `ReadabiliPy` can be used either as a command line application or as a Python library.
 28 | 
 29 | ### Command line application
 30 | 
 31 | The ``readabilipy`` command line application can be used to extract an article from an HTML source file.
 32 | 
 33 | For example, if you have the article saved as ``input.html`` in the current directory then you can run:
 34 | 
 35 | ```
 36 | $ readabilipy -i ./input.html -o article.json
 37 | ```
 38 | 
 39 | The extracted article can then be found in the ``article.json`` file. By default ReadabiliPy will use the Readability.js functionality to extract the article, provided this is available. If instead you'd like to use the Python-based extraction, run:
 40 | 
 41 | ```
 42 | $ readabilipy -p -i ./input.html -o article.json
 43 | ```
 44 | 
 45 | The complete help text of the command line application is as follows:
 46 | 
 47 | ```
 48 | $ readabilipy -h
 49 | usage: readabilipy [-h] -i INPUT_FILE -o OUTPUT_FILE [-c] [-n] [-p] [-V]
 50 | 
 51 | Extract article data from a HTML file using either Mozilla's Readability.js
 52 | package or a simplified python-only alternative.
 53 | 
 54 | optional arguments:
 55 |   -h, --help            show this help message and exit
 56 |   -i INPUT_FILE, --input-file INPUT_FILE
 57 |                         Path to input file containing HTML.
 58 |   -o OUTPUT_FILE, --output-file OUTPUT_FILE
 59 |                         Path to file to output the article data to as JSON.
 60 |   -c, --content-digests
 61 |                         Add a 'data-content-digest' attribute containing a
 62 |                         SHA256-based digest of the element's contents to each
 63 |                         HTML element in the plain_content output.
 64 |   -n, --node-indexes    Add a 'data-node-index' attribute containing a
 65 |                         hierarchical representation of the element's position
 66 |                         in the HTML structure each HTML element in the
 67 |                         plain_content output.
 68 |   -p, --use-python-parser
 69 |                         Use the pure-python 'plain_html' parser included in
 70 |                         this project rather than Mozilla's Readability.js.
 71 |   -V, --version         Show version and exit
 72 | ```
 73 | 
 74 | ## Library
 75 | 
 76 | ReadabiliPy can also be used as a Python package.
 77 | The main routine is called ``simple_json_from_html_string`` and expects the HTML article as a string.
 78 | Here is an example of extracting an article after downloading the page using [requests](https://requests.readthedocs.io/en/master/):
 79 | 
 80 | ```python
 81 | >>> import requests
 82 | >>> from readabilipy import simple_json_from_html_string
 83 | >>> req = requests.get('https://en.wikipedia.org/wiki/Readability')
 84 | >>> article = simple_json_from_html_string(req.text, use_readability=True)
 85 | ```
 86 | 
 87 | Note that you need to use the flag ``use_readability=True`` to use Readability.js, otherwise the Python-based extraction is used.
 88 | 
 89 | The ``simple_json_from_html_string`` function returns a dictionary with the following fields:
 90 | 
 91 |  - `title`: The article title
 92 |  - `byline`: Author information
 93 |  - `content`: A simplified HTML representation of the article, with all article text contained in paragraph elements.
 94 |  - `plain_content`: A "plain" version of the simplified `Readability.js` article HTML present in the `content` field. This attempts to retain only the plain text content of the article, while preserving the HTML structure.
 95 |  - `plain_text`: A list containing plain text representations of each paragraph (`<p>`) or list (`<ol>` or `<ul>`) present in the simplified `Readability.js` article HTML in the `content` field. Each paragraph or list is represented as a single string. List strings look like `"* item 1, * item 2, * item 3,"` for both ordered and unordered lists (note the trailing `,`).
 96 | 
 97 | Note further that:
 98 | 
 99 | - All fields are guaranteed to be present. If individual fields are missing from the output of `Readability.js`, the value of these fields will be `None`. If no article data is returned by `Readability.js`, the value of all fields will be `None`.
100 | - All text in the `plain_content` and `plain_text` fields is encoded as unicode normalised using the "NFKC" normal form. This normal form is used to try and ensure as much as possible that things that appear visually the same are encoded with the same unicode representation (the K part) and characters are represented as a single composite character where possible (the C part).
101 | - An optional `content_digests` flag can be passed to the Python wrapper. When this is set to `True`, each HTML element in the `plain_content` field has a `data-content-digest` attribute, which holds the SHA-256 hash of its plain text content. For "leaf" nodes (containing only plain text in the output), this is the SHA-256 hash of their plain text content. For nodes containing other nodes, this is the SHA-256 hash of the concatenated SHA-256 hashes of their child nodes.
102 | - An optional `node_indexes` flag can be passed to the Python wrapper. When this is set to `True`, each HTML element in the `plain_content` field has a `data-node-indexes` attribute, which holds a hierarchical index describing the location of element within the `plain_content` HTML structure.
103 | - An optional `use_readability` flag can be passed to the Python wrapper. When this is set to `True`, Mozilla's `Readability.js` will be used as the parser. If it is set to `False` then the pure-python parser in `plain_html.py` will be used instead.
104 | 
105 | The second top-level function exported by ReadabiliPy is ``simple_tree_from_html_string``. This returns a cleaned, parsed HTML tree of the article as a [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) object.
106 | 
107 | ## Notes
108 | 
109 | License: MIT License, see the `LICENSE` file.
110 | 
111 | Copyright (c) 2018, The Alan Turing Institute
112 | 
113 | If you encounter any issues or have any suggestions for improvement, please open an issue [on Github](https://github.com/alan-turing-institute/ReadabiliPy).
114 | You're helping to make this project better for everyone!
115 | 


--------------------------------------------------------------------------------
/benchmarks/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3
 2 | 
 3 | # Install requirements
 4 | RUN apt-get update
 5 | RUN apt-get -y install curl
 6 | RUN curl -sL https://deb.nodesource.com/setup_11.x | bash -
 7 | RUN apt install nodejs
 8 | RUN npm install
 9 | RUN pip install --upgrade pip
10 | RUN apt-get install -y git
11 | 
12 | # Clone ReadabiliPy and install python packages
13 | RUN git clone https://github.com/alan-turing-institute/ReadabiliPy
14 | WORKDIR "/ReadabiliPy"
15 | RUN git pull
16 | RUN pip install -r requirements-dev.txt
17 | 
18 | # Run the benchmarks with Pytest
19 | CMD pytest --benchmark-only
20 | 


--------------------------------------------------------------------------------
/benchmarks/README.md:
--------------------------------------------------------------------------------
 1 | Benchmarking [ReadabiliPy](https://github.com/alan-turing-institute/ReadabiliPy/tree/master) with containers
 2 | ====
 3 | 
 4 | This directory contains a Dockerfile to build a benchmarking image for [ReadabiliPy](https://github.com/alan-turing-institute/ReadabiliPy/tree/master) as per the guidelines specified by the [Benchmarking with containers](https://alan-turing-institute.github.io/data-science-benchmarking/) project, at the Alan Turing Institute.
 5 | 
 6 | **Software:** [ReadabiliPy](https://github.com/alan-turing-institute/ReadabiliPy) - A simple HTML content extractor in Python. Can be run as a wrapper for Mozilla's Readability.js package or in pure-python mode.
 7 | 
 8 | **Benchmarks:** Benchmark the speed of core package functions at extracting information from an input HTML with [pytest](https://pypi.org/project/pytest-benchmark/). See [test_benchmarking.py](https://github.com/alan-turing-institute/ReadabiliPy/blob/master/tests/test_benchmarking.py).
 9 | 
10 | Running benchmarks
11 | ----
12 | 
13 | Using the [pytest-benchmark](https://pypi.org/project/pytest-benchmark/) package, we benchmark some of the package functions, including extraction of titles and dates from article HTML and the full article content in JSON format.
14 | 
15 | Benchmarks can be run from the top directory of the package with the following command: ```pytest --benchmark-only```.
16 | 
17 | Building a Docker image for Benchmarking ReadabiliPy
18 | ----
19 | 
20 | The [Dockerfile](https://github.com/alan-turing-institute/ReadabiliPy/blob/master/benchmarks/Dockerfile) specifies an image that installs the requirements for ReadabiliPy, clones the package from GitHub, then runs the benchmarks with pytest.
21 | 
22 | Docker Hub Automated build
23 | ----
24 | 
25 | An image was built with this Dockerfile and pushed to [Docker Hub](https://cloud.docker.com/repository/docker/edwardchalstrey/readabilipy_benchmark) as ```edwardchalstrey/readabilipy_benchmark```. An automated build was set up so that the ```latest``` tag  is built whenever the master branch of the ReadabiliPy GitHub repo has a new commit.
26 | 
27 | Run the containerised benchmarks
28 | ----
29 | 
30 | The benchmark image can be pulled from the remote registry (Docker Hub), and run on any computing platform with Docker. Benchmarks can be run whenever new features are added.
31 | 
32 | ### Results
33 | 
34 | I have benchmarked three of the html parsing features of ReadabiliPy on an example html file; see the tests in [ReadabiliPy](https://github.com/alan-turing-institute/ReadabiliPy/tree/master) repo within ```tests/test_benchmarking.py```.
35 | 
36 | Benchmarks run on these dates, are for the following [ReadabiliPy](https://github.com/alan-turing-institute/ReadabiliPy/tree/master) commits and measure **mean time ms**:
37 | 1. 2019-05-02 => [9ba2fdb7...](https://github.com/alan-turing-institute/ReadabiliPy/tree/9ba2fdb71b3b014f3252a29672ff41159203e45c)
38 | 2. 2019-05-14 => [d3b3c365...](https://github.com/alan-turing-institute/ReadabiliPy/tree/d3b3c365984aa26ce0a8f0fda6b3fd75b9e837a2)
39 | 3. 2019-05-31 => [73493922...](https://github.com/alan-turing-institute/ReadabiliPy/tree/734939221048041e545e3a4bd205a84e87631a3f)
40 | 
41 | **Benchmarks on a Macbook:**
42 | 
43 | | Date  | Date parse  | Title parse  | Full parse  |
44 | |---|---|---|---|
45 | | 2019-05-02  | 69.5056  | 55.5296  | 2140.0745  |
46 | | 2019-05-14  | 44.4991  | 54.8936  | 1942.1609  |
47 | | 2019-05-31  | 80.5528  | 94.9283  | 2290.3153  |
48 | 
49 | 
50 | **Benchmarks on a Macbook in Docker container:**
51 | 
52 | | Date  | Date parse  | Title parse  | Full parse  |
53 | |---|---|---|---|
54 | | 2019-05-02  | 46.4389  | 40.2649  | 3065.2467  |
55 | | 2019-05-14  | 32.8276  | 39.7405  | 2642.1735  |
56 | | 2019-05-31  | 34.8774  | 41.2476  | 2838.9681  |
57 | 


--------------------------------------------------------------------------------
/make_release.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Do-nothing script for making a release
  6 | 
  7 | This idea comes from here:
  8 | https://blog.danslimmon.com/2019/07/15/do-nothing-scripting-the-key-to-gradual-automation/
  9 | 
 10 | This file is part of ReadabiliPy.
 11 | 
 12 | Copyright: 2020, The Alan Turing Institute
 13 | License: See LICENSE file.
 14 | 
 15 | """
 16 | 
 17 | import os
 18 | import sys
 19 | import tempfile
 20 | import webbrowser
 21 | 
 22 | try:
 23 |     import colorama
 24 |     colorama.init()
 25 |     BE_COLOURFUL = True
 26 | except ImportError:
 27 |     BE_COLOURFUL = False
 28 | 
 29 | 
 30 | URLS = {
 31 |     "Travis": "https://travis-ci.org/alan-turing-institute/ReadabiliPy",
 32 | }
 33 | 
 34 | 
 35 | def coloured(msg, colour=None, style=None):
 36 |     if not BE_COLOURFUL:
 37 |         return msg
 38 | 
 39 |     colours = {
 40 |         "red": colorama.Fore.RED,
 41 |         "green": colorama.Fore.GREEN,
 42 |         "cyan": colorama.Fore.CYAN,
 43 |         "yellow": colorama.Fore.YELLOW,
 44 |         "magenta": colorama.Fore.MAGENTA,
 45 |         None: "",
 46 |     }
 47 |     styles = {
 48 |         "bright": colorama.Style.BRIGHT,
 49 |         "dim": colorama.Style.DIM,
 50 |         None: "",
 51 |     }
 52 |     pre = colours[colour] + styles[style]
 53 |     post = colorama.Style.RESET_ALL
 54 |     return f"{pre}{msg}{post}"
 55 | 
 56 | 
 57 | def cprint(msg, colour=None, style=None):
 58 |     print(coloured(msg, colour=colour, style=style))
 59 | 
 60 | 
 61 | def wait_for_enter():
 62 |     input(coloured("\nPress Enter to continue", style="dim"))
 63 |     print()
 64 | 
 65 | 
 66 | def get_package_name():
 67 |     with open("./setup.py", "r") as fp:
 68 |         nameline = next(
 69 |             (line.strip() for line in fp if line.startswith("NAME = ")), None
 70 |         )
 71 |         return nameline.split("=")[-1].strip().strip('"')
 72 | 
 73 | 
 74 | def get_package_version(pkgname):
 75 |     ctx = {}
 76 |     with open(f"{pkgname.lower()}/__version__.py", "r") as fp:
 77 |         exec(fp.read(), ctx)
 78 |     return ctx["__version__"]
 79 | 
 80 | 
 81 | class Step:
 82 |     def pre(self, context):
 83 |         pass
 84 | 
 85 |     def post(self, context):
 86 |         wait_for_enter()
 87 | 
 88 |     def run(self, context):
 89 |         try:
 90 |             self.pre(context)
 91 |             self.action(context)
 92 |             self.post(context)
 93 |         except KeyboardInterrupt:
 94 |             cprint("\nInterrupted.", colour="red")
 95 |             raise SystemExit(1)
 96 | 
 97 |     def instruct(self, msg):
 98 |         cprint(msg, colour="green")
 99 | 
100 |     def print_run(self, msg):
101 |         cprint("Run:", colour="cyan", style="bright")
102 |         self.print_cmd(msg)
103 | 
104 |     def print_cmd(self, msg):
105 |         cprint("\t" + msg, colour="cyan", style="bright")
106 | 
107 |     def do_cmd(self, cmd):
108 |         cprint(f"Going to run: {cmd}", colour="magenta", style="bright")
109 |         wait_for_enter()
110 |         os.system(cmd)
111 | 
112 | 
113 | class GitToMain(Step):
114 |     def action(self, context):
115 |         self.instruct("Make sure you're on main and changes are merged in")
116 |         self.print_run("git checkout main")
117 | 
118 | 
119 | class UpdateChangelog(Step):
120 |     def action(self, context):
121 |         self.instruct(f"Update change log for version {context['version']}")
122 |         self.print_run("vi CHANGELOG.md")
123 | 
124 | 
125 | class UpdateReadme(Step):
126 |     def action(self, context):
127 |         self.instruct("Update readme if necessary")
128 |         self.print_run("vi README.md")
129 | 
130 | 
131 | class RunTests(Step):
132 |     def action(self, context):
133 |         self.instruct("Run the unit tests")
134 |         self.print_run("make test")
135 | 
136 | 
137 | class BumpVersionPackage(Step):
138 |     def action(self, context):
139 |         self.instruct("Update __version__.py with the new version")
140 | 
141 |     def post(self, context):
142 |         wait_for_enter()
143 |         context["version"] = self._get_version(context)
144 | 
145 |     def _get_version(self, context):
146 |         # Get the version from the version file
147 |         return get_package_version(context["pkgname"])
148 | 
149 | 
150 | class MakeClean(Step):
151 |     def action(self, context):
152 |         self.do_cmd("make clean")
153 | 
154 | 
155 | class MakeDocs(Step):
156 |     def action(self, context):
157 |         self.do_cmd("make docs")
158 | 
159 | 
160 | class MakeDist(Step):
161 |     def action(self, context):
162 |         self.do_cmd("make dist")
163 | 
164 | 
165 | class PushToTestPyPI(Step):
166 |     def action(self, context):
167 |         self.do_cmd(
168 |             "twine upload --repository-url https://test.pypi.org/legacy/ dist/*"
169 |         )
170 | 
171 | 
172 | class InstallFromTestPyPI(Step):
173 |     def action(self, context):
174 |         tmpvenv = tempfile.mkdtemp(prefix="rdpy_venv_")
175 |         self.do_cmd(
176 |             f"python -m venv {tmpvenv} && source {tmpvenv}/bin/activate && "
177 |             "pip install --no-cache-dir --index-url "
178 |             "https://test.pypi.org/simple/ "
179 |             "--extra-index-url https://pypi.org/simple "
180 |             f"{context['pkgname']}=={context['version']}"
181 |         )
182 |         context["tmpvenv"] = tmpvenv
183 | 
184 | 
185 | class TestPackage(Step):
186 |     def action(self, context):
187 |         self.instruct(
188 |             f"Ensure that the following command gives version {context['version']}"
189 |         )
190 |         self.do_cmd(f"source {context['tmpvenv']}/bin/activate && readabilipy -V")
191 | 
192 | 
193 | class RemoveVenv(Step):
194 |     def action(self, context):
195 |         self.do_cmd(f"rm -rf {context['tmpvenv']}")
196 | 
197 | 
198 | class GitTagVersion(Step):
199 |     def action(self, context):
200 |         self.do_cmd(f"git tag v{context['version']}")
201 | 
202 | 
203 | class GitAdd(Step):
204 |     def action(self, context):
205 |         self.instruct("Add everything to git and commit")
206 |         self.print_run("git gui")
207 | 
208 | 
209 | class PushToPyPI(Step):
210 |     def action(self, context):
211 |         self.do_cmd("twine upload dist/*")
212 | 
213 | 
214 | class PushToGitHub(Step):
215 |     def action(self, context):
216 |         self.do_cmd("git push -u --tags origin main")
217 | 
218 | 
219 | class WaitForTravis(Step):
220 |     def action(self, context):
221 |         webbrowser.open(URLS['Travis'])
222 |         self.instruct(
223 |             "Wait for Travis to complete and verify that its successful"
224 |         )
225 | 
226 | 
227 | class WaitForRTD(Step):
228 |     def action(self, context):
229 |         self.instruct(
230 |             "Wait for ReadTheDocs to complete and verify that its successful"
231 |         )
232 | 
233 | 
234 | def main(target=None):
235 |     procedure = [
236 |         ("gittomain", GitToMain()),
237 |         ("clean1", MakeClean()),
238 |         ("tests1", RunTests()),
239 |         ("gitadd1", GitAdd()),
240 |         ("push1", PushToGitHub()),
241 |         ("travis1", WaitForTravis()),
242 |         ("bumpversion", BumpVersionPackage()),
243 |         ("changelog", UpdateChangelog()),
244 |         ("readme", UpdateReadme()),
245 |         ("dist", MakeDist()),
246 |         ("testpypi", PushToTestPyPI()),
247 |         ("install", InstallFromTestPyPI()),
248 |         ("testpkg", TestPackage()),
249 |         ("remove_venv", RemoveVenv()),
250 |         ("gitadd2", GitAdd()),
251 |         ("pypi", PushToPyPI()),
252 |         ("tag", GitTagVersion()),
253 |         ("push2", PushToGitHub()),
254 |     ]
255 |     context = {}
256 |     context["pkgname"] = get_package_name()
257 |     context["version"] = get_package_version(context["pkgname"])
258 |     skip = True if target else False
259 |     for name, step in procedure:
260 |         if not name == target and skip:
261 |             continue
262 |         skip = False
263 |         step.run(context)
264 |     cprint("\nDone!", colour="yellow", style="bright")
265 | 
266 | 
267 | if __name__ == "__main__":
268 |     target = sys.argv[1] if len(sys.argv) > 1 else None
269 |     main(target=target)
270 | 


--------------------------------------------------------------------------------
/readabilipy/__init__.py:
--------------------------------------------------------------------------------
1 | from .simple_json import simple_json_from_html_string
2 | from .simple_tree import simple_tree_from_html_string
3 | 
4 | __all__ = [
5 |     'simple_json_from_html_string',
6 |     'simple_tree_from_html_string',
7 | ]
8 | 


--------------------------------------------------------------------------------
/readabilipy/__main__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """Command line interface
 4 | 
 5 | """
 6 | 
 7 | import argparse
 8 | import json
 9 | import sys
10 | 
11 | from .__version__ import __version__
12 | from .simple_json import simple_json_from_html_string, have_node
13 | 
14 | 
15 | def main():
16 |     parser = argparse.ArgumentParser(
17 |         description="Extract article data from a HTML file using either Mozilla's Readability.js package or a simplified python-only alternative.",
18 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
19 |     )
20 |     parser.add_argument(
21 |         "-i",
22 |         "--input-file",
23 |         default="-",
24 |         help="Path to input file containing HTML, use '-' for stdin.",
25 |     )
26 |     parser.add_argument(
27 |         "-o",
28 |         "--output-file",
29 |         default="-",
30 |         help="Path to file to output the article data to as JSON, use '-' for stdout.",
31 |     )
32 |     parser.add_argument(
33 |         "-c",
34 |         "--content-digests",
35 |         action="store_true",
36 |         help="Add a 'data-content-digest' attribute containing a SHA256-based digest of the element's contents to each HTML element in the plain_content output.",
37 |     )
38 |     parser.add_argument(
39 |         "-n",
40 |         "--node-indexes",
41 |         action="store_true",
42 |         help="Add a 'data-node-index' attribute containing a hierarchical representation of the element's position in the HTML structure each HTML element in the plain_content output.",
43 |     )
44 |     parser.add_argument(
45 |         "-p",
46 |         "--use-python-parser",
47 |         action="store_true",
48 |         help="Use the pure-python 'plain_html' parser included in this project rather than Mozilla's Readability.js.",
49 |     )
50 |     parser.add_argument(
51 |         "-V",
52 |         "--version",
53 |         help="Show version and exit",
54 |         action="version",
55 |         version=f"{__version__} (Readability.js supported: {'yes' if have_node() else 'no'})",
56 |     )
57 | 
58 |     args = parser.parse_args()
59 | 
60 |     # Open input file or stream
61 |     if args.input_file == "-":
62 |         if hasattr(sys.stdin, "reconfigure"):
63 |             sys.stdin.reconfigure(encoding="utf-8", errors="replace")
64 |         input_file = sys.stdin
65 |     else:
66 |         input_file = open(args.input_file, encoding="utf-8", errors="replace")  # pylint: disable=consider-using-with
67 | 
68 |     # Read from input then close if appropriate
69 |     html = input_file.read()
70 |     if not input_file.isatty():
71 |         input_file.close()
72 | 
73 |     article = simple_json_from_html_string(
74 |         html,
75 |         content_digests=args.content_digests,
76 |         node_indexes=args.node_indexes,
77 |         use_readability=(not args.use_python_parser),
78 |     )
79 | 
80 |     # Open output file or stream
81 |     if args.output_file == "-":
82 |         output_file = sys.stdout
83 |     else:
84 |         output_file = open(args.output_file, "w", encoding="utf-8")  # pylint: disable=consider-using-with
85 | 
86 |     # Write to output then close if appropriate
87 |     json.dump(article, output_file, ensure_ascii=False)
88 |     if not output_file.isatty():
89 |         output_file.close()
90 | 
91 | 
92 | if __name__ == "__main__":
93 |     main()
94 | 


--------------------------------------------------------------------------------
/readabilipy/__version__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | VERSION = (0, 3, 0)
4 | 
5 | __version__ = '.'.join(map(str, VERSION))
6 | 


--------------------------------------------------------------------------------
/readabilipy/extractors/__init__.py:
--------------------------------------------------------------------------------
1 | from .extract_date import extract_date, ensure_iso_date_format
2 | from .extract_title import extract_title
3 | 
4 | __all__ = [
5 |     'extract_date',
6 |     'extract_title',
7 |     'ensure_iso_date_format',
8 | ]
9 | 


--------------------------------------------------------------------------------
/readabilipy/extractors/extract_date.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from .extract_element import extract_element
 3 | 
 4 | 
 5 | def extract_date(html):
 6 |     """Return the article date from the article HTML"""
 7 | 
 8 |     # List of xpaths for HTML tags that could contain a date
 9 |     # Tuple scores reflect confidence in these xpaths and the preference used for extraction
10 |     xpaths = [
11 |         ('//meta[@property="article:published_time"]/@content', 13),
12 |         ('//meta[@property="og:updated_time"]/@content', 10),
13 |         ('//meta[@property="og:article:published_time"]/@content', 10),
14 |         ('//meta[@property="og:article:modified_time"]/@content', 10),
15 |         ('//meta[@property="article:published"]/@content', 7),
16 |         ('//meta[@itemprop="datePublished"]/@content', 3),
17 |         ('//time/@datetime', 3),
18 |         ('//meta[@itemprop="dateModified"]/@content', 2),
19 |         ('//meta[@property="article:modified_time"]/@content', 2),
20 |     ]
21 | 
22 |     # Get all the dates
23 |     extracted_dates = extract_element(html, xpaths)
24 |     if not extracted_dates:
25 |         return None
26 | 
27 |     # Search through the extracted date strings in order of score and take the first that is in isoformat
28 |     for date_string in sorted(extracted_dates, key=lambda ds: extracted_dates[ds]["score"], reverse=True):
29 |         iso_date = ensure_iso_date_format(date_string)
30 |         if iso_date:
31 |             return iso_date
32 |     return None
33 | 
34 | 
35 | def ensure_iso_date_format(date_string, ignoretz=True):
36 |     """Check date_string is in one of our supported formats and return it"""
37 |     supported_date_formats = [
38 |         "%Y-%m-%dT%H:%M:%S",      # '2014-10-24T17:32:46'
39 |         "%Y-%m-%dT%H:%M:%S%z",    # '2014-10-24T17:32:46+12:00'
40 |         "%Y-%m-%dT%H:%M%z",       # '2014-10-24T17:32+12:00'
41 |         "%Y-%m-%dT%H:%M:%SZ",     # '2014-10-24T17:32:46Z'
42 |         "%Y-%m-%dT%H:%M:%S.%fZ",  # '2014-10-24T17:32:46.000Z'
43 |         "%Y-%m-%dT%H:%M:%S.%f"    # '2014-10-24T17:32:46.493'
44 |     ]
45 | 
46 |     for date_format in supported_date_formats:
47 |         try:
48 |             # For python < 3.7, strptime() is not able to parse timezones containing
49 |             # colons (eg. 2014-10-24T17:32:46+12:00). By stripping the colon here,
50 |             # we ensure that all versions of python can parse datetimes like these
51 |             if date_format in ("%Y-%m-%dT%H:%M:%S%z", "%Y-%m-%dT%H:%M%z") and date_string[-3] == ':':
52 |                 isodate = datetime.strptime(date_string[:-3] + date_string[-2:], date_format)
53 |             else:
54 |                 isodate = datetime.strptime(date_string, date_format)
55 |             if ignoretz:
56 |                 isodate = isodate.replace(tzinfo=None, microsecond=0)
57 |             return isodate.isoformat()
58 |         except ValueError:
59 |             pass
60 |     return None
61 | 


--------------------------------------------------------------------------------
/readabilipy/extractors/extract_element.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | import lxml.html
 3 | from ..simplifiers import normalise_whitespace
 4 | 
 5 | 
 6 | def extract_element(html, xpaths, process_dict_fn=None):
 7 |     """Return the relevant elements (titles, dates or bylines) from article HTML, specified by xpaths.
 8 |         xpaths should be a list of tuples, each with the xpath and a reliability scores.
 9 |         Processing of the dictionary can be handled with the arg function.
10 |         The returned dictionary should have the processed elements as keys and dicts with scores and the xpaths used as values
11 |     """
12 |     # Attempt to parse the html, aborting here if it is not parseable
13 |     try:
14 |         lxml_html = lxml.html.fromstring(html)
15 |     except lxml.etree.ParserError:
16 |         return None
17 | 
18 |     # Get all elements specified and combine scores
19 |     extracted_strings = defaultdict(dict)
20 |     for extraction_xpath, score in xpaths:
21 |         found_elements = lxml_html.xpath(extraction_xpath)
22 |         found_elements = found_elements if isinstance(found_elements, list) else [found_elements]
23 |         for found_element in found_elements:
24 |             element = normalise_whitespace(found_element)
25 |             if element:
26 |                 try:
27 |                     extracted_strings[element]['score'] += score
28 |                     extracted_strings[element]['xpaths'].append(extraction_xpath)
29 |                     extracted_strings[element]['xpaths'].sort()
30 |                 except KeyError:
31 |                     extracted_strings[element]['score'] = score
32 |                     extracted_strings[element]['xpaths'] = [extraction_xpath]
33 | 
34 |     # Edit the dictionary
35 |     if process_dict_fn:
36 |         extracted_strings = process_dict_fn(extracted_strings)
37 | 
38 |     return extracted_strings
39 | 


--------------------------------------------------------------------------------
/readabilipy/extractors/extract_title.py:
--------------------------------------------------------------------------------
 1 | from itertools import permutations
 2 | from .extract_element import extract_element
 3 | 
 4 | 
 5 | def extract_title(html):
 6 |     """Return the article title from the article HTML"""
 7 | 
 8 |     # List of xpaths for HTML tags that could contain a title
 9 |     # Tuple scores reflect confidence in these xpaths and the preference used for extraction
10 |     xpaths = [
11 |         ('//header[@class="entry-header"]/h1[@class="entry-title"]//text()', 4),
12 |         ('//meta[@property="og:title"]/@content', 4),
13 |         ('//h1[@class="entry-title"]//text()', 3),
14 |         ('//h1[@itemprop="headline"]//text()', 3),
15 |         ('//h2[@itemprop="headline"]//text()', 2),
16 |         ('//meta[contains(@itemprop, "headline")]/@content', 2),
17 |         ('//body/title//text()', 1),
18 |         ('//div[@class="postarea"]/h2/a//text()', 1),
19 |         ('//h1[@class="post__title"]//text()', 1),
20 |         ('//h1[@class="title"]//text()', 1),
21 |         ('//head/title//text()', 1),
22 |         ('//header/h1//text()', 1),
23 |         ('//meta[@name="dcterms.title"]/@content', 1),
24 |         ('//meta[@name="fb_title"]/@content', 1),
25 |         ('//meta[@name="sailthru.title"]/@content', 1),
26 |         ('//meta[@name="title"]/@content', 1),
27 |     ]
28 | 
29 |     extracted_titles = extract_element(html, xpaths, process_dict_fn=combine_similar_titles)
30 |     if not extracted_titles:
31 |         return None
32 |     return max(extracted_titles, key=lambda x: extracted_titles[x].get('score'))
33 | 
34 | 
35 | def combine_similar_titles(extracted_strings):
36 |     """Take a dictionary with titles and nested dicts with scores and combine scores for titles which we decide are the same."""
37 | 
38 |     # Iterate through each possible pair of title keys, including both permutations of each pair
39 |     for title_pair in permutations(extracted_strings, 2):
40 |         # If the first title is a subset of the second then combine their scores, taking the shorter one as the key
41 |         if title_pair[0] in title_pair[1]:
42 |             extracted_strings[title_pair[0]]['score'] += extracted_strings[title_pair[1]]['score']
43 |             extracted_strings[title_pair[0]]['xpaths'] += extracted_strings[title_pair[1]]['xpaths']
44 |         # If the first title is identical to the second (ignoring case) then combine their scores, taking the one with more capitals as the key
45 |         elif title_pair[0].lower() == title_pair[1].lower():
46 |             if len([c for c in title_pair[0] if c.isupper()]) > len([c for c in title_pair[1] if c.isupper()]):
47 |                 extracted_strings[title_pair[0]]['score'] += extracted_strings[title_pair[1]]['score']
48 |                 extracted_strings[title_pair[0]]['xpaths'] += extracted_strings[title_pair[1]]['xpaths']
49 |     for score_xpath_dict in extracted_strings.values():
50 |         score_xpath_dict['xpaths'].sort()
51 |     return extracted_strings
52 | 


--------------------------------------------------------------------------------
/readabilipy/javascript/ExtractArticle.js:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This file is part of ReadabiliPy
 3 |  */
 4 | 
 5 | const fs = require('fs');
 6 | const { Readability } = require('@mozilla/readability');
 7 | const { JSDOM } = require('jsdom');
 8 | 
 9 | function readFile(filePath) {
10 | 	return fs.readFileSync(filePath, {encoding: "utf-8"}).trim();
11 | }
12 | 
13 | function writeFile(data, filePath) {
14 | 	return fs.writeFileSync(filePath, data, {encoding: "utf-8"});
15 | }
16 | function main() {
17 | 	var outFilePath;
18 | 
19 | 	var argv = require('minimist')(process.argv.slice(2));
20 | 	if (argv['i'] === undefined) {
21 | 		console.log("Input file required.");
22 | 		return 1;
23 | 	}
24 | 
25 | 	var inFilePath = argv['i'];
26 | 	if (typeof(argv['o']) !== 'undefined') {
27 | 		outFilePath = argv['o'];
28 | 	} else {
29 | 		outFilePath = inFilePath + ".simple.json";
30 | 	}
31 | 
32 | 	var html = readFile(inFilePath);
33 | 	var doc = new JSDOM(html);
34 | 	let reader = new Readability(doc.window.document);
35 | 	let article = reader.parse();
36 | 
37 | 	writeFile(JSON.stringify(article), outFilePath);
38 | 	return 0;
39 | }
40 | 
41 | main();
42 | 


--------------------------------------------------------------------------------
/readabilipy/javascript/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "ReadabiliPy",
 3 |   "version": "0.1.0",
 4 |   "description": "An augmented Python wrapper for the Mozilla standalone Readability.js package.",
 5 |   "main": "ExtractArticle.js",
 6 |   "scripts": {},
 7 |   "repository": {
 8 |     "type": "git",
 9 |     "url": "https://github.com/alan-turing-institute/ReadabiliPy"
10 |   },
11 |   "author": "",
12 |   "license": "Apache-2.0",
13 |   "bugs": {
14 |     "url": "https://github.com/alan-turing-institute/ReadabiliPy/issues"
15 |   },
16 |   "engines": {
17 |     "node": ">=14.0.0"
18 |   },
19 |   "homepage": "https://github.com/alan-turing-institute/ReadabiliPy",
20 |   "devDependencies": {},
21 |   "dependencies": {
22 |     "@mozilla/readability": ">=0.4.1",
23 |     "jsdom": ">=12.2.0",
24 |     "minimist": "^1.2.3"
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/readabilipy/simple_json.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import json
  3 | import os
  4 | import tempfile
  5 | import subprocess
  6 | import sys
  7 | 
  8 | from bs4 import BeautifulSoup
  9 | from bs4.element import Comment, NavigableString, CData
 10 | from .simple_tree import simple_tree_from_html_string
 11 | from .extractors import extract_date, extract_title
 12 | from .simplifiers import normalise_text
 13 | from .utils import run_npm_install
 14 | 
 15 | 
 16 | def have_node():
 17 |     """Check that we can run node and have a new enough version """
 18 |     try:
 19 |         cp = subprocess.run(['node', '-v'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False)
 20 |     except FileNotFoundError:
 21 |         return False
 22 | 
 23 |     if not cp.returncode == 0:
 24 |         return False
 25 | 
 26 |     major = int(cp.stdout.split(b'.')[0].lstrip(b'v'))
 27 |     if major < 10:
 28 |         return False
 29 | 
 30 |     # check that this package has a node_modules dir in the javascript
 31 |     # directory, if it doesn't, it wasn't installed with Node support
 32 |     jsdir = os.path.join(os.path.dirname(__file__), 'javascript')
 33 |     node_modules = os.path.join(jsdir, 'node_modules')
 34 |     if not os.path.exists(node_modules):
 35 |         # Try installing node dependencies.
 36 |         run_npm_install()
 37 |     return os.path.exists(node_modules)
 38 | 
 39 | 
 40 | def simple_json_from_html_string(html, content_digests=False, node_indexes=False, use_readability=False):
 41 |     if use_readability and not have_node():
 42 |         print("Warning: node executable not found, reverting to pure-Python mode. Install Node.js v10 or newer to use Readability.js.", file=sys.stderr)
 43 |         use_readability = False
 44 | 
 45 |     if use_readability:
 46 |         # Write input HTML to temporary file so it is available to the node.js script
 47 |         # It is important that this file be unique in case this function is called concurrently
 48 |         with tempfile.NamedTemporaryFile(delete=False, mode="w+", encoding="utf-8", prefix="readabilipy") as f_html:
 49 |             f_html.write(html)
 50 |             f_html.close()
 51 |         tmp_html_path = f_html.name
 52 | 
 53 |         # We assume appending ".json" to the html name will also be a unique filename
 54 |         tmp_json_path = tmp_html_path + ".json"
 55 | 
 56 |         # Call Mozilla's Readability.js Readability.parse() function via node, writing output to a temporary file
 57 |         jsdir = os.path.join(os.path.dirname(__file__), 'javascript')
 58 |         try:
 59 |             subprocess.run(
 60 |                 ["node", "ExtractArticle.js", "-i", tmp_html_path, "-o", tmp_json_path],
 61 |                 cwd=jsdir,
 62 |                 check=True,
 63 |                 stdout=subprocess.PIPE,
 64 |                 stderr=subprocess.PIPE,
 65 |                 universal_newlines=True)
 66 |         except subprocess.CalledProcessError as e:
 67 |             print(e.stderr)
 68 |             raise
 69 | 
 70 |         # Read output of call to Readability.parse() from JSON file as Python dictionary
 71 |         with open(tmp_json_path, "r", encoding="utf-8") as json_file:
 72 |             input_json = json.load(json_file)
 73 | 
 74 |         # Delete temporary input and output files after processing
 75 |         os.unlink(tmp_json_path)
 76 |         os.unlink(tmp_html_path)
 77 |     else:
 78 |         input_json = {
 79 |             "title": extract_title(html),
 80 |             "date": extract_date(html),
 81 |             "content": str(simple_tree_from_html_string(html))
 82 |         }
 83 | 
 84 |     # Only keep the subset of Readability.js fields we are using (and therefore testing for accuracy of extraction)
 85 |     # NB: Need to add tests for additional fields and include them when we look at packaging this wrapper up for PyPI
 86 |     # Initialise output article to include all fields with null values
 87 |     article_json = {
 88 |         "title": None,
 89 |         "byline": None,
 90 |         "date": None,
 91 |         "content": None,
 92 |         "plain_content": None,
 93 |         "plain_text": None
 94 |     }
 95 |     # Populate article fields from readability fields where present
 96 |     if input_json:
 97 |         if "title" in input_json and input_json["title"]:
 98 |             article_json["title"] = input_json["title"]
 99 |         if "byline" in input_json and input_json["byline"]:
100 |             article_json["byline"] = input_json["byline"]
101 |         if "date" in input_json and input_json["date"]:
102 |             article_json["date"] = input_json["date"]
103 |         if "content" in input_json and input_json["content"]:
104 |             article_json["content"] = input_json["content"]
105 |             article_json["plain_content"] = plain_content(article_json["content"], content_digests, node_indexes)
106 |             if use_readability:
107 |                 article_json["plain_text"] = extract_text_blocks_js(article_json["plain_content"])
108 |             else:
109 |                 article_json["plain_text"] = extract_text_blocks_as_plain_text(article_json["plain_content"])
110 | 
111 |     return article_json
112 | 
113 | 
114 | def extract_text_blocks_js(paragraph_html):
115 |     # Load article as DOM
116 |     soup = BeautifulSoup(paragraph_html, 'html.parser')
117 |     # Select all text blocks
118 |     text_blocks = [{"text": str(s)} for s in soup.find_all(string=True)]
119 |     return text_blocks
120 | 
121 | 
122 | def extract_text_blocks_as_plain_text(paragraph_html):
123 |     # Load article as DOM
124 |     soup = BeautifulSoup(paragraph_html, 'html.parser')
125 |     # Select all lists
126 |     list_elements = soup.find_all(['ul', 'ol'])
127 |     # Prefix text in all list items with "* " and make lists paragraphs
128 |     for list_element in list_elements:
129 |         plain_items = "".join(list(filter(None, [plain_text_leaf_node(li)["text"] for li in list_element.find_all('li')])))
130 |         list_element.string = plain_items
131 |         list_element.name = "p"
132 |     # Select all text blocks
133 |     text_blocks = [s.parent for s in soup.find_all(string=True)]
134 |     text_blocks = [plain_text_leaf_node(block) for block in text_blocks]
135 |     # Drop empty paragraphs
136 |     text_blocks = list(filter(lambda p: p["text"] is not None, text_blocks))
137 |     return text_blocks
138 | 
139 | 
140 | def plain_text_leaf_node(element):
141 |     # Extract all text, stripped of any child HTML elements and normalise it
142 |     plain_text = normalise_text(element.get_text())
143 |     if plain_text != "" and element.name == "li":
144 |         plain_text = f"* {plain_text}, "
145 |     if plain_text == "":
146 |         plain_text = None
147 |     if "data-node-index" in element.attrs:
148 |         plain = {"node_index": element["data-node-index"], "text": plain_text}
149 |     else:
150 |         plain = {"text": plain_text}
151 |     return plain
152 | 
153 | 
154 | def plain_content(readability_content, content_digests, node_indexes):
155 |     # Load article as DOM
156 |     soup = BeautifulSoup(readability_content, 'html.parser')
157 |     # Make all elements plain
158 |     elements = plain_elements(soup.contents, content_digests, node_indexes)
159 |     if node_indexes:
160 |         # Add node index attributes to nodes
161 |         elements = [add_node_indexes(element) for element in elements]
162 |     # Replace article contents with plain elements
163 |     soup.contents = elements
164 |     return str(soup)
165 | 
166 | 
167 | def plain_elements(elements, content_digests, node_indexes):
168 |     # Get plain content versions of all elements
169 |     elements = [plain_element(element, content_digests, node_indexes)
170 |                 for element in elements]
171 |     if content_digests:
172 |         # Add content digest attribute to nodes
173 |         elements = [add_content_digest(element) for element in elements]
174 |     return elements
175 | 
176 | 
177 | def plain_element(element, content_digests, node_indexes):
178 |     # For lists, we make each item plain text
179 |     if is_leaf(element):
180 |         # For leaf node elements, extract the text content, discarding any HTML tags
181 |         # 1. Get element contents as text
182 |         plain_text = element.get_text()
183 |         # 2. Normalise the extracted text string to a canonical representation
184 |         plain_text = normalise_text(plain_text)
185 |         # 3. Update element content to be plain text
186 |         element.string = plain_text
187 |     elif is_text(element):
188 |         if is_non_printing(element):
189 |             # The simplified HTML may have come from Readability.js so might
190 |             # have non-printing text (e.g. Comment or CData). In this case, we
191 |             # keep the structure, but ensure that the string is empty.
192 |             element = type(element)("")
193 |         else:
194 |             plain_text = element.string
195 |             plain_text = normalise_text(plain_text)
196 |             element = type(element)(plain_text)
197 |     else:
198 |         # If not a leaf node or leaf type call recursively on child nodes, replacing
199 |         plain_conents = plain_elements(element.contents, content_digests, node_indexes)
200 |         element.clear()
201 |         element.extend(plain_conents)
202 |     return element
203 | 
204 | 
205 | def is_leaf(element):
206 |     return (element.name in ['p', 'li'])
207 | 
208 | 
209 | def is_text(element):
210 |     return isinstance(element, NavigableString)
211 | 
212 | 
213 | def is_non_printing(element):
214 |     return any(isinstance(element, _e) for _e in [Comment, CData])
215 | 
216 | 
217 | def add_node_indexes(element, node_index="0"):
218 |     # Can't add attributes to string types
219 |     if is_text(element):
220 |         return element
221 |     # Add index to current element
222 |     element["data-node-index"] = node_index
223 |     # Add index to child elements
224 |     for local_idx, child in enumerate(
225 |             [c for c in element.contents if not is_text(c)], start=1):
226 |         # Can't add attributes to leaf string types
227 |         child_index = f"{node_index}.{local_idx}"
228 |         add_node_indexes(child, node_index=child_index)
229 |     return element
230 | 
231 | 
232 | def add_content_digest(element):
233 |     if not is_text(element):
234 |         element["data-content-digest"] = content_digest(element)
235 |     return element
236 | 
237 | 
238 | def content_digest(element):
239 |     if is_text(element):
240 |         # Hash
241 |         trimmed_string = element.string.strip()
242 |         if trimmed_string == "":
243 |             digest = ""
244 |         else:
245 |             digest = hashlib.sha256(trimmed_string.encode('utf-8')).hexdigest()
246 |     else:
247 |         contents = element.contents
248 |         num_contents = len(contents)
249 |         if num_contents == 0:
250 |             # No hash when no child elements exist
251 |             digest = ""
252 |         elif num_contents == 1:
253 |             # If single child, use digest of child
254 |             digest = content_digest(contents[0])
255 |         else:
256 |             # Build content digest from the "non-empty" digests of child nodes
257 |             digest = hashlib.sha256()
258 |             child_digests = list(
259 |                 filter(lambda x: x != "", [content_digest(content) for content in contents]))
260 |             for child in child_digests:
261 |                 digest.update(child.encode('utf-8'))
262 |             digest = digest.hexdigest()
263 |     return digest
264 | 


--------------------------------------------------------------------------------
/readabilipy/simple_tree.py:
--------------------------------------------------------------------------------
 1 | """Turn input HTML into a cleaned parsed tree."""
 2 | from bs4 import BeautifulSoup
 3 | from .simplifiers.html import consolidate_text, insert_paragraph_breaks, normalise_strings, process_special_elements, process_unknown_elements, recursively_prune_elements, remove_blacklist, remove_empty_strings_and_elements, remove_metadata, strip_attributes, structural_elements, unnest_paragraphs, unwrap_elements, wrap_bare_text
 4 | 
 5 | 
 6 | def simple_tree_from_html_string(html):
 7 |     """Turn input HTML into a cleaned parsed tree."""
 8 |     # Insert space into non-spaced comments so that html5lib can interpret them correctly
 9 |     html = html.replace("<!---->", "<!-- -->")
10 | 
11 |     # Convert the HTML into a Soup parse tree
12 |     soup = BeautifulSoup(html, "html5lib")
13 | 
14 |     # Remove comments, CDATA (which is converted to comments) and DOCTYPE
15 |     remove_metadata(soup)
16 | 
17 |     # Strip tag attributes apart from 'class' and 'style'
18 |     strip_attributes(soup)
19 | 
20 |     # Remove blacklisted elements
21 |     remove_blacklist(soup)
22 | 
23 |     # Unwrap elements where we want to keep the text but drop the containing tag
24 |     unwrap_elements(soup)
25 | 
26 |     # Process elements with special innerText handling
27 |     process_special_elements(soup)
28 | 
29 |     # Process unknown elements
30 |     process_unknown_elements(soup)
31 | 
32 |     # Consolidate text, joining any consecutive NavigableStrings together.
33 |     # Must come before any whitespace operations (eg. remove_empty_strings_and_elements or normalise_strings)
34 |     consolidate_text(soup)
35 | 
36 |     # Remove empty string elements
37 |     remove_empty_strings_and_elements(soup)
38 | 
39 |     # Split out block-level elements illegally contained inside paragraphs
40 |     unnest_paragraphs(soup)
41 | 
42 |     # Replace <br> and <hr> elements with paragraph breaks
43 |     # Must come after remove_empty_strings_and_elements so that consecutive <br>s can be identified
44 |     # Re-consolidates strings at the end, so must come before normalise_strings
45 |     insert_paragraph_breaks(soup)
46 | 
47 |     # Wrap any remaining bare text in a suitable block level element
48 |     # Must come after consolidate_text and identify_and_replace_break_elements
49 |     # otherwise there may be multiple strings inside a <p> tag which would create nested <p>s
50 |     wrap_bare_text(soup)
51 | 
52 |     # Normalise all strings, removing whitespace and fixing unicode issues
53 |     # Must come after consolidate_text and insert_paragraph_breaks which join
54 |     # strings with semantic whitespace
55 |     normalise_strings(soup)
56 | 
57 |     # Recursively replace any elements which have no children or only zero-length children
58 |     recursively_prune_elements(soup)
59 | 
60 |     # Finally ensure that the whole tree is wrapped in a div
61 |     # Strip out enclosing elements that cannot live inside a div
62 |     while soup.contents and (soup.contents[0].name in structural_elements()):
63 |         soup.contents[0].unwrap()
64 |     # If the outermost tag is a single div then return it
65 |     if len(soup.contents) == 1 and soup.contents[0].name == "div":
66 |         return soup
67 | 
68 |     # ... otherwise wrap in a div and return that
69 |     root = soup.new_tag("div")
70 |     root.append(soup)
71 |     return root
72 | 


--------------------------------------------------------------------------------
/readabilipy/simplifiers/__init__.py:
--------------------------------------------------------------------------------
 1 | from .text import normalise_text, normalise_unicode, normalise_whitespace, strip_control_characters, strip_html_whitespace
 2 | 
 3 | __all__ = [
 4 |     "normalise_text",
 5 |     "normalise_unicode",
 6 |     "normalise_whitespace",
 7 |     "strip_control_characters",
 8 |     "strip_html_whitespace",
 9 | ]
10 | 


--------------------------------------------------------------------------------
/readabilipy/simplifiers/html.py:
--------------------------------------------------------------------------------
  1 | """Common HTML cleaning functions."""
  2 | from bs4 import Comment, Doctype, NavigableString
  3 | from .text import normalise_text
  4 | 
  5 | 
  6 | def elements_to_delete():
  7 |     """Elements that will be deleted together with their contents."""
  8 |     html5_form_elements = ['button', 'datalist', 'fieldset', 'form', 'input',
  9 |                            'label', 'legend', 'meter', 'optgroup', 'option',
 10 |                            'output', 'progress', 'select', 'textarea']
 11 |     html5_image_elements = ['area', 'img', 'map', 'picture', 'source']
 12 |     html5_media_elements = ['audio', 'track', 'video']
 13 |     html5_embedded_elements = ['embed', 'iframe', 'math', 'object', 'param', 'svg']
 14 |     html5_interactive_elements = ['details', 'dialog', 'summary']
 15 |     html5_scripting_elements = ['canvas', 'noscript', 'script', 'template']
 16 |     html5_data_elements = ['data', 'link']
 17 |     html5_formatting_elements = ['style']
 18 |     html5_navigation_elements = ['nav']
 19 | 
 20 |     elements = html5_form_elements + html5_image_elements \
 21 |         + html5_media_elements + html5_embedded_elements \
 22 |         + html5_interactive_elements + html5_scripting_elements \
 23 |         + html5_data_elements + html5_formatting_elements \
 24 |         + html5_navigation_elements
 25 | 
 26 |     return elements
 27 | 
 28 | 
 29 | def elements_to_replace_with_contents():
 30 |     """Elements that we will discard while keeping their contents."""
 31 |     elements = ['a', 'abbr', 'address', 'b', 'bdi', 'bdo', 'center', 'cite',
 32 |                 'code', 'del', 'dfn', 'em', 'i', 'ins', 'kbs', 'mark',
 33 |                 'rb', 'ruby', 'rp', 'rt', 'rtc', 's', 'samp', 'small', 'span',
 34 |                 'strong', 'time', 'u', 'var', 'wbr']
 35 |     return elements
 36 | 
 37 | 
 38 | def special_elements():
 39 |     """Elements that we will discard while keeping their contents that need
 40 |     additional processing."""
 41 |     elements = ['q', 'sub', 'sup']
 42 |     return elements
 43 | 
 44 | 
 45 | def block_level_whitelist():
 46 |     """Elements that we will always accept."""
 47 |     elements = ['article', 'aside', 'blockquote', 'caption', 'colgroup', 'col',
 48 |                 'div', 'dl', 'dt', 'dd', 'figure', 'figcaption', 'footer',
 49 |                 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'li', 'main',
 50 |                 'ol', 'p', 'pre', 'section', 'table', 'tbody', 'thead',
 51 |                 'tfoot', 'tr', 'td', 'th', 'ul']
 52 |     return elements
 53 | 
 54 | 
 55 | def structural_elements():
 56 |     """Structural elements we do no further processing on (though we do remove attributes and alter their contents)"""
 57 |     return ['html', 'head', 'body']
 58 | 
 59 | 
 60 | def metadata_elements():
 61 |     """Metadata elements we do no further processing on (though we do remove attributes and alter their contents)"""
 62 |     return ['meta', 'link', 'base', 'title']
 63 | 
 64 | 
 65 | def linebreak_elements():
 66 |     return ['br', 'hr']
 67 | 
 68 | 
 69 | def known_elements():
 70 |     """All elements that we know by name."""
 71 |     return structural_elements() + metadata_elements() + linebreak_elements() + elements_to_delete() \
 72 |         + elements_to_replace_with_contents() + special_elements() \
 73 |         + block_level_whitelist()
 74 | 
 75 | 
 76 | def remove_metadata(soup):
 77 |     """Remove comments, CData and doctype. These are not rendered by browsers.
 78 |     The lxml-based parsers automatically convert CData to comments unless it is
 79 |     inside <script> tags. CData will therefore be removed either as a comment
 80 |     or as part of a <script> but if any other behaviour is desired, the HTML
 81 |     will need to be pre-processed before giving it to the BeautifulSoup parser.
 82 | 
 83 |     We were a bit worried about potentially removing content here but satisfied
 84 |     ourselves it won't be displayed by most browsers in most cases
 85 |     (see https://github.com/alan-turing-institute/ReadabiliPy/issues/32)"""
 86 |     for comment in soup.findAll(string=lambda text: any(isinstance(text, x) for x in [Comment, Doctype])):
 87 |         comment.extract()
 88 | 
 89 | 
 90 | def strip_attributes(soup):
 91 |     """Strip class and style attributes."""
 92 |     for element in soup.find_all():
 93 |         element.attrs.pop("class", None)
 94 |         element.attrs.pop("style", None)
 95 | 
 96 | 
 97 | def remove_blacklist(soup):
 98 |     """Remove all blacklisted elements."""
 99 |     for element_name in elements_to_delete():
100 |         for element in soup.find_all(element_name):
101 |             element.decompose()
102 | 
103 | 
104 | def unwrap_elements(soup):
105 |     """Flatten all elements where we are only interested in their contents."""
106 |     # We do not need to unwrap from the "bottom up" as all we are doing is replacing elements with their contents so
107 |     # we will still find child elements after their parent has been unwrapped.
108 |     for element_name in elements_to_replace_with_contents():
109 |         for element in soup.find_all(element_name):
110 |             element.unwrap()
111 | 
112 | 
113 | def process_special_elements(soup):
114 |     """Flatten special elements while processing their contents."""
115 |     for element_name in special_elements():
116 |         for element in soup.find_all(element_name):
117 |             # Insert appropriate strings before and/or after the contents
118 |             if element.name == 'q':
119 |                 element.insert_before(NavigableString('"'))
120 |                 element.insert_after(NavigableString('"'))
121 |             if element.name == 'sub':
122 |                 element.insert_before(NavigableString('_'))
123 |             if element.name == 'sup':
124 |                 element.insert_before(NavigableString('^'))
125 |             # Replace the element by its contents
126 |             element.unwrap()
127 | 
128 | 
129 | def process_unknown_elements(soup):
130 |     """Replace any unknown elements with their contents."""
131 |     for element in soup.find_all():
132 |         if element.name not in known_elements():
133 |             element.unwrap()
134 | 
135 | 
136 | def consolidate_text(soup):
137 |     """Join any consecutive NavigableStrings together."""
138 |     # Iterate over all strings in the tree
139 |     for element in soup.find_all(string=True):
140 |         # If the previous element is the same type then extract the current string and append to previous
141 |         if type(element.previous_sibling) is type(element):
142 |             text = "".join([str(element.previous_sibling), str(element)])
143 |             element.previous_sibling.replace_with(text)
144 |             element.extract()
145 | 
146 | 
147 | def remove_empty_strings_and_elements(soup):
148 |     """Remove any strings which contain only whitespace. Without this,
149 |     consecutive linebreaks may not be identified correctly."""
150 |     for element in list(soup.descendants):
151 |         if not normalise_text(str(element)):
152 |             element.extract()
153 | 
154 | 
155 | def unnest_paragraphs(soup):
156 |     """Split out block-level elements illegally contained inside paragraphs."""
157 |     illegal_elements = ["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset",
158 |                         "figcaption", "figure", "footer", "form", "h1>-<h6", "header", "hr", "li", "main", "nav",
159 |                         "noscript", "ol", "p", "pre", "section", "table", "tfoot", "ul", "video"]
160 |     for nested_type in illegal_elements:
161 |         # Search for nested elements that need to be split out
162 |         nested_elements = [e for e in soup.find_all('p') if e.find(nested_type)]
163 |         while nested_elements:
164 |             # Separate this element into the nested element, plus before and after
165 |             elem_nested = nested_elements[0].find(nested_type)
166 |             p_before = soup.new_tag("p")
167 |             for sibling in list(elem_nested.previous_siblings):
168 |                 p_before.append(sibling)
169 |             p_after = soup.new_tag("p")
170 |             for sibling in list(elem_nested.next_siblings):
171 |                 p_after.append(sibling)
172 |             # Replace element by before/nested/after.
173 |             # NB. this is done in reverse order as we are adding after the current position
174 |             nested_elements[0].insert_after(p_after)
175 |             nested_elements[0].insert_after(elem_nested)
176 |             nested_elements[0].insert_after(p_before)
177 |             nested_elements[0].decompose()
178 |             # Rerun search for nested elements now that we have rewritten the tree
179 |             nested_elements = [e for e in soup.find_all('p') if e.find(nested_type)]
180 | 
181 | 
182 | def insert_paragraph_breaks(soup):
183 |     """Identify <br> and <hr> and split their parent element into multiple elements where appropriate."""
184 |     # Indicator which is used as a placeholder to mark paragraph breaks
185 |     BREAK_INDICATOR = "|BREAK_HERE|"
186 | 
187 |     # Find consecutive <br> elements and replace with a break marker
188 |     for element in soup.find_all('br'):
189 |         # When the next element is not another <br> count how long the chain is
190 |         if (element.next_sibling is None) or (element.next_sibling.name != 'br'):
191 |             br_element_chain = [element]
192 |             while (br_element_chain[-1].previous_sibling is not None) and (br_element_chain[-1].previous_sibling.name == 'br'):
193 |                 br_element_chain.append(br_element_chain[-1].previous_sibling)
194 | 
195 |             # If there's only one <br> then we replace it with a space
196 |             if len(br_element_chain) == 1:
197 |                 br_element_chain[0].replace_with(' ')
198 |             # If there are multiple <br>s then replace them with BREAK_INDICATOR
199 |             else:
200 |                 br_element_chain[0].replace_with(BREAK_INDICATOR)
201 |                 for inner_element in br_element_chain[1:]:
202 |                     inner_element.decompose()
203 | 
204 |     # Find consecutive <hr> elements and replace with a break marker
205 |     # Use a list rather than the generator, since we are altering the tree as we traverse it
206 |     for element in list(soup.find_all('hr')):
207 |         element.replace_with(BREAK_INDICATOR)
208 | 
209 |     # Consolidate the text again now that we have added strings to the tree
210 |     consolidate_text(soup)
211 | 
212 |     # Iterate through the tree, splitting string elements which contain BREAK_INDICATOR
213 |     # Use a list rather than the generator, since we are altering the tree as we traverse it
214 |     for element in list(soup.find_all(string=True)):
215 |         if BREAK_INDICATOR in element:
216 |             # Split the text into two or more fragments (there maybe be multiple BREAK_INDICATORs in the string)
217 |             text_fragments = [s.strip() for s in str(element).split(BREAK_INDICATOR)]
218 | 
219 |             # Get the parent element
220 |             parent_element = element.parent
221 | 
222 |             # If the parent is a paragraph then we want to close and reopen by creating a new tag
223 |             if parent_element.name == "p":
224 |                 # Iterate in reverse order as we are repeatedly adding new elements directly after the original one
225 |                 for text_fragment in text_fragments[:0:-1]:
226 |                     new_p_element = soup.new_tag("p")
227 |                     new_p_element.string = text_fragment
228 |                     parent_element.insert_after(new_p_element)
229 |                 # Replace this element by a navigable string containing the first text fragment
230 |                 element.replace_with(NavigableString(text_fragments[0]))
231 |             # Otherwise we want to simply include all the text fragments as independent NavigableStrings (that will be wrapped later)
232 |             else:
233 |                 # Iterate in reverse order as we are repeatedly adding new elements directly after the original one
234 |                 for text_fragment in text_fragments[:0:-1]:
235 |                     element.insert_after(soup.new_string(text_fragment))
236 |                 element.string.replace_with(text_fragments[0])
237 | 
238 | 
239 | def normalise_strings(soup):
240 |     """Remove extraneous whitespace and fix unicode issues in all strings."""
241 |     # Iterate over all strings in the tree (including bare strings outside tags)
242 |     for element in soup.find_all(string=True):
243 |         # Treat Beautiful Soup text elements as strings when normalising since normalisation returns a copy of the string
244 |         text = str(element)
245 |         normalised_text = normalise_text(text)
246 |         # Replace the element with a new string element of the same type, but containing the normalised text
247 |         element.replace_with(type(element)(normalised_text))
248 | 
249 | 
250 | def wrap_bare_text(soup):
251 |     """Wrap any remaining bare text in <p> tags.
252 | 
253 |     We do this to ensure that there is a strong, unique correspondance between presentational paragraphs and DOM structure
254 |      - all presentational paragraphs should be the only content associated with their immediate parent
255 |      - all presentational paragraphs at the same conceptual level should be equally nested
256 |      - the string as displayed in the browser should be equivalent to the innerHTML of the parent (so that indexing is equivalent between presentation and source)
257 | 
258 |     The following examples should not be allowed:
259 | 
260 |      1. Two presentational elements at the same DOM level have non-equivalent index levels
261 |        <div index="1.1">
262 |          text
263 |          <p index="1.1.1">more text</p>
264 |        </div>
265 | 
266 |      2. Index 1.1 might contain both strings
267 |        <div index="1.1">
268 |          <p index="1.1.1">more text</p>
269 |          text
270 |        </div>
271 | 
272 |      3. Two presentational paragraphs are included in the same index
273 |        <div index="1.1">
274 |          text
275 |          <p index="1.1.1">more text</p>
276 |          yet more text
277 |        </div>
278 |     """
279 |     # Iterate over all strings in the tree
280 |     for element in soup.find_all(string=True):
281 |         # If this is the only child of a whitelisted block then do nothing
282 |         # if we add <p> tags here then:
283 |         # - this might not be allowed for all whitelisted elements
284 |         # - we are adding additional structure that was not present in the original document
285 |         if element.parent.name in block_level_whitelist() and len(element.parent.contents) == 1:
286 |             pass
287 |         # ... otherwise wrap them in <p> tags
288 |         else:
289 |             p_element = soup.new_tag("p")
290 |             p_element.string = element
291 |             element.replace_with(p_element)
292 | 
293 | 
294 | def recursively_prune_elements(soup):
295 |     """Recursively prune out any elements which have no children or only zero-length children."""
296 |     def single_replace():
297 |         n_removed = 0
298 |         # Remove elements with no children
299 |         for element in soup.find_all(lambda elem: len(list(elem.children)) == 0):
300 |             element.decompose()
301 |             n_removed += 1
302 |         # Remove elements with only zero-length children
303 |         for element in soup.find_all(lambda elem: sum(len(c) for c in elem.children) == 0):
304 |             element.decompose()
305 |             n_removed += 1
306 |         return n_removed
307 |     # Repeatedly apply single_replace() until no elements are being removed
308 |     while single_replace():
309 |         pass
310 | 


--------------------------------------------------------------------------------
/readabilipy/simplifiers/text.py:
--------------------------------------------------------------------------------
 1 | """Common text manipulation functions."""
 2 | import unicodedata
 3 | import regex
 4 | 
 5 | matched_punctuation_marks = [('“', '”'), ('‘', '’'), ('(', ')'), ('[', ']'), ('{', '}')]
 6 | terminal_punctuation_marks = ['.', ',', '!', ':', ';', '?']
 7 | 
 8 | 
 9 | def normalise_unicode(text):
10 |     """Normalise unicode such that things that are visually equivalent map to the same unicode string where possible."""
11 |     normal_form = "NFKC"
12 |     text = unicodedata.normalize(normal_form, text)
13 |     return text
14 | 
15 | 
16 | def normalise_whitespace(text):
17 |     """Replace runs of whitespace characters with a single space as this is what happens when HTML text is displayed."""
18 |     text = regex.sub(r"\s+", " ", text)
19 |     # Remove leading and trailing whitespace
20 |     text = text.strip()
21 |     return text
22 | 
23 | 
24 | def normalise_text(text):
25 |     """Normalise unicode and whitespace."""
26 |     # Normalise unicode first to try and standardise whitespace characters as much as possible before normalising them
27 |     text = strip_control_characters(text)
28 |     text = normalise_unicode(text)
29 |     text = normalise_whitespace(text)
30 |     return text
31 | 
32 | 
33 | def strip_html_whitespace(text):
34 |     """Simplify HTML by stripping whitespace."""
35 |     # Normalise unicode first to try and standardise whitespace characters as much as possible before normalising them
36 |     text = normalise_text(text)
37 |     text = text.replace(" <", "<").replace("> ", ">")
38 |     return text
39 | 
40 | 
41 | def strip_control_characters(text):
42 |     """Strip out unicode control characters which might break the parsing."""
43 |     # Unicode control characters
44 |     #   [Cc]: Other, Control [includes new lines]
45 |     #   [Cf]: Other, Format
46 |     #   [Cn]: Other, Not Assigned
47 |     #   [Co]: Other, Private Use
48 |     #   [Cs]: Other, Surrogate
49 |     control_chars = set(['Cc', 'Cf', 'Cn', 'Co', 'Cs'])
50 |     retained_chars = ['\t', '\n', '\r', '\f']
51 | 
52 |     # Remove non-printing control characters
53 |     return "".join(["" if (unicodedata.category(char) in control_chars) and (char not in retained_chars) else char for char in text])
54 | 


--------------------------------------------------------------------------------
/readabilipy/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """Utility functions
 4 | 
 5 | """
 6 | 
 7 | import os
 8 | import subprocess
 9 | import sys
10 | 
11 | from contextlib import contextmanager
12 | 
13 | 
14 | @contextmanager
15 | def chdir(path):
16 |     """Change directory in context and return to original on exit"""
17 |     # From https://stackoverflow.com/a/37996581, couldn't find a built-in
18 |     original_path = os.getcwd()
19 |     os.chdir(path)
20 |     try:
21 |         yield
22 |     finally:
23 |         os.chdir(original_path)
24 | 
25 | 
26 | def have_npm():
27 |     try:
28 |         cp = subprocess.run(
29 |             ["npm", "version"],
30 |             stdout=subprocess.DEVNULL,
31 |             stderr=subprocess.DEVNULL,
32 |             check=True
33 |         )
34 |     except FileNotFoundError:
35 |         return False
36 |     return cp.returncode == 0
37 | 
38 | 
39 | def run_npm_install():
40 |     # Run NPM installation
41 |     if not have_npm():
42 |         print(
43 |             "Warning: A working NPM installation was not found. The package will use Python-based article extraction.",
44 |             file=sys.stderr,
45 |         )
46 |         return
47 | 
48 |     here = os.path.abspath(os.path.dirname(__file__))
49 |     jsdir = os.path.join(here, "javascript")
50 |     pkgjson = os.path.join(jsdir, "package.json")
51 |     if not os.path.exists(pkgjson):
52 |         print(
53 |             "Error: Couldn't find package.json. Package will use Python-based extraction.",
54 |             file=sys.stderr,
55 |         )
56 |         return
57 | 
58 |     with chdir(jsdir):
59 |         try:
60 |             cp = subprocess.run(["npm", "install"], check=True)
61 |             returncode = cp.returncode
62 |         except FileNotFoundError:
63 |             returncode = 1
64 | 
65 |     if returncode != 0:
66 |         print(
67 |             "Error: Failed to install dependencies with npm. Package will fall back on Python-based extraction.",
68 |             file=sys.stderr,
69 |         )
70 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import io
  5 | import os
  6 | 
  7 | from contextlib import contextmanager
  8 | 
  9 | from setuptools import find_packages, setup
 10 | 
 11 | # Package meta-data.
 12 | NAME = "readabilipy"
 13 | DESCRIPTION = "Python wrapper for Mozilla's Readability.js"
 14 | URL = "https://github.com/alan-turing-institute/ReadabiliPy"
 15 | AUTHOR = "The Alan Turing Institute"
 16 | AUTHOR_EMAIL = "info@turing.ac.uk"
 17 | MAINTAINER = "James Robinson"
 18 | MAINTAINER_EMAIL = "jrobinson@turing.ac.uk"
 19 | REQUIRES_PYTHON = ">=3.6.0"
 20 | VERSION = None
 21 | 
 22 | # What packages are required for this module to be executed?
 23 | REQUIRED = [
 24 |     "beautifulsoup4>=4.7.1",
 25 |     "html5lib",
 26 |     "lxml",
 27 |     "regex",
 28 | ]
 29 | 
 30 | docs_require = ["sphinx", "m2r"]
 31 | test_require = ["coveralls", "pycodestyle", "pyflakes", "pylint", "pytest", "pytest-benchmark", "pytest-cov"]
 32 | dev_require = []
 33 | 
 34 | # What packages are optional?
 35 | EXTRAS = {
 36 |     "docs": docs_require,
 37 |     "test": test_require,
 38 |     "dev": docs_require + test_require + dev_require,
 39 | }
 40 | 
 41 | # The rest you shouldn't have to touch too much :)
 42 | # ------------------------------------------------
 43 | # Except, perhaps the License and Trove Classifiers!
 44 | # If you do change the License, remember to change the Trove Classifier for that!
 45 | 
 46 | here = os.path.abspath(os.path.dirname(__file__))
 47 | 
 48 | # Import the README and use it as the long-description.
 49 | # Note: this will only work if 'README.md' is present in your MANIFEST.in file!
 50 | try:
 51 |     with io.open(os.path.join(here, "README.md"), encoding="utf-8") as f:
 52 |         long_description = "\n" + f.read()
 53 | except FileNotFoundError:
 54 |     long_description = DESCRIPTION
 55 | 
 56 | # Load the package's __version__.py module as a dictionary.
 57 | about = {}
 58 | if not VERSION:
 59 |     project_slug = NAME.lower().replace("-", "_").replace(" ", "_")
 60 |     with open(os.path.join(here, project_slug, "__version__.py")) as f:
 61 |         exec(f.read(), about)
 62 | else:
 63 |     about["__version__"] = VERSION
 64 | 
 65 | 
 66 | @contextmanager
 67 | def chdir(path):
 68 |     # From https://stackoverflow.com/a/37996581, couldn't find a built-in
 69 |     original_path = os.getcwd()
 70 |     os.chdir(path)
 71 |     try:
 72 |         yield
 73 |     finally:
 74 |         os.chdir(original_path)
 75 | 
 76 | 
 77 | # Where the magic happens:
 78 | setup(
 79 |     name=NAME,
 80 |     version=about["__version__"],
 81 |     description=DESCRIPTION,
 82 |     long_description=long_description,
 83 |     long_description_content_type="text/markdown",
 84 |     author=AUTHOR,
 85 |     author_email=AUTHOR_EMAIL,
 86 |     maintainer=MAINTAINER,
 87 |     maintainer_email=MAINTAINER_EMAIL,
 88 |     python_requires=REQUIRES_PYTHON,
 89 |     url=URL,
 90 |     packages=find_packages(
 91 |         exclude=["tests", "*.tests", "*.tests.*", "tests.*"]
 92 |     ),
 93 |     entry_points={
 94 |         "console_scripts": ["readabilipy=readabilipy.__main__:main"],
 95 |     },
 96 |     install_requires=REQUIRED,
 97 |     extras_require=EXTRAS,
 98 |     include_package_data=True,
 99 |     license="MIT",
100 |     classifiers=[
101 |         # Trove classifiers
102 |         # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
103 |         "License :: OSI Approved :: MIT License",
104 |         "Programming Language :: Python",
105 |         "Programming Language :: Python :: 3",
106 |         "Programming Language :: Python :: 3.6",
107 |         "Programming Language :: Python :: 3.7",
108 |         "Programming Language :: Python :: 3.8",
109 |         "Programming Language :: Python :: 3.9",
110 |         "Programming Language :: Python :: 3.10",
111 |         "Programming Language :: Python :: 3.11",
112 |         "Programming Language :: Python :: 3.12",
113 |         "Programming Language :: Python :: Implementation :: CPython",
114 |         "Programming Language :: Python :: Implementation :: PyPy",
115 |     ],
116 | )
117 | 


--------------------------------------------------------------------------------
/tests/checks.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from readabilipy import simple_json_from_html_string
 4 | from readabilipy.simplifiers import strip_html_whitespace
 5 | from readabilipy.simple_json import extract_text_blocks_as_plain_text
 6 | 
 7 | 
 8 | def get_normalised_html_output(test_fragment, expected_output=None):
 9 |     """Get normalised HTML output."""
10 |     if expected_output is None:
11 |         expected_output = test_fragment
12 |     article_json = simple_json_from_html_string(test_fragment)
13 |     content = str(article_json["plain_content"])
14 |     # Check that expected output is present after simplifying the HTML
15 |     normalised_expectation = strip_html_whitespace(expected_output)
16 |     normalised_result = strip_html_whitespace(content)
17 |     print("expectation:", normalised_expectation)
18 |     print("result:", normalised_result)
19 |     return (normalised_expectation, normalised_result)
20 | 
21 | 
22 | def check_exact_html_output(test_fragment, expected_output=None):
23 |     """Check that expected output is present when parsing HTML fragment."""
24 |     normalised_expectation, normalised_result = get_normalised_html_output(test_fragment, expected_output)
25 |     assert normalised_expectation == normalised_result
26 | 
27 | 
28 | def check_html_output_contains_text(test_fragment, expected_output=None):
29 |     """Check that expected output is present when parsing HTML fragment."""
30 |     normalised_expectation, normalised_result = get_normalised_html_output(test_fragment, expected_output)
31 |     assert normalised_expectation in normalised_result
32 | 
33 | 
34 | def check_extract_article(test_filename, expected_filename, content_digests=False, node_indexes=False, use_readability_js=False):
35 |     """Test end-to-end article extraction. Ensure that HTML from file matches JSON from file after parsing is applied."""
36 |     test_data_dir = "data"
37 |     # Read HTML test file
38 |     test_filepath = os.path.join(os.path.dirname(__file__), test_data_dir, test_filename)
39 |     with open(test_filepath, encoding="utf-8") as h:
40 |         html = h.read()
41 | 
42 |     # Extract simplified article HTML
43 |     if use_readability_js:
44 |         article_json = simple_json_from_html_string(html, content_digests, node_indexes, use_readability=True)
45 |     else:
46 |         article_json = simple_json_from_html_string(html, content_digests, node_indexes)
47 | 
48 |     # Get expected simplified article HTML
49 |     expected_filepath = os.path.join(os.path.dirname(__file__), test_data_dir, expected_filename)
50 |     with open(expected_filepath, encoding="utf-8") as h:
51 |         expected_article_json = json.loads(h.read())
52 | 
53 |     # Test full JSON matches (checks for unexpected fields in either actual or expected JSON)
54 |     assert article_json == expected_article_json
55 | 
56 | 
57 | def check_extract_paragraphs_as_plain_text(test_filename, expected_filename):
58 |     test_data_dir = "data"
59 |     # Read readable article test file
60 |     test_filepath = os.path.join(os.path.dirname(__file__), test_data_dir, test_filename)
61 |     with open(test_filepath, encoding="utf-8") as h:
62 |         article = json.loads(h.read())
63 | 
64 |     # Extract plain text paragraphs
65 |     paragraphs = extract_text_blocks_as_plain_text(article["plain_content"])
66 | 
67 |     # Get expected plain text paragraphs
68 |     expected_filepath = os.path.join(os.path.dirname(__file__),
69 |                                      test_data_dir, expected_filename)
70 |     with open(expected_filepath, encoding="utf-8") as h:
71 |         expected_paragraphs = json.loads(h.read())
72 | 
73 |     # Test
74 |     assert paragraphs == expected_paragraphs
75 | 
76 | 
77 | def check_html_has_no_output(test_fragment):
78 |     """Check that no output is present when parsing HTML fragment."""
79 |     article_json = simple_json_from_html_string(test_fragment)
80 |     # Check that there is no output
81 |     assert article_json["plain_content"] is None or article_json["plain_content"] == "<div></div>"
82 | 
83 | 
84 | def check_html_output_does_not_contain_tag(test_fragment, vetoed_tag):
85 |     """Check that vetoed tag is not present when parsing HTML fragment."""
86 |     article_json = simple_json_from_html_string(test_fragment)
87 |     # Check that neither <tag> nor </tag> appear in the output
88 |     content = str(article_json["plain_content"])
89 |     if content is not None:
90 |         for element in [f"<{vetoed_tag}>", f"</{vetoed_tag}>"]:
91 |             assert element not in content
92 | 


--------------------------------------------------------------------------------
/tests/data/addictinginfo.com-1_full_article.html:
--------------------------------------------------------------------------------
  1 | <div id="post-342398" class="clearfix post-342398 post type-post status-publish format-standard has-post-thumbnail hentry category-current-issues category-news tag-donald-trump tag-elizabeth-warren">
  2 | 	<div class="breadcrumbs"><a href="http://addictinginfo.com/category/info/current-issues/" rel="category tag">Current
  3 | 			Issues</a>, <a href="http://addictinginfo.com/category/news/" rel="category tag">News</a></div>
  4 | 	<h1 class="entry-title">
  5 | 		<a href="http://addictinginfo.com/2018/10/15/trump-denies-charitable-donation-he-promised-if-elizabeth-warren-releases-dna-results-and-its-on-video/"
  6 | 		 title="Permalink to Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It&#8217;s On Video"
  7 | 		 rel="bookmark">Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And
  8 | 			It&#8217;s On Video</a>
  9 | 	</h1>
 10 | 	<div class="post-meta">
 11 | 		<span>By <a class="url fn n" href="http://addictinginfo.com/author/conover100gmail-com/">Conover Kennard</a></span>
 12 | 		<time class="entry-date published updated" datetime="2018-10-15T12:13:54+00:00">
 13 | 			on October 15, 2018 12:13 pm </time>
 14 | 		<span class="separator">&middot;</span>
 15 | 	</div>
 16 | 	<div class="entry entry-content">
 17 | 		<p>Donald Trump has repeatedly lashed out at Sen. Elizabeth Warren (D-MA) at his rallies, calling her Pochahontas
 18 | 			to cast doubt on her Native American Heritage. Trump even referred to Warren as Pocahontas at a November 2017
 19 | 			White House event honoring Navajo code-breakers. At a rally in Montana this past July, Trump offered Sen. Warren
 20 | 			$1 million if she took a DNA test to prove her heritage. Trump spoke of bringing a DNA test to a debate if she
 21 | 			challenges him.</p>
 22 | 		<div class='ad-inserter ad-inserter-1' style='margin: 8px auto; text-align: center; clear: both;'>
 23 | 			<div class="topgooglenews"></div>
 24 | 		</div>
 25 | 		<p>“We will take that little kit — but we have to do it gently. Because we’re in the #MeToo generation, we have
 26 | 			to do it gently,” Trump said. “And we will very gently take that kit, and slowly toss it, hoping it doesn’t
 27 | 			injure her arm, and we will say: I will give you a million dollars to your favorite charity, paid for by Trump,
 28 | 			if you take the test and it shows you’re an Indian. I have a feeling she will say no.”</p>
 29 | 		<div class='ad-inserter ad-inserter-13' style='margin: 8px 0; clear: both;'>
 30 | 			<script>
 31 | 				function parseRSS(t, n) {
 32 | 					$.ajax({
 33 | 						url: "https://api.rss2json.com/v1/api.json?rss_url=" + encodeURIComponent(t),
 34 | 						dataType: "json",
 35 | 						success: function (t) {
 36 | 							n(t)
 37 | 						}
 38 | 					})
 39 | 				}
 40 | 				var rssUrl = "http://addictinginfo.com/feed/";
 41 | 				html = "", parseRSS(rssUrl, function (t) {
 42 | 					var rand = Math.floor(Math.random() * 10 + 1);
 43 | 					var n = t.items;
 44 | 					for (i = 0; i < 1; i++) html = "<span style='color:red;font-weight:bold;'>Related: </span> <a href='" + n[
 45 | 						rand].link + "' target=_blank>" + n[rand].title + "</a>";
 46 | 					$("#related-story").html(html)
 47 | 				});
 48 | 			</script>
 49 | 			<div id="related-story" style="float:left;width:100%;margin:10px 0;font-size:15px;font-family: sans-serif;font-weight:500;"></div>
 50 | 		</div>
 51 | 		<div class='ad-inserter ad-inserter-2' style='margin: 8px auto; text-align: center; clear: both;'>
 52 | 			<script async src="http://cdn.a3m.io/i1.js" styling="bottom: 20,right: 20, width: 320"></script>
 53 | 		</div>
 54 | 		<p>Today the Massachusetts Senator <a href="http://usnationalnews.org/elizabeth-warren-just-sent-huge-fck-you-to-trump-releases-her-dna-results/"
 55 | 			 target="_blank" rel="noopener">released her DNA results</a>.</p>
 56 | 		<p>&#8220;The facts suggest that you absolutely have a Native American ancestor in your pedigree,” an
 57 | 			Ancestry.com and 23andMe adviser told Warren.</p>
 58 | 		<p>Warren tweeted Trump to demand he pay what he promised and requested that he make the donation to the National
 59 | 			Indigenous Women’s Resource Center.</p>
 60 | 		<blockquote class="twitter-tweet" data-lang="en">
 61 | 			<p dir="ltr" lang="en">By the way, <a href="https://twitter.com/realDonaldTrump?ref_src=twsrc%5Etfw">@realDonaldTrump</a>:
 62 | 				Remember saying on 7/5 that you’d give $1M to a charity of my choice if my DNA showed Native American ancestry?
 63 | 				I remember – and here&#8217;s the verdict. Please send the check to the National Indigenous Women’s Resource
 64 | 				Center: <a href="https://t.co/I6YQ9hf7Tv">https://t.co/I6YQ9hf7Tv</a> <a href="https://t.co/J4gBamaeeo">pic.twitter.com/J4gBamaeeo</a></p>
 65 | 			<p>— Elizabeth Warren (@elizabethforma) <a href="https://twitter.com/elizabethforma/status/1051820933982359553?ref_src=twsrc%5Etfw">October
 66 | 					15, 2018</a></p>
 67 | 		</blockquote>
 68 | 		<p>
 69 | 			<script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>
 70 | 		</p>
 71 | 		<p>In response, White House counselor Kellyanne Conway<a href="http://usnationalnews.org/unhinged-kellyanne-conway-drops-shocking-statement-about-warrens-latest-dna-results/"
 72 | 			 target="_blank" rel="noopener"> called</a> DNA testing &#8220;junk science.&#8221;</p>
 73 | 		<p>Then, Trump flat out denied ever promising to make that donation even though it&#8217;s on tape.</p>
 74 | 		<p>“Who cares?” Trump told reporters today. “I didn’t say that. Nope, you better read it again.”</p>
 75 | 		<p>Well, MSNBC has the receipts:</p>
 76 | 		<blockquote class="twitter-tweet" data-conversation="none" data-lang="en">
 77 | 			<p dir="ltr" lang="en">MSNBC put together video of Trump unequivocally telling Elizabeth Warren in July that
 78 | 				&#8220;I&#8217;ll give you a million dollars, to your favorite charity, paid for by Trump, if you take [a DNA]
 79 | 				test and it shows you&#8217;re an Indian&#8221; &#8212; followed by him denying he ever said that this morning.
 80 | 				<a href="https://t.co/zR3n2DqaiY">pic.twitter.com/zR3n2DqaiY</a></p>
 81 | 			<p>— Aaron Rupar (@atrupar) <a href="https://twitter.com/atrupar/status/1051846550236930053?ref_src=twsrc%5Etfw">October
 82 | 					15, 2018</a></p>
 83 | 		</blockquote>
 84 | 		<p>
 85 | 			<script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>
 86 | 		</p>
 87 | 		<p><em>Image via screen capture.</em></p>
 88 | 		<div class='ad-inserter ad-inserter-3' style='margin: 8px auto; text-align: center; clear: both;'>
 89 | 			<center>
 90 | 				<div class="teadbelowcontentad"></div>
 91 | 			</center>
 92 | 			<div class="switchlockerdome"></div>
 93 | 
 94 | 			<div class="bottom-socialwrap">
 95 | 				<div class="socialwrap-title">Share this Article!</div>
 96 | 				<a href="" target="_blank" class="socicon-facebook fburl">
 97 | 					<div class="facebooklike-left"><i class="icon-facebook"></i> Share on Facebook</div>
 98 | 				</a>
 99 | 				<a href="https://twitter.com/intent/tweet?text=<?php the_title();?>&url=<?php the_permalink();?>&image=<?php echo wp_get_attachment_url( get_post_thumbnail_id( get_the_id() ) );?>?w=640"
100 | 				 target="_blank" class="socicon-twitter twurl">
101 | 					<div class="twitterlike-right"><i class="icon-twitter"></i> Share on Twitter</div>
102 | 				</a>
103 | 			</div>
104 | 			<style>
105 | 				.bottom-socialwrap {float: left;width: calc(100% - 40px);padding: 30px 20px;margin: 30px 0;background-color: #ffffff;border-top: 7px solid #eee;border-bottom: 7px solid #eee;}
106 | .socialwrap-title {float: left;width: 100%;text-align: center;font-size: 21px;color: #CA0002;font-family: verdana;letter-spacing: 0.6px;margin-top: -13px;margin-bottom: 16px;}
107 | .facebooklike-left {float: left;width: 47%;height: 46px;background: #3960BA;text-align: center;color: #FFFFFF;font-size: 14px;line-height: 46px;font-family: 'Open Sans', sans-serif;}
108 | .icon-facebook:before {content: "\f09a";font-family: FontAwesome;font-style: normal;padding-right: 6px;display:inline-block;}
109 | .twitterlike-right {float: right;width: 47%;height: 46px;background: #52A6F7;text-align: center;color: #FFFFFF;font-size: 14px;line-height: 46px;font-family: 'Open Sans', sans-serif;}
110 | .icon-twitter:before {content: "\f099";font-family: FontAwesome;font-style: normal;padding-right: 6px;display:inline-block;}
111 | @media screen and (max-width:500px) {.bottom-socialwrap {width: 100% !important;padding: 30px 0 !important;}.icon-facebook, .icon-twitter {display: none;}}
112 | </style>
113 | 			<script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/jquery/1.9.0/jquery.min.js"></script>
114 | 			<script>
115 | 				jQuery(document).ready(function ($) {
116 | 					var currentPageTitle = $('.post h1 a').text();
117 | 					var currentPage = window.location.href;
118 | 					$('.fburl').attr('href', 'https://www.facebook.com/sharer/sharer.php?u=' + currentPage);
119 | 					$('.twurl').attr('href', 'https://twitter.com/intent/tweet?text=' + currentPageTitle + '&url=' + currentPage);
120 | 					$('.socicon-facebook').click(function () {
121 | 						winWidth = 520;
122 | 						winHeight = 350;
123 | 						var winTop = (screen.height / 2) - (winHeight / 2);
124 | 						var winLeft = (screen.width / 2) - (winWidth / 2);
125 | 						var currentPageTitle = $('.post h1 a').text();
126 | 						var currentPage = window.location.href;
127 | 						window.open('https://www.facebook.com/sharer/sharer.php?u=' + currentPage, 'sharer', 'top=' + winTop +
128 | 							',left=' + winLeft + ',toolbar=0,status=0,width=' + winWidth + ',height=' + winHeight);
129 | 					});
130 | 				});
131 | 			</script>
132 | 		</div>
133 | 		<div class='ad-inserter ad-inserter-4 ai-viewport-1' style='margin: 8px auto; text-align: center; clear: both;'>
134 | 
135 | 			<ins class="adbladeads" data-cid="30884-1603787504" data-host="web.adblade.com" data-tag-type="4" style="display:none"></ins>
136 | 			<script async src="http://web.adblade.com/js/ads/async/show.js" type="text/javascript"></script>
137 | 
138 | 
139 | 		</div>
140 | 		<div class='ad-inserter ad-inserter-7' style='margin: 8px 0; clear: both;'>
141 | 
142 | 			<script type="text/javascript">
143 | 				_atrk_opts = {
144 | 					atrk_acct: "Ckvzg1awO700y8",
145 | 					domain: "addictinginfo.org",
146 | 					dynamic: true
147 | 				};
148 | 				(function () {
149 | 					var as = document.createElement('script');
150 | 					as.type = 'text/javascript';
151 | 					as.async = true;
152 | 					as.src = "https://d31qbv1cthcecs.cloudfront.net/atrk.js";
153 | 					var s = document.getElementsByTagName('script')[0];
154 | 					s.parentNode.insertBefore(as, s);
155 | 				})();
156 | 			</script>
157 | 			<noscript><img src="https://d5nxst8fruw4z.cloudfront.net/atrk.gif?account=Ckvzg1awO700y8" style="display:none"
158 | 				 height="1" width="1" alt="" /></noscript>
159 | 		</div>
160 | 		<div class='ad-inserter ad-inserter-10 ai-viewport-2 ai-viewport-3' style='margin: 8px 0; clear: both;'>
161 | 			<ins class="adbladeads" data-cid="30888-2250940359" data-host="web.adblade.com" data-tag-type="4" style="display:none"></ins>
162 | 			<script async src="http://web.adblade.com/js/ads/async/show.js" type="text/javascript"></script>
163 | 		</div>
164 | 		<div class="clear"></div>
165 | 		<div class="clear"></div>
166 | 	</div>
167 | 	<div class="clear"></div>
168 | 	<div class="post_author vcard">
169 | 		<img alt='' src='http://0.gravatar.com/avatar/f6d2c6722372d33d52a0080cd1072001?s=70&#038;d=identicon&#038;r=g'
170 | 		 srcset='http://0.gravatar.com/avatar/f6d2c6722372d33d52a0080cd1072001?s=140&#038;d=identicon&#038;r=g 2x' class='avatar avatar-70 photo'
171 | 		 height='70' width='70' /><span>Author: <a class="url fn n" href="http://addictinginfo.com/author/conover100gmail-com/">Conover
172 | 				Kennard</a></span>Conover makes tea partiers cry as a hobby. She was Commander of Jade Helm15 during the failed
173 | 		takeover of the South. She's also one of the biggest arseholes on Twitter. At night, she can be found drinking
174 | 		Conservative tears while pulling off the wings of flies just because she can. She is the founder of a Marxist,
175 | 		Commie, Maoist, Socialist site and has contributed to several other sites, blah blah blah. She is an awful person
176 | 		but she doesn't like to brag about that.
177 | 		<div class="clear"></div>
178 | 	</div>
179 | </div>
180 | 


--------------------------------------------------------------------------------
/tests/data/addictinginfo.com-1_full_page_javascript.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "title": "Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video",
 3 |     "byline": "Conover Kennard",
 4 |     "date": null,
 5 |     "content": "<div id=\"readability-page-1\" class=\"page\"><div id=\"post-342398\">\n\n\n\n<div>\n<p>Donald Trump has repeatedly lashed out at Sen. Elizabeth Warren (D-MA) at his rallies, calling her Pochahontas to cast doubt on her Native American Heritage. Trump even referred to Warren as Pocahontas at a November 2017 White House event honoring Navajo code-breakers. At a rally in Montana this past July, Trump offered Sen. Warren $1 million if she took a DNA test to prove her heritage. Trump spoke of bringing a DNA test to a debate if she challenges him.</p>\n<p>“We will take that little kit — but we have to do it gently. Because we’re in the #MeToo generation, we have to do it gently,” Trump said. “And we will very gently take that kit, and slowly toss it, hoping it doesn’t injure her arm, and we will say: I will give you a million dollars to your favorite charity, paid for by Trump, if you take the test and it shows you’re an Indian. I have a feeling she will say no.”</p>\n\n<p>Today the Massachusetts Senator <a href=\"http://usnationalnews.org/elizabeth-warren-just-sent-huge-fck-you-to-trump-releases-her-dna-results/\" target=\"_blank\" rel=\"noopener\">released her DNA results</a>.</p>\n<p>“The facts suggest that you absolutely have a Native American ancestor in your pedigree,” an Ancestry.com and 23andMe adviser told Warren.</p>\n<p>Warren tweeted Trump to demand he pay what he promised and requested that he make the donation to the National Indigenous Women’s Resource Center.</p>\n<blockquote data-lang=\"en\">\n<p dir=\"ltr\" lang=\"en\">By the way, <a href=\"https://twitter.com/realDonaldTrump?ref_src=twsrc%5Etfw\">@realDonaldTrump</a>: Remember saying on 7/5 that you’d give $1M to a charity of my choice if my DNA showed Native American ancestry? I remember – and here’s the verdict. Please send the check to the National Indigenous Women’s Resource Center: <a href=\"https://t.co/I6YQ9hf7Tv\">https://t.co/I6YQ9hf7Tv</a> <a href=\"https://t.co/J4gBamaeeo\">pic.twitter.com/J4gBamaeeo</a></p>\n<p>— Elizabeth Warren (@elizabethforma) <a href=\"https://twitter.com/elizabethforma/status/1051820933982359553?ref_src=twsrc%5Etfw\">October 15, 2018</a></p></blockquote>\n\n<p>In response, White House counselor Kellyanne Conway<a href=\"http://usnationalnews.org/unhinged-kellyanne-conway-drops-shocking-statement-about-warrens-latest-dna-results/\" target=\"_blank\" rel=\"noopener\"> called</a> DNA testing “junk science.”</p>\n<p>Then, Trump flat out denied ever promising to make that donation even though it’s on tape.</p>\n<p>“Who cares?” Trump told reporters today. “I didn’t say that. Nope, you better read it again.”</p>\n<p>Well, MSNBC has the receipts:</p>\n<blockquote data-conversation=\"none\" data-lang=\"en\">\n<p dir=\"ltr\" lang=\"en\">MSNBC put together video of Trump unequivocally telling Elizabeth Warren in July that “I’ll give you a million dollars, to your favorite charity, paid for by Trump, if you take [a DNA] test and it shows you’re an Indian” — followed by him denying he ever said that this morning. <a href=\"https://t.co/zR3n2DqaiY\">pic.twitter.com/zR3n2DqaiY</a></p>\n<p>— Aaron Rupar (@atrupar) <a href=\"https://twitter.com/atrupar/status/1051846550236930053?ref_src=twsrc%5Etfw\">October 15, 2018</a></p></blockquote>\n\n<p><em>Image via screen capture.</em></p>\n\n\n\n\n\n\n</div>\n\n<div>\n<p><img alt=\"\" src=\"http://0.gravatar.com/avatar/f6d2c6722372d33d52a0080cd1072001?s=70&amp;d=identicon&amp;r=g\" srcset=\"http://0.gravatar.com/avatar/f6d2c6722372d33d52a0080cd1072001?s=140&amp;d=identicon&amp;r=g 2x\" height=\"70\" width=\"70\"><span>Author: <a href=\"http://addictinginfo.com/author/conover100gmail-com/\">Conover Kennard</a></span>Conover makes tea partiers cry as a hobby. She was Commander of Jade Helm15 during the failed takeover of the South. She's also one of the biggest arseholes on Twitter. At night, she can be found drinking Conservative tears while pulling off the wings of flies just because she can. She is the founder of a Marxist, Commie, Maoist, Socialist site and has contributed to several other sites, blah blah blah. She is an awful person but she doesn't like to brag about that.\n</p>\n</div>\n</div></div>",
 6 |     "plain_content": "<div class=\"page\" id=\"readability-page-1\"><div id=\"post-342398\"><div><p>Donald Trump has repeatedly lashed out at Sen. Elizabeth Warren (D-MA) at his rallies, calling her Pochahontas to cast doubt on her Native American Heritage. Trump even referred to Warren as Pocahontas at a November 2017 White House event honoring Navajo code-breakers. At a rally in Montana this past July, Trump offered Sen. Warren $1 million if she took a DNA test to prove her heritage. Trump spoke of bringing a DNA test to a debate if she challenges him.</p><p>“We will take that little kit — but we have to do it gently. Because we’re in the #MeToo generation, we have to do it gently,” Trump said. “And we will very gently take that kit, and slowly toss it, hoping it doesn’t injure her arm, and we will say: I will give you a million dollars to your favorite charity, paid for by Trump, if you take the test and it shows you’re an Indian. I have a feeling she will say no.”</p><p>Today the Massachusetts Senator released her DNA results.</p><p>“The facts suggest that you absolutely have a Native American ancestor in your pedigree,” an Ancestry.com and 23andMe adviser told Warren.</p><p>Warren tweeted Trump to demand he pay what he promised and requested that he make the donation to the National Indigenous Women’s Resource Center.</p><blockquote data-lang=\"en\"><p dir=\"ltr\" lang=\"en\">By the way, @realDonaldTrump: Remember saying on 7/5 that you’d give $1M to a charity of my choice if my DNA showed Native American ancestry? I remember – and here’s the verdict. Please send the check to the National Indigenous Women’s Resource Center: https://t.co/I6YQ9hf7Tv pic.twitter.com/J4gBamaeeo</p><p>— Elizabeth Warren (@elizabethforma) October 15, 2018</p></blockquote><p>In response, White House counselor Kellyanne Conway called DNA testing “junk science.”</p><p>Then, Trump flat out denied ever promising to make that donation even though it’s on tape.</p><p>“Who cares?” Trump told reporters today. “I didn’t say that. Nope, you better read it again.”</p><p>Well, MSNBC has the receipts:</p><blockquote data-conversation=\"none\" data-lang=\"en\"><p dir=\"ltr\" lang=\"en\">MSNBC put together video of Trump unequivocally telling Elizabeth Warren in July that “I’ll give you a million dollars, to your favorite charity, paid for by Trump, if you take [a DNA] test and it shows you’re an Indian” — followed by him denying he ever said that this morning. pic.twitter.com/zR3n2DqaiY</p><p>— Aaron Rupar (@atrupar) October 15, 2018</p></blockquote><p>Image via screen capture.</p></div><div><p>Author: Conover KennardConover makes tea partiers cry as a hobby. She was Commander of Jade Helm15 during the failed takeover of the South. She's also one of the biggest arseholes on Twitter. At night, she can be found drinking Conservative tears while pulling off the wings of flies just because she can. She is the founder of a Marxist, Commie, Maoist, Socialist site and has contributed to several other sites, blah blah blah. She is an awful person but she doesn't like to brag about that.</p></div></div></div>",
 7 |     "plain_text": [
 8 |         {"text": "Donald Trump has repeatedly lashed out at Sen. Elizabeth Warren (D-MA) at his rallies, calling her Pochahontas to cast doubt on her Native American Heritage. Trump even referred to Warren as Pocahontas at a November 2017 White House event honoring Navajo code-breakers. At a rally in Montana this past July, Trump offered Sen. Warren $1 million if she took a DNA test to prove her heritage. Trump spoke of bringing a DNA test to a debate if she challenges him."},
 9 |         {"text": "“We will take that little kit — but we have to do it gently. Because we’re in the #MeToo generation, we have to do it gently,” Trump said. “And we will very gently take that kit, and slowly toss it, hoping it doesn’t injure her arm, and we will say: I will give you a million dollars to your favorite charity, paid for by Trump, if you take the test and it shows you’re an Indian. I have a feeling she will say no.”"},
10 |         {"text": "Today the Massachusetts Senator released her DNA results."},
11 |         {"text": "“The facts suggest that you absolutely have a Native American ancestor in your pedigree,” an Ancestry.com and 23andMe adviser told Warren."},
12 |         {"text": "Warren tweeted Trump to demand he pay what he promised and requested that he make the donation to the National Indigenous Women’s Resource Center."},
13 |         {"text": "By the way, @realDonaldTrump: Remember saying on 7/5 that you’d give $1M to a charity of my choice if my DNA showed Native American ancestry? I remember – and here’s the verdict. Please send the check to the National Indigenous Women’s Resource Center: https://t.co/I6YQ9hf7Tv pic.twitter.com/J4gBamaeeo"},
14 |         {"text": "— Elizabeth Warren (@elizabethforma) October 15, 2018"},
15 |         {"text": "In response, White House counselor Kellyanne Conway called DNA testing “junk science.”"},
16 |         {"text": "Then, Trump flat out denied ever promising to make that donation even though it’s on tape."},
17 |         {"text": "“Who cares?” Trump told reporters today. “I didn’t say that. Nope, you better read it again.”"},
18 |         {"text": "Well, MSNBC has the receipts:"},
19 |         {"text": "MSNBC put together video of Trump unequivocally telling Elizabeth Warren in July that “I’ll give you a million dollars, to your favorite charity, paid for by Trump, if you take [a DNA] test and it shows you’re an Indian” — followed by him denying he ever said that this morning. pic.twitter.com/zR3n2DqaiY"},
20 |         {"text": "— Aaron Rupar (@atrupar) October 15, 2018"},
21 |         {"text": "Image via screen capture."},
22 |         {"text": "Author: Conover KennardConover makes tea partiers cry as a hobby. She was Commander of Jade Helm15 during the failed takeover of the South. She's also one of the biggest arseholes on Twitter. At night, she can be found drinking Conservative tears while pulling off the wings of flies just because she can. She is the founder of a Marxist, Commie, Maoist, Socialist site and has contributed to several other sites, blah blah blah. She is an awful person but she doesn't like to brag about that."}
23 |     ]
24 | }
25 | 


--------------------------------------------------------------------------------
/tests/data/addictinginfo.com-1_plain_text_paragraphs_from_simple_article.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {"text": "Current Issues, News"},
 3 |   {"text": "Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video"},
 4 |   {"text": "By Conover Kennard on October 15, 2018 12:13 pm ·"},
 5 |   {"text": "Donald Trump has repeatedly lashed out at Sen. Elizabeth Warren (D-MA) at his rallies, calling her Pochahontas to cast doubt on her Native American Heritage. Trump even referred to Warren as Pocahontas at a November 2017 White House event honoring Navajo code-breakers. At a rally in Montana this past July, Trump offered Sen. Warren $1 million if she took a DNA test to prove her heritage. Trump spoke of bringing a DNA test to a debate if she challenges him."},
 6 |   {"text": "“We will take that little kit — but we have to do it gently. Because we’re in the #MeToo generation, we have to do it gently,” Trump said. “And we will very gently take that kit, and slowly toss it, hoping it doesn’t injure her arm, and we will say: I will give you a million dollars to your favorite charity, paid for by Trump, if you take the test and it shows you’re an Indian. I have a feeling she will say no.”"},
 7 |   {"text": "Today the Massachusetts Senator released her DNA results."},
 8 |   {"text": "“The facts suggest that you absolutely have a Native American ancestor in your pedigree,” an Ancestry.com and 23andMe adviser told Warren."},
 9 |   {"text": "Warren tweeted Trump to demand he pay what he promised and requested that he make the donation to the National Indigenous Women’s Resource Center."},
10 |   {"text": "By the way, @realDonaldTrump: Remember saying on 7/5 that you’d give $1M to a charity of my choice if my DNA showed Native American ancestry? I remember – and here’s the verdict. Please send the check to the National Indigenous Women’s Resource Center: https://t.co/I6YQ9hf7Tv pic.twitter.com/J4gBamaeeo"},
11 |   {"text": "— Elizabeth Warren (@elizabethforma) October 15, 2018"},
12 |   {"text": "In response, White House counselor Kellyanne Conway called DNA testing “junk science.”"},
13 |   {"text": "Then, Trump flat out denied ever promising to make that donation even though it’s on tape."},
14 |   {"text": "“Who cares?” Trump told reporters today. “I didn’t say that. Nope, you better read it again.”"},
15 |   {"text": "Well, MSNBC has the receipts:"},
16 |   {"text": "MSNBC put together video of Trump unequivocally telling Elizabeth Warren in July that “I’ll give you a million dollars, to your favorite charity, paid for by Trump, if you take [a DNA] test and it shows you’re an Indian” — followed by him denying he ever said that this morning. pic.twitter.com/zR3n2DqaiY"},
17 |   {"text": "— Aaron Rupar (@atrupar) October 15, 2018"},
18 |   {"text": "Image via screen capture."},
19 |   {"text": "Share this Article!"},
20 |   {"text": "Share on Facebook"},
21 |   {"text": "Share on Twitter"},
22 |   {"text": "Author: Conover KennardConover makes tea partiers cry as a hobby. She was Commander of Jade Helm15 during the failed takeover of the South. She's also one of the biggest arseholes on Twitter. At night, she can be found drinking Conservative tears while pulling off the wings of flies just because she can. She is the founder of a Marxist, Commie, Maoist, Socialist site and has contributed to several other sites, blah blah blah. She is an awful person but she doesn't like to brag about that."}
23 | ]


--------------------------------------------------------------------------------
/tests/data/addictinginfo.com-1_simple_article_from_full_article.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "title": "Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video",
 3 |   "byline": null,
 4 |   "date": "2018-10-15T12:13:54",
 5 |   "content": "<div id=\"post-342398\"><div>Current Issues, News</div><h1>Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video</h1><div>By Conover Kennard on October 15, 2018 12:13 pm ·</div><div><p>Donald Trump has repeatedly lashed out at Sen. Elizabeth Warren (D-MA) at his rallies, calling her Pochahontas to cast doubt on her Native American Heritage. Trump even referred to Warren as Pocahontas at a November 2017 White House event honoring Navajo code-breakers. At a rally in Montana this past July, Trump offered Sen. Warren $1 million if she took a DNA test to prove her heritage. Trump spoke of bringing a DNA test to a debate if she challenges him.</p><p>“We will take that little kit — but we have to do it gently. Because we’re in the #MeToo generation, we have to do it gently,” Trump said. “And we will very gently take that kit, and slowly toss it, hoping it doesn’t injure her arm, and we will say: I will give you a million dollars to your favorite charity, paid for by Trump, if you take the test and it shows you’re an Indian. I have a feeling she will say no.”</p><p>Today the Massachusetts Senator released her DNA results.</p><p>“The facts suggest that you absolutely have a Native American ancestor in your pedigree,” an Ancestry.com and 23andMe adviser told Warren.</p><p>Warren tweeted Trump to demand he pay what he promised and requested that he make the donation to the National Indigenous Women’s Resource Center.</p><blockquote data-lang=\"en\"><p dir=\"ltr\" lang=\"en\">By the way, @realDonaldTrump: Remember saying on 7/5 that you’d give $1M to a charity of my choice if my DNA showed Native American ancestry? I remember – and here’s the verdict. Please send the check to the National Indigenous Women’s Resource Center: https://t.co/I6YQ9hf7Tv pic.twitter.com/J4gBamaeeo</p><p>— Elizabeth Warren (@elizabethforma) October 15, 2018</p></blockquote><p>In response, White House counselor Kellyanne Conway called DNA testing “junk science.”</p><p>Then, Trump flat out denied ever promising to make that donation even though it’s on tape.</p><p>“Who cares?” Trump told reporters today. “I didn’t say that. Nope, you better read it again.”</p><p>Well, MSNBC has the receipts:</p><blockquote data-conversation=\"none\" data-lang=\"en\"><p dir=\"ltr\" lang=\"en\">MSNBC put together video of Trump unequivocally telling Elizabeth Warren in July that “I’ll give you a million dollars, to your favorite charity, paid for by Trump, if you take [a DNA] test and it shows you’re an Indian” — followed by him denying he ever said that this morning. pic.twitter.com/zR3n2DqaiY</p><p>— Aaron Rupar (@atrupar) October 15, 2018</p></blockquote><p>Image via screen capture.</p><div><div><div>Share this Article!</div><div>Share on Facebook</div><div>Share on Twitter</div></div></div></div><div><p>Author: Conover KennardConover makes tea partiers cry as a hobby. She was Commander of Jade Helm15 during the failed takeover of the South. She's also one of the biggest arseholes on Twitter. At night, she can be found drinking Conservative tears while pulling off the wings of flies just because she can. She is the founder of a Marxist, Commie, Maoist, Socialist site and has contributed to several other sites, blah blah blah. She is an awful person but she doesn't like to brag about that.</p></div></div>",
 6 |   "plain_content": "<div id=\"post-342398\"><div>Current Issues, News</div><h1>Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video</h1><div>By Conover Kennard on October 15, 2018 12:13 pm ·</div><div><p>Donald Trump has repeatedly lashed out at Sen. Elizabeth Warren (D-MA) at his rallies, calling her Pochahontas to cast doubt on her Native American Heritage. Trump even referred to Warren as Pocahontas at a November 2017 White House event honoring Navajo code-breakers. At a rally in Montana this past July, Trump offered Sen. Warren $1 million if she took a DNA test to prove her heritage. Trump spoke of bringing a DNA test to a debate if she challenges him.</p><p>“We will take that little kit — but we have to do it gently. Because we’re in the #MeToo generation, we have to do it gently,” Trump said. “And we will very gently take that kit, and slowly toss it, hoping it doesn’t injure her arm, and we will say: I will give you a million dollars to your favorite charity, paid for by Trump, if you take the test and it shows you’re an Indian. I have a feeling she will say no.”</p><p>Today the Massachusetts Senator released her DNA results.</p><p>“The facts suggest that you absolutely have a Native American ancestor in your pedigree,” an Ancestry.com and 23andMe adviser told Warren.</p><p>Warren tweeted Trump to demand he pay what he promised and requested that he make the donation to the National Indigenous Women’s Resource Center.</p><blockquote data-lang=\"en\"><p dir=\"ltr\" lang=\"en\">By the way, @realDonaldTrump: Remember saying on 7/5 that you’d give $1M to a charity of my choice if my DNA showed Native American ancestry? I remember – and here’s the verdict. Please send the check to the National Indigenous Women’s Resource Center: https://t.co/I6YQ9hf7Tv pic.twitter.com/J4gBamaeeo</p><p>— Elizabeth Warren (@elizabethforma) October 15, 2018</p></blockquote><p>In response, White House counselor Kellyanne Conway called DNA testing “junk science.”</p><p>Then, Trump flat out denied ever promising to make that donation even though it’s on tape.</p><p>“Who cares?” Trump told reporters today. “I didn’t say that. Nope, you better read it again.”</p><p>Well, MSNBC has the receipts:</p><blockquote data-conversation=\"none\" data-lang=\"en\"><p dir=\"ltr\" lang=\"en\">MSNBC put together video of Trump unequivocally telling Elizabeth Warren in July that “I’ll give you a million dollars, to your favorite charity, paid for by Trump, if you take [a DNA] test and it shows you’re an Indian” — followed by him denying he ever said that this morning. pic.twitter.com/zR3n2DqaiY</p><p>— Aaron Rupar (@atrupar) October 15, 2018</p></blockquote><p>Image via screen capture.</p><div><div><div>Share this Article!</div><div>Share on Facebook</div><div>Share on Twitter</div></div></div></div><div><p>Author: Conover KennardConover makes tea partiers cry as a hobby. She was Commander of Jade Helm15 during the failed takeover of the South. She's also one of the biggest arseholes on Twitter. At night, she can be found drinking Conservative tears while pulling off the wings of flies just because she can. She is the founder of a Marxist, Commie, Maoist, Socialist site and has contributed to several other sites, blah blah blah. She is an awful person but she doesn't like to brag about that.</p></div></div>",
 7 |   "plain_text": [
 8 |     {"text": "Current Issues, News"},
 9 |     {"text": "Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video"},
10 |     {"text": "By Conover Kennard on October 15, 2018 12:13 pm ·"},
11 |     {"text": "Donald Trump has repeatedly lashed out at Sen. Elizabeth Warren (D-MA) at his rallies, calling her Pochahontas to cast doubt on her Native American Heritage. Trump even referred to Warren as Pocahontas at a November 2017 White House event honoring Navajo code-breakers. At a rally in Montana this past July, Trump offered Sen. Warren $1 million if she took a DNA test to prove her heritage. Trump spoke of bringing a DNA test to a debate if she challenges him."},
12 |     {"text": "“We will take that little kit — but we have to do it gently. Because we’re in the #MeToo generation, we have to do it gently,” Trump said. “And we will very gently take that kit, and slowly toss it, hoping it doesn’t injure her arm, and we will say: I will give you a million dollars to your favorite charity, paid for by Trump, if you take the test and it shows you’re an Indian. I have a feeling she will say no.”"},
13 |     {"text": "Today the Massachusetts Senator released her DNA results."},
14 |     {"text": "“The facts suggest that you absolutely have a Native American ancestor in your pedigree,” an Ancestry.com and 23andMe adviser told Warren."},
15 |     {"text": "Warren tweeted Trump to demand he pay what he promised and requested that he make the donation to the National Indigenous Women’s Resource Center."},
16 |     {"text": "By the way, @realDonaldTrump: Remember saying on 7/5 that you’d give $1M to a charity of my choice if my DNA showed Native American ancestry? I remember – and here’s the verdict. Please send the check to the National Indigenous Women’s Resource Center: https://t.co/I6YQ9hf7Tv pic.twitter.com/J4gBamaeeo"},
17 |     {"text": "— Elizabeth Warren (@elizabethforma) October 15, 2018"},
18 |     {"text": "In response, White House counselor Kellyanne Conway called DNA testing “junk science.”"},
19 |     {"text": "Then, Trump flat out denied ever promising to make that donation even though it’s on tape."},
20 |     {"text": "“Who cares?” Trump told reporters today. “I didn’t say that. Nope, you better read it again.”"},
21 |     {"text": "Well, MSNBC has the receipts:"},
22 |     {"text": "MSNBC put together video of Trump unequivocally telling Elizabeth Warren in July that “I’ll give you a million dollars, to your favorite charity, paid for by Trump, if you take [a DNA] test and it shows you’re an Indian” — followed by him denying he ever said that this morning. pic.twitter.com/zR3n2DqaiY"},
23 |     {"text": "— Aaron Rupar (@atrupar) October 15, 2018"},
24 |     {"text": "Image via screen capture."},
25 |     {"text": "Share this Article!"},
26 |     {"text": "Share on Facebook"},
27 |     {"text": "Share on Twitter"},
28 |     {"text": "Author: Conover KennardConover makes tea partiers cry as a hobby. She was Commander of Jade Helm15 during the failed takeover of the South. She's also one of the biggest arseholes on Twitter. At night, she can be found drinking Conservative tears while pulling off the wings of flies just because she can. She is the founder of a Marxist, Commie, Maoist, Socialist site and has contributed to several other sites, blah blah blah. She is an awful person but she doesn't like to brag about that."}
29 |   ]
30 | }
31 | 


--------------------------------------------------------------------------------
/tests/data/addictinginfo.com-1_simple_article_from_full_page.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "title": "Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video",
 3 |   "byline": null,
 4 |   "date": "2018-10-15T12:13:54",
 5 |   "content": "<div><title><p>Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video | Addicting Info | The Knowledge You Crave</p></title><title><p>Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video</p></title><body><div><div id=\"header\"><div id=\"menutop\"><div id=\"topmenu-wrap\"><ul id=\"topmenu\"><li id=\"menu-item-5835\">About Us</li><li id=\"menu-item-5836\">Contact Us</li><li id=\"menu-item-7590\">Creative Commons License</li><li id=\"menu-item-149826\">Privacy Policy</li><li id=\"menu-item-341224\">Terms of Service</li></ul></div></div><div id=\"menu\"><div id=\"menu-wrap\"><ul id=\"mainmenu\"><li id=\"menu-item-126746\">Home</li><li id=\"menu-item-5799\">News</li><li id=\"menu-item-5801\">Important Information</li><li id=\"menu-item-5803\">Discredited Myths</li><li id=\"menu-item-5802\">Historical Information</li><li id=\"menu-item-5806\">Political Humor</li></ul></div></div></div><div id=\"content\"><div id=\"main\"><div id=\"post-342398\"><div>Current Issues, News</div><h1>Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video</h1><div>By Conover Kennard on October 15, 2018 12:13 pm ·</div><div><p>Donald Trump has repeatedly lashed out at Sen. Elizabeth Warren (D-MA) at his rallies, calling her Pochahontas to cast doubt on her Native American Heritage. Trump even referred to Warren as Pocahontas at a November 2017 White House event honoring Navajo code-breakers. At a rally in Montana this past July, Trump offered Sen. Warren $1 million if she took a DNA test to prove her heritage. Trump spoke of bringing a DNA test to a debate if she challenges him.</p><p>“We will take that little kit — but we have to do it gently. Because we’re in the #MeToo generation, we have to do it gently,” Trump said. “And we will very gently take that kit, and slowly toss it, hoping it doesn’t injure her arm, and we will say: I will give you a million dollars to your favorite charity, paid for by Trump, if you take the test and it shows you’re an Indian. I have a feeling she will say no.”</p><p>Today the Massachusetts Senator released her DNA results.</p><p>“The facts suggest that you absolutely have a Native American ancestor in your pedigree,” an Ancestry.com and 23andMe adviser told Warren.</p><p>Warren tweeted Trump to demand he pay what he promised and requested that he make the donation to the National Indigenous Women’s Resource Center.</p><blockquote data-lang=\"en\"><p dir=\"ltr\" lang=\"en\">By the way, @realDonaldTrump: Remember saying on 7/5 that you’d give $1M to a charity of my choice if my DNA showed Native American ancestry? I remember – and here’s the verdict. Please send the check to the National Indigenous Women’s Resource Center: https://t.co/I6YQ9hf7Tv pic.twitter.com/J4gBamaeeo</p><p>— Elizabeth Warren (@elizabethforma) October 15, 2018</p></blockquote><p>In response, White House counselor Kellyanne Conway called DNA testing “junk science.”</p><p>Then, Trump flat out denied ever promising to make that donation even though it’s on tape.</p><p>“Who cares?” Trump told reporters today. “I didn’t say that. Nope, you better read it again.”</p><p>Well, MSNBC has the receipts:</p><blockquote data-conversation=\"none\" data-lang=\"en\"><p dir=\"ltr\" lang=\"en\">MSNBC put together video of Trump unequivocally telling Elizabeth Warren in July that “I’ll give you a million dollars, to your favorite charity, paid for by Trump, if you take [a DNA] test and it shows you’re an Indian” — followed by him denying he ever said that this morning. pic.twitter.com/zR3n2DqaiY</p><p>— Aaron Rupar (@atrupar) October 15, 2018</p></blockquote><p>Image via screen capture.</p><div><div><div>Share this Article!</div><div>Share on Facebook</div><div>Share on Twitter</div></div></div></div><div><p>Author: Conover KennardConover makes tea partiers cry as a hobby. She was Commander of Jade Helm15 during the failed takeover of the South. She's also one of the biggest arseholes on Twitter. At night, she can be found drinking Conservative tears while pulling off the wings of flies just because she can. She is the founder of a Marxist, Commie, Maoist, Socialist site and has contributed to several other sites, blah blah blah. She is an awful person but she doesn't like to brag about that.</p></div></div></div><div id=\"sidebar\"><div id=\"search-2\"><h3>Search</h3></div><div id=\"archives-5\"><h3>Archives</h3></div></div><div><div>Share on Facebook</div><div>Comments</div></div></div></div><div id=\"footer\"><div><div><h2><p>Addicting Info | The Knowledge You Crave</p></h2></div><div>Copyright © 2018 AddictingInfo.org. All Rights Reserved</div></div></div></body></div>",
 6 |   "plain_content": "<div><title><p>Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video | Addicting Info | The Knowledge You Crave</p></title><title><p>Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video</p></title><body><div><div id=\"header\"><div id=\"menutop\"><div id=\"topmenu-wrap\"><ul id=\"topmenu\"><li id=\"menu-item-5835\">About Us</li><li id=\"menu-item-5836\">Contact Us</li><li id=\"menu-item-7590\">Creative Commons License</li><li id=\"menu-item-149826\">Privacy Policy</li><li id=\"menu-item-341224\">Terms of Service</li></ul></div></div><div id=\"menu\"><div id=\"menu-wrap\"><ul id=\"mainmenu\"><li id=\"menu-item-126746\">Home</li><li id=\"menu-item-5799\">News</li><li id=\"menu-item-5801\">Important Information</li><li id=\"menu-item-5803\">Discredited Myths</li><li id=\"menu-item-5802\">Historical Information</li><li id=\"menu-item-5806\">Political Humor</li></ul></div></div></div><div id=\"content\"><div id=\"main\"><div id=\"post-342398\"><div>Current Issues, News</div><h1>Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video</h1><div>By Conover Kennard on October 15, 2018 12:13 pm ·</div><div><p>Donald Trump has repeatedly lashed out at Sen. Elizabeth Warren (D-MA) at his rallies, calling her Pochahontas to cast doubt on her Native American Heritage. Trump even referred to Warren as Pocahontas at a November 2017 White House event honoring Navajo code-breakers. At a rally in Montana this past July, Trump offered Sen. Warren $1 million if she took a DNA test to prove her heritage. Trump spoke of bringing a DNA test to a debate if she challenges him.</p><p>“We will take that little kit — but we have to do it gently. Because we’re in the #MeToo generation, we have to do it gently,” Trump said. “And we will very gently take that kit, and slowly toss it, hoping it doesn’t injure her arm, and we will say: I will give you a million dollars to your favorite charity, paid for by Trump, if you take the test and it shows you’re an Indian. I have a feeling she will say no.”</p><p>Today the Massachusetts Senator released her DNA results.</p><p>“The facts suggest that you absolutely have a Native American ancestor in your pedigree,” an Ancestry.com and 23andMe adviser told Warren.</p><p>Warren tweeted Trump to demand he pay what he promised and requested that he make the donation to the National Indigenous Women’s Resource Center.</p><blockquote data-lang=\"en\"><p dir=\"ltr\" lang=\"en\">By the way, @realDonaldTrump: Remember saying on 7/5 that you’d give $1M to a charity of my choice if my DNA showed Native American ancestry? I remember – and here’s the verdict. Please send the check to the National Indigenous Women’s Resource Center: https://t.co/I6YQ9hf7Tv pic.twitter.com/J4gBamaeeo</p><p>— Elizabeth Warren (@elizabethforma) October 15, 2018</p></blockquote><p>In response, White House counselor Kellyanne Conway called DNA testing “junk science.”</p><p>Then, Trump flat out denied ever promising to make that donation even though it’s on tape.</p><p>“Who cares?” Trump told reporters today. “I didn’t say that. Nope, you better read it again.”</p><p>Well, MSNBC has the receipts:</p><blockquote data-conversation=\"none\" data-lang=\"en\"><p dir=\"ltr\" lang=\"en\">MSNBC put together video of Trump unequivocally telling Elizabeth Warren in July that “I’ll give you a million dollars, to your favorite charity, paid for by Trump, if you take [a DNA] test and it shows you’re an Indian” — followed by him denying he ever said that this morning. pic.twitter.com/zR3n2DqaiY</p><p>— Aaron Rupar (@atrupar) October 15, 2018</p></blockquote><p>Image via screen capture.</p><div><div><div>Share this Article!</div><div>Share on Facebook</div><div>Share on Twitter</div></div></div></div><div><p>Author: Conover KennardConover makes tea partiers cry as a hobby. She was Commander of Jade Helm15 during the failed takeover of the South. She's also one of the biggest arseholes on Twitter. At night, she can be found drinking Conservative tears while pulling off the wings of flies just because she can. She is the founder of a Marxist, Commie, Maoist, Socialist site and has contributed to several other sites, blah blah blah. She is an awful person but she doesn't like to brag about that.</p></div></div></div><div id=\"sidebar\"><div id=\"search-2\"><h3>Search</h3></div><div id=\"archives-5\"><h3>Archives</h3></div></div><div><div>Share on Facebook</div><div>Comments</div></div></div></div><div id=\"footer\"><div><div><h2><p>Addicting Info | The Knowledge You Crave</p></h2></div><div>Copyright © 2018 AddictingInfo.org. All Rights Reserved</div></div></div></body></div>",
 7 |   "plain_text": [
 8 |     {"text": "Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video | Addicting Info | The Knowledge You Crave"},
 9 |     {"text": "Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video"},
10 |     {"text": "* About Us, * Contact Us, * Creative Commons License, * Privacy Policy, * Terms of Service,"},
11 |     {"text": "* Home, * News, * Important Information, * Discredited Myths, * Historical Information, * Political Humor,"},
12 |     {"text": "Current Issues, News"},
13 |     {"text": "Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video"},
14 |     {"text": "By Conover Kennard on October 15, 2018 12:13 pm ·"},
15 |     {"text": "Donald Trump has repeatedly lashed out at Sen. Elizabeth Warren (D-MA) at his rallies, calling her Pochahontas to cast doubt on her Native American Heritage. Trump even referred to Warren as Pocahontas at a November 2017 White House event honoring Navajo code-breakers. At a rally in Montana this past July, Trump offered Sen. Warren $1 million if she took a DNA test to prove her heritage. Trump spoke of bringing a DNA test to a debate if she challenges him."},
16 |     {"text": "“We will take that little kit — but we have to do it gently. Because we’re in the #MeToo generation, we have to do it gently,” Trump said. “And we will very gently take that kit, and slowly toss it, hoping it doesn’t injure her arm, and we will say: I will give you a million dollars to your favorite charity, paid for by Trump, if you take the test and it shows you’re an Indian. I have a feeling she will say no.”"},
17 |     {"text": "Today the Massachusetts Senator released her DNA results."},
18 |     {"text": "“The facts suggest that you absolutely have a Native American ancestor in your pedigree,” an Ancestry.com and 23andMe adviser told Warren."},
19 |     {"text": "Warren tweeted Trump to demand he pay what he promised and requested that he make the donation to the National Indigenous Women’s Resource Center."},
20 |     {"text": "By the way, @realDonaldTrump: Remember saying on 7/5 that you’d give $1M to a charity of my choice if my DNA showed Native American ancestry? I remember – and here’s the verdict. Please send the check to the National Indigenous Women’s Resource Center: https://t.co/I6YQ9hf7Tv pic.twitter.com/J4gBamaeeo"},
21 |     {"text": "— Elizabeth Warren (@elizabethforma) October 15, 2018"},
22 |     {"text": "In response, White House counselor Kellyanne Conway called DNA testing “junk science.”"},
23 |     {"text": "Then, Trump flat out denied ever promising to make that donation even though it’s on tape."},
24 |     {"text": "“Who cares?” Trump told reporters today. “I didn’t say that. Nope, you better read it again.”"},
25 |     {"text": "Well, MSNBC has the receipts:"},
26 |     {"text": "MSNBC put together video of Trump unequivocally telling Elizabeth Warren in July that “I’ll give you a million dollars, to your favorite charity, paid for by Trump, if you take [a DNA] test and it shows you’re an Indian” — followed by him denying he ever said that this morning. pic.twitter.com/zR3n2DqaiY"},
27 |     {"text": "— Aaron Rupar (@atrupar) October 15, 2018"},
28 |     {"text": "Image via screen capture."},
29 |     {"text": "Share this Article!"},
30 |     {"text": "Share on Facebook"},
31 |     {"text": "Share on Twitter"},
32 |     {"text": "Author: Conover KennardConover makes tea partiers cry as a hobby. She was Commander of Jade Helm15 during the failed takeover of the South. She's also one of the biggest arseholes on Twitter. At night, she can be found drinking Conservative tears while pulling off the wings of flies just because she can. She is the founder of a Marxist, Commie, Maoist, Socialist site and has contributed to several other sites, blah blah blah. She is an awful person but she doesn't like to brag about that."},
33 |     {"text": "Search"},
34 |     {"text": "Archives"},
35 |     {"text": "Share on Facebook"},
36 |     {"text": "Comments"},
37 |     {"text": "Addicting Info | The Knowledge You Crave"},
38 |     {"text": "Copyright © 2018 AddictingInfo.org. All Rights Reserved"}
39 |   ]
40 | }
41 | 


--------------------------------------------------------------------------------
/tests/data/addictinginfo.com-1_simple_article_from_full_page_node_indexes.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "title": "Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video",
 3 |   "byline": null,
 4 |   "date": "2018-10-15T12:13:54",
 5 |   "content": "<div><title><p>Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video | Addicting Info | The Knowledge You Crave</p></title><title><p>Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video</p></title><body><div><div id=\"header\"><div id=\"menutop\"><div id=\"topmenu-wrap\"><ul id=\"topmenu\"><li id=\"menu-item-5835\">About Us</li><li id=\"menu-item-5836\">Contact Us</li><li id=\"menu-item-7590\">Creative Commons License</li><li id=\"menu-item-149826\">Privacy Policy</li><li id=\"menu-item-341224\">Terms of Service</li></ul></div></div><div id=\"menu\"><div id=\"menu-wrap\"><ul id=\"mainmenu\"><li id=\"menu-item-126746\">Home</li><li id=\"menu-item-5799\">News</li><li id=\"menu-item-5801\">Important Information</li><li id=\"menu-item-5803\">Discredited Myths</li><li id=\"menu-item-5802\">Historical Information</li><li id=\"menu-item-5806\">Political Humor</li></ul></div></div></div><div id=\"content\"><div id=\"main\"><div id=\"post-342398\"><div>Current Issues, News</div><h1>Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video</h1><div>By Conover Kennard on October 15, 2018 12:13 pm ·</div><div><p>Donald Trump has repeatedly lashed out at Sen. Elizabeth Warren (D-MA) at his rallies, calling her Pochahontas to cast doubt on her Native American Heritage. Trump even referred to Warren as Pocahontas at a November 2017 White House event honoring Navajo code-breakers. At a rally in Montana this past July, Trump offered Sen. Warren $1 million if she took a DNA test to prove her heritage. Trump spoke of bringing a DNA test to a debate if she challenges him.</p><p>“We will take that little kit — but we have to do it gently. Because we’re in the #MeToo generation, we have to do it gently,” Trump said. “And we will very gently take that kit, and slowly toss it, hoping it doesn’t injure her arm, and we will say: I will give you a million dollars to your favorite charity, paid for by Trump, if you take the test and it shows you’re an Indian. I have a feeling she will say no.”</p><p>Today the Massachusetts Senator released her DNA results.</p><p>“The facts suggest that you absolutely have a Native American ancestor in your pedigree,” an Ancestry.com and 23andMe adviser told Warren.</p><p>Warren tweeted Trump to demand he pay what he promised and requested that he make the donation to the National Indigenous Women’s Resource Center.</p><blockquote data-lang=\"en\"><p dir=\"ltr\" lang=\"en\">By the way, @realDonaldTrump: Remember saying on 7/5 that you’d give $1M to a charity of my choice if my DNA showed Native American ancestry? I remember – and here’s the verdict. Please send the check to the National Indigenous Women’s Resource Center: https://t.co/I6YQ9hf7Tv pic.twitter.com/J4gBamaeeo</p><p>— Elizabeth Warren (@elizabethforma) October 15, 2018</p></blockquote><p>In response, White House counselor Kellyanne Conway called DNA testing “junk science.”</p><p>Then, Trump flat out denied ever promising to make that donation even though it’s on tape.</p><p>“Who cares?” Trump told reporters today. “I didn’t say that. Nope, you better read it again.”</p><p>Well, MSNBC has the receipts:</p><blockquote data-conversation=\"none\" data-lang=\"en\"><p dir=\"ltr\" lang=\"en\">MSNBC put together video of Trump unequivocally telling Elizabeth Warren in July that “I’ll give you a million dollars, to your favorite charity, paid for by Trump, if you take [a DNA] test and it shows you’re an Indian” — followed by him denying he ever said that this morning. pic.twitter.com/zR3n2DqaiY</p><p>— Aaron Rupar (@atrupar) October 15, 2018</p></blockquote><p>Image via screen capture.</p><div><div><div>Share this Article!</div><div>Share on Facebook</div><div>Share on Twitter</div></div></div></div><div><p>Author: Conover KennardConover makes tea partiers cry as a hobby. She was Commander of Jade Helm15 during the failed takeover of the South. She's also one of the biggest arseholes on Twitter. At night, she can be found drinking Conservative tears while pulling off the wings of flies just because she can. She is the founder of a Marxist, Commie, Maoist, Socialist site and has contributed to several other sites, blah blah blah. She is an awful person but she doesn't like to brag about that.</p></div></div></div><div id=\"sidebar\"><div id=\"search-2\"><h3>Search</h3></div><div id=\"archives-5\"><h3>Archives</h3></div></div><div><div>Share on Facebook</div><div>Comments</div></div></div></div><div id=\"footer\"><div><div><h2><p>Addicting Info | The Knowledge You Crave</p></h2></div><div>Copyright © 2018 AddictingInfo.org. All Rights Reserved</div></div></div></body></div>",
 6 |   "plain_content": "<div data-node-index=\"0\"><title data-node-index=\"0.1\"><p data-node-index=\"0.1.1\">Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video | Addicting Info | The Knowledge You Crave</p></title><title data-node-index=\"0.2\"><p data-node-index=\"0.2.1\">Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video</p></title><body data-node-index=\"0.3\"><div data-node-index=\"0.3.1\"><div data-node-index=\"0.3.1.1\" id=\"header\"><div data-node-index=\"0.3.1.1.1\" id=\"menutop\"><div data-node-index=\"0.3.1.1.1.1\" id=\"topmenu-wrap\"><ul data-node-index=\"0.3.1.1.1.1.1\" id=\"topmenu\"><li data-node-index=\"0.3.1.1.1.1.1.1\" id=\"menu-item-5835\">About Us</li><li data-node-index=\"0.3.1.1.1.1.1.2\" id=\"menu-item-5836\">Contact Us</li><li data-node-index=\"0.3.1.1.1.1.1.3\" id=\"menu-item-7590\">Creative Commons License</li><li data-node-index=\"0.3.1.1.1.1.1.4\" id=\"menu-item-149826\">Privacy Policy</li><li data-node-index=\"0.3.1.1.1.1.1.5\" id=\"menu-item-341224\">Terms of Service</li></ul></div></div><div data-node-index=\"0.3.1.1.2\" id=\"menu\"><div data-node-index=\"0.3.1.1.2.1\" id=\"menu-wrap\"><ul data-node-index=\"0.3.1.1.2.1.1\" id=\"mainmenu\"><li data-node-index=\"0.3.1.1.2.1.1.1\" id=\"menu-item-126746\">Home</li><li data-node-index=\"0.3.1.1.2.1.1.2\" id=\"menu-item-5799\">News</li><li data-node-index=\"0.3.1.1.2.1.1.3\" id=\"menu-item-5801\">Important Information</li><li data-node-index=\"0.3.1.1.2.1.1.4\" id=\"menu-item-5803\">Discredited Myths</li><li data-node-index=\"0.3.1.1.2.1.1.5\" id=\"menu-item-5802\">Historical Information</li><li data-node-index=\"0.3.1.1.2.1.1.6\" id=\"menu-item-5806\">Political Humor</li></ul></div></div></div><div data-node-index=\"0.3.1.2\" id=\"content\"><div data-node-index=\"0.3.1.2.1\" id=\"main\"><div data-node-index=\"0.3.1.2.1.1\" id=\"post-342398\"><div data-node-index=\"0.3.1.2.1.1.1\">Current Issues, News</div><h1 data-node-index=\"0.3.1.2.1.1.2\">Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video</h1><div data-node-index=\"0.3.1.2.1.1.3\">By Conover Kennard on October 15, 2018 12:13 pm ·</div><div data-node-index=\"0.3.1.2.1.1.4\"><p data-node-index=\"0.3.1.2.1.1.4.1\">Donald Trump has repeatedly lashed out at Sen. Elizabeth Warren (D-MA) at his rallies, calling her Pochahontas to cast doubt on her Native American Heritage. Trump even referred to Warren as Pocahontas at a November 2017 White House event honoring Navajo code-breakers. At a rally in Montana this past July, Trump offered Sen. Warren $1 million if she took a DNA test to prove her heritage. Trump spoke of bringing a DNA test to a debate if she challenges him.</p><p data-node-index=\"0.3.1.2.1.1.4.2\">“We will take that little kit — but we have to do it gently. Because we’re in the #MeToo generation, we have to do it gently,” Trump said. “And we will very gently take that kit, and slowly toss it, hoping it doesn’t injure her arm, and we will say: I will give you a million dollars to your favorite charity, paid for by Trump, if you take the test and it shows you’re an Indian. I have a feeling she will say no.”</p><p data-node-index=\"0.3.1.2.1.1.4.3\">Today the Massachusetts Senator released her DNA results.</p><p data-node-index=\"0.3.1.2.1.1.4.4\">“The facts suggest that you absolutely have a Native American ancestor in your pedigree,” an Ancestry.com and 23andMe adviser told Warren.</p><p data-node-index=\"0.3.1.2.1.1.4.5\">Warren tweeted Trump to demand he pay what he promised and requested that he make the donation to the National Indigenous Women’s Resource Center.</p><blockquote data-lang=\"en\" data-node-index=\"0.3.1.2.1.1.4.6\"><p data-node-index=\"0.3.1.2.1.1.4.6.1\" dir=\"ltr\" lang=\"en\">By the way, @realDonaldTrump: Remember saying on 7/5 that you’d give $1M to a charity of my choice if my DNA showed Native American ancestry? I remember – and here’s the verdict. Please send the check to the National Indigenous Women’s Resource Center: https://t.co/I6YQ9hf7Tv pic.twitter.com/J4gBamaeeo</p><p data-node-index=\"0.3.1.2.1.1.4.6.2\">— Elizabeth Warren (@elizabethforma) October 15, 2018</p></blockquote><p data-node-index=\"0.3.1.2.1.1.4.7\">In response, White House counselor Kellyanne Conway called DNA testing “junk science.”</p><p data-node-index=\"0.3.1.2.1.1.4.8\">Then, Trump flat out denied ever promising to make that donation even though it’s on tape.</p><p data-node-index=\"0.3.1.2.1.1.4.9\">“Who cares?” Trump told reporters today. “I didn’t say that. Nope, you better read it again.”</p><p data-node-index=\"0.3.1.2.1.1.4.10\">Well, MSNBC has the receipts:</p><blockquote data-conversation=\"none\" data-lang=\"en\" data-node-index=\"0.3.1.2.1.1.4.11\"><p data-node-index=\"0.3.1.2.1.1.4.11.1\" dir=\"ltr\" lang=\"en\">MSNBC put together video of Trump unequivocally telling Elizabeth Warren in July that “I’ll give you a million dollars, to your favorite charity, paid for by Trump, if you take [a DNA] test and it shows you’re an Indian” — followed by him denying he ever said that this morning. pic.twitter.com/zR3n2DqaiY</p><p data-node-index=\"0.3.1.2.1.1.4.11.2\">— Aaron Rupar (@atrupar) October 15, 2018</p></blockquote><p data-node-index=\"0.3.1.2.1.1.4.12\">Image via screen capture.</p><div data-node-index=\"0.3.1.2.1.1.4.13\"><div data-node-index=\"0.3.1.2.1.1.4.13.1\"><div data-node-index=\"0.3.1.2.1.1.4.13.1.1\">Share this Article!</div><div data-node-index=\"0.3.1.2.1.1.4.13.1.2\">Share on Facebook</div><div data-node-index=\"0.3.1.2.1.1.4.13.1.3\">Share on Twitter</div></div></div></div><div data-node-index=\"0.3.1.2.1.1.5\"><p data-node-index=\"0.3.1.2.1.1.5.1\">Author: Conover KennardConover makes tea partiers cry as a hobby. She was Commander of Jade Helm15 during the failed takeover of the South. She's also one of the biggest arseholes on Twitter. At night, she can be found drinking Conservative tears while pulling off the wings of flies just because she can. She is the founder of a Marxist, Commie, Maoist, Socialist site and has contributed to several other sites, blah blah blah. She is an awful person but she doesn't like to brag about that.</p></div></div></div><div data-node-index=\"0.3.1.2.2\" id=\"sidebar\"><div data-node-index=\"0.3.1.2.2.1\" id=\"search-2\"><h3 data-node-index=\"0.3.1.2.2.1.1\">Search</h3></div><div data-node-index=\"0.3.1.2.2.2\" id=\"archives-5\"><h3 data-node-index=\"0.3.1.2.2.2.1\">Archives</h3></div></div><div data-node-index=\"0.3.1.2.3\"><div data-node-index=\"0.3.1.2.3.1\">Share on Facebook</div><div data-node-index=\"0.3.1.2.3.2\">Comments</div></div></div></div><div data-node-index=\"0.3.2\" id=\"footer\"><div data-node-index=\"0.3.2.1\"><div data-node-index=\"0.3.2.1.1\"><h2 data-node-index=\"0.3.2.1.1.1\"><p data-node-index=\"0.3.2.1.1.1.1\">Addicting Info | The Knowledge You Crave</p></h2></div><div data-node-index=\"0.3.2.1.2\">Copyright © 2018 AddictingInfo.org. All Rights Reserved</div></div></div></body></div>",
 7 |   "plain_text": [
 8 |     {"node_index": "0.1.1", "text": "Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video | Addicting Info | The Knowledge You Crave"},
 9 |     {"node_index": "0.2.1", "text": "Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video"},
10 |     {"node_index": "0.3.1.1.1.1.1", "text": "* About Us, * Contact Us, * Creative Commons License, * Privacy Policy, * Terms of Service,"},
11 |     {"node_index": "0.3.1.1.2.1.1", "text": "* Home, * News, * Important Information, * Discredited Myths, * Historical Information, * Political Humor,"},
12 |     {"node_index": "0.3.1.2.1.1.1", "text": "Current Issues, News"},
13 |     {"node_index": "0.3.1.2.1.1.2", "text": "Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video"},
14 |     {"node_index": "0.3.1.2.1.1.3", "text": "By Conover Kennard on October 15, 2018 12:13 pm ·"},
15 |     {"node_index": "0.3.1.2.1.1.4.1", "text": "Donald Trump has repeatedly lashed out at Sen. Elizabeth Warren (D-MA) at his rallies, calling her Pochahontas to cast doubt on her Native American Heritage. Trump even referred to Warren as Pocahontas at a November 2017 White House event honoring Navajo code-breakers. At a rally in Montana this past July, Trump offered Sen. Warren $1 million if she took a DNA test to prove her heritage. Trump spoke of bringing a DNA test to a debate if she challenges him."},
16 |     {"node_index": "0.3.1.2.1.1.4.2", "text": "“We will take that little kit — but we have to do it gently. Because we’re in the #MeToo generation, we have to do it gently,” Trump said. “And we will very gently take that kit, and slowly toss it, hoping it doesn’t injure her arm, and we will say: I will give you a million dollars to your favorite charity, paid for by Trump, if you take the test and it shows you’re an Indian. I have a feeling she will say no.”"},
17 |     {"node_index": "0.3.1.2.1.1.4.3", "text": "Today the Massachusetts Senator released her DNA results."},
18 |     {"node_index": "0.3.1.2.1.1.4.4", "text": "“The facts suggest that you absolutely have a Native American ancestor in your pedigree,” an Ancestry.com and 23andMe adviser told Warren."},
19 |     {"node_index": "0.3.1.2.1.1.4.5", "text": "Warren tweeted Trump to demand he pay what he promised and requested that he make the donation to the National Indigenous Women’s Resource Center."},
20 |     {"node_index": "0.3.1.2.1.1.4.6.1", "text": "By the way, @realDonaldTrump: Remember saying on 7/5 that you’d give $1M to a charity of my choice if my DNA showed Native American ancestry? I remember – and here’s the verdict. Please send the check to the National Indigenous Women’s Resource Center: https://t.co/I6YQ9hf7Tv pic.twitter.com/J4gBamaeeo"},
21 |     {"node_index": "0.3.1.2.1.1.4.6.2", "text": "— Elizabeth Warren (@elizabethforma) October 15, 2018"},
22 |     {"node_index": "0.3.1.2.1.1.4.7", "text": "In response, White House counselor Kellyanne Conway called DNA testing “junk science.”"},
23 |     {"node_index": "0.3.1.2.1.1.4.8", "text": "Then, Trump flat out denied ever promising to make that donation even though it’s on tape."},
24 |     {"node_index": "0.3.1.2.1.1.4.9", "text": "“Who cares?” Trump told reporters today. “I didn’t say that. Nope, you better read it again.”"},
25 |     {"node_index": "0.3.1.2.1.1.4.10", "text": "Well, MSNBC has the receipts:"},
26 |     {"node_index": "0.3.1.2.1.1.4.11.1", "text": "MSNBC put together video of Trump unequivocally telling Elizabeth Warren in July that “I’ll give you a million dollars, to your favorite charity, paid for by Trump, if you take [a DNA] test and it shows you’re an Indian” — followed by him denying he ever said that this morning. pic.twitter.com/zR3n2DqaiY"},
27 |     {"node_index": "0.3.1.2.1.1.4.11.2", "text": "— Aaron Rupar (@atrupar) October 15, 2018"},
28 |     {"node_index": "0.3.1.2.1.1.4.12", "text": "Image via screen capture."},
29 |     {"node_index": "0.3.1.2.1.1.4.13.1.1", "text": "Share this Article!"},
30 |     {"node_index": "0.3.1.2.1.1.4.13.1.2", "text": "Share on Facebook"},
31 |     {"node_index": "0.3.1.2.1.1.4.13.1.3", "text": "Share on Twitter"},
32 |     {"node_index": "0.3.1.2.1.1.5.1", "text": "Author: Conover KennardConover makes tea partiers cry as a hobby. She was Commander of Jade Helm15 during the failed takeover of the South. She's also one of the biggest arseholes on Twitter. At night, she can be found drinking Conservative tears while pulling off the wings of flies just because she can. She is the founder of a Marxist, Commie, Maoist, Socialist site and has contributed to several other sites, blah blah blah. She is an awful person but she doesn't like to brag about that."},
33 |     {"node_index": "0.3.1.2.2.1.1", "text": "Search"},
34 |     {"node_index": "0.3.1.2.2.2.1", "text": "Archives"},
35 |     {"node_index": "0.3.1.2.3.1", "text": "Share on Facebook"},
36 |     {"node_index": "0.3.1.2.3.2", "text": "Comments"},
37 |     {"node_index": "0.3.2.1.1.1.1", "text": "Addicting Info | The Knowledge You Crave"},
38 |     {"node_index": "0.3.2.1.2", "text": "Copyright © 2018 AddictingInfo.org. All Rights Reserved"}
39 |   ]
40 | }
41 | 


--------------------------------------------------------------------------------
/tests/data/list_items_full_page.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 |   <head></head>
 3 |   <body>
 4 |   <div id="ordered-lists">
 5 |     <p>
 6 |       An unordered list inside a paragraph.
 7 |       <ul>
 8 |         <li>Unordered thing one</li>
 9 |         <li>Unordered <a>thing</a> two</li>
10 |         <li></li>
11 |         <li>Unordered thing three</li>
12 |       </ul>
13 |     <p>
14 |       A paragraph with no list before an unordered list outside a paragraph.
15 |     </p>
16 |     <ul>
17 |       <li>Unordered town A</li>
18 |       <li>Unordered town B</li>
19 |       <li></li>
20 |       <li>Unordered town C</li>
21 |     </ul>
22 |     <p>
23 |       A paragraph with no list after an unordered list outside a paragraph.
24 |     </p>
25 |   </div>
26 |   <div id="unordered-lists">
27 |     <p>
28 |       An ordered list inside a paragraph.
29 |       <ol>
30 |         <li>Ordered <span>thing</span> one</li>
31 |         <li>Ordered thing two</li>
32 |         <li>Ordered thing three</li>
33 |       </ol>
34 |     <p>
35 |       A paragraph with no list before an ordered list outside a paragraph.
36 |     </p>
37 |     <ol>
38 |       <li>Ordered town A</li>
39 |       <li>Ordered town B</li>
40 |       <li>Ordered town C</li>
41 |     </ol>
42 |     <p>
43 |       A paragraph with no list after an ordered list outside a paragraph.
44 |     </p>
45 |   </div>
46 |   </body>
47 | </html>


--------------------------------------------------------------------------------
/tests/data/list_items_plain_text_paragraph_node_indexes.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {"node_index": "0.1.1", "text": "An unordered list inside a paragraph."},
 3 |   {"node_index": "0.1.2", "text": "* Unordered thing one, * Unordered thing two, * Unordered thing three,"},
 4 |   {"node_index": "0.1.3", "text": "A paragraph with no list before an unordered list outside a paragraph."},
 5 |   {"node_index": "0.1.4", "text": "* Unordered town A, * Unordered town B, * Unordered town C,"},
 6 |   {"node_index": "0.1.5", "text": "A paragraph with no list after an unordered list outside a paragraph."},
 7 |   {"node_index": "0.2.1", "text": "An ordered list inside a paragraph."},
 8 |   {"node_index": "0.2.2", "text": "* Ordered thing one, * Ordered thing two, * Ordered thing three,"},
 9 |   {"node_index": "0.2.3", "text": "A paragraph with no list before an ordered list outside a paragraph."},
10 |   {"node_index": "0.2.4", "text": "* Ordered town A, * Ordered town B, * Ordered town C,"},
11 |   {"node_index": "0.2.5", "text": "A paragraph with no list after an ordered list outside a paragraph."}
12 | ]


--------------------------------------------------------------------------------
/tests/data/list_items_simple_article_from_full_page.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "title": null,
 3 |   "byline": null,
 4 |   "date": null,
 5 |   "content": "<div><div id=\"ordered-lists\"><p>An unordered list inside a paragraph.</p><ul><li>Unordered thing one</li><li>Unordered thing two</li><li>Unordered thing three</li></ul><p>A paragraph with no list before an unordered list outside a paragraph.</p><ul><li>Unordered town A</li><li>Unordered town B</li><li>Unordered town C</li></ul><p>A paragraph with no list after an unordered list outside a paragraph.</p></div><div id=\"unordered-lists\"><p>An ordered list inside a paragraph.</p><ol><li>Ordered thing one</li><li>Ordered thing two</li><li>Ordered thing three</li></ol><p>A paragraph with no list before an ordered list outside a paragraph.</p><ol><li>Ordered town A</li><li>Ordered town B</li><li>Ordered town C</li></ol><p>A paragraph with no list after an ordered list outside a paragraph.</p></div></div>",
 6 |   "plain_content": "<div><div id=\"ordered-lists\"><p>An unordered list inside a paragraph.</p><ul><li>Unordered thing one</li><li>Unordered thing two</li><li>Unordered thing three</li></ul><p>A paragraph with no list before an unordered list outside a paragraph.</p><ul><li>Unordered town A</li><li>Unordered town B</li><li>Unordered town C</li></ul><p>A paragraph with no list after an unordered list outside a paragraph.</p></div><div id=\"unordered-lists\"><p>An ordered list inside a paragraph.</p><ol><li>Ordered thing one</li><li>Ordered thing two</li><li>Ordered thing three</li></ol><p>A paragraph with no list before an ordered list outside a paragraph.</p><ol><li>Ordered town A</li><li>Ordered town B</li><li>Ordered town C</li></ol><p>A paragraph with no list after an ordered list outside a paragraph.</p></div></div>",
 7 |   "plain_text": [
 8 |     {"text": "An unordered list inside a paragraph."},
 9 |     {"text": "* Unordered thing one, * Unordered thing two, * Unordered thing three,"},
10 |     {"text": "A paragraph with no list before an unordered list outside a paragraph."},
11 |     {"text": "* Unordered town A, * Unordered town B, * Unordered town C,"},
12 |     {"text": "A paragraph with no list after an unordered list outside a paragraph."},
13 |     {"text": "An ordered list inside a paragraph."},
14 |     {"text": "* Ordered thing one, * Ordered thing two, * Ordered thing three,"},
15 |     {"text": "A paragraph with no list before an ordered list outside a paragraph."},
16 |     {"text": "* Ordered town A, * Ordered town B, * Ordered town C,"},
17 |     {"text": "A paragraph with no list after an ordered list outside a paragraph."}
18 |   ]
19 | }
20 | 


--------------------------------------------------------------------------------
/tests/data/list_items_simple_article_from_full_page_content_digests.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "title": null,
 3 |   "byline": null,
 4 |   "date": null,
 5 |   "content": "<div><div id=\"ordered-lists\"><p>An unordered list inside a paragraph.</p><ul><li>Unordered thing one</li><li>Unordered thing two</li><li>Unordered thing three</li></ul><p>A paragraph with no list before an unordered list outside a paragraph.</p><ul><li>Unordered town A</li><li>Unordered town B</li><li>Unordered town C</li></ul><p>A paragraph with no list after an unordered list outside a paragraph.</p></div><div id=\"unordered-lists\"><p>An ordered list inside a paragraph.</p><ol><li>Ordered thing one</li><li>Ordered thing two</li><li>Ordered thing three</li></ol><p>A paragraph with no list before an ordered list outside a paragraph.</p><ol><li>Ordered town A</li><li>Ordered town B</li><li>Ordered town C</li></ol><p>A paragraph with no list after an ordered list outside a paragraph.</p></div></div>",
 6 |   "plain_content": "<div data-content-digest=\"e9bb4353c87a5e6fdb8d2be2f36c4695d4c7ff618a0e82f0a746119f5cef0f90\"><div data-content-digest=\"40786ff56358941b23bb41c441a766fa75de45bce97c4bf712c39e3a314a74d7\" id=\"ordered-lists\"><p data-content-digest=\"23ccc43d7d274c5f9c129cad053e2c2610e912ae91859753768fd7e7fc2348a4\">An unordered list inside a paragraph.</p><ul data-content-digest=\"6a6b7854988c8b77960e1a332b45ae5ec2c15d8d5baf076089db5b586fcbb09c\"><li data-content-digest=\"80a6808aacfe0318d824b4643e42c589b1b0b6e20cb84ecccf4b0478d1839133\">Unordered thing one</li><li data-content-digest=\"4c59f8ae20978a92c780ea0eec76a42c125789b6d35d4fddb98d82dd49064123\">Unordered thing two</li><li data-content-digest=\"ec2d8aa4b21a543068c370910b7e17f53af83dd4adde664ba3336a1c86bcf76e\">Unordered thing three</li></ul><p data-content-digest=\"cdeb5342b8727366b0432af57ad8cc4b0e26e1f971c066be44b9ccf33843d02c\">A paragraph with no list before an unordered list outside a paragraph.</p><ul data-content-digest=\"75e94d667c82b2aecf965953e8d8e871a690b98e05483b56e4c78ae89841fc3c\"><li data-content-digest=\"d1ad1707275ac70339e54519bcabf0c805e59e9cbc96e772bd3559bfd2fe2c10\">Unordered town A</li><li data-content-digest=\"4baf512aa623268fc5b96b1d3385bf81fefcab83515db67bf2bedf4ebe0f649c\">Unordered town B</li><li data-content-digest=\"cc00319e1533e14149551372f6087782119c0752c2508ab6ea0f1cc8706d8d7d\">Unordered town C</li></ul><p data-content-digest=\"a3d3a7142056c8a8162f7c1979fb81c52ae435d195871259570fdab4dbcd4150\">A paragraph with no list after an unordered list outside a paragraph.</p></div><div data-content-digest=\"ac70a47027a1176aa8752c6e3d3f221745816edf73366564721a897896f723a8\" id=\"unordered-lists\"><p data-content-digest=\"77a88ede677c3cfc43c48d90d986b808a42077816c1417508639aa593702f6e8\">An ordered list inside a paragraph.</p><ol data-content-digest=\"5d1176738cf1bba08063ee54ec6e4cceadc2cf838c7db92d168485b9b2ffbbbb\"><li data-content-digest=\"5d42930b4354c65b6c1236c8cc3c7ea558a0d8c901ef8c3fddbc9f2a7f7b9649\">Ordered thing one</li><li data-content-digest=\"154b0725d85f0cd973785ebb15290c3200bb8609c1fc0040b2d8834330704af1\">Ordered thing two</li><li data-content-digest=\"9908b35afc2ed54ca0bec8d7e33965b2d4539814a75c7b9833d390c5aa4c2fde\">Ordered thing three</li></ol><p data-content-digest=\"432522f27359153581d30f1c18263de2666959750e5f64d1de3425417f754d89\">A paragraph with no list before an ordered list outside a paragraph.</p><ol data-content-digest=\"cf7c0afe5736fb6642b7834e424e4b0749ccdc6abb8fdb4827c28fe4a0139a55\"><li data-content-digest=\"34379d3f4d216af2d72b73034ed3602809fc0fa54022224030d6af0407558eb3\">Ordered town A</li><li data-content-digest=\"b84fc3569b2d8f67d8538282285bed83a73110d34d1607167f0f0ca80265865c\">Ordered town B</li><li data-content-digest=\"68edeeb158527cc01de28fcdc06c740c08f781f43a6a6734b9fc3f0d82b403b1\">Ordered town C</li></ol><p data-content-digest=\"37902c8d6c34b08f68cede9b72945e7a92c1095f7e5b3d91abef96e0c570cc07\">A paragraph with no list after an ordered list outside a paragraph.</p></div></div>",
 7 |   "plain_text": [
 8 |     {"text": "An unordered list inside a paragraph."},
 9 |     {"text": "* Unordered thing one, * Unordered thing two, * Unordered thing three,"},
10 |     {"text": "A paragraph with no list before an unordered list outside a paragraph."},
11 |     {"text": "* Unordered town A, * Unordered town B, * Unordered town C,"},
12 |     {"text": "A paragraph with no list after an unordered list outside a paragraph."},
13 |     {"text": "An ordered list inside a paragraph."},
14 |     {"text": "* Ordered thing one, * Ordered thing two, * Ordered thing three,"},
15 |     {"text": "A paragraph with no list before an ordered list outside a paragraph."},
16 |     {"text": "* Ordered town A, * Ordered town B, * Ordered town C,"},
17 |     {"text": "A paragraph with no list after an ordered list outside a paragraph."}
18 |   ]
19 | }
20 | 


--------------------------------------------------------------------------------
/tests/data/list_items_simple_article_from_full_page_node_indexes.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "title": null,
 3 |   "byline": null,
 4 |   "date": null,
 5 |   "content": "<div><div id=\"ordered-lists\"><p>An unordered list inside a paragraph.</p><ul><li>Unordered thing one</li><li>Unordered thing two</li><li>Unordered thing three</li></ul><p>A paragraph with no list before an unordered list outside a paragraph.</p><ul><li>Unordered town A</li><li>Unordered town B</li><li>Unordered town C</li></ul><p>A paragraph with no list after an unordered list outside a paragraph.</p></div><div id=\"unordered-lists\"><p>An ordered list inside a paragraph.</p><ol><li>Ordered thing one</li><li>Ordered thing two</li><li>Ordered thing three</li></ol><p>A paragraph with no list before an ordered list outside a paragraph.</p><ol><li>Ordered town A</li><li>Ordered town B</li><li>Ordered town C</li></ol><p>A paragraph with no list after an ordered list outside a paragraph.</p></div></div>",
 6 |   "plain_content": "<div data-node-index=\"0\"><div data-node-index=\"0.1\" id=\"ordered-lists\"><p data-node-index=\"0.1.1\">An unordered list inside a paragraph.</p><ul data-node-index=\"0.1.2\"><li data-node-index=\"0.1.2.1\">Unordered thing one</li><li data-node-index=\"0.1.2.2\">Unordered thing two</li><li data-node-index=\"0.1.2.3\">Unordered thing three</li></ul><p data-node-index=\"0.1.3\">A paragraph with no list before an unordered list outside a paragraph.</p><ul data-node-index=\"0.1.4\"><li data-node-index=\"0.1.4.1\">Unordered town A</li><li data-node-index=\"0.1.4.2\">Unordered town B</li><li data-node-index=\"0.1.4.3\">Unordered town C</li></ul><p data-node-index=\"0.1.5\">A paragraph with no list after an unordered list outside a paragraph.</p></div><div data-node-index=\"0.2\" id=\"unordered-lists\"><p data-node-index=\"0.2.1\">An ordered list inside a paragraph.</p><ol data-node-index=\"0.2.2\"><li data-node-index=\"0.2.2.1\">Ordered thing one</li><li data-node-index=\"0.2.2.2\">Ordered thing two</li><li data-node-index=\"0.2.2.3\">Ordered thing three</li></ol><p data-node-index=\"0.2.3\">A paragraph with no list before an ordered list outside a paragraph.</p><ol data-node-index=\"0.2.4\"><li data-node-index=\"0.2.4.1\">Ordered town A</li><li data-node-index=\"0.2.4.2\">Ordered town B</li><li data-node-index=\"0.2.4.3\">Ordered town C</li></ol><p data-node-index=\"0.2.5\">A paragraph with no list after an ordered list outside a paragraph.</p></div></div>",
 7 |   "plain_text": [
 8 |     {"node_index": "0.1.1", "text": "An unordered list inside a paragraph."},
 9 |     {"node_index": "0.1.2", "text": "* Unordered thing one, * Unordered thing two, * Unordered thing three,"},
10 |     {"node_index": "0.1.3", "text": "A paragraph with no list before an unordered list outside a paragraph."},
11 |     {"node_index": "0.1.4", "text": "* Unordered town A, * Unordered town B, * Unordered town C,"},
12 |     {"node_index": "0.1.5", "text": "A paragraph with no list after an unordered list outside a paragraph."},
13 |     {"node_index": "0.2.1", "text": "An ordered list inside a paragraph."},
14 |     {"node_index": "0.2.2", "text": "* Ordered thing one, * Ordered thing two, * Ordered thing three,"},
15 |     {"node_index": "0.2.3", "text": "A paragraph with no list before an ordered list outside a paragraph."},
16 |     {"node_index": "0.2.4", "text": "* Ordered town A, * Ordered town B, * Ordered town C,"},
17 |     {"node_index": "0.2.5", "text": "A paragraph with no list after an ordered list outside a paragraph."}
18 |   ]
19 | }
20 | 


--------------------------------------------------------------------------------
/tests/data/non_article_full_page.html:
--------------------------------------------------------------------------------
1 | <html>
2 |     <head></head>
3 |     <body>No article here!</body>
4 | </html>


--------------------------------------------------------------------------------
/tests/data/non_article_full_page.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "title": null,
 3 |   "byline": null,
 4 |   "date": null,
 5 |   "content": "<div><p>No article here!</p></div>",
 6 |   "plain_content": "<div><p>No article here!</p></div>",
 7 |   "plain_text": [
 8 |     {"text": "No article here!"}
 9 |   ]
10 | }
11 | 


--------------------------------------------------------------------------------
/tests/data/plain-content-test_full_article.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 |     <head>
 3 | 
 4 |     </head>
 5 |     <body>
 6 |         <div class="page">
 7 |             <div class="content">
 8 |                 <article class="article">
 9 |                     <h1> Article title </h1>
10 |                     <div class="summary">
11 |                         Proin vulputate viverra dapibus. Duis hendrerit suscipit porta. Mauris arcu urna, placerat a
12 |                         lorem non, placerat sodales massa. Proin faucibus libero sollicitudin auctor tincidunt. Fusce
13 |                         malesuada massa quis enim cursus, at tincidunt dolor pretium. Donec sed felis et dolor semper
14 |                         molestie eget ut velit. Mauris ac ultricies arcu, nec vehicula felis. In eu mauris vel est
15 |                         tempor lobortis. Aenean sagittis molestie elit quis suscipit. Praesent volutpat mi in ipsum
16 |                         auctor, vitae dapibus ipsum cursus. Etiam iaculis lobortis mi ut accumsan. Vestibulum lacinia
17 |                         aliquam euismod. Cras venenatis dolor vel vulputate aliquet. Fusce cursus est quis nisi
18 |                         fringilla rhoncus. Maecenas lobortis nisi quis porttitor consectetur.
19 |                     </div>
20 |                     Inside div, after div, before paragraph.
21 |                     <p>Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam iaculis urna nec eros iaculis
22 |                         pretium. Sed efficitur eleifend turpis. Quisque at ipsum interdum, sagittis diam id, malesuada
23 |                         enim. Ut in nibh vitae eros vestibulum aliquam. Fusce finibus, ligula sed ullamcorper aliquet,
24 |                         nisl diam mollis quam, eu varius ligula justo nec nulla. Sed ut purus sollicitudin, porttitor
25 |                         mi quis, suscipit quam. In sit amet gravida nisl.
26 |                         <blockquote>
27 |                             Vivamus sapien mi, ultricies sit amet dolor sit amet, accumsan accumsan lacus. Donec in
28 |                     elit sit amet nisl pulvinar commodo. Nam at libero a est blandit vestibulum. Nam quis tempor sem.
29 |                     Integer a lobortis erat, vel ultricies diam. Vivamus sit amet ipsum in sapien tempus semper.
30 |                     Proin luctus pulvinar velit, eget egestas neque condimentum sit amet. Nam ut fringilla magna.
31 |                     Donec blandit lobortis mattis.
32 |                     <br/><br/>
33 |                     Maecenas nec ipsum quis ipsum laoreet fermentum sit amet id est.
34 |                     Donec efficitur id est in iaculis. Morbi vel magna id ex efficitur tempus in in lectus. Fusce at
35 |                     orci vulputate, aliquet turpis eget, tincidunt leo. Aenean efficitur nibh vitae nisi volutpat
36 |                     iaculis. Etiam bibendum rhoncus vulputate.
37 |                         </blockquote>
38 |                         Nunc scelerisque, nibh ut porta cursus, ex
39 |                         orci molestie odio, in fringilla diam ex quis dui. Curabitur facilisis lacus ac congue finibus.
40 |                         Sed ipsum ex, blandit in dolor in, efficitur iaculis neque. Morbi gravida dapibus erat, eget
41 |                         cursus ipsum vehicula sit amet. Praesent feugiat bibendum felis venenatis pellentesque. Ut eu
42 |                         enim non nisl porttitor accumsan. Cras fringilla volutpat imperdiet.
43 |                     </p>
44 |                     Inside div, after parqagraph, before blockquote
45 |                     <blockquote>
46 |                         Inside blockquote before paragraph.
47 |                     <p>
48 |                         Donec sollicitudin, felis eu sodales semper, purus nisl mollis tellus, at bibendum diam sem non
49 |                         lectus. Nam vestibulum sagittis ultricies. Duis id metus lacus. Duis ac neque quis lectus ornare
50 |                         pharetra. Proin facilisis lectus sed urna tincidunt, ut cursus massa aliquam. Curabitur luctus
51 |                         aliquet sollicitudin. Nam dapibus purus maximus nulla semper, nec auctor augue interdum. Fusce
52 |                         ornare in velit vitae luctus.
53 |                     </p>
54 |                         Inside blockquote after paragraph.
55 |                     </blockquote>
56 |                     <div class="nested-parent">
57 |                         Inside div before div
58 |                         <div class="nested-child">
59 |                             Inside div before paragraph
60 |                             <p>
61 |                                 Nullam non ante nec felis rhoncus mattis. Mauris ornare eros et purus efficitur, in
62 |                                 euismod lorem vestibulum. Vestibulum euismod mattis ornare. Aenean a sem eu tellus
63 |                                 vestibulum sagittis. Vestibulum lectus nisi, tempor eget velit consectetur, vulputate
64 |                                 tincidunt erat. Integer eleifend sit amet risus a blandit. Sed sagittis et lorem eu
65 |                                 tincidunt. Integer eget tortor molestie, consequat nibh vitae, dignissim risus. Cras at
66 |                                 neque non dolor viverra volutpat eget id dui. Cras sed convallis dui. Vestibulum porta
67 |                                 odio nec risus consectetur condimentum. Curabitur commodo odio eget justo suscipit
68 |                                 mattis. Proin in turpis at nisl venenatis sagittis. Integer justo tortor, accumsan
69 |                                 blandit nulla at, molestie viverra enim. Duis pulvinar pulvinar massa, ac dapibus enim.
70 |                             </p>
71 |                         </div>
72 |                         Inside div after div.
73 |                     </div>
74 |                     <h2>Second level heading</h2>
75 |                     Text after second level heading and before third level heading.
76 |                     <h3> Third level heading </h3>
77 |                     Text after third level heading and before fourth level heading.
78 |                     <h4> Fourth level heading </h4>
79 |                     Text after fourth level heading and before fifth level heading.
80 |                     <h5> Fifth level heading </h5>
81 |                     Text after fifth level heading and before sixth level heading.
82 |                     <h6> Sixth level heading </h6>
83 |                     Text after Sixth level heading.
84 |                     <section>
85 |                         Cras ac orci faucibus, rhoncus erat ac, fermentum ligula. Suspendisse et elit sed justo mollis
86 |                         lobortis. Ut consequat aliquet varius. Morbi accumsan nulla eget lectus euismod elementum.
87 |                         Quisque tincidunt ornare nibh in sagittis. Integer lorem tortor, iaculis eget ex vitae, tempus
88 |                         porta turpis. Donec aliquam lorem at mauris condimentum pellentesque. Proin posuere cursus
89 |                         diam, in commodo enim dignissim eu. Pellentesque id libero at lorem dapibus laoreet eget nec
90 |                         ligula.
91 |                     </section>
92 |                 </article>
93 |             </div>
94 |         </div>
95 |     </body>
96 | </html>


--------------------------------------------------------------------------------
/tests/data/plain-content-test_full_article_javascript.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "title": null,
 3 |     "byline": null,
 4 |     "date": null,
 5 |     "content": "<div id=\"readability-page-1\" class=\"page\"><div class=\"page\">\n                <article>\n                    <h2> Article title </h2>\n                    <p>\n                        Proin vulputate viverra dapibus. Duis hendrerit suscipit porta. Mauris arcu urna, placerat a\n                        lorem non, placerat sodales massa. Proin faucibus libero sollicitudin auctor tincidunt. Fusce\n                        malesuada massa quis enim cursus, at tincidunt dolor pretium. Donec sed felis et dolor semper\n                        molestie eget ut velit. Mauris ac ultricies arcu, nec vehicula felis. In eu mauris vel est\n                        tempor lobortis. Aenean sagittis molestie elit quis suscipit. Praesent volutpat mi in ipsum\n                        auctor, vitae dapibus ipsum cursus. Etiam iaculis lobortis mi ut accumsan. Vestibulum lacinia\n                        aliquam euismod. Cras venenatis dolor vel vulputate aliquet. Fusce cursus est quis nisi\n                        fringilla rhoncus. Maecenas lobortis nisi quis porttitor consectetur.\n                    </p>\n                    Inside div, after div, before paragraph.\n                    <p>Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam iaculis urna nec eros iaculis\n                        pretium. Sed efficitur eleifend turpis. Quisque at ipsum interdum, sagittis diam id, malesuada\n                        enim. Ut in nibh vitae eros vestibulum aliquam. Fusce finibus, ligula sed ullamcorper aliquet,\n                        nisl diam mollis quam, eu varius ligula justo nec nulla. Sed ut purus sollicitudin, porttitor\n                        mi quis, suscipit quam. In sit amet gravida nisl.\n                        </p><blockquote>\n                            Vivamus sapien mi, ultricies sit amet dolor sit amet, accumsan accumsan lacus. Donec in\n                    elit sit amet nisl pulvinar commodo. Nam at libero a est blandit vestibulum. Nam quis tempor sem.\n                    Integer a lobortis erat, vel ultricies diam. Vivamus sit amet ipsum in sapien tempus semper.\n                    Proin luctus pulvinar velit, eget egestas neque condimentum sit amet. Nam ut fringilla magna.\n                    Donec blandit lobortis mattis.\n                    <p>\n                    Maecenas nec ipsum quis ipsum laoreet fermentum sit amet id est.\n                    Donec efficitur id est in iaculis. Morbi vel magna id ex efficitur tempus in in lectus. Fusce at\n                    orci vulputate, aliquet turpis eget, tincidunt leo. Aenean efficitur nibh vitae nisi volutpat\n                    iaculis. Etiam bibendum rhoncus vulputate.\n                        </p></blockquote>\n                        Nunc scelerisque, nibh ut porta cursus, ex\n                        orci molestie odio, in fringilla diam ex quis dui. Curabitur facilisis lacus ac congue finibus.\n                        Sed ipsum ex, blandit in dolor in, efficitur iaculis neque. Morbi gravida dapibus erat, eget\n                        cursus ipsum vehicula sit amet. Praesent feugiat bibendum felis venenatis pellentesque. Ut eu\n                        enim non nisl porttitor accumsan. Cras fringilla volutpat imperdiet.\n                    \n                    Inside div, after parqagraph, before blockquote\n                    <blockquote>\n                        Inside blockquote before paragraph.\n                    <p>\n                        Donec sollicitudin, felis eu sodales semper, purus nisl mollis tellus, at bibendum diam sem non\n                        lectus. Nam vestibulum sagittis ultricies. Duis id metus lacus. Duis ac neque quis lectus ornare\n                        pharetra. Proin facilisis lectus sed urna tincidunt, ut cursus massa aliquam. Curabitur luctus\n                        aliquet sollicitudin. Nam dapibus purus maximus nulla semper, nec auctor augue interdum. Fusce\n                        ornare in velit vitae luctus.\n                    </p>\n                        Inside blockquote after paragraph.\n                    </blockquote>\n                    <div><p>\n                        Inside div before div\n                        </p><div><p>\n                            Inside div before paragraph\n                            </p><p>\n                                Nullam non ante nec felis rhoncus mattis. Mauris ornare eros et purus efficitur, in\n                                euismod lorem vestibulum. Vestibulum euismod mattis ornare. Aenean a sem eu tellus\n                                vestibulum sagittis. Vestibulum lectus nisi, tempor eget velit consectetur, vulputate\n                                tincidunt erat. Integer eleifend sit amet risus a blandit. Sed sagittis et lorem eu\n                                tincidunt. Integer eget tortor molestie, consequat nibh vitae, dignissim risus. Cras at\n                                neque non dolor viverra volutpat eget id dui. Cras sed convallis dui. Vestibulum porta\n                                odio nec risus consectetur condimentum. Curabitur commodo odio eget justo suscipit\n                                mattis. Proin in turpis at nisl venenatis sagittis. Integer justo tortor, accumsan\n                                blandit nulla at, molestie viverra enim. Duis pulvinar pulvinar massa, ac dapibus enim.\n                            </p>\n                        </div><p>\n                        Inside div after div.\n                    </p></div>\n                    <h2>Second level heading</h2>\n                    Text after second level heading and before third level heading.\n                    <h3> Third level heading </h3>\n                    Text after third level heading and before fourth level heading.\n                    <h4> Fourth level heading </h4>\n                    Text after fourth level heading and before fifth level heading.\n                    <h5> Fifth level heading </h5>\n                    Text after fifth level heading and before sixth level heading.\n                    <h6> Sixth level heading </h6>\n                    Text after Sixth level heading.\n                    <section>\n                        Cras ac orci faucibus, rhoncus erat ac, fermentum ligula. Suspendisse et elit sed justo mollis\n                        lobortis. Ut consequat aliquet varius. Morbi accumsan nulla eget lectus euismod elementum.\n                        Quisque tincidunt ornare nibh in sagittis. Integer lorem tortor, iaculis eget ex vitae, tempus\n                        porta turpis. Donec aliquam lorem at mauris condimentum pellentesque. Proin posuere cursus\n                        diam, in commodo enim dignissim eu. Pellentesque id libero at lorem dapibus laoreet eget nec\n                        ligula.\n                    </section>\n                </article>\n            </div></div>",
 6 |     "plain_content": "<div class=\"page\" id=\"readability-page-1\"><div class=\"page\"><article><h2>Article title</h2><p>Proin vulputate viverra dapibus. Duis hendrerit suscipit porta. Mauris arcu urna, placerat a lorem non, placerat sodales massa. Proin faucibus libero sollicitudin auctor tincidunt. Fusce malesuada massa quis enim cursus, at tincidunt dolor pretium. Donec sed felis et dolor semper molestie eget ut velit. Mauris ac ultricies arcu, nec vehicula felis. In eu mauris vel est tempor lobortis. Aenean sagittis molestie elit quis suscipit. Praesent volutpat mi in ipsum auctor, vitae dapibus ipsum cursus. Etiam iaculis lobortis mi ut accumsan. Vestibulum lacinia aliquam euismod. Cras venenatis dolor vel vulputate aliquet. Fusce cursus est quis nisi fringilla rhoncus. Maecenas lobortis nisi quis porttitor consectetur.</p>Inside div, after div, before paragraph.<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam iaculis urna nec eros iaculis pretium. Sed efficitur eleifend turpis. Quisque at ipsum interdum, sagittis diam id, malesuada enim. Ut in nibh vitae eros vestibulum aliquam. Fusce finibus, ligula sed ullamcorper aliquet, nisl diam mollis quam, eu varius ligula justo nec nulla. Sed ut purus sollicitudin, porttitor mi quis, suscipit quam. In sit amet gravida nisl.</p><blockquote>Vivamus sapien mi, ultricies sit amet dolor sit amet, accumsan accumsan lacus. Donec in elit sit amet nisl pulvinar commodo. Nam at libero a est blandit vestibulum. Nam quis tempor sem. Integer a lobortis erat, vel ultricies diam. Vivamus sit amet ipsum in sapien tempus semper. Proin luctus pulvinar velit, eget egestas neque condimentum sit amet. Nam ut fringilla magna. Donec blandit lobortis mattis.<p>Maecenas nec ipsum quis ipsum laoreet fermentum sit amet id est. Donec efficitur id est in iaculis. Morbi vel magna id ex efficitur tempus in in lectus. Fusce at orci vulputate, aliquet turpis eget, tincidunt leo. Aenean efficitur nibh vitae nisi volutpat iaculis. Etiam bibendum rhoncus vulputate.</p></blockquote>Nunc scelerisque, nibh ut porta cursus, ex orci molestie odio, in fringilla diam ex quis dui. Curabitur facilisis lacus ac congue finibus. Sed ipsum ex, blandit in dolor in, efficitur iaculis neque. Morbi gravida dapibus erat, eget cursus ipsum vehicula sit amet. Praesent feugiat bibendum felis venenatis pellentesque. Ut eu enim non nisl porttitor accumsan. Cras fringilla volutpat imperdiet. Inside div, after parqagraph, before blockquote<blockquote>Inside blockquote before paragraph.<p>Donec sollicitudin, felis eu sodales semper, purus nisl mollis tellus, at bibendum diam sem non lectus. Nam vestibulum sagittis ultricies. Duis id metus lacus. Duis ac neque quis lectus ornare pharetra. Proin facilisis lectus sed urna tincidunt, ut cursus massa aliquam. Curabitur luctus aliquet sollicitudin. Nam dapibus purus maximus nulla semper, nec auctor augue interdum. Fusce ornare in velit vitae luctus.</p>Inside blockquote after paragraph.</blockquote><div><p>Inside div before div</p><div><p>Inside div before paragraph</p><p>Nullam non ante nec felis rhoncus mattis. Mauris ornare eros et purus efficitur, in euismod lorem vestibulum. Vestibulum euismod mattis ornare. Aenean a sem eu tellus vestibulum sagittis. Vestibulum lectus nisi, tempor eget velit consectetur, vulputate tincidunt erat. Integer eleifend sit amet risus a blandit. Sed sagittis et lorem eu tincidunt. Integer eget tortor molestie, consequat nibh vitae, dignissim risus. Cras at neque non dolor viverra volutpat eget id dui. Cras sed convallis dui. Vestibulum porta odio nec risus consectetur condimentum. Curabitur commodo odio eget justo suscipit mattis. Proin in turpis at nisl venenatis sagittis. Integer justo tortor, accumsan blandit nulla at, molestie viverra enim. Duis pulvinar pulvinar massa, ac dapibus enim.</p></div><p>Inside div after div.</p></div><h2>Second level heading</h2>Text after second level heading and before third level heading.<h3>Third level heading</h3>Text after third level heading and before fourth level heading.<h4>Fourth level heading</h4>Text after fourth level heading and before fifth level heading.<h5>Fifth level heading</h5>Text after fifth level heading and before sixth level heading.<h6>Sixth level heading</h6>Text after Sixth level heading.<section>Cras ac orci faucibus, rhoncus erat ac, fermentum ligula. Suspendisse et elit sed justo mollis lobortis. Ut consequat aliquet varius. Morbi accumsan nulla eget lectus euismod elementum. Quisque tincidunt ornare nibh in sagittis. Integer lorem tortor, iaculis eget ex vitae, tempus porta turpis. Donec aliquam lorem at mauris condimentum pellentesque. Proin posuere cursus diam, in commodo enim dignissim eu. Pellentesque id libero at lorem dapibus laoreet eget nec ligula.</section></article></div></div>",
 7 |     "plain_text": [
 8 |         {"text": "Article title"},
 9 |         {"text": "Proin vulputate viverra dapibus. Duis hendrerit suscipit porta. Mauris arcu urna, placerat a lorem non, placerat sodales massa. Proin faucibus libero sollicitudin auctor tincidunt. Fusce malesuada massa quis enim cursus, at tincidunt dolor pretium. Donec sed felis et dolor semper molestie eget ut velit. Mauris ac ultricies arcu, nec vehicula felis. In eu mauris vel est tempor lobortis. Aenean sagittis molestie elit quis suscipit. Praesent volutpat mi in ipsum auctor, vitae dapibus ipsum cursus. Etiam iaculis lobortis mi ut accumsan. Vestibulum lacinia aliquam euismod. Cras venenatis dolor vel vulputate aliquet. Fusce cursus est quis nisi fringilla rhoncus. Maecenas lobortis nisi quis porttitor consectetur."},
10 |         {"text": "Inside div, after div, before paragraph."},
11 |         {"text": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam iaculis urna nec eros iaculis pretium. Sed efficitur eleifend turpis. Quisque at ipsum interdum, sagittis diam id, malesuada enim. Ut in nibh vitae eros vestibulum aliquam. Fusce finibus, ligula sed ullamcorper aliquet, nisl diam mollis quam, eu varius ligula justo nec nulla. Sed ut purus sollicitudin, porttitor mi quis, suscipit quam. In sit amet gravida nisl."},
12 |         {"text": "Vivamus sapien mi, ultricies sit amet dolor sit amet, accumsan accumsan lacus. Donec in elit sit amet nisl pulvinar commodo. Nam at libero a est blandit vestibulum. Nam quis tempor sem. Integer a lobortis erat, vel ultricies diam. Vivamus sit amet ipsum in sapien tempus semper. Proin luctus pulvinar velit, eget egestas neque condimentum sit amet. Nam ut fringilla magna. Donec blandit lobortis mattis."},
13 |         {"text": "Maecenas nec ipsum quis ipsum laoreet fermentum sit amet id est. Donec efficitur id est in iaculis. Morbi vel magna id ex efficitur tempus in in lectus. Fusce at orci vulputate, aliquet turpis eget, tincidunt leo. Aenean efficitur nibh vitae nisi volutpat iaculis. Etiam bibendum rhoncus vulputate."},
14 |         {"text": "Nunc scelerisque, nibh ut porta cursus, ex orci molestie odio, in fringilla diam ex quis dui. Curabitur facilisis lacus ac congue finibus. Sed ipsum ex, blandit in dolor in, efficitur iaculis neque. Morbi gravida dapibus erat, eget cursus ipsum vehicula sit amet. Praesent feugiat bibendum felis venenatis pellentesque. Ut eu enim non nisl porttitor accumsan. Cras fringilla volutpat imperdiet. Inside div, after parqagraph, before blockquote"},
15 |         {"text": "Inside blockquote before paragraph."},
16 |         {"text": "Donec sollicitudin, felis eu sodales semper, purus nisl mollis tellus, at bibendum diam sem non lectus. Nam vestibulum sagittis ultricies. Duis id metus lacus. Duis ac neque quis lectus ornare pharetra. Proin facilisis lectus sed urna tincidunt, ut cursus massa aliquam. Curabitur luctus aliquet sollicitudin. Nam dapibus purus maximus nulla semper, nec auctor augue interdum. Fusce ornare in velit vitae luctus."},
17 |         {"text": "Inside blockquote after paragraph."},
18 |         {"text": "Inside div before div"},
19 |         {"text": "Inside div before paragraph"},
20 |         {"text": "Nullam non ante nec felis rhoncus mattis. Mauris ornare eros et purus efficitur, in euismod lorem vestibulum. Vestibulum euismod mattis ornare. Aenean a sem eu tellus vestibulum sagittis. Vestibulum lectus nisi, tempor eget velit consectetur, vulputate tincidunt erat. Integer eleifend sit amet risus a blandit. Sed sagittis et lorem eu tincidunt. Integer eget tortor molestie, consequat nibh vitae, dignissim risus. Cras at neque non dolor viverra volutpat eget id dui. Cras sed convallis dui. Vestibulum porta odio nec risus consectetur condimentum. Curabitur commodo odio eget justo suscipit mattis. Proin in turpis at nisl venenatis sagittis. Integer justo tortor, accumsan blandit nulla at, molestie viverra enim. Duis pulvinar pulvinar massa, ac dapibus enim."},
21 |         {"text": "Inside div after div."},
22 |         {"text": "Second level heading"},
23 |         {"text": "Text after second level heading and before third level heading."},
24 |         {"text": "Third level heading"},
25 |         {"text": "Text after third level heading and before fourth level heading."},
26 |         {"text": "Fourth level heading"},
27 |         {"text": "Text after fourth level heading and before fifth level heading."},
28 |         {"text": "Fifth level heading"},
29 |         {"text": "Text after fifth level heading and before sixth level heading."},
30 |         {"text": "Sixth level heading"},
31 |         {"text": "Text after Sixth level heading."},
32 |         {"text": "Cras ac orci faucibus, rhoncus erat ac, fermentum ligula. Suspendisse et elit sed justo mollis lobortis. Ut consequat aliquet varius. Morbi accumsan nulla eget lectus euismod elementum. Quisque tincidunt ornare nibh in sagittis. Integer lorem tortor, iaculis eget ex vitae, tempus porta turpis. Donec aliquam lorem at mauris condimentum pellentesque. Proin posuere cursus diam, in commodo enim dignissim eu. Pellentesque id libero at lorem dapibus laoreet eget nec ligula."}
33 |     ]
34 | }
35 | 


--------------------------------------------------------------------------------
/tests/test_article_extraction.py:
--------------------------------------------------------------------------------
  1 | """Test readability.py on sample articles"""
  2 | from checks import check_extract_article, check_extract_paragraphs_as_plain_text
  3 | 
  4 | 
  5 | # Test end-to-end article extraction
  6 | def test_extract_article_full_page():
  7 |     check_extract_article(
  8 |         "addictinginfo.com-1_full_page.html",
  9 |         "addictinginfo.com-1_simple_article_from_full_page.json"
 10 |     )
 11 | 
 12 | 
 13 | def test_extract_article_full_article():
 14 |     check_extract_article(
 15 |         "addictinginfo.com-1_full_article.html",
 16 |         "addictinginfo.com-1_simple_article_from_full_article.json"
 17 |     )
 18 | 
 19 | 
 20 | def test_extract_article_non_article():
 21 |     check_extract_article(
 22 |         "non_article_full_page.html",
 23 |         "non_article_full_page.json"
 24 |     )
 25 | 
 26 | 
 27 | def test_extract_article_unicode_normalisation():
 28 |     check_extract_article(
 29 |         "conservativehq.com-1_full_page.html",
 30 |         "conservativehq.com-1_simple_article_from_full_page.json"
 31 |     )
 32 | 
 33 | 
 34 | def test_extract_article_list_items():
 35 |     check_extract_article(
 36 |         "list_items_full_page.html",
 37 |         "list_items_simple_article_from_full_page.json"
 38 |     )
 39 | 
 40 | 
 41 | def test_extract_article_headers_and_non_paragraph_blockquote_text():
 42 |     check_extract_article(
 43 |         "davidwolfe.com-1_full_page.html",
 44 |         "davidwolfe.com-1_simple_article_from_full_page.json"
 45 |     )
 46 | 
 47 | 
 48 | def test_extract_article_list_items_content_digests():
 49 |     check_extract_article(
 50 |         "list_items_full_page.html",
 51 |         "list_items_simple_article_from_full_page_content_digests.json",
 52 |         content_digests=True
 53 |     )
 54 | 
 55 | 
 56 | def test_extract_article_list_items_node_indexes():
 57 |     check_extract_article(
 58 |         "list_items_full_page.html",
 59 |         "list_items_simple_article_from_full_page_node_indexes.json",
 60 |         node_indexes=True
 61 |     )
 62 | 
 63 | 
 64 | def test_extract_article_full_page_content_digest():
 65 |     check_extract_article(
 66 |         "addictinginfo.com-1_full_page.html",
 67 |         "addictinginfo.com-1_simple_article_from_full_page_content_digest.json",
 68 |         content_digests=True
 69 |     )
 70 | 
 71 | 
 72 | def test_extract_article_full_page_node_indexes():
 73 |     check_extract_article(
 74 |         "addictinginfo.com-1_full_page.html",
 75 |         "addictinginfo.com-1_simple_article_from_full_page_node_indexes.json",
 76 |         node_indexes=True
 77 |     )
 78 | 
 79 | 
 80 | def test_extract_article_full_page_content_digest_node_indexes():
 81 |     check_extract_article(
 82 |         "addictinginfo.com-1_full_page.html",
 83 |         "addictinginfo.com-1_simple_article_from_full_page_content_digest_node_indexes.json",
 84 |         content_digests=True,
 85 |         node_indexes=True
 86 |     )
 87 | 
 88 | 
 89 | # Test plain text extraction
 90 | def test_extract_paragraphs_as_plain_text():
 91 |     check_extract_paragraphs_as_plain_text(
 92 |         "addictinginfo.com-1_simple_article_from_full_article.json",
 93 |         "addictinginfo.com-1_plain_text_paragraphs_from_simple_article.json"
 94 |     )
 95 | 
 96 | 
 97 | def test_extract_paragraphs_as_plain_text_node_indexes():
 98 |     check_extract_paragraphs_as_plain_text(
 99 |         "list_items_simple_article_from_full_page_node_indexes.json",
100 |         "list_items_plain_text_paragraph_node_indexes.json"
101 |     )
102 | 


--------------------------------------------------------------------------------
/tests/test_benchmarking.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from readabilipy import simple_json_from_html_string
 3 | from readabilipy.extractors import extract_date, extract_title
 4 | 
 5 | 
 6 | TEST_FILEPATH = os.path.join(os.path.dirname(__file__), "data", "benchmarkinghuge.html")
 7 | with open(TEST_FILEPATH, encoding="utf-8") as h:
 8 |     HTML = h.read()
 9 | 
10 | 
11 | def test_benchmark_simple_json_from_html_string(benchmark):
12 |     benchmark(simple_json_from_html_string, html=HTML)
13 | 
14 | 
15 | def test_benchmark_extract_title(benchmark):
16 |     benchmark(extract_title, html=HTML)
17 | 
18 | 
19 | def test_benchmark_extract_date(benchmark):
20 |     benchmark(extract_date, html=HTML)
21 | 


--------------------------------------------------------------------------------
/tests/test_date_functions.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from readabilipy.extractors import extract_date, ensure_iso_date_format
 3 | 
 4 | 
 5 | htmls_with_expected = [
 6 |     ("""<h1>No dates here</h1>""", None),
 7 |     ("""<meta property="article:published_time" content="2018-10-09T01:03:32" />""", "2018-10-09T01:03:32"),
 8 |     ("""<meta property="article:modified_time" content="2018-12-13T21:02:01+00:00" />""", "2018-12-13T21:02:01"),
 9 |     ("""<meta property="article:published" content="2019-01-30T09:39:20-0500" />""", "2019-01-30T09:39:20"),
10 |     ("""<meta property="og:updated_time" content="2019-01-30T09:39:21-0500" />""", "2019-01-30T09:39:21"),
11 |     ("""<meta itemprop="dateModified" content="2019-01-30T09:39:22-0500" />""", "2019-01-30T09:39:22"),
12 |     ("""<meta itemprop="datePublished" content="2019-01-30T09:39:23-0500" />""", "2019-01-30T09:39:23"),
13 |     ("""<meta property="og:article:published_time" content="2019-01-30T09:39:25-0500" />""", "2019-01-30T09:39:25"),
14 |     ("""<meta property="og:article:modified_time" content="2019-01-30T09:39:26-0500" />""", "2019-01-30T09:39:26"),
15 |     ("""<time datetime="2019-01-30T09:39:24-0500" />""", "2019-01-30T09:39:24"),
16 | ]
17 | 
18 | 
19 | @pytest.mark.parametrize("html, expected", htmls_with_expected)
20 | def test_extract_date(html, expected):
21 |     assert extract_date(html) == expected
22 | 
23 | 
24 | def test_extract_date_finds_isoformat_from_lower_scoring_xpath_when_highest_scoring_not_isoformat():
25 |     html = """<meta property="article:published_time" content="2017-01-01" />
26 |                 <meta property="article:modified_time" content="2019-01-01T00:00:00" />"""
27 |     expected = "2019-01-01T00:00:00"
28 |     assert extract_date(html) == expected
29 | 
30 | 
31 | def test_extract_date_all_dates_not_isoformat():
32 |     html = """<meta property="article:published_time" content="2017-01-01" />
33 |                 <meta property="article:modified_time" content="2019-01-01" />"""
34 |     expected = None
35 |     assert extract_date(html) == expected
36 | 
37 | 
38 | def test_ensure_iso_date_format_timezone_keep():
39 |     datetime_string = '2014-10-24T17:32:46+12:00'
40 |     expected_iso_string = '2014-10-24T17:32:46+12:00'
41 |     assert ensure_iso_date_format(datetime_string, ignoretz=False) == expected_iso_string
42 | 
43 | 
44 | def test_ensure_iso_date_format_timezone_drop():
45 |     datetime_string = '2014-10-24T17:32:46+12:00'
46 |     expected_iso_string = '2014-10-24T17:32:46'
47 |     assert ensure_iso_date_format(datetime_string) == expected_iso_string
48 | 
49 | 
50 | def test_ensure_iso_date_format_no_seconds():
51 |     datetime_string = '2014-10-24T17:32+12:00'
52 |     expected_iso_string = '2014-10-24T17:32:00'
53 |     assert ensure_iso_date_format(datetime_string) == expected_iso_string
54 | 
55 | 
56 | def test_ensure_iso_date_format_no_tz():
57 |     datetime_string = '2014-10-24T17:32:46'
58 |     expected_iso_string = '2014-10-24T17:32:46'
59 |     assert ensure_iso_date_format(datetime_string) == expected_iso_string
60 | 
61 | 
62 | def test_ensure_iso_date_format_000Z_format():
63 |     datetime_string = '2019-02-15T15:54:50.000Z'
64 |     expected_iso_string = '2019-02-15T15:54:50'
65 |     assert ensure_iso_date_format(datetime_string) == expected_iso_string
66 | 
67 | 
68 | def test_ensure_iso_date_format_hh_colon_mmZ_format():
69 |     datetime_string = '2019-02-18T17:52:10Z'
70 |     expected_iso_string = '2019-02-18T17:52:10'
71 |     assert ensure_iso_date_format(datetime_string) == expected_iso_string
72 | 
73 | 
74 | def test_ensure_iso_date_format_with_ms():
75 |     datetime_string = '2019-05-14T16:45:01.493'
76 |     expected_iso_string = '2019-05-14T16:45:01'
77 |     assert ensure_iso_date_format(datetime_string) == expected_iso_string
78 | 
79 | 
80 | @pytest.mark.parametrize("html, expected", [("Hello world", None), ("10/10/2019", None)])
81 | def test_ensure_iso_date_format_non_iso_string(html, expected):
82 |     assert ensure_iso_date_format(html) == expected
83 | 


--------------------------------------------------------------------------------
/tests/test_extract_element.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | from readabilipy.extractors.extract_element import extract_element
 3 | 
 4 | 
 5 | def test_extract_element():
 6 |     xpaths = [
 7 |         ('//h1[@class="entry-title"]//text()', 4),
 8 |         ('//h1[@itemprop="headline"]//text()', 3),
 9 |     ]
10 | 
11 |     html = """
12 |             <h1 class="entry-title">Title 1</h>
13 |             <p></p>
14 |             <h1 itemprop="headline">Title 2</h>
15 |             <p></p>
16 |             <h1 class="post__title">Title 2</h>
17 |     """
18 | 
19 |     expected_output_1 = defaultdict(dict)
20 |     expected_output_1['Title 1'] = {'score': 4, 'xpaths': ['//h1[@class="entry-title"]//text()']}
21 |     expected_output_1['Title 2'] = {'score': 3, 'xpaths': ['//h1[@itemprop="headline"]//text()']}
22 | 
23 |     assert extract_element(html, xpaths) == expected_output_1
24 | 
25 | 
26 | def test_extract_element_with_passed_func():
27 |     def process_dict(d):
28 |         d['Title 1'] = 7
29 |         return d
30 | 
31 |     xpaths = [
32 |         ('//h1[@class="entry-title"]//text()', 4),
33 |         ('//h1[@itemprop="headline"]//text()', 3),
34 |     ]
35 | 
36 |     html = """
37 |             <h1 class="entry-title">Title 1</h>
38 |             <p></p>
39 |             <h1 itemprop="headline">Title 2</h>
40 |             <p></p>
41 |             <h1 class="post__title">Title 2</h>
42 |     """
43 | 
44 |     expected_output_3 = defaultdict(dict)
45 |     expected_output_3['Title 1'] = 7
46 |     expected_output_3['Title 2'] = {'score': 3, 'xpaths': ['//h1[@itemprop="headline"]//text()']}
47 | 
48 |     assert extract_element(html, xpaths, process_dict_fn=process_dict) == expected_output_3
49 | 


--------------------------------------------------------------------------------
/tests/test_javascript.py:
--------------------------------------------------------------------------------
 1 | from checks import check_extract_article
 2 | 
 3 | 
 4 | def test_extract_simple_article_with_readability_js():
 5 |     check_extract_article(
 6 |         "plain-content-test_full_article.html",
 7 |         "plain-content-test_full_article_javascript.json",
 8 |         use_readability_js=True
 9 |     )
10 | 
11 | 
12 | def test_extract_article_from_page_with_readability_js():
13 |     check_extract_article(
14 |         "addictinginfo.com-1_full_page.html",
15 |         "addictinginfo.com-1_full_page_javascript.json",
16 |         use_readability_js=True
17 |     )
18 | 


--------------------------------------------------------------------------------
/tests/test_normal_html.py:
--------------------------------------------------------------------------------
  1 | """Test readability.py on sample articles"""
  2 | from checks import check_exact_html_output, check_html_output_contains_text
  3 | 
  4 | 
  5 | # Test bare text behaviours
  6 | def test_html_bare_text():
  7 |     """Bare text should be wrapped in <p> tags."""
  8 |     check_html_output_contains_text(
  9 |         "Bare text here",
 10 |         "<p>Bare text here</p>"
 11 |     )
 12 | 
 13 | 
 14 | def test_html_bare_text_linebreaks():
 15 |     """Line breaks in bare text should be removed."""
 16 |     check_html_output_contains_text("""
 17 |         Bare text with
 18 |         some linebreaks here
 19 |     """, "<p>Bare text with some linebreaks here</p>")
 20 | 
 21 | 
 22 | def test_html_text_with_semantic_br():
 23 |     """Single <br> is sometimes used as a word separator so should be replaced
 24 |     with a space."""
 25 |     check_exact_html_output(
 26 |         """<a href="http://example.com">link</a><br />caption""",
 27 |         "<div><p>link caption</p></div>"
 28 |     )
 29 | 
 30 | 
 31 | def test_html_bare_text_double_br():
 32 |     """Double <br> in bare text should trigger a new paragraph."""
 33 |     check_html_output_contains_text("""
 34 |         Bare text with
 35 |         <br/><br/>
 36 |         some linebreaks here
 37 |     """, "<p>Bare text with</p><p>some linebreaks here</p>")
 38 | 
 39 | 
 40 | def test_html_space_separated_double_br():
 41 |     """Double <br> separated by whitespace should still trigger a new paragraph."""
 42 |     check_html_output_contains_text("""
 43 |         Bare text with
 44 |         <br/>
 45 |                <br/>
 46 |         some linebreaks here
 47 |     """, "<p>Bare text with</p><p>some linebreaks here</p>")
 48 | 
 49 | 
 50 | def test_html_space_separated_double_br_inside_div():
 51 |     """Double <br> separated by whitespace should still trigger a new div."""
 52 |     check_html_output_contains_text("""
 53 |         <div>
 54 |             Text with
 55 |             <br/>
 56 |                 <br/>
 57 |             some linebreaks here
 58 |         <div>
 59 |     """, "<div><p>Text with</p><p>some linebreaks here</p></div>")
 60 | 
 61 | 
 62 | def test_html_space_separated_double_br_inside_and_outside_div():
 63 |     """First double <br> should trigger a new <p>, second several <p> inside the div, third a new <p>"""
 64 |     check_exact_html_output("""
 65 |         <div>
 66 |             <p>Some <br/>
 67 |             <br/>example text here.</p>
 68 |         </div>
 69 |         <div>
 70 |         Text in a div. <br/>
 71 |         <br/> A new div.
 72 |         </div>
 73 |         Bare text. <br/>
 74 |         <br/> A new paragraph.
 75 |         """, "<div><div><p>Some</p><p>example text here.</p></div><div><p>Text in a div.</p><p>A new div.</p></div><p>Bare text.</p><p>A new paragraph.</p></div>")
 76 | 
 77 | 
 78 | # Test correct wrapping
 79 | def test_ensure_correct_outer_div_wrapping():
 80 |     """Do not wrap in a <div> if this is already a <div>."""
 81 |     check_exact_html_output("""
 82 |         <div>
 83 |             <p>
 84 |                 Some example text here.
 85 |             </p>
 86 |         </div>""", """<div><p>Some example text here.</p></div>""")
 87 | 
 88 | 
 89 | def test_ensure_correct_paragraph_wrapping():
 90 |     """Do not wrap bare text inside <div> with <p> tags."""
 91 |     check_exact_html_output("""
 92 |         <div>
 93 |             Some example text here.
 94 |         </div>""", """<div>Some example text here.</div>""")
 95 | 
 96 | 
 97 | # Test consecutive links
 98 | def test_consecutive_links():
 99 |     """Check that whitespace is preserved between consecutive <a> links."""
100 |     check_exact_html_output("""
101 |         <blockquote>
102 |             <p>First paragraph: <a href="https://example.com">first link</a> <a href="https://example.com">second link</a></p>
103 |             <p>Second paragraph: <a href="https://example.com">third link</a></p>
104 |         </blockquote>""", "<div><blockquote><p>First paragraph: first link second link</p><p>Second paragraph: third link</p></blockquote></div>")
105 | 
106 | 
107 | def test_consecutive_links_with_spaces():
108 |     """Check that extra whitespace is remove inside <a> links even when they are consecutive."""
109 |     check_exact_html_output("""
110 |         <blockquote>
111 |             <p>First paragraph: <a href="https://example.com">first link </a> <a href="https://example.com"> second link</a></p>
112 |             <p>Second paragraph: <a href="https://example.com">third link </a></p>
113 |             <p>Third paragraph: <a href="https://example.com">first link </a><a href="https://example.com">second link</a></p>
114 |         </blockquote>""", "<div><blockquote><p>First paragraph: first link second link</p><p>Second paragraph: third link</p><p>Third paragraph: first link second link</p></blockquote></div>")
115 | 
116 | 
117 | # Test text consolidation
118 | def test_span_removal_and_conversion():
119 |     """First <span> should be removed. Second should give bare text that will be wrapped."""
120 |     check_exact_html_output("""
121 |         <div>
122 |             <p>Some <span>example</span> text here.</p>
123 |             <span>More text in a span.</span>
124 |         </div>""", "<div><p>Some example text here.</p><p>More text in a span.</p></div>")
125 | 
126 | 
127 | def test_consolidating_string_between_tags():
128 |     """First <span> should be removed. Second should give bare text that will be wrapped."""
129 |     check_exact_html_output("""
130 |         <div>
131 |             <p>Some <br><br>example text here.</p>
132 |             <span>More text in a span.</span>
133 |             Part of the same paragraph. <br>
134 |             <br> A new paragraph.
135 |         </div>""", "<div><p>Some</p><p>example text here.</p><p>More text in a span. Part of the same paragraph.</p><p>A new paragraph.</p></div>")
136 | 
137 | 
138 | def test_empty_element_removal():
139 |     """Empty elements should be removed."""
140 |     check_exact_html_output("""
141 |         <div>
142 |             <p>Text</p>
143 |             <p></p>
144 |             <span>Paragraphs</span>
145 |         </div>
146 |         Bare <span></span> t<a></a>ext
147 |         <div></div>
148 |     """, "<div><div><p>Text</p><p>Paragraphs</p></div><p>Bare text</p></div>")
149 | 
150 | 
151 | def test_single_br_with_semantic_space():
152 |     """Empty elements should be removed."""
153 |     check_exact_html_output("""
154 |         <div>
155 |             <p>This tag<br> will be removed but the space after it is important.</p>
156 |         </div>
157 |     """, "<div><p>This tag will be removed but the space after it is important.</p></div>")
158 | 
159 | 
160 | def test_prune_div_with_one_populated_one_empty_span():
161 |     check_exact_html_output("""
162 |         <div>
163 |             <span>dfs</span>
164 |             <span></span>
165 |         </div>
166 |     """, "<div>dfs</div>")
167 | 
168 | 
169 | def test_prune_div_with_one_empty_span():
170 |     check_exact_html_output("""
171 |         <div>
172 |             <span></span>
173 |         </div>""", "<div></div>")
174 | 
175 | 
176 | def test_prune_div_with_one_whitespace_paragraph():
177 |     check_exact_html_output(
178 |         """<div>
179 |             <p>        </p>
180 |         </div>
181 |         """,
182 |         "<div></div>"
183 |     )
184 | 


--------------------------------------------------------------------------------
/tests/test_simple_json.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from subprocess import CompletedProcess
  3 | from unittest import mock
  4 | 
  5 | # from .checks import check_extract_article
  6 | from bs4 import BeautifulSoup
  7 | from readabilipy import simple_json_from_html_string
  8 | from readabilipy.simplifiers import normalise_text
  9 | from readabilipy.simple_json import plain_element, plain_text_leaf_node, add_node_indexes, content_digest, have_node
 10 | 
 11 | 
 12 | def test_empty_page():
 13 |     """Empty pages should return an empty <div>."""
 14 |     html = ""
 15 |     parsed_content = simple_json_from_html_string(html)
 16 |     assert parsed_content["content"] == "<div></div>"
 17 | 
 18 | 
 19 | def test_contentless_page():
 20 |     """Contentless pages should return an empty <div>."""
 21 |     html = "<html></html>"
 22 |     parsed_content = simple_json_from_html_string(html)
 23 |     assert parsed_content["content"] == "<div></div>"
 24 | 
 25 | 
 26 | def test_plain_element_with_comments():
 27 |     """Contents of comments should be stripped but the comment itself should be kept."""
 28 |     html = """
 29 |         <div>
 30 |             <p>Text</p>
 31 |             <!-- comment -->
 32 |         </div>
 33 |     """.strip()
 34 |     soup = BeautifulSoup(html, 'html.parser')
 35 |     elements = [str(plain_element(element, False, False)) for element in soup.contents]
 36 |     assert elements == ["<div><p>Text</p><!----></div>"]
 37 | 
 38 | 
 39 | def test_content_digest_on_filled_and_empty_elements():
 40 |     """Filled strings should get a digest but empty strings should not."""
 41 |     html = """
 42 |         <div>
 43 |             <p>Text</p>
 44 |             <p></p>
 45 |         </div>
 46 |     """.strip()
 47 |     soup = BeautifulSoup(html, 'html.parser')
 48 |     elements = [str(plain_element(element, True, True)) for element in soup.contents]
 49 |     assert elements == ['<div><p data-content-digest="71988c4d8e0803ba4519f0b2864c1331c14a1890bf8694e251379177bfedb5c3">Text</p><p data-content-digest=""></p></div>']
 50 | 
 51 | 
 52 | def test_leaf_nodes_without_text():
 53 |     """Leaf nodes with text should yield their text, while those without should yield None."""
 54 |     html = """
 55 |         <div>
 56 |             <p>Some text</p>
 57 |             <p></p>
 58 |             <p>Some more text</p>
 59 |         </div>
 60 |     """.strip()
 61 |     soup = BeautifulSoup(html, 'html.parser')
 62 |     text_blocks = [plain_text_leaf_node(paragraph) for paragraph in soup.find_all("p")]
 63 |     assert text_blocks == [{'text': 'Some text'}, {'text': None}, {'text': 'Some more text'}]
 64 | 
 65 | 
 66 | def test_node_index_assignment():
 67 |     """Whitelisted elements should get an appropriate index but bare strings should not."""
 68 |     html = """
 69 |         <div>
 70 |             <p>Some text</p>
 71 |             <p></p>
 72 |             Some bare text
 73 |         </div>
 74 |     """.strip()
 75 |     soup = BeautifulSoup(html, 'html.parser')
 76 |     normalised_strings = [normalise_text(str(add_node_indexes(elem))) for elem in soup.find_all("div")[0].children]
 77 |     normalised_strings = [s for s in normalised_strings if s]
 78 |     assert normalised_strings == ['<p data-node-index="0">Some text</p>', '<p data-node-index="0"></p>', 'Some bare text']
 79 | 
 80 | 
 81 | def test_content_digest_assignment():
 82 |     """No content digest hash should be assigned when no child elements exist."""
 83 |     html = """
 84 |         <div>
 85 |             <p>Some text</p>
 86 |             <p></p>
 87 |             Some bare text
 88 |         </div>
 89 |     """.strip()
 90 |     soup = BeautifulSoup(html, 'html.parser')
 91 |     digests = [content_digest(elem) for elem in soup.find_all()]
 92 |     assert digests == ['5271913f47bd4cbfda56ff8c0cddfc481d6bc4fe99725906068fbb6144bfeab4',
 93 |                        '4c2e9e6da31a64c70623619c449a040968cdbea85945bf384fa30ed2d5d24fa3',
 94 |                        '']
 95 | 
 96 | 
 97 | @mock.patch('subprocess.run')
 98 | def test_have_node_1(mock_subprocess_run):
 99 |     mock_subprocess_run.side_effect = FileNotFoundError("No such file or directory: 'node'")
100 |     assert not have_node()
101 | 
102 | 
103 | @mock.patch('subprocess.run')
104 | def test_have_node_2(mock_subprocess_run):
105 |     mock_subprocess_run.return_value = CompletedProcess("", 1)
106 |     assert not have_node()
107 | 
108 | 
109 | @mock.patch('subprocess.run')
110 | def test_have_node_3(mock_subprocess_run):
111 |     mock_subprocess_run.return_value = CompletedProcess("", 0, stdout=b"v9.0.0\n")
112 |     assert not have_node()
113 | 
114 | 
115 | @mock.patch('os.path.exists')
116 | def test_have_node_4(mock_os_path_exists):
117 |     mock_os_path_exists.return_value = False
118 |     assert not have_node()
119 | 
120 | 
121 | def test_have_node_5():
122 |     # Assumes we're running on a system with Node/Readability.js installed
123 |     assert have_node()
124 | 


--------------------------------------------------------------------------------
/tests/test_simple_tree.py:
--------------------------------------------------------------------------------
 1 | """Tests for simple_tree functions."""
 2 | from readabilipy import simple_tree_from_html_string
 3 | from readabilipy.simplifiers import strip_html_whitespace
 4 | 
 5 | 
 6 | def test_remove_cdata():
 7 |     """Test all possible methods of CData inclusion. Note that in the final
 8 |     example the '//' prefixes have no effect (since we are not in a <script>)
 9 |     tag and so we expect that the first will be displayed (tested in Chrome and
10 |     Safari)."""
11 |     html = """
12 |         <div>
13 |             <p>Some text <![CDATA[Text inside two tags]]></p>
14 |             <![CDATA[Text inside one tag]]>
15 |         </div>
16 |         <![CDATA[Text outside tags]]>
17 |         <script type="text/javascript">
18 |             //<![CDATA[
19 |             document.write("<");
20 |             //]]>
21 |         </script>
22 |         //<![CDATA[
23 |             invalid CDATA block
24 |         //]]>
25 |     """.strip()
26 |     parsed_html = str(simple_tree_from_html_string(html))
27 |     expected_output = "<div><div><p>Some text</p></div><p>//</p></div>"
28 |     assert strip_html_whitespace(parsed_html) == expected_output
29 | 


--------------------------------------------------------------------------------
/tests/test_simplifiers_html.py:
--------------------------------------------------------------------------------
 1 | """Tests for plain_html functions."""
 2 | from bs4 import BeautifulSoup
 3 | from readabilipy.simplifiers import html
 4 | 
 5 | 
 6 | def test_remove_metadata():
 7 |     HTML = """
 8 |         <!DOCTYPE html>
 9 |         <html>
10 |         <head></head>
11 |         <body>
12 |         <!-- Comment here -->
13 |         </body>
14 |         </html>
15 |     """
16 |     soup = BeautifulSoup(HTML, "html5lib")
17 |     html.remove_metadata(soup)
18 |     assert "<!-- Comment here -->" not in str(soup)
19 | 
20 | 
21 | def test_remove_blacklist():
22 |     HTML = """
23 |         <html>
24 |         <body>
25 |             <button type="button">Click Me!</button>
26 |             <p>Hello</p>
27 |         <body>
28 |         </html>
29 |     """
30 |     soup = BeautifulSoup(HTML, "html5lib")
31 |     html.remove_blacklist(soup)
32 |     assert "button" not in str(soup)
33 | 


--------------------------------------------------------------------------------
/tests/test_simplifiers_text.py:
--------------------------------------------------------------------------------
 1 | from pytest import mark
 2 | from checks import check_exact_html_output
 3 | 
 4 | from readabilipy.simplifiers import normalise_text, normalise_unicode, normalise_whitespace, strip_control_characters, strip_html_whitespace
 5 | from readabilipy.simplifiers import text
 6 | 
 7 | 
 8 | def test_unicode_normalisation():
 9 |     nfd_form = "Ame\u0301lie"
10 |     nfc_form = "Amélie"
11 |     assert normalise_unicode(nfd_form) == normalise_unicode(nfc_form)
12 | 
13 | 
14 | def test_all_whitespace_is_normalised_to_empty_string():
15 |     tab_space_new_line_tab_space = "\t \n\t \f \r\n"
16 |     assert normalise_whitespace(tab_space_new_line_tab_space) == ""
17 | 
18 | 
19 | def test_text_normalisation():
20 |     unnormalised_string = "Ame\u0301lie   Poulain"
21 |     assert normalise_text(unnormalised_string) == "Amélie Poulain"
22 | 
23 | 
24 | def test_strip_html_whitespace():
25 |     formatted_string = """
26 |     <html>
27 |         <body>
28 |             <p>Some text here</p>
29 |         </body>
30 |     </html>
31 |     """
32 |     assert strip_html_whitespace(formatted_string) == "<html><body><p>Some text here</p></body></html>"
33 | 
34 | 
35 | def test_strip_control_characters_non_printing_characters():
36 |     unnormalised_string = "A string with non-printing characters in\u200Bc\u200Bluded\ufeff"
37 |     assert strip_control_characters(unnormalised_string) == "A string with non-printing characters included"
38 |     assert normalise_text(unnormalised_string) == "A string with non-printing characters included"
39 | 
40 | 
41 | def test_strip_control_characters_cr():
42 |     unnormalised_string = "A string with new lines\rin\u200Bc\u200Bluded\ufeff"
43 |     assert strip_control_characters(unnormalised_string) == "A string with new lines\rincluded"
44 |     assert normalise_text(unnormalised_string) == "A string with new lines included"
45 | 
46 | 
47 | def test_strip_control_characters_lf():
48 |     unnormalised_string = "A string with new lines\ninc\u200Bluded\ufeff"
49 |     assert strip_control_characters(unnormalised_string) == "A string with new lines\nincluded"
50 |     assert normalise_text(unnormalised_string) == "A string with new lines included"
51 | 
52 | 
53 | def test_strip_control_characters_cr_lf():
54 |     unnormalised_string = "A string with new lines\r\nin\u200Bc\u200Bluded\ufeff"
55 |     assert strip_control_characters(unnormalised_string) == "A string with new lines\r\nincluded"
56 |     assert normalise_text(unnormalised_string) == "A string with new lines included"
57 | 
58 | 
59 | def test_strip_control_characters_ff():
60 |     unnormalised_string = "A string with form feed\fin\u200Bc\u200Bluded\ufeff"
61 |     assert strip_control_characters(unnormalised_string) == "A string with form feed\fincluded"
62 |     assert normalise_text(unnormalised_string) == "A string with form feed included"
63 | 
64 | 
65 | def test_strip_control_characters_tab():
66 |     unnormalised_string = "A string with tabs\tin\u200Bc\u200Bluded\ufeff"
67 |     assert strip_control_characters(unnormalised_string) == "A string with tabs\tincluded"
68 |     assert normalise_text(unnormalised_string) == "A string with tabs included"
69 | 
70 | 
71 | # Test whitespace around tags
72 | @mark.parametrize('terminal_punctuation', text.terminal_punctuation_marks)
73 | def test_ensure_correct_punctuation_joining(terminal_punctuation):
74 |     """Do not join with ' ' if the following character is a punctuation mark."""
75 |     input_html = f"""
76 |         <div>
77 |             <p>
78 |                 Some text <a href="example.com">like this</a>{terminal_punctuation} with punctuation.
79 |             </p>
80 |         </div>"""
81 |     expected_output = f"""<div><p>Some text like this{terminal_punctuation} with punctuation.</p></div>"""
82 |     check_exact_html_output(input_html, expected_output)
83 | 
84 | 
85 | @mark.parametrize('matched_pair', text.matched_punctuation_marks)
86 | def test_ensure_correct_bracket_quote_joining(matched_pair):
87 |     """Do not join with ' ' if we are inside matched punctuation marks."""
88 |     input_html = f"""
89 |         <div>
90 |             <p>
91 |                 Some text {matched_pair[0]}<a href="example.com">like this</a>{matched_pair[1]} with punctuation.
92 |             </p>
93 |         </div>"""
94 |     expected_output = f"""<div><p>Some text {matched_pair[0]}like this{matched_pair[1]} with punctuation.</p></div>"""
95 |     check_exact_html_output(input_html, expected_output)
96 | 


--------------------------------------------------------------------------------
/tests/test_title_functions.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from readabilipy.extractors.extract_title import extract_title
 3 | from readabilipy.extractors.extract_title import combine_similar_titles
 4 | 
 5 | 
 6 | htmls_with_expected = [
 7 |     ("""<meta name="fb_title" content="Example title 1" />""", "Example title 1"),
 8 |     ("""<meta property="og:title" content="Example title 2" />""", "Example title 2"),
 9 |     ("""<head><title>Example title 3</title></head>""", "Example title 3"),
10 |     ("""<head><title><p>Example title 4</p></title></head>""", "Example title 4"),
11 |     ("""<meta itemprop="headline" content="Example title 5" />)""", "Example title 5"),
12 |     ("""<meta name="sailthru.title" content="Example title 6" />)""", "Example title 6"),
13 |     ("""<meta name="dcterms.title" content="Example title 7" />)""", "Example title 7"),
14 |     ("""<meta name="title" content="Example title 8" />)""", "Example title 8"),
15 |     ("""<header name="entry-header"><h1 class="entry-title">Example title 9</h1></header>""", "Example title 9"),
16 |     ("""<h1 class="entry-title">Example title 10</h1>""", "Example title 10"),
17 |     ("""<header><h1>Example title 11</h1></header>""", "Example title 11"),
18 |     ("""<h1 class="title">Example title 12</h1>""", "Example title 12"),
19 |     ("""<h1 itemprop="headline">Example title 13</h2>""", "Example title 13"),
20 |     ("""<h2 itemprop="headline">Example title 14</h2>""", "Example title 14"),
21 |     ("""<h2 class="title">Example title 15</h2>""", None),  # not one of the xpaths in extract_title()
22 |     ("""<div class="postarea"><h2><a>Example title 16</a></h2></div>""", "Example title 16"),
23 |     ("""<body><title>Example title 17</title></body>""", "Example title 17"),
24 | ]
25 | 
26 | 
27 | @pytest.mark.parametrize("html, expected", htmls_with_expected)
28 | def test_extract_title(html, expected):
29 |     assert extract_title(html) == expected
30 | 
31 | 
32 | def test_extract_title_prioritises_highest_score_xpath():
33 | 
34 |     html = """
35 |             <h2 class="title">Silly title</h2>
36 |             <h1 class="entry-title">Example title</h1>
37 |             <header><h1>Bad title</h1></header>
38 |             <p>Hello world</p>
39 |     """
40 |     expected = "Example title"
41 |     assert extract_title(html) == expected
42 | 
43 | 
44 | def test_extract_title_removes_unwanted_characters():
45 | 
46 |     html = """
47 |         <meta property="og:title" content="Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It&#8217;s On Video" />
48 |     """
49 |     expected = "Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video"
50 |     assert extract_title(html) == expected
51 | 
52 | 
53 | def test_extract_title_gets_text_within_hyperlinks():
54 | 
55 |     html = """
56 |         <h1 class="entry-title">
57 |             <a href="http://addictinginfo.com/2018/10/15/trump-denies-charitable-donation-he-promised-if-elizabeth-warren-releases-dna-results-and-its-on-video/"
58 |                 title="Permalink to Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It&#8217;s On Video"
59 |                 rel="bookmark">Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And
60 |                 It&#8217;s On Video</a>
61 |         </h1>
62 |     """
63 |     expected = "Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video"
64 |     assert extract_title(html) == expected
65 | 
66 | 
67 | def test_extract_title_pick_shortest_version_of_equivalent_title():
68 | 
69 |     html = """
70 |             <h1 class="entry-title">Pamela Geller in Breitbart News: Dueling Billboards from CAIR, AFDI in Times Square</h1>
71 |             <meta property="og:title" content="Pamela Geller in Breitbart News: Dueling Billboards from CAIR, AFDI in Times Square - Geller Report" />
72 |         """
73 |     expected = "Pamela Geller in Breitbart News: Dueling Billboards from CAIR, AFDI in Times Square"
74 |     assert extract_title(html) == expected
75 | 
76 | 
77 | def test_combine_similar_titles():
78 | 
79 |     extracted_strings = {}
80 |     extracted_strings['title 1'] = {'score': 1, 'xpaths': ['a']}
81 |     extracted_strings['Title 1'] = {'score': 1, 'xpaths': ['b']}
82 |     extracted_strings['Title 1 - Extended'] = {'score': 1, 'xpaths': ['c']}
83 | 
84 |     expected_output = {}
85 |     expected_output['title 1'] = {'score': 1, 'xpaths': ['a']}
86 |     expected_output['Title 1'] = {'score': 3, 'xpaths': ['a', 'b', 'c']}
87 |     expected_output['Title 1 - Extended'] = {'score': 1, 'xpaths': ['c']}
88 | 
89 |     assert combine_similar_titles(extracted_strings) == expected_output
90 | 


--------------------------------------------------------------------------------
/tests/test_weird_html.py:
--------------------------------------------------------------------------------
  1 | """Tests for weird HTML input."""
  2 | from checks import check_exact_html_output
  3 | 
  4 | 
  5 | def test_non_printing_control_characters():
  6 |     """Non-printing characters should be removed."""
  7 |     check_exact_html_output("""
  8 |         <div>
  9 |             <p>First paragraph.</p>
 10 |             <p><span>﻿</span></p>
 11 |             <p>Last paragraph.</p>
 12 |         </div>
 13 |     """, """
 14 |         <div>
 15 |             <p>First paragraph.</p>
 16 |             <p>Last paragraph.</p>
 17 |         </div>
 18 |     """)
 19 | 
 20 | 
 21 | def test_iframe_containing_tags():
 22 |     """At present we blacklist iframes completely"""
 23 |     check_exact_html_output("""
 24 |         <div>
 25 |             <iframe><span>text</span></iframe>
 26 |         </div>
 27 |         """, "<div></div>")
 28 | 
 29 | 
 30 | def test_iframe_with_source():
 31 |     """At present we blacklist iframes, but may want to extract the links in future."""
 32 |     check_exact_html_output(
 33 |         """<div><iframe src="https://www.youtube.com/embed/BgB5E91lD6s" width="640" height="355" frameborder="0" allowfullscreen="allowfullscreen"></iframe></div>""",
 34 |         "<div></div>"
 35 |     )
 36 | 
 37 | 
 38 | # Test comments inside tags
 39 | def test_comments_inside_tags():
 40 |     """Ensure that comments inside tags are removed."""
 41 |     check_exact_html_output(
 42 |         "<p>Some <!-- --> text <!-- with a comment --> here<!--or here-->.<!----></p>",
 43 |         "<div><p>Some text here.</p></div>"
 44 |     )
 45 | 
 46 | 
 47 | # Test tags inside words
 48 | def test_tags_inside_words():
 49 |     """Ensure that words with tags inside them are kept together when the tags are stripped."""
 50 |     check_exact_html_output(
 51 |         """a<a href="http://example.com">i</a>sle""",
 52 |         "<div><p>aisle</p></div>"
 53 |     )
 54 | 
 55 | 
 56 | # Test splitting for unclosed tags inside paragraphs
 57 | def test_paragraph_splitting_with_unclosed_tags():
 58 |     """Ensure that paragraphs with unclosed tags inside them split correctly."""
 59 |     check_exact_html_output(
 60 |         """
 61 |         <p>
 62 |             <meta charset="utf-8">First paragraph.
 63 |             <br><br>
 64 |             Second paragraph.
 65 |         </p>""",
 66 |         "<div><p>First paragraph.</p><p>Second paragraph.</p></div>"
 67 |     )
 68 | 
 69 | 
 70 | # Test (possibly illegal) nested elements
 71 | def test_nested_superscript():
 72 |     """Ensure that nested superscripts are correctly parsed."""
 73 |     check_exact_html_output(
 74 |         "<p>Some text with <sup>nested <sup>superscripts</sup></sup> here.</p>",
 75 |         "<div><p>Some text with ^nested ^superscripts here.</p></div>"
 76 |     )
 77 | 
 78 | 
 79 | def test_nested_linebreaks_inside_superscript():
 80 |     """Ensure that linebreaks inside superscript are correctly parsed."""
 81 |     check_exact_html_output(
 82 |         "<p>Some text <sup>with<br/>superscripts</sup> that should be joined.</p>",
 83 |         "<div><p>Some text ^with superscripts that should be joined.</p></div>"
 84 |     )
 85 | 
 86 | 
 87 | def test_nested_superscript_with_linebreaks():
 88 |     """Ensure that nested superscripts with linebreaks are correctly parsed."""
 89 |     check_exact_html_output(
 90 |         """
 91 |         <p>Some text<br>
 92 |         with linebreaks <sup><br>
 93 |         <sup>around a footnote</sup></sup>.
 94 |         </p>""",
 95 |         "<div><p>Some text with linebreaks ^ ^around a footnote.</p></div>"
 96 |     )
 97 | 
 98 | 
 99 | def test_nested_table_inside_paragraph():
100 |     """Ensure that blocks (illegally) nested inside paragraphs are split out."""
101 |     check_exact_html_output(
102 |         """
103 |         <p>
104 |             First paragraph.
105 |             <br/><br/>
106 |             <table>
107 |                 <tbody>
108 |                     <tr>
109 |                         <td>Table text.</td>
110 |                     </tr>
111 |                 </tbody>
112 |             </table>
113 |             Second paragraph.
114 |         </p>""",
115 |         "<div><p>First paragraph.</p><table><tbody><tr><td>Table text.</td></tr></tbody></table><p>Second paragraph.</p></div>"
116 |     )
117 | 
118 | 
119 | def test_nested_span_inside_paragraph():
120 |     """Ensure that spans nested inside paragraphs are kept in."""
121 |     check_exact_html_output(
122 |         "<p>Some text <span>in a span</span> that should stay together.</p>""",
123 |         "<div><p>Some text in a span that should stay together.</p></div>"
124 |     )
125 | 


--------------------------------------------------------------------------------