├── .github
└── workflows
│ ├── lint.yml
│ └── test.yml
├── .gitignore
├── .pylintrc
├── CHANGELOG.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── benchmarks
├── Dockerfile
└── README.md
├── make_release.py
├── readabilipy
├── __init__.py
├── __main__.py
├── __version__.py
├── extractors
│ ├── __init__.py
│ ├── extract_date.py
│ ├── extract_element.py
│ └── extract_title.py
├── javascript
│ ├── ExtractArticle.js
│ └── package.json
├── simple_json.py
├── simple_tree.py
├── simplifiers
│ ├── __init__.py
│ ├── html.py
│ └── text.py
└── utils.py
├── setup.py
└── tests
├── checks.py
├── data
├── addictinginfo.com-1_full_article.html
├── addictinginfo.com-1_full_page.html
├── addictinginfo.com-1_full_page_javascript.json
├── addictinginfo.com-1_plain_text_paragraphs_from_simple_article.json
├── addictinginfo.com-1_simple_article_from_full_article.json
├── addictinginfo.com-1_simple_article_from_full_page.json
├── addictinginfo.com-1_simple_article_from_full_page_content_digest.json
├── addictinginfo.com-1_simple_article_from_full_page_content_digest_node_indexes.json
├── addictinginfo.com-1_simple_article_from_full_page_node_indexes.json
├── benchmarkinghuge.html
├── conservativehq.com-1_full_page.html
├── conservativehq.com-1_simple_article_from_full_page.json
├── davidwolfe.com-1_full_page.html
├── davidwolfe.com-1_simple_article_from_full_page.json
├── list_items_full_page.html
├── list_items_plain_text_paragraph_node_indexes.json
├── list_items_simple_article_from_full_page.json
├── list_items_simple_article_from_full_page_content_digests.json
├── list_items_simple_article_from_full_page_node_indexes.json
├── non_article_full_page.html
├── non_article_full_page.json
├── plain-content-test_full_article.html
└── plain-content-test_full_article_javascript.json
├── test_article_extraction.py
├── test_benchmarking.py
├── test_date_functions.py
├── test_extract_element.py
├── test_html_elements.py
├── test_javascript.py
├── test_normal_html.py
├── test_simple_json.py
├── test_simple_tree.py
├── test_simplifiers_html.py
├── test_simplifiers_text.py
├── test_title_functions.py
└── test_weird_html.py
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
1 | name: ReadabiliPy CI Linting
2 |
3 | on:
4 | push:
5 | branches: [ master ]
6 | pull_request:
7 |
8 | jobs:
9 | build:
10 |
11 | runs-on: "ubuntu-20.04"
12 | strategy:
13 | matrix:
14 | python-version: ["3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
15 |
16 | steps:
17 | - uses: actions/checkout@v4
18 | - name: Set up Python ${{ matrix.python-version }}
19 | uses: actions/setup-python@v4
20 | with:
21 | python-version: ${{ matrix.python-version }}
22 |
23 | - name: Install dependencies
24 | run: |
25 | python -m pip install --upgrade pip
26 | pip install -e ".[test]"
27 |
28 | - name: Lint with pyflakes
29 | run: |
30 | pyflakes *.py readabilipy tests
31 |
32 | - name: check PEP8
33 | run: |
34 | pycodestyle --statistics --ignore=E501 --count *.py readabilipy tests
35 |
36 | - name: Run pylint for stricter error checking
37 | run: |
38 | pylint readabilipy
39 | pylint ./tests/*.py
40 |
--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
1 | name: ReadabiliPy CI
2 |
3 | on:
4 | push:
5 | branches: [ master ]
6 | pull_request:
7 |
8 | jobs:
9 | build:
10 |
11 | runs-on: "ubuntu-20.04"
12 | strategy:
13 | matrix:
14 | python-version: ["3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
15 | node-version: [18.x, 20.x, 22.x]
16 |
17 | steps:
18 | - uses: actions/checkout@v4
19 | - name: Set up Python ${{ matrix.python-version }}
20 | uses: actions/setup-python@v4
21 | with:
22 | python-version: ${{ matrix.python-version }}
23 |
24 | - name: Set up Node.js ${{ matrix.node-version }}
25 | uses: actions/setup-node@v4
26 | with:
27 | node-version: ${{ matrix.node-version }}
28 |
29 | - name: Install dependencies
30 | run: |
31 | python -m pip install --upgrade pip
32 | pip install -e ".[test]"
33 |
34 | - name: Test with pytest
35 | run: |
36 | pytest -v . --cov readabilipy --cov-report term-missing --benchmark-disable
37 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # PyCharm IDE stuff
2 | .idea/
3 |
4 | # OSX temporary files
5 | .DS_Store
6 |
7 | # ===== PYTHON STUFF ====
8 | # Byte-compiled / optimized / DLL files
9 | __pycache__/
10 | *.py[cod]
11 | *$py.class
12 |
13 | # C extensions
14 | *.so
15 |
16 | # Distribution / packaging
17 | .Python
18 | build/
19 | develop-eggs/
20 | dist/
21 | downloads/
22 | eggs/
23 | .eggs/
24 | lib/
25 | lib64/
26 | parts/
27 | sdist/
28 | var/
29 | wheels/
30 | *.egg-info/
31 | .installed.cfg
32 | *.egg
33 | MANIFEST
34 |
35 | # PyInstaller
36 | # Usually these files are written by a python script from a template
37 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
38 | *.manifest
39 | *.spec
40 |
41 | # Installer logs
42 | pip-log.txt
43 | pip-delete-this-directory.txt
44 |
45 | # Unit test / coverage reports
46 | htmlcov/
47 | .tox/
48 | .nox/
49 | .coverage
50 | .coverage.*
51 | .cache
52 | nosetests.xml
53 | coverage.xml
54 | *.cover
55 | .hypothesis/
56 | .pytest_cache/
57 |
58 | # Translations
59 | *.mo
60 | *.pot
61 |
62 | # Django stuff:
63 | *.log
64 | local_settings.py
65 | db.sqlite3
66 |
67 | # Flask stuff:
68 | instance/
69 | .webassets-cache
70 |
71 | # Scrapy stuff:
72 | .scrapy
73 |
74 | # Sphinx documentation
75 | docs/_build/
76 |
77 | # PyBuilder
78 | target/
79 |
80 | # Jupyter Notebook
81 | .ipynb_checkpoints
82 |
83 | # IPython
84 | profile_default/
85 | ipython_config.py
86 |
87 | # pyenv
88 | .python-version
89 |
90 | # celery beat schedule file
91 | celerybeat-schedule
92 |
93 | # SageMath parsed files
94 | *.sage.py
95 |
96 | # Environments
97 | .env
98 | .venv
99 | env/
100 | venv/
101 | ENV/
102 | env.bak/
103 | venv.bak/
104 |
105 | # Spyder project settings
106 | .spyderproject
107 | .spyproject
108 |
109 | # Rope project settings
110 | .ropeproject
111 |
112 | # mkdocs documentation
113 | /site
114 |
115 | # mypy
116 | .mypy_cache/
117 | .dmypy.json
118 | dmypy.json
119 |
120 | # Pyre type checker
121 | .pyre/
122 |
123 |
124 | # ===== NODE STUFF =====
125 | # Logs
126 | logs
127 | *.log
128 | npm-debug.log*
129 | yarn-debug.log*
130 | yarn-error.log*
131 |
132 | # Runtime data
133 | pids
134 | *.pid
135 | *.seed
136 | *.pid.lock
137 |
138 | # Directory for instrumented libs generated by jscoverage/JSCover
139 | lib-cov
140 |
141 | # Coverage directory used by tools like istanbul
142 | coverage
143 |
144 | # nyc test coverage
145 | .nyc_output
146 |
147 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
148 | .grunt
149 |
150 | # Bower dependency directory (https://bower.io/)
151 | bower_components
152 |
153 | # node-waf configuration
154 | .lock-wscript
155 |
156 | # Compiled binary addons (https://nodejs.org/api/addons.html)
157 | build/Release
158 |
159 | # Dependency directories
160 | node_modules/
161 | jspm_packages/
162 |
163 | # TypeScript v1 declaration files
164 | typings/
165 |
166 | # Optional npm cache directory
167 | .npm
168 |
169 | # Optional eslint cache
170 | .eslintcache
171 |
172 | # Optional REPL history
173 | .node_repl_history
174 |
175 | # Output of 'npm pack'
176 | *.tgz
177 |
178 | # Yarn Integrity file
179 | .yarn-integrity
180 |
181 | # dotenv environment variables file
182 | .env
183 |
184 | # next.js build output
185 | .next
186 |
187 | # package-lock
188 | package-lock.json
189 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Change Log
2 |
3 | ## Version 0.3.0
4 | * Fixed some bugs. Updated supported Python versions.
5 |
6 | ## Version 0.2.0
7 | * Restructured project ready for initial PyPI upload.
8 |
9 | ## Version 0.1.0
10 | * Final version used by Misinformation project.
11 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 The Alan Turing Institute
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include readabilipy/javascript/*.js
2 | include readabilipy/javascript/package.json
3 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | # Makefile for easier installation and cleanup.
2 | #
3 | # Uses self-documenting macros from here:
4 | # http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html
5 |
6 | SHELL := bash
7 | .SHELLFLAGS := -eu -o pipefail -c
8 | MAKEFLAGS += --warn-undefined-variables --no-builtin-rules
9 |
10 | PACKAGE=readabilipy
11 | DOC_DIR=./docs
12 | VENV_DIR=/tmp/rdpy_venv
13 | TEST_DIR=./tests
14 |
15 | .PHONY: help
16 |
17 | .DEFAULT_GOAL := help
18 |
19 | # Display a help message when called without target, using the ## comments
20 | help:
21 | @grep -E '^[0-9a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) |\
22 | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-15s\033[0m\
23 | %s\n", $$1, $$2}'
24 |
25 | ################
26 | # Installation #
27 | ################
28 |
29 | .PHONY: install
30 |
31 | install: ## Install for the current user using the default python command
32 | python setup.py build_ext --inplace
33 | python setup.py install --user
34 |
35 |
36 | ################
37 | # Distribution #
38 | ################
39 |
40 | .PHONY: release dist
41 |
42 | release: ## Make a release
43 | python make_release.py
44 |
45 | dist: ## Make Python source distribution
46 | python setup.py sdist bdist_wheel
47 |
48 |
49 | ###########
50 | # Testing #
51 | ###########
52 |
53 | .PHONY: test
54 |
55 | test: venv ## Run unit tests
56 | source $(VENV_DIR)/bin/activate && cd $(TEST_DIR) && python -m pytest -v . --cov readabilipy --cov-report term-missing --benchmark-disable
57 | source $(VENV_DIR)/bin/activate && pyflakes *.py readabilipy $(TEST_DIR)
58 | source $(VENV_DIR)/bin/activate && pycodestyle --statistics --ignore=E501 --count *.py readabilipy $(TEST_DIR)
59 | source $(VENV_DIR)/bin/activate && pylint readabilipy $(TEST_DIR)/*.py
60 |
61 | #################
62 | # Documentation #
63 | #################
64 |
65 | .PHONY: docs
66 |
67 | docs: install ## Build documentation with Sphinx
68 | exit; # not implemented
69 | source $(VENV_DIR)/bin/activate && m2r README.md && mv README.rst $(DOC_DIR)
70 | source $(VENV_DIR)/bin/activate && m2r CHANGELOG.md && mv CHANGELOG.rst $(DOC_DIR)
71 | cd $(DOC_DIR) && \
72 | rm source/* && \
73 | source $(VENV_DIR)/bin/activate && \
74 | sphinx-apidoc -H 'ReadabiliPy API Documentation' -o source ../$(PACKAGE) && \
75 | touch source/AUTOGENERATED
76 | $(MAKE) -C $(DOC_DIR) html
77 |
78 | #######################
79 | # Virtual environment #
80 | #######################
81 |
82 | .PHONY: venv clean_venv
83 |
84 | venv: $(VENV_DIR)/bin/activate
85 |
86 | $(VENV_DIR)/bin/activate: setup.py
87 | test -d $(VENV_DIR) || python -m venv $(VENV_DIR)
88 | source $(VENV_DIR)/bin/activate && pip install .[dev]
89 | touch $(VENV_DIR)/bin/activate
90 |
91 | clean_venv:
92 | rm -rf $(VENV_DIR)
93 |
94 | ############
95 | # Clean up #
96 | ############
97 |
98 | .PHONY: clean
99 |
100 | clean: ## Clean build dist and egg directories left after install
101 | rm -rf ./dist
102 | rm -rf ./build
103 | rm -rf ./$(PACKAGE).egg-info
104 | rm -rf $(VENV_DIR)
105 | rm -f MANIFEST
106 | rm -rf $(PACKAGE)/javascript/node_modules
107 | find . -type f -iname '*.pyc' -delete
108 | find . -type d -name '__pycache__' -empty -delete
109 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ReadabiliPy
2 |
3 | [](https://coveralls.io/github/alan-turing-institute/ReadabiliPy?branch=master)
4 |
5 | `ReadabiliPy` contains a Python wrapper for Mozilla's [Readability.js](https://github.com/mozilla/readability) Node.js package, as well as article extraction routines written in pure Python.
6 |
7 | This package augments the output of `Readability.js` to also return a list of plain text representations of article paragraphs.
8 |
9 | `ReadabiliPy` comes with a handy command line application: ``readabilipy``.
10 |
11 | ## Installation
12 |
13 | To use the `Readability.js` wrapper you need to have a working [Node.js](https://nodejs.org/en/download/) installation of version 14 or higher.
14 | Make sure to install Node.js before installing this package, as this ensures Readability.js will be installed.
15 | If you only want to use the Python-based article extraction, you **do not need** to install Node.js.
16 |
17 | `ReadabiliPy` can be installed simply from PyPI:
18 |
19 | ```
20 | $ pip install readabilipy
21 | ```
22 |
23 | Note that to update to a new version of `Readability.js` you can simply reinstall `ReadabiliPy`.
24 |
25 | ## Usage
26 |
27 | `ReadabiliPy` can be used either as a command line application or as a Python library.
28 |
29 | ### Command line application
30 |
31 | The ``readabilipy`` command line application can be used to extract an article from an HTML source file.
32 |
33 | For example, if you have the article saved as ``input.html`` in the current directory then you can run:
34 |
35 | ```
36 | $ readabilipy -i ./input.html -o article.json
37 | ```
38 |
39 | The extracted article can then be found in the ``article.json`` file. By default ReadabiliPy will use the Readability.js functionality to extract the article, provided this is available. If instead you'd like to use the Python-based extraction, run:
40 |
41 | ```
42 | $ readabilipy -p -i ./input.html -o article.json
43 | ```
44 |
45 | The complete help text of the command line application is as follows:
46 |
47 | ```
48 | $ readabilipy -h
49 | usage: readabilipy [-h] -i INPUT_FILE -o OUTPUT_FILE [-c] [-n] [-p] [-V]
50 |
51 | Extract article data from a HTML file using either Mozilla's Readability.js
52 | package or a simplified python-only alternative.
53 |
54 | optional arguments:
55 | -h, --help show this help message and exit
56 | -i INPUT_FILE, --input-file INPUT_FILE
57 | Path to input file containing HTML.
58 | -o OUTPUT_FILE, --output-file OUTPUT_FILE
59 | Path to file to output the article data to as JSON.
60 | -c, --content-digests
61 | Add a 'data-content-digest' attribute containing a
62 | SHA256-based digest of the element's contents to each
63 | HTML element in the plain_content output.
64 | -n, --node-indexes Add a 'data-node-index' attribute containing a
65 | hierarchical representation of the element's position
66 | in the HTML structure each HTML element in the
67 | plain_content output.
68 | -p, --use-python-parser
69 | Use the pure-python 'plain_html' parser included in
70 | this project rather than Mozilla's Readability.js.
71 | -V, --version Show version and exit
72 | ```
73 |
74 | ## Library
75 |
76 | ReadabiliPy can also be used as a Python package.
77 | The main routine is called ``simple_json_from_html_string`` and expects the HTML article as a string.
78 | Here is an example of extracting an article after downloading the page using [requests](https://requests.readthedocs.io/en/master/):
79 |
80 | ```python
81 | >>> import requests
82 | >>> from readabilipy import simple_json_from_html_string
83 | >>> req = requests.get('https://en.wikipedia.org/wiki/Readability')
84 | >>> article = simple_json_from_html_string(req.text, use_readability=True)
85 | ```
86 |
87 | Note that you need to use the flag ``use_readability=True`` to use Readability.js, otherwise the Python-based extraction is used.
88 |
89 | The ``simple_json_from_html_string`` function returns a dictionary with the following fields:
90 |
91 | - `title`: The article title
92 | - `byline`: Author information
93 | - `content`: A simplified HTML representation of the article, with all article text contained in paragraph elements.
94 | - `plain_content`: A "plain" version of the simplified `Readability.js` article HTML present in the `content` field. This attempts to retain only the plain text content of the article, while preserving the HTML structure.
95 | - `plain_text`: A list containing plain text representations of each paragraph (`
`) or list (`
` or `
`) present in the simplified `Readability.js` article HTML in the `content` field. Each paragraph or list is represented as a single string. List strings look like `"* item 1, * item 2, * item 3,"` for both ordered and unordered lists (note the trailing `,`).
96 |
97 | Note further that:
98 |
99 | - All fields are guaranteed to be present. If individual fields are missing from the output of `Readability.js`, the value of these fields will be `None`. If no article data is returned by `Readability.js`, the value of all fields will be `None`.
100 | - All text in the `plain_content` and `plain_text` fields is encoded as unicode normalised using the "NFKC" normal form. This normal form is used to try and ensure as much as possible that things that appear visually the same are encoded with the same unicode representation (the K part) and characters are represented as a single composite character where possible (the C part).
101 | - An optional `content_digests` flag can be passed to the Python wrapper. When this is set to `True`, each HTML element in the `plain_content` field has a `data-content-digest` attribute, which holds the SHA-256 hash of its plain text content. For "leaf" nodes (containing only plain text in the output), this is the SHA-256 hash of their plain text content. For nodes containing other nodes, this is the SHA-256 hash of the concatenated SHA-256 hashes of their child nodes.
102 | - An optional `node_indexes` flag can be passed to the Python wrapper. When this is set to `True`, each HTML element in the `plain_content` field has a `data-node-indexes` attribute, which holds a hierarchical index describing the location of element within the `plain_content` HTML structure.
103 | - An optional `use_readability` flag can be passed to the Python wrapper. When this is set to `True`, Mozilla's `Readability.js` will be used as the parser. If it is set to `False` then the pure-python parser in `plain_html.py` will be used instead.
104 |
105 | The second top-level function exported by ReadabiliPy is ``simple_tree_from_html_string``. This returns a cleaned, parsed HTML tree of the article as a [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) object.
106 |
107 | ## Notes
108 |
109 | License: MIT License, see the `LICENSE` file.
110 |
111 | Copyright (c) 2018, The Alan Turing Institute
112 |
113 | If you encounter any issues or have any suggestions for improvement, please open an issue [on Github](https://github.com/alan-turing-institute/ReadabiliPy).
114 | You're helping to make this project better for everyone!
115 |
--------------------------------------------------------------------------------
/benchmarks/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3
2 |
3 | # Install requirements
4 | RUN apt-get update
5 | RUN apt-get -y install curl
6 | RUN curl -sL https://deb.nodesource.com/setup_11.x | bash -
7 | RUN apt install nodejs
8 | RUN npm install
9 | RUN pip install --upgrade pip
10 | RUN apt-get install -y git
11 |
12 | # Clone ReadabiliPy and install python packages
13 | RUN git clone https://github.com/alan-turing-institute/ReadabiliPy
14 | WORKDIR "/ReadabiliPy"
15 | RUN git pull
16 | RUN pip install -r requirements-dev.txt
17 |
18 | # Run the benchmarks with Pytest
19 | CMD pytest --benchmark-only
20 |
--------------------------------------------------------------------------------
/benchmarks/README.md:
--------------------------------------------------------------------------------
1 | Benchmarking [ReadabiliPy](https://github.com/alan-turing-institute/ReadabiliPy/tree/master) with containers
2 | ====
3 |
4 | This directory contains a Dockerfile to build a benchmarking image for [ReadabiliPy](https://github.com/alan-turing-institute/ReadabiliPy/tree/master) as per the guidelines specified by the [Benchmarking with containers](https://alan-turing-institute.github.io/data-science-benchmarking/) project, at the Alan Turing Institute.
5 |
6 | **Software:** [ReadabiliPy](https://github.com/alan-turing-institute/ReadabiliPy) - A simple HTML content extractor in Python. Can be run as a wrapper for Mozilla's Readability.js package or in pure-python mode.
7 |
8 | **Benchmarks:** Benchmark the speed of core package functions at extracting information from an input HTML with [pytest](https://pypi.org/project/pytest-benchmark/). See [test_benchmarking.py](https://github.com/alan-turing-institute/ReadabiliPy/blob/master/tests/test_benchmarking.py).
9 |
10 | Running benchmarks
11 | ----
12 |
13 | Using the [pytest-benchmark](https://pypi.org/project/pytest-benchmark/) package, we benchmark some of the package functions, including extraction of titles and dates from article HTML and the full article content in JSON format.
14 |
15 | Benchmarks can be run from the top directory of the package with the following command: ```pytest --benchmark-only```.
16 |
17 | Building a Docker image for Benchmarking ReadabiliPy
18 | ----
19 |
20 | The [Dockerfile](https://github.com/alan-turing-institute/ReadabiliPy/blob/master/benchmarks/Dockerfile) specifies an image that installs the requirements for ReadabiliPy, clones the package from GitHub, then runs the benchmarks with pytest.
21 |
22 | Docker Hub Automated build
23 | ----
24 |
25 | An image was built with this Dockerfile and pushed to [Docker Hub](https://cloud.docker.com/repository/docker/edwardchalstrey/readabilipy_benchmark) as ```edwardchalstrey/readabilipy_benchmark```. An automated build was set up so that the ```latest``` tag is built whenever the master branch of the ReadabiliPy GitHub repo has a new commit.
26 |
27 | Run the containerised benchmarks
28 | ----
29 |
30 | The benchmark image can be pulled from the remote registry (Docker Hub), and run on any computing platform with Docker. Benchmarks can be run whenever new features are added.
31 |
32 | ### Results
33 |
34 | I have benchmarked three of the html parsing features of ReadabiliPy on an example html file; see the tests in [ReadabiliPy](https://github.com/alan-turing-institute/ReadabiliPy/tree/master) repo within ```tests/test_benchmarking.py```.
35 |
36 | Benchmarks run on these dates, are for the following [ReadabiliPy](https://github.com/alan-turing-institute/ReadabiliPy/tree/master) commits and measure **mean time ms**:
37 | 1. 2019-05-02 => [9ba2fdb7...](https://github.com/alan-turing-institute/ReadabiliPy/tree/9ba2fdb71b3b014f3252a29672ff41159203e45c)
38 | 2. 2019-05-14 => [d3b3c365...](https://github.com/alan-turing-institute/ReadabiliPy/tree/d3b3c365984aa26ce0a8f0fda6b3fd75b9e837a2)
39 | 3. 2019-05-31 => [73493922...](https://github.com/alan-turing-institute/ReadabiliPy/tree/734939221048041e545e3a4bd205a84e87631a3f)
40 |
41 | **Benchmarks on a Macbook:**
42 |
43 | | Date | Date parse | Title parse | Full parse |
44 | |---|---|---|---|
45 | | 2019-05-02 | 69.5056 | 55.5296 | 2140.0745 |
46 | | 2019-05-14 | 44.4991 | 54.8936 | 1942.1609 |
47 | | 2019-05-31 | 80.5528 | 94.9283 | 2290.3153 |
48 |
49 |
50 | **Benchmarks on a Macbook in Docker container:**
51 |
52 | | Date | Date parse | Title parse | Full parse |
53 | |---|---|---|---|
54 | | 2019-05-02 | 46.4389 | 40.2649 | 3065.2467 |
55 | | 2019-05-14 | 32.8276 | 39.7405 | 2642.1735 |
56 | | 2019-05-31 | 34.8774 | 41.2476 | 2838.9681 |
57 |
--------------------------------------------------------------------------------
/make_release.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | """
5 | Do-nothing script for making a release
6 |
7 | This idea comes from here:
8 | https://blog.danslimmon.com/2019/07/15/do-nothing-scripting-the-key-to-gradual-automation/
9 |
10 | This file is part of ReadabiliPy.
11 |
12 | Copyright: 2020, The Alan Turing Institute
13 | License: See LICENSE file.
14 |
15 | """
16 |
17 | import os
18 | import sys
19 | import tempfile
20 | import webbrowser
21 |
22 | try:
23 | import colorama
24 | colorama.init()
25 | BE_COLOURFUL = True
26 | except ImportError:
27 | BE_COLOURFUL = False
28 |
29 |
30 | URLS = {
31 | "Travis": "https://travis-ci.org/alan-turing-institute/ReadabiliPy",
32 | }
33 |
34 |
35 | def coloured(msg, colour=None, style=None):
36 | if not BE_COLOURFUL:
37 | return msg
38 |
39 | colours = {
40 | "red": colorama.Fore.RED,
41 | "green": colorama.Fore.GREEN,
42 | "cyan": colorama.Fore.CYAN,
43 | "yellow": colorama.Fore.YELLOW,
44 | "magenta": colorama.Fore.MAGENTA,
45 | None: "",
46 | }
47 | styles = {
48 | "bright": colorama.Style.BRIGHT,
49 | "dim": colorama.Style.DIM,
50 | None: "",
51 | }
52 | pre = colours[colour] + styles[style]
53 | post = colorama.Style.RESET_ALL
54 | return f"{pre}{msg}{post}"
55 |
56 |
57 | def cprint(msg, colour=None, style=None):
58 | print(coloured(msg, colour=colour, style=style))
59 |
60 |
61 | def wait_for_enter():
62 | input(coloured("\nPress Enter to continue", style="dim"))
63 | print()
64 |
65 |
66 | def get_package_name():
67 | with open("./setup.py", "r") as fp:
68 | nameline = next(
69 | (line.strip() for line in fp if line.startswith("NAME = ")), None
70 | )
71 | return nameline.split("=")[-1].strip().strip('"')
72 |
73 |
74 | def get_package_version(pkgname):
75 | ctx = {}
76 | with open(f"{pkgname.lower()}/__version__.py", "r") as fp:
77 | exec(fp.read(), ctx)
78 | return ctx["__version__"]
79 |
80 |
81 | class Step:
82 | def pre(self, context):
83 | pass
84 |
85 | def post(self, context):
86 | wait_for_enter()
87 |
88 | def run(self, context):
89 | try:
90 | self.pre(context)
91 | self.action(context)
92 | self.post(context)
93 | except KeyboardInterrupt:
94 | cprint("\nInterrupted.", colour="red")
95 | raise SystemExit(1)
96 |
97 | def instruct(self, msg):
98 | cprint(msg, colour="green")
99 |
100 | def print_run(self, msg):
101 | cprint("Run:", colour="cyan", style="bright")
102 | self.print_cmd(msg)
103 |
104 | def print_cmd(self, msg):
105 | cprint("\t" + msg, colour="cyan", style="bright")
106 |
107 | def do_cmd(self, cmd):
108 | cprint(f"Going to run: {cmd}", colour="magenta", style="bright")
109 | wait_for_enter()
110 | os.system(cmd)
111 |
112 |
113 | class GitToMain(Step):
114 | def action(self, context):
115 | self.instruct("Make sure you're on main and changes are merged in")
116 | self.print_run("git checkout main")
117 |
118 |
119 | class UpdateChangelog(Step):
120 | def action(self, context):
121 | self.instruct(f"Update change log for version {context['version']}")
122 | self.print_run("vi CHANGELOG.md")
123 |
124 |
125 | class UpdateReadme(Step):
126 | def action(self, context):
127 | self.instruct("Update readme if necessary")
128 | self.print_run("vi README.md")
129 |
130 |
131 | class RunTests(Step):
132 | def action(self, context):
133 | self.instruct("Run the unit tests")
134 | self.print_run("make test")
135 |
136 |
137 | class BumpVersionPackage(Step):
138 | def action(self, context):
139 | self.instruct("Update __version__.py with the new version")
140 |
141 | def post(self, context):
142 | wait_for_enter()
143 | context["version"] = self._get_version(context)
144 |
145 | def _get_version(self, context):
146 | # Get the version from the version file
147 | return get_package_version(context["pkgname"])
148 |
149 |
150 | class MakeClean(Step):
151 | def action(self, context):
152 | self.do_cmd("make clean")
153 |
154 |
155 | class MakeDocs(Step):
156 | def action(self, context):
157 | self.do_cmd("make docs")
158 |
159 |
160 | class MakeDist(Step):
161 | def action(self, context):
162 | self.do_cmd("make dist")
163 |
164 |
165 | class PushToTestPyPI(Step):
166 | def action(self, context):
167 | self.do_cmd(
168 | "twine upload --repository-url https://test.pypi.org/legacy/ dist/*"
169 | )
170 |
171 |
172 | class InstallFromTestPyPI(Step):
173 | def action(self, context):
174 | tmpvenv = tempfile.mkdtemp(prefix="rdpy_venv_")
175 | self.do_cmd(
176 | f"python -m venv {tmpvenv} && source {tmpvenv}/bin/activate && "
177 | "pip install --no-cache-dir --index-url "
178 | "https://test.pypi.org/simple/ "
179 | "--extra-index-url https://pypi.org/simple "
180 | f"{context['pkgname']}=={context['version']}"
181 | )
182 | context["tmpvenv"] = tmpvenv
183 |
184 |
185 | class TestPackage(Step):
186 | def action(self, context):
187 | self.instruct(
188 | f"Ensure that the following command gives version {context['version']}"
189 | )
190 | self.do_cmd(f"source {context['tmpvenv']}/bin/activate && readabilipy -V")
191 |
192 |
193 | class RemoveVenv(Step):
194 | def action(self, context):
195 | self.do_cmd(f"rm -rf {context['tmpvenv']}")
196 |
197 |
198 | class GitTagVersion(Step):
199 | def action(self, context):
200 | self.do_cmd(f"git tag v{context['version']}")
201 |
202 |
203 | class GitAdd(Step):
204 | def action(self, context):
205 | self.instruct("Add everything to git and commit")
206 | self.print_run("git gui")
207 |
208 |
209 | class PushToPyPI(Step):
210 | def action(self, context):
211 | self.do_cmd("twine upload dist/*")
212 |
213 |
214 | class PushToGitHub(Step):
215 | def action(self, context):
216 | self.do_cmd("git push -u --tags origin main")
217 |
218 |
219 | class WaitForTravis(Step):
220 | def action(self, context):
221 | webbrowser.open(URLS['Travis'])
222 | self.instruct(
223 | "Wait for Travis to complete and verify that its successful"
224 | )
225 |
226 |
227 | class WaitForRTD(Step):
228 | def action(self, context):
229 | self.instruct(
230 | "Wait for ReadTheDocs to complete and verify that its successful"
231 | )
232 |
233 |
234 | def main(target=None):
235 | procedure = [
236 | ("gittomain", GitToMain()),
237 | ("clean1", MakeClean()),
238 | ("tests1", RunTests()),
239 | ("gitadd1", GitAdd()),
240 | ("push1", PushToGitHub()),
241 | ("travis1", WaitForTravis()),
242 | ("bumpversion", BumpVersionPackage()),
243 | ("changelog", UpdateChangelog()),
244 | ("readme", UpdateReadme()),
245 | ("dist", MakeDist()),
246 | ("testpypi", PushToTestPyPI()),
247 | ("install", InstallFromTestPyPI()),
248 | ("testpkg", TestPackage()),
249 | ("remove_venv", RemoveVenv()),
250 | ("gitadd2", GitAdd()),
251 | ("pypi", PushToPyPI()),
252 | ("tag", GitTagVersion()),
253 | ("push2", PushToGitHub()),
254 | ]
255 | context = {}
256 | context["pkgname"] = get_package_name()
257 | context["version"] = get_package_version(context["pkgname"])
258 | skip = True if target else False
259 | for name, step in procedure:
260 | if not name == target and skip:
261 | continue
262 | skip = False
263 | step.run(context)
264 | cprint("\nDone!", colour="yellow", style="bright")
265 |
266 |
267 | if __name__ == "__main__":
268 | target = sys.argv[1] if len(sys.argv) > 1 else None
269 | main(target=target)
270 |
--------------------------------------------------------------------------------
/readabilipy/__init__.py:
--------------------------------------------------------------------------------
1 | from .simple_json import simple_json_from_html_string
2 | from .simple_tree import simple_tree_from_html_string
3 |
4 | __all__ = [
5 | 'simple_json_from_html_string',
6 | 'simple_tree_from_html_string',
7 | ]
8 |
--------------------------------------------------------------------------------
/readabilipy/__main__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """Command line interface
4 |
5 | """
6 |
7 | import argparse
8 | import json
9 | import sys
10 |
11 | from .__version__ import __version__
12 | from .simple_json import simple_json_from_html_string, have_node
13 |
14 |
15 | def main():
16 | parser = argparse.ArgumentParser(
17 | description="Extract article data from a HTML file using either Mozilla's Readability.js package or a simplified python-only alternative.",
18 | formatter_class=argparse.ArgumentDefaultsHelpFormatter,
19 | )
20 | parser.add_argument(
21 | "-i",
22 | "--input-file",
23 | default="-",
24 | help="Path to input file containing HTML, use '-' for stdin.",
25 | )
26 | parser.add_argument(
27 | "-o",
28 | "--output-file",
29 | default="-",
30 | help="Path to file to output the article data to as JSON, use '-' for stdout.",
31 | )
32 | parser.add_argument(
33 | "-c",
34 | "--content-digests",
35 | action="store_true",
36 | help="Add a 'data-content-digest' attribute containing a SHA256-based digest of the element's contents to each HTML element in the plain_content output.",
37 | )
38 | parser.add_argument(
39 | "-n",
40 | "--node-indexes",
41 | action="store_true",
42 | help="Add a 'data-node-index' attribute containing a hierarchical representation of the element's position in the HTML structure each HTML element in the plain_content output.",
43 | )
44 | parser.add_argument(
45 | "-p",
46 | "--use-python-parser",
47 | action="store_true",
48 | help="Use the pure-python 'plain_html' parser included in this project rather than Mozilla's Readability.js.",
49 | )
50 | parser.add_argument(
51 | "-V",
52 | "--version",
53 | help="Show version and exit",
54 | action="version",
55 | version=f"{__version__} (Readability.js supported: {'yes' if have_node() else 'no'})",
56 | )
57 |
58 | args = parser.parse_args()
59 |
60 | # Open input file or stream
61 | if args.input_file == "-":
62 | if hasattr(sys.stdin, "reconfigure"):
63 | sys.stdin.reconfigure(encoding="utf-8", errors="replace")
64 | input_file = sys.stdin
65 | else:
66 | input_file = open(args.input_file, encoding="utf-8", errors="replace") # pylint: disable=consider-using-with
67 |
68 | # Read from input then close if appropriate
69 | html = input_file.read()
70 | if not input_file.isatty():
71 | input_file.close()
72 |
73 | article = simple_json_from_html_string(
74 | html,
75 | content_digests=args.content_digests,
76 | node_indexes=args.node_indexes,
77 | use_readability=(not args.use_python_parser),
78 | )
79 |
80 | # Open output file or stream
81 | if args.output_file == "-":
82 | output_file = sys.stdout
83 | else:
84 | output_file = open(args.output_file, "w", encoding="utf-8") # pylint: disable=consider-using-with
85 |
86 | # Write to output then close if appropriate
87 | json.dump(article, output_file, ensure_ascii=False)
88 | if not output_file.isatty():
89 | output_file.close()
90 |
91 |
92 | if __name__ == "__main__":
93 | main()
94 |
--------------------------------------------------------------------------------
/readabilipy/__version__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | VERSION = (0, 3, 0)
4 |
5 | __version__ = '.'.join(map(str, VERSION))
6 |
--------------------------------------------------------------------------------
/readabilipy/extractors/__init__.py:
--------------------------------------------------------------------------------
1 | from .extract_date import extract_date, ensure_iso_date_format
2 | from .extract_title import extract_title
3 |
4 | __all__ = [
5 | 'extract_date',
6 | 'extract_title',
7 | 'ensure_iso_date_format',
8 | ]
9 |
--------------------------------------------------------------------------------
/readabilipy/extractors/extract_date.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 | from .extract_element import extract_element
3 |
4 |
5 | def extract_date(html):
6 | """Return the article date from the article HTML"""
7 |
8 | # List of xpaths for HTML tags that could contain a date
9 | # Tuple scores reflect confidence in these xpaths and the preference used for extraction
10 | xpaths = [
11 | ('//meta[@property="article:published_time"]/@content', 13),
12 | ('//meta[@property="og:updated_time"]/@content', 10),
13 | ('//meta[@property="og:article:published_time"]/@content', 10),
14 | ('//meta[@property="og:article:modified_time"]/@content', 10),
15 | ('//meta[@property="article:published"]/@content', 7),
16 | ('//meta[@itemprop="datePublished"]/@content', 3),
17 | ('//time/@datetime', 3),
18 | ('//meta[@itemprop="dateModified"]/@content', 2),
19 | ('//meta[@property="article:modified_time"]/@content', 2),
20 | ]
21 |
22 | # Get all the dates
23 | extracted_dates = extract_element(html, xpaths)
24 | if not extracted_dates:
25 | return None
26 |
27 | # Search through the extracted date strings in order of score and take the first that is in isoformat
28 | for date_string in sorted(extracted_dates, key=lambda ds: extracted_dates[ds]["score"], reverse=True):
29 | iso_date = ensure_iso_date_format(date_string)
30 | if iso_date:
31 | return iso_date
32 | return None
33 |
34 |
35 | def ensure_iso_date_format(date_string, ignoretz=True):
36 | """Check date_string is in one of our supported formats and return it"""
37 | supported_date_formats = [
38 | "%Y-%m-%dT%H:%M:%S", # '2014-10-24T17:32:46'
39 | "%Y-%m-%dT%H:%M:%S%z", # '2014-10-24T17:32:46+12:00'
40 | "%Y-%m-%dT%H:%M%z", # '2014-10-24T17:32+12:00'
41 | "%Y-%m-%dT%H:%M:%SZ", # '2014-10-24T17:32:46Z'
42 | "%Y-%m-%dT%H:%M:%S.%fZ", # '2014-10-24T17:32:46.000Z'
43 | "%Y-%m-%dT%H:%M:%S.%f" # '2014-10-24T17:32:46.493'
44 | ]
45 |
46 | for date_format in supported_date_formats:
47 | try:
48 | # For python < 3.7, strptime() is not able to parse timezones containing
49 | # colons (eg. 2014-10-24T17:32:46+12:00). By stripping the colon here,
50 | # we ensure that all versions of python can parse datetimes like these
51 | if date_format in ("%Y-%m-%dT%H:%M:%S%z", "%Y-%m-%dT%H:%M%z") and date_string[-3] == ':':
52 | isodate = datetime.strptime(date_string[:-3] + date_string[-2:], date_format)
53 | else:
54 | isodate = datetime.strptime(date_string, date_format)
55 | if ignoretz:
56 | isodate = isodate.replace(tzinfo=None, microsecond=0)
57 | return isodate.isoformat()
58 | except ValueError:
59 | pass
60 | return None
61 |
--------------------------------------------------------------------------------
/readabilipy/extractors/extract_element.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 | import lxml.html
3 | from ..simplifiers import normalise_whitespace
4 |
5 |
6 | def extract_element(html, xpaths, process_dict_fn=None):
7 | """Return the relevant elements (titles, dates or bylines) from article HTML, specified by xpaths.
8 | xpaths should be a list of tuples, each with the xpath and a reliability scores.
9 | Processing of the dictionary can be handled with the arg function.
10 | The returned dictionary should have the processed elements as keys and dicts with scores and the xpaths used as values
11 | """
12 | # Attempt to parse the html, aborting here if it is not parseable
13 | try:
14 | lxml_html = lxml.html.fromstring(html)
15 | except lxml.etree.ParserError:
16 | return None
17 |
18 | # Get all elements specified and combine scores
19 | extracted_strings = defaultdict(dict)
20 | for extraction_xpath, score in xpaths:
21 | found_elements = lxml_html.xpath(extraction_xpath)
22 | found_elements = found_elements if isinstance(found_elements, list) else [found_elements]
23 | for found_element in found_elements:
24 | element = normalise_whitespace(found_element)
25 | if element:
26 | try:
27 | extracted_strings[element]['score'] += score
28 | extracted_strings[element]['xpaths'].append(extraction_xpath)
29 | extracted_strings[element]['xpaths'].sort()
30 | except KeyError:
31 | extracted_strings[element]['score'] = score
32 | extracted_strings[element]['xpaths'] = [extraction_xpath]
33 |
34 | # Edit the dictionary
35 | if process_dict_fn:
36 | extracted_strings = process_dict_fn(extracted_strings)
37 |
38 | return extracted_strings
39 |
--------------------------------------------------------------------------------
/readabilipy/extractors/extract_title.py:
--------------------------------------------------------------------------------
1 | from itertools import permutations
2 | from .extract_element import extract_element
3 |
4 |
5 | def extract_title(html):
6 | """Return the article title from the article HTML"""
7 |
8 | # List of xpaths for HTML tags that could contain a title
9 | # Tuple scores reflect confidence in these xpaths and the preference used for extraction
10 | xpaths = [
11 | ('//header[@class="entry-header"]/h1[@class="entry-title"]//text()', 4),
12 | ('//meta[@property="og:title"]/@content', 4),
13 | ('//h1[@class="entry-title"]//text()', 3),
14 | ('//h1[@itemprop="headline"]//text()', 3),
15 | ('//h2[@itemprop="headline"]//text()', 2),
16 | ('//meta[contains(@itemprop, "headline")]/@content', 2),
17 | ('//body/title//text()', 1),
18 | ('//div[@class="postarea"]/h2/a//text()', 1),
19 | ('//h1[@class="post__title"]//text()', 1),
20 | ('//h1[@class="title"]//text()', 1),
21 | ('//head/title//text()', 1),
22 | ('//header/h1//text()', 1),
23 | ('//meta[@name="dcterms.title"]/@content', 1),
24 | ('//meta[@name="fb_title"]/@content', 1),
25 | ('//meta[@name="sailthru.title"]/@content', 1),
26 | ('//meta[@name="title"]/@content', 1),
27 | ]
28 |
29 | extracted_titles = extract_element(html, xpaths, process_dict_fn=combine_similar_titles)
30 | if not extracted_titles:
31 | return None
32 | return max(extracted_titles, key=lambda x: extracted_titles[x].get('score'))
33 |
34 |
35 | def combine_similar_titles(extracted_strings):
36 | """Take a dictionary with titles and nested dicts with scores and combine scores for titles which we decide are the same."""
37 |
38 | # Iterate through each possible pair of title keys, including both permutations of each pair
39 | for title_pair in permutations(extracted_strings, 2):
40 | # If the first title is a subset of the second then combine their scores, taking the shorter one as the key
41 | if title_pair[0] in title_pair[1]:
42 | extracted_strings[title_pair[0]]['score'] += extracted_strings[title_pair[1]]['score']
43 | extracted_strings[title_pair[0]]['xpaths'] += extracted_strings[title_pair[1]]['xpaths']
44 | # If the first title is identical to the second (ignoring case) then combine their scores, taking the one with more capitals as the key
45 | elif title_pair[0].lower() == title_pair[1].lower():
46 | if len([c for c in title_pair[0] if c.isupper()]) > len([c for c in title_pair[1] if c.isupper()]):
47 | extracted_strings[title_pair[0]]['score'] += extracted_strings[title_pair[1]]['score']
48 | extracted_strings[title_pair[0]]['xpaths'] += extracted_strings[title_pair[1]]['xpaths']
49 | for score_xpath_dict in extracted_strings.values():
50 | score_xpath_dict['xpaths'].sort()
51 | return extracted_strings
52 |
--------------------------------------------------------------------------------
/readabilipy/javascript/ExtractArticle.js:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of ReadabiliPy
3 | */
4 |
5 | const fs = require('fs');
6 | const { Readability } = require('@mozilla/readability');
7 | const { JSDOM } = require('jsdom');
8 |
9 | function readFile(filePath) {
10 | return fs.readFileSync(filePath, {encoding: "utf-8"}).trim();
11 | }
12 |
13 | function writeFile(data, filePath) {
14 | return fs.writeFileSync(filePath, data, {encoding: "utf-8"});
15 | }
16 | function main() {
17 | var outFilePath;
18 |
19 | var argv = require('minimist')(process.argv.slice(2));
20 | if (argv['i'] === undefined) {
21 | console.log("Input file required.");
22 | return 1;
23 | }
24 |
25 | var inFilePath = argv['i'];
26 | if (typeof(argv['o']) !== 'undefined') {
27 | outFilePath = argv['o'];
28 | } else {
29 | outFilePath = inFilePath + ".simple.json";
30 | }
31 |
32 | var html = readFile(inFilePath);
33 | var doc = new JSDOM(html);
34 | let reader = new Readability(doc.window.document);
35 | let article = reader.parse();
36 |
37 | writeFile(JSON.stringify(article), outFilePath);
38 | return 0;
39 | }
40 |
41 | main();
42 |
--------------------------------------------------------------------------------
/readabilipy/javascript/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "ReadabiliPy",
3 | "version": "0.1.0",
4 | "description": "An augmented Python wrapper for the Mozilla standalone Readability.js package.",
5 | "main": "ExtractArticle.js",
6 | "scripts": {},
7 | "repository": {
8 | "type": "git",
9 | "url": "https://github.com/alan-turing-institute/ReadabiliPy"
10 | },
11 | "author": "",
12 | "license": "Apache-2.0",
13 | "bugs": {
14 | "url": "https://github.com/alan-turing-institute/ReadabiliPy/issues"
15 | },
16 | "engines": {
17 | "node": ">=14.0.0"
18 | },
19 | "homepage": "https://github.com/alan-turing-institute/ReadabiliPy",
20 | "devDependencies": {},
21 | "dependencies": {
22 | "@mozilla/readability": ">=0.4.1",
23 | "jsdom": ">=12.2.0",
24 | "minimist": "^1.2.3"
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/readabilipy/simple_json.py:
--------------------------------------------------------------------------------
1 | import hashlib
2 | import json
3 | import os
4 | import tempfile
5 | import subprocess
6 | import sys
7 |
8 | from bs4 import BeautifulSoup
9 | from bs4.element import Comment, NavigableString, CData
10 | from .simple_tree import simple_tree_from_html_string
11 | from .extractors import extract_date, extract_title
12 | from .simplifiers import normalise_text
13 | from .utils import run_npm_install
14 |
15 |
16 | def have_node():
17 | """Check that we can run node and have a new enough version """
18 | try:
19 | cp = subprocess.run(['node', '-v'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False)
20 | except FileNotFoundError:
21 | return False
22 |
23 | if not cp.returncode == 0:
24 | return False
25 |
26 | major = int(cp.stdout.split(b'.')[0].lstrip(b'v'))
27 | if major < 10:
28 | return False
29 |
30 | # check that this package has a node_modules dir in the javascript
31 | # directory, if it doesn't, it wasn't installed with Node support
32 | jsdir = os.path.join(os.path.dirname(__file__), 'javascript')
33 | node_modules = os.path.join(jsdir, 'node_modules')
34 | if not os.path.exists(node_modules):
35 | # Try installing node dependencies.
36 | run_npm_install()
37 | return os.path.exists(node_modules)
38 |
39 |
40 | def simple_json_from_html_string(html, content_digests=False, node_indexes=False, use_readability=False):
41 | if use_readability and not have_node():
42 | print("Warning: node executable not found, reverting to pure-Python mode. Install Node.js v10 or newer to use Readability.js.", file=sys.stderr)
43 | use_readability = False
44 |
45 | if use_readability:
46 | # Write input HTML to temporary file so it is available to the node.js script
47 | # It is important that this file be unique in case this function is called concurrently
48 | with tempfile.NamedTemporaryFile(delete=False, mode="w+", encoding="utf-8", prefix="readabilipy") as f_html:
49 | f_html.write(html)
50 | f_html.close()
51 | tmp_html_path = f_html.name
52 |
53 | # We assume appending ".json" to the html name will also be a unique filename
54 | tmp_json_path = tmp_html_path + ".json"
55 |
56 | # Call Mozilla's Readability.js Readability.parse() function via node, writing output to a temporary file
57 | jsdir = os.path.join(os.path.dirname(__file__), 'javascript')
58 | try:
59 | subprocess.run(
60 | ["node", "ExtractArticle.js", "-i", tmp_html_path, "-o", tmp_json_path],
61 | cwd=jsdir,
62 | check=True,
63 | stdout=subprocess.PIPE,
64 | stderr=subprocess.PIPE,
65 | universal_newlines=True)
66 | except subprocess.CalledProcessError as e:
67 | print(e.stderr)
68 | raise
69 |
70 | # Read output of call to Readability.parse() from JSON file as Python dictionary
71 | with open(tmp_json_path, "r", encoding="utf-8") as json_file:
72 | input_json = json.load(json_file)
73 |
74 | # Delete temporary input and output files after processing
75 | os.unlink(tmp_json_path)
76 | os.unlink(tmp_html_path)
77 | else:
78 | input_json = {
79 | "title": extract_title(html),
80 | "date": extract_date(html),
81 | "content": str(simple_tree_from_html_string(html))
82 | }
83 |
84 | # Only keep the subset of Readability.js fields we are using (and therefore testing for accuracy of extraction)
85 | # NB: Need to add tests for additional fields and include them when we look at packaging this wrapper up for PyPI
86 | # Initialise output article to include all fields with null values
87 | article_json = {
88 | "title": None,
89 | "byline": None,
90 | "date": None,
91 | "content": None,
92 | "plain_content": None,
93 | "plain_text": None
94 | }
95 | # Populate article fields from readability fields where present
96 | if input_json:
97 | if "title" in input_json and input_json["title"]:
98 | article_json["title"] = input_json["title"]
99 | if "byline" in input_json and input_json["byline"]:
100 | article_json["byline"] = input_json["byline"]
101 | if "date" in input_json and input_json["date"]:
102 | article_json["date"] = input_json["date"]
103 | if "content" in input_json and input_json["content"]:
104 | article_json["content"] = input_json["content"]
105 | article_json["plain_content"] = plain_content(article_json["content"], content_digests, node_indexes)
106 | if use_readability:
107 | article_json["plain_text"] = extract_text_blocks_js(article_json["plain_content"])
108 | else:
109 | article_json["plain_text"] = extract_text_blocks_as_plain_text(article_json["plain_content"])
110 |
111 | return article_json
112 |
113 |
114 | def extract_text_blocks_js(paragraph_html):
115 | # Load article as DOM
116 | soup = BeautifulSoup(paragraph_html, 'html.parser')
117 | # Select all text blocks
118 | text_blocks = [{"text": str(s)} for s in soup.find_all(string=True)]
119 | return text_blocks
120 |
121 |
122 | def extract_text_blocks_as_plain_text(paragraph_html):
123 | # Load article as DOM
124 | soup = BeautifulSoup(paragraph_html, 'html.parser')
125 | # Select all lists
126 | list_elements = soup.find_all(['ul', 'ol'])
127 | # Prefix text in all list items with "* " and make lists paragraphs
128 | for list_element in list_elements:
129 | plain_items = "".join(list(filter(None, [plain_text_leaf_node(li)["text"] for li in list_element.find_all('li')])))
130 | list_element.string = plain_items
131 | list_element.name = "p"
132 | # Select all text blocks
133 | text_blocks = [s.parent for s in soup.find_all(string=True)]
134 | text_blocks = [plain_text_leaf_node(block) for block in text_blocks]
135 | # Drop empty paragraphs
136 | text_blocks = list(filter(lambda p: p["text"] is not None, text_blocks))
137 | return text_blocks
138 |
139 |
140 | def plain_text_leaf_node(element):
141 | # Extract all text, stripped of any child HTML elements and normalise it
142 | plain_text = normalise_text(element.get_text())
143 | if plain_text != "" and element.name == "li":
144 | plain_text = f"* {plain_text}, "
145 | if plain_text == "":
146 | plain_text = None
147 | if "data-node-index" in element.attrs:
148 | plain = {"node_index": element["data-node-index"], "text": plain_text}
149 | else:
150 | plain = {"text": plain_text}
151 | return plain
152 |
153 |
154 | def plain_content(readability_content, content_digests, node_indexes):
155 | # Load article as DOM
156 | soup = BeautifulSoup(readability_content, 'html.parser')
157 | # Make all elements plain
158 | elements = plain_elements(soup.contents, content_digests, node_indexes)
159 | if node_indexes:
160 | # Add node index attributes to nodes
161 | elements = [add_node_indexes(element) for element in elements]
162 | # Replace article contents with plain elements
163 | soup.contents = elements
164 | return str(soup)
165 |
166 |
167 | def plain_elements(elements, content_digests, node_indexes):
168 | # Get plain content versions of all elements
169 | elements = [plain_element(element, content_digests, node_indexes)
170 | for element in elements]
171 | if content_digests:
172 | # Add content digest attribute to nodes
173 | elements = [add_content_digest(element) for element in elements]
174 | return elements
175 |
176 |
177 | def plain_element(element, content_digests, node_indexes):
178 | # For lists, we make each item plain text
179 | if is_leaf(element):
180 | # For leaf node elements, extract the text content, discarding any HTML tags
181 | # 1. Get element contents as text
182 | plain_text = element.get_text()
183 | # 2. Normalise the extracted text string to a canonical representation
184 | plain_text = normalise_text(plain_text)
185 | # 3. Update element content to be plain text
186 | element.string = plain_text
187 | elif is_text(element):
188 | if is_non_printing(element):
189 | # The simplified HTML may have come from Readability.js so might
190 | # have non-printing text (e.g. Comment or CData). In this case, we
191 | # keep the structure, but ensure that the string is empty.
192 | element = type(element)("")
193 | else:
194 | plain_text = element.string
195 | plain_text = normalise_text(plain_text)
196 | element = type(element)(plain_text)
197 | else:
198 | # If not a leaf node or leaf type call recursively on child nodes, replacing
199 | plain_conents = plain_elements(element.contents, content_digests, node_indexes)
200 | element.clear()
201 | element.extend(plain_conents)
202 | return element
203 |
204 |
205 | def is_leaf(element):
206 | return (element.name in ['p', 'li'])
207 |
208 |
209 | def is_text(element):
210 | return isinstance(element, NavigableString)
211 |
212 |
213 | def is_non_printing(element):
214 | return any(isinstance(element, _e) for _e in [Comment, CData])
215 |
216 |
217 | def add_node_indexes(element, node_index="0"):
218 | # Can't add attributes to string types
219 | if is_text(element):
220 | return element
221 | # Add index to current element
222 | element["data-node-index"] = node_index
223 | # Add index to child elements
224 | for local_idx, child in enumerate(
225 | [c for c in element.contents if not is_text(c)], start=1):
226 | # Can't add attributes to leaf string types
227 | child_index = f"{node_index}.{local_idx}"
228 | add_node_indexes(child, node_index=child_index)
229 | return element
230 |
231 |
232 | def add_content_digest(element):
233 | if not is_text(element):
234 | element["data-content-digest"] = content_digest(element)
235 | return element
236 |
237 |
238 | def content_digest(element):
239 | if is_text(element):
240 | # Hash
241 | trimmed_string = element.string.strip()
242 | if trimmed_string == "":
243 | digest = ""
244 | else:
245 | digest = hashlib.sha256(trimmed_string.encode('utf-8')).hexdigest()
246 | else:
247 | contents = element.contents
248 | num_contents = len(contents)
249 | if num_contents == 0:
250 | # No hash when no child elements exist
251 | digest = ""
252 | elif num_contents == 1:
253 | # If single child, use digest of child
254 | digest = content_digest(contents[0])
255 | else:
256 | # Build content digest from the "non-empty" digests of child nodes
257 | digest = hashlib.sha256()
258 | child_digests = list(
259 | filter(lambda x: x != "", [content_digest(content) for content in contents]))
260 | for child in child_digests:
261 | digest.update(child.encode('utf-8'))
262 | digest = digest.hexdigest()
263 | return digest
264 |
--------------------------------------------------------------------------------
/readabilipy/simple_tree.py:
--------------------------------------------------------------------------------
1 | """Turn input HTML into a cleaned parsed tree."""
2 | from bs4 import BeautifulSoup
3 | from .simplifiers.html import consolidate_text, insert_paragraph_breaks, normalise_strings, process_special_elements, process_unknown_elements, recursively_prune_elements, remove_blacklist, remove_empty_strings_and_elements, remove_metadata, strip_attributes, structural_elements, unnest_paragraphs, unwrap_elements, wrap_bare_text
4 |
5 |
6 | def simple_tree_from_html_string(html):
7 | """Turn input HTML into a cleaned parsed tree."""
8 | # Insert space into non-spaced comments so that html5lib can interpret them correctly
9 | html = html.replace("", "")
10 |
11 | # Convert the HTML into a Soup parse tree
12 | soup = BeautifulSoup(html, "html5lib")
13 |
14 | # Remove comments, CDATA (which is converted to comments) and DOCTYPE
15 | remove_metadata(soup)
16 |
17 | # Strip tag attributes apart from 'class' and 'style'
18 | strip_attributes(soup)
19 |
20 | # Remove blacklisted elements
21 | remove_blacklist(soup)
22 |
23 | # Unwrap elements where we want to keep the text but drop the containing tag
24 | unwrap_elements(soup)
25 |
26 | # Process elements with special innerText handling
27 | process_special_elements(soup)
28 |
29 | # Process unknown elements
30 | process_unknown_elements(soup)
31 |
32 | # Consolidate text, joining any consecutive NavigableStrings together.
33 | # Must come before any whitespace operations (eg. remove_empty_strings_and_elements or normalise_strings)
34 | consolidate_text(soup)
35 |
36 | # Remove empty string elements
37 | remove_empty_strings_and_elements(soup)
38 |
39 | # Split out block-level elements illegally contained inside paragraphs
40 | unnest_paragraphs(soup)
41 |
42 | # Replace and elements with paragraph breaks
43 | # Must come after remove_empty_strings_and_elements so that consecutive s can be identified
44 | # Re-consolidates strings at the end, so must come before normalise_strings
45 | insert_paragraph_breaks(soup)
46 |
47 | # Wrap any remaining bare text in a suitable block level element
48 | # Must come after consolidate_text and identify_and_replace_break_elements
49 | # otherwise there may be multiple strings inside a
tag which would create nested
s
50 | wrap_bare_text(soup)
51 |
52 | # Normalise all strings, removing whitespace and fixing unicode issues
53 | # Must come after consolidate_text and insert_paragraph_breaks which join
54 | # strings with semantic whitespace
55 | normalise_strings(soup)
56 |
57 | # Recursively replace any elements which have no children or only zero-length children
58 | recursively_prune_elements(soup)
59 |
60 | # Finally ensure that the whole tree is wrapped in a div
61 | # Strip out enclosing elements that cannot live inside a div
62 | while soup.contents and (soup.contents[0].name in structural_elements()):
63 | soup.contents[0].unwrap()
64 | # If the outermost tag is a single div then return it
65 | if len(soup.contents) == 1 and soup.contents[0].name == "div":
66 | return soup
67 |
68 | # ... otherwise wrap in a div and return that
69 | root = soup.new_tag("div")
70 | root.append(soup)
71 | return root
72 |
--------------------------------------------------------------------------------
/readabilipy/simplifiers/__init__.py:
--------------------------------------------------------------------------------
1 | from .text import normalise_text, normalise_unicode, normalise_whitespace, strip_control_characters, strip_html_whitespace
2 |
3 | __all__ = [
4 | "normalise_text",
5 | "normalise_unicode",
6 | "normalise_whitespace",
7 | "strip_control_characters",
8 | "strip_html_whitespace",
9 | ]
10 |
--------------------------------------------------------------------------------
/readabilipy/simplifiers/html.py:
--------------------------------------------------------------------------------
1 | """Common HTML cleaning functions."""
2 | from bs4 import Comment, Doctype, NavigableString
3 | from .text import normalise_text
4 |
5 |
6 | def elements_to_delete():
7 | """Elements that will be deleted together with their contents."""
8 | html5_form_elements = ['button', 'datalist', 'fieldset', 'form', 'input',
9 | 'label', 'legend', 'meter', 'optgroup', 'option',
10 | 'output', 'progress', 'select', 'textarea']
11 | html5_image_elements = ['area', 'img', 'map', 'picture', 'source']
12 | html5_media_elements = ['audio', 'track', 'video']
13 | html5_embedded_elements = ['embed', 'iframe', 'math', 'object', 'param', 'svg']
14 | html5_interactive_elements = ['details', 'dialog', 'summary']
15 | html5_scripting_elements = ['canvas', 'noscript', 'script', 'template']
16 | html5_data_elements = ['data', 'link']
17 | html5_formatting_elements = ['style']
18 | html5_navigation_elements = ['nav']
19 |
20 | elements = html5_form_elements + html5_image_elements \
21 | + html5_media_elements + html5_embedded_elements \
22 | + html5_interactive_elements + html5_scripting_elements \
23 | + html5_data_elements + html5_formatting_elements \
24 | + html5_navigation_elements
25 |
26 | return elements
27 |
28 |
29 | def elements_to_replace_with_contents():
30 | """Elements that we will discard while keeping their contents."""
31 | elements = ['a', 'abbr', 'address', 'b', 'bdi', 'bdo', 'center', 'cite',
32 | 'code', 'del', 'dfn', 'em', 'i', 'ins', 'kbs', 'mark',
33 | 'rb', 'ruby', 'rp', 'rt', 'rtc', 's', 'samp', 'small', 'span',
34 | 'strong', 'time', 'u', 'var', 'wbr']
35 | return elements
36 |
37 |
38 | def special_elements():
39 | """Elements that we will discard while keeping their contents that need
40 | additional processing."""
41 | elements = ['q', 'sub', 'sup']
42 | return elements
43 |
44 |
45 | def block_level_whitelist():
46 | """Elements that we will always accept."""
47 | elements = ['article', 'aside', 'blockquote', 'caption', 'colgroup', 'col',
48 | 'div', 'dl', 'dt', 'dd', 'figure', 'figcaption', 'footer',
49 | 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'li', 'main',
50 | 'ol', 'p', 'pre', 'section', 'table', 'tbody', 'thead',
51 | 'tfoot', 'tr', 'td', 'th', 'ul']
52 | return elements
53 |
54 |
55 | def structural_elements():
56 | """Structural elements we do no further processing on (though we do remove attributes and alter their contents)"""
57 | return ['html', 'head', 'body']
58 |
59 |
60 | def metadata_elements():
61 | """Metadata elements we do no further processing on (though we do remove attributes and alter their contents)"""
62 | return ['meta', 'link', 'base', 'title']
63 |
64 |
65 | def linebreak_elements():
66 | return ['br', 'hr']
67 |
68 |
69 | def known_elements():
70 | """All elements that we know by name."""
71 | return structural_elements() + metadata_elements() + linebreak_elements() + elements_to_delete() \
72 | + elements_to_replace_with_contents() + special_elements() \
73 | + block_level_whitelist()
74 |
75 |
76 | def remove_metadata(soup):
77 | """Remove comments, CData and doctype. These are not rendered by browsers.
78 | The lxml-based parsers automatically convert CData to comments unless it is
79 | inside
49 |
“The facts suggest that you absolutely have a Native American ancestor in your pedigree,” an
57 | Ancestry.com and 23andMe adviser told Warren.
58 |
Warren tweeted Trump to demand he pay what he promised and requested that he make the donation to the National
59 | Indigenous Women’s Resource Center.
60 |
61 |
By the way, @realDonaldTrump:
62 | Remember saying on 7/5 that you’d give $1M to a charity of my choice if my DNA showed Native American ancestry?
63 | I remember – and here’s the verdict. Please send the check to the National Indigenous Women’s Resource
64 | Center: https://t.co/I6YQ9hf7Tvpic.twitter.com/J4gBamaeeo
In response, White House counselor Kellyanne Conway called DNA testing “junk science.”
73 |
Then, Trump flat out denied ever promising to make that donation even though it’s on tape.
74 |
“Who cares?” Trump told reporters today. “I didn’t say that. Nope, you better read it again.”
75 |
Well, MSNBC has the receipts:
76 |
77 |
MSNBC put together video of Trump unequivocally telling Elizabeth Warren in July that
78 | “I’ll give you a million dollars, to your favorite charity, paid for by Trump, if you take [a DNA]
79 | test and it shows you’re an Indian” — followed by him denying he ever said that this morning.
80 | pic.twitter.com/zR3n2DqaiY
169 | Author: Conover
172 | KennardConover makes tea partiers cry as a hobby. She was Commander of Jade Helm15 during the failed
173 | takeover of the South. She's also one of the biggest arseholes on Twitter. At night, she can be found drinking
174 | Conservative tears while pulling off the wings of flies just because she can. She is the founder of a Marxist,
175 | Commie, Maoist, Socialist site and has contributed to several other sites, blah blah blah. She is an awful person
176 | but she doesn't like to brag about that.
177 |
178 |
179 |
180 |
--------------------------------------------------------------------------------
/tests/data/addictinginfo.com-1_full_page_javascript.json:
--------------------------------------------------------------------------------
1 | {
2 | "title": "Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video",
3 | "byline": "Conover Kennard",
4 | "date": null,
5 | "content": "
\n\n\n\n
\n
Donald Trump has repeatedly lashed out at Sen. Elizabeth Warren (D-MA) at his rallies, calling her Pochahontas to cast doubt on her Native American Heritage. Trump even referred to Warren as Pocahontas at a November 2017 White House event honoring Navajo code-breakers. At a rally in Montana this past July, Trump offered Sen. Warren $1 million if she took a DNA test to prove her heritage. Trump spoke of bringing a DNA test to a debate if she challenges him.
\n
“We will take that little kit — but we have to do it gently. Because we’re in the #MeToo generation, we have to do it gently,” Trump said. “And we will very gently take that kit, and slowly toss it, hoping it doesn’t injure her arm, and we will say: I will give you a million dollars to your favorite charity, paid for by Trump, if you take the test and it shows you’re an Indian. I have a feeling she will say no.”
“The facts suggest that you absolutely have a Native American ancestor in your pedigree,” an Ancestry.com and 23andMe adviser told Warren.
\n
Warren tweeted Trump to demand he pay what he promised and requested that he make the donation to the National Indigenous Women’s Resource Center.
\n
\n
By the way, @realDonaldTrump: Remember saying on 7/5 that you’d give $1M to a charity of my choice if my DNA showed Native American ancestry? I remember – and here’s the verdict. Please send the check to the National Indigenous Women’s Resource Center: https://t.co/I6YQ9hf7Tvpic.twitter.com/J4gBamaeeo
In response, White House counselor Kellyanne Conway called DNA testing “junk science.”
\n
Then, Trump flat out denied ever promising to make that donation even though it’s on tape.
\n
“Who cares?” Trump told reporters today. “I didn’t say that. Nope, you better read it again.”
\n
Well, MSNBC has the receipts:
\n
\n
MSNBC put together video of Trump unequivocally telling Elizabeth Warren in July that “I’ll give you a million dollars, to your favorite charity, paid for by Trump, if you take [a DNA] test and it shows you’re an Indian” — followed by him denying he ever said that this morning. pic.twitter.com/zR3n2DqaiY
Author: Conover KennardConover makes tea partiers cry as a hobby. She was Commander of Jade Helm15 during the failed takeover of the South. She's also one of the biggest arseholes on Twitter. At night, she can be found drinking Conservative tears while pulling off the wings of flies just because she can. She is the founder of a Marxist, Commie, Maoist, Socialist site and has contributed to several other sites, blah blah blah. She is an awful person but she doesn't like to brag about that.\n
\n
\n
",
6 | "plain_content": "
Donald Trump has repeatedly lashed out at Sen. Elizabeth Warren (D-MA) at his rallies, calling her Pochahontas to cast doubt on her Native American Heritage. Trump even referred to Warren as Pocahontas at a November 2017 White House event honoring Navajo code-breakers. At a rally in Montana this past July, Trump offered Sen. Warren $1 million if she took a DNA test to prove her heritage. Trump spoke of bringing a DNA test to a debate if she challenges him.
“We will take that little kit — but we have to do it gently. Because we’re in the #MeToo generation, we have to do it gently,” Trump said. “And we will very gently take that kit, and slowly toss it, hoping it doesn’t injure her arm, and we will say: I will give you a million dollars to your favorite charity, paid for by Trump, if you take the test and it shows you’re an Indian. I have a feeling she will say no.”
Today the Massachusetts Senator released her DNA results.
“The facts suggest that you absolutely have a Native American ancestor in your pedigree,” an Ancestry.com and 23andMe adviser told Warren.
Warren tweeted Trump to demand he pay what he promised and requested that he make the donation to the National Indigenous Women’s Resource Center.
By the way, @realDonaldTrump: Remember saying on 7/5 that you’d give $1M to a charity of my choice if my DNA showed Native American ancestry? I remember – and here’s the verdict. Please send the check to the National Indigenous Women’s Resource Center: https://t.co/I6YQ9hf7Tv pic.twitter.com/J4gBamaeeo
— Elizabeth Warren (@elizabethforma) October 15, 2018
In response, White House counselor Kellyanne Conway called DNA testing “junk science.”
Then, Trump flat out denied ever promising to make that donation even though it’s on tape.
“Who cares?” Trump told reporters today. “I didn’t say that. Nope, you better read it again.”
Well, MSNBC has the receipts:
MSNBC put together video of Trump unequivocally telling Elizabeth Warren in July that “I’ll give you a million dollars, to your favorite charity, paid for by Trump, if you take [a DNA] test and it shows you’re an Indian” — followed by him denying he ever said that this morning. pic.twitter.com/zR3n2DqaiY
— Aaron Rupar (@atrupar) October 15, 2018
Image via screen capture.
Author: Conover KennardConover makes tea partiers cry as a hobby. She was Commander of Jade Helm15 during the failed takeover of the South. She's also one of the biggest arseholes on Twitter. At night, she can be found drinking Conservative tears while pulling off the wings of flies just because she can. She is the founder of a Marxist, Commie, Maoist, Socialist site and has contributed to several other sites, blah blah blah. She is an awful person but she doesn't like to brag about that.
",
7 | "plain_text": [
8 | {"text": "Donald Trump has repeatedly lashed out at Sen. Elizabeth Warren (D-MA) at his rallies, calling her Pochahontas to cast doubt on her Native American Heritage. Trump even referred to Warren as Pocahontas at a November 2017 White House event honoring Navajo code-breakers. At a rally in Montana this past July, Trump offered Sen. Warren $1 million if she took a DNA test to prove her heritage. Trump spoke of bringing a DNA test to a debate if she challenges him."},
9 | {"text": "“We will take that little kit — but we have to do it gently. Because we’re in the #MeToo generation, we have to do it gently,” Trump said. “And we will very gently take that kit, and slowly toss it, hoping it doesn’t injure her arm, and we will say: I will give you a million dollars to your favorite charity, paid for by Trump, if you take the test and it shows you’re an Indian. I have a feeling she will say no.”"},
10 | {"text": "Today the Massachusetts Senator released her DNA results."},
11 | {"text": "“The facts suggest that you absolutely have a Native American ancestor in your pedigree,” an Ancestry.com and 23andMe adviser told Warren."},
12 | {"text": "Warren tweeted Trump to demand he pay what he promised and requested that he make the donation to the National Indigenous Women’s Resource Center."},
13 | {"text": "By the way, @realDonaldTrump: Remember saying on 7/5 that you’d give $1M to a charity of my choice if my DNA showed Native American ancestry? I remember – and here’s the verdict. Please send the check to the National Indigenous Women’s Resource Center: https://t.co/I6YQ9hf7Tv pic.twitter.com/J4gBamaeeo"},
14 | {"text": "— Elizabeth Warren (@elizabethforma) October 15, 2018"},
15 | {"text": "In response, White House counselor Kellyanne Conway called DNA testing “junk science.”"},
16 | {"text": "Then, Trump flat out denied ever promising to make that donation even though it’s on tape."},
17 | {"text": "“Who cares?” Trump told reporters today. “I didn’t say that. Nope, you better read it again.”"},
18 | {"text": "Well, MSNBC has the receipts:"},
19 | {"text": "MSNBC put together video of Trump unequivocally telling Elizabeth Warren in July that “I’ll give you a million dollars, to your favorite charity, paid for by Trump, if you take [a DNA] test and it shows you’re an Indian” — followed by him denying he ever said that this morning. pic.twitter.com/zR3n2DqaiY"},
20 | {"text": "— Aaron Rupar (@atrupar) October 15, 2018"},
21 | {"text": "Image via screen capture."},
22 | {"text": "Author: Conover KennardConover makes tea partiers cry as a hobby. She was Commander of Jade Helm15 during the failed takeover of the South. She's also one of the biggest arseholes on Twitter. At night, she can be found drinking Conservative tears while pulling off the wings of flies just because she can. She is the founder of a Marxist, Commie, Maoist, Socialist site and has contributed to several other sites, blah blah blah. She is an awful person but she doesn't like to brag about that."}
23 | ]
24 | }
25 |
--------------------------------------------------------------------------------
/tests/data/addictinginfo.com-1_plain_text_paragraphs_from_simple_article.json:
--------------------------------------------------------------------------------
1 | [
2 | {"text": "Current Issues, News"},
3 | {"text": "Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video"},
4 | {"text": "By Conover Kennard on October 15, 2018 12:13 pm ·"},
5 | {"text": "Donald Trump has repeatedly lashed out at Sen. Elizabeth Warren (D-MA) at his rallies, calling her Pochahontas to cast doubt on her Native American Heritage. Trump even referred to Warren as Pocahontas at a November 2017 White House event honoring Navajo code-breakers. At a rally in Montana this past July, Trump offered Sen. Warren $1 million if she took a DNA test to prove her heritage. Trump spoke of bringing a DNA test to a debate if she challenges him."},
6 | {"text": "“We will take that little kit — but we have to do it gently. Because we’re in the #MeToo generation, we have to do it gently,” Trump said. “And we will very gently take that kit, and slowly toss it, hoping it doesn’t injure her arm, and we will say: I will give you a million dollars to your favorite charity, paid for by Trump, if you take the test and it shows you’re an Indian. I have a feeling she will say no.”"},
7 | {"text": "Today the Massachusetts Senator released her DNA results."},
8 | {"text": "“The facts suggest that you absolutely have a Native American ancestor in your pedigree,” an Ancestry.com and 23andMe adviser told Warren."},
9 | {"text": "Warren tweeted Trump to demand he pay what he promised and requested that he make the donation to the National Indigenous Women’s Resource Center."},
10 | {"text": "By the way, @realDonaldTrump: Remember saying on 7/5 that you’d give $1M to a charity of my choice if my DNA showed Native American ancestry? I remember – and here’s the verdict. Please send the check to the National Indigenous Women’s Resource Center: https://t.co/I6YQ9hf7Tv pic.twitter.com/J4gBamaeeo"},
11 | {"text": "— Elizabeth Warren (@elizabethforma) October 15, 2018"},
12 | {"text": "In response, White House counselor Kellyanne Conway called DNA testing “junk science.”"},
13 | {"text": "Then, Trump flat out denied ever promising to make that donation even though it’s on tape."},
14 | {"text": "“Who cares?” Trump told reporters today. “I didn’t say that. Nope, you better read it again.”"},
15 | {"text": "Well, MSNBC has the receipts:"},
16 | {"text": "MSNBC put together video of Trump unequivocally telling Elizabeth Warren in July that “I’ll give you a million dollars, to your favorite charity, paid for by Trump, if you take [a DNA] test and it shows you’re an Indian” — followed by him denying he ever said that this morning. pic.twitter.com/zR3n2DqaiY"},
17 | {"text": "— Aaron Rupar (@atrupar) October 15, 2018"},
18 | {"text": "Image via screen capture."},
19 | {"text": "Share this Article!"},
20 | {"text": "Share on Facebook"},
21 | {"text": "Share on Twitter"},
22 | {"text": "Author: Conover KennardConover makes tea partiers cry as a hobby. She was Commander of Jade Helm15 during the failed takeover of the South. She's also one of the biggest arseholes on Twitter. At night, she can be found drinking Conservative tears while pulling off the wings of flies just because she can. She is the founder of a Marxist, Commie, Maoist, Socialist site and has contributed to several other sites, blah blah blah. She is an awful person but she doesn't like to brag about that."}
23 | ]
--------------------------------------------------------------------------------
/tests/data/addictinginfo.com-1_simple_article_from_full_article.json:
--------------------------------------------------------------------------------
1 | {
2 | "title": "Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video",
3 | "byline": null,
4 | "date": "2018-10-15T12:13:54",
5 | "content": "
Current Issues, News
Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video
By Conover Kennard on October 15, 2018 12:13 pm ·
Donald Trump has repeatedly lashed out at Sen. Elizabeth Warren (D-MA) at his rallies, calling her Pochahontas to cast doubt on her Native American Heritage. Trump even referred to Warren as Pocahontas at a November 2017 White House event honoring Navajo code-breakers. At a rally in Montana this past July, Trump offered Sen. Warren $1 million if she took a DNA test to prove her heritage. Trump spoke of bringing a DNA test to a debate if she challenges him.
“We will take that little kit — but we have to do it gently. Because we’re in the #MeToo generation, we have to do it gently,” Trump said. “And we will very gently take that kit, and slowly toss it, hoping it doesn’t injure her arm, and we will say: I will give you a million dollars to your favorite charity, paid for by Trump, if you take the test and it shows you’re an Indian. I have a feeling she will say no.”
Today the Massachusetts Senator released her DNA results.
“The facts suggest that you absolutely have a Native American ancestor in your pedigree,” an Ancestry.com and 23andMe adviser told Warren.
Warren tweeted Trump to demand he pay what he promised and requested that he make the donation to the National Indigenous Women’s Resource Center.
By the way, @realDonaldTrump: Remember saying on 7/5 that you’d give $1M to a charity of my choice if my DNA showed Native American ancestry? I remember – and here’s the verdict. Please send the check to the National Indigenous Women’s Resource Center: https://t.co/I6YQ9hf7Tv pic.twitter.com/J4gBamaeeo
— Elizabeth Warren (@elizabethforma) October 15, 2018
In response, White House counselor Kellyanne Conway called DNA testing “junk science.”
Then, Trump flat out denied ever promising to make that donation even though it’s on tape.
“Who cares?” Trump told reporters today. “I didn’t say that. Nope, you better read it again.”
Well, MSNBC has the receipts:
MSNBC put together video of Trump unequivocally telling Elizabeth Warren in July that “I’ll give you a million dollars, to your favorite charity, paid for by Trump, if you take [a DNA] test and it shows you’re an Indian” — followed by him denying he ever said that this morning. pic.twitter.com/zR3n2DqaiY
— Aaron Rupar (@atrupar) October 15, 2018
Image via screen capture.
Share this Article!
Share on Facebook
Share on Twitter
Author: Conover KennardConover makes tea partiers cry as a hobby. She was Commander of Jade Helm15 during the failed takeover of the South. She's also one of the biggest arseholes on Twitter. At night, she can be found drinking Conservative tears while pulling off the wings of flies just because she can. She is the founder of a Marxist, Commie, Maoist, Socialist site and has contributed to several other sites, blah blah blah. She is an awful person but she doesn't like to brag about that.
",
6 | "plain_content": "
Current Issues, News
Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video
By Conover Kennard on October 15, 2018 12:13 pm ·
Donald Trump has repeatedly lashed out at Sen. Elizabeth Warren (D-MA) at his rallies, calling her Pochahontas to cast doubt on her Native American Heritage. Trump even referred to Warren as Pocahontas at a November 2017 White House event honoring Navajo code-breakers. At a rally in Montana this past July, Trump offered Sen. Warren $1 million if she took a DNA test to prove her heritage. Trump spoke of bringing a DNA test to a debate if she challenges him.
“We will take that little kit — but we have to do it gently. Because we’re in the #MeToo generation, we have to do it gently,” Trump said. “And we will very gently take that kit, and slowly toss it, hoping it doesn’t injure her arm, and we will say: I will give you a million dollars to your favorite charity, paid for by Trump, if you take the test and it shows you’re an Indian. I have a feeling she will say no.”
Today the Massachusetts Senator released her DNA results.
“The facts suggest that you absolutely have a Native American ancestor in your pedigree,” an Ancestry.com and 23andMe adviser told Warren.
Warren tweeted Trump to demand he pay what he promised and requested that he make the donation to the National Indigenous Women’s Resource Center.
By the way, @realDonaldTrump: Remember saying on 7/5 that you’d give $1M to a charity of my choice if my DNA showed Native American ancestry? I remember – and here’s the verdict. Please send the check to the National Indigenous Women’s Resource Center: https://t.co/I6YQ9hf7Tv pic.twitter.com/J4gBamaeeo
— Elizabeth Warren (@elizabethforma) October 15, 2018
In response, White House counselor Kellyanne Conway called DNA testing “junk science.”
Then, Trump flat out denied ever promising to make that donation even though it’s on tape.
“Who cares?” Trump told reporters today. “I didn’t say that. Nope, you better read it again.”
Well, MSNBC has the receipts:
MSNBC put together video of Trump unequivocally telling Elizabeth Warren in July that “I’ll give you a million dollars, to your favorite charity, paid for by Trump, if you take [a DNA] test and it shows you’re an Indian” — followed by him denying he ever said that this morning. pic.twitter.com/zR3n2DqaiY
— Aaron Rupar (@atrupar) October 15, 2018
Image via screen capture.
Share this Article!
Share on Facebook
Share on Twitter
Author: Conover KennardConover makes tea partiers cry as a hobby. She was Commander of Jade Helm15 during the failed takeover of the South. She's also one of the biggest arseholes on Twitter. At night, she can be found drinking Conservative tears while pulling off the wings of flies just because she can. She is the founder of a Marxist, Commie, Maoist, Socialist site and has contributed to several other sites, blah blah blah. She is an awful person but she doesn't like to brag about that.
",
7 | "plain_text": [
8 | {"text": "Current Issues, News"},
9 | {"text": "Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video"},
10 | {"text": "By Conover Kennard on October 15, 2018 12:13 pm ·"},
11 | {"text": "Donald Trump has repeatedly lashed out at Sen. Elizabeth Warren (D-MA) at his rallies, calling her Pochahontas to cast doubt on her Native American Heritage. Trump even referred to Warren as Pocahontas at a November 2017 White House event honoring Navajo code-breakers. At a rally in Montana this past July, Trump offered Sen. Warren $1 million if she took a DNA test to prove her heritage. Trump spoke of bringing a DNA test to a debate if she challenges him."},
12 | {"text": "“We will take that little kit — but we have to do it gently. Because we’re in the #MeToo generation, we have to do it gently,” Trump said. “And we will very gently take that kit, and slowly toss it, hoping it doesn’t injure her arm, and we will say: I will give you a million dollars to your favorite charity, paid for by Trump, if you take the test and it shows you’re an Indian. I have a feeling she will say no.”"},
13 | {"text": "Today the Massachusetts Senator released her DNA results."},
14 | {"text": "“The facts suggest that you absolutely have a Native American ancestor in your pedigree,” an Ancestry.com and 23andMe adviser told Warren."},
15 | {"text": "Warren tweeted Trump to demand he pay what he promised and requested that he make the donation to the National Indigenous Women’s Resource Center."},
16 | {"text": "By the way, @realDonaldTrump: Remember saying on 7/5 that you’d give $1M to a charity of my choice if my DNA showed Native American ancestry? I remember – and here’s the verdict. Please send the check to the National Indigenous Women’s Resource Center: https://t.co/I6YQ9hf7Tv pic.twitter.com/J4gBamaeeo"},
17 | {"text": "— Elizabeth Warren (@elizabethforma) October 15, 2018"},
18 | {"text": "In response, White House counselor Kellyanne Conway called DNA testing “junk science.”"},
19 | {"text": "Then, Trump flat out denied ever promising to make that donation even though it’s on tape."},
20 | {"text": "“Who cares?” Trump told reporters today. “I didn’t say that. Nope, you better read it again.”"},
21 | {"text": "Well, MSNBC has the receipts:"},
22 | {"text": "MSNBC put together video of Trump unequivocally telling Elizabeth Warren in July that “I’ll give you a million dollars, to your favorite charity, paid for by Trump, if you take [a DNA] test and it shows you’re an Indian” — followed by him denying he ever said that this morning. pic.twitter.com/zR3n2DqaiY"},
23 | {"text": "— Aaron Rupar (@atrupar) October 15, 2018"},
24 | {"text": "Image via screen capture."},
25 | {"text": "Share this Article!"},
26 | {"text": "Share on Facebook"},
27 | {"text": "Share on Twitter"},
28 | {"text": "Author: Conover KennardConover makes tea partiers cry as a hobby. She was Commander of Jade Helm15 during the failed takeover of the South. She's also one of the biggest arseholes on Twitter. At night, she can be found drinking Conservative tears while pulling off the wings of flies just because she can. She is the founder of a Marxist, Commie, Maoist, Socialist site and has contributed to several other sites, blah blah blah. She is an awful person but she doesn't like to brag about that."}
29 | ]
30 | }
31 |
--------------------------------------------------------------------------------
/tests/data/addictinginfo.com-1_simple_article_from_full_page.json:
--------------------------------------------------------------------------------
1 | {
2 | "title": "Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video",
3 | "byline": null,
4 | "date": "2018-10-15T12:13:54",
5 | "content": "
Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video | Addicting Info | The Knowledge You Crave
Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video
About Us
Contact Us
Creative Commons License
Privacy Policy
Terms of Service
Home
News
Important Information
Discredited Myths
Historical Information
Political Humor
Current Issues, News
Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video
By Conover Kennard on October 15, 2018 12:13 pm ·
Donald Trump has repeatedly lashed out at Sen. Elizabeth Warren (D-MA) at his rallies, calling her Pochahontas to cast doubt on her Native American Heritage. Trump even referred to Warren as Pocahontas at a November 2017 White House event honoring Navajo code-breakers. At a rally in Montana this past July, Trump offered Sen. Warren $1 million if she took a DNA test to prove her heritage. Trump spoke of bringing a DNA test to a debate if she challenges him.
“We will take that little kit — but we have to do it gently. Because we’re in the #MeToo generation, we have to do it gently,” Trump said. “And we will very gently take that kit, and slowly toss it, hoping it doesn’t injure her arm, and we will say: I will give you a million dollars to your favorite charity, paid for by Trump, if you take the test and it shows you’re an Indian. I have a feeling she will say no.”
Today the Massachusetts Senator released her DNA results.
“The facts suggest that you absolutely have a Native American ancestor in your pedigree,” an Ancestry.com and 23andMe adviser told Warren.
Warren tweeted Trump to demand he pay what he promised and requested that he make the donation to the National Indigenous Women’s Resource Center.
By the way, @realDonaldTrump: Remember saying on 7/5 that you’d give $1M to a charity of my choice if my DNA showed Native American ancestry? I remember – and here’s the verdict. Please send the check to the National Indigenous Women’s Resource Center: https://t.co/I6YQ9hf7Tv pic.twitter.com/J4gBamaeeo
— Elizabeth Warren (@elizabethforma) October 15, 2018
In response, White House counselor Kellyanne Conway called DNA testing “junk science.”
Then, Trump flat out denied ever promising to make that donation even though it’s on tape.
“Who cares?” Trump told reporters today. “I didn’t say that. Nope, you better read it again.”
Well, MSNBC has the receipts:
MSNBC put together video of Trump unequivocally telling Elizabeth Warren in July that “I’ll give you a million dollars, to your favorite charity, paid for by Trump, if you take [a DNA] test and it shows you’re an Indian” — followed by him denying he ever said that this morning. pic.twitter.com/zR3n2DqaiY
— Aaron Rupar (@atrupar) October 15, 2018
Image via screen capture.
Share this Article!
Share on Facebook
Share on Twitter
Author: Conover KennardConover makes tea partiers cry as a hobby. She was Commander of Jade Helm15 during the failed takeover of the South. She's also one of the biggest arseholes on Twitter. At night, she can be found drinking Conservative tears while pulling off the wings of flies just because she can. She is the founder of a Marxist, Commie, Maoist, Socialist site and has contributed to several other sites, blah blah blah. She is an awful person but she doesn't like to brag about that.
Search
Archives
Share on Facebook
Comments
",
6 | "plain_content": "
Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video | Addicting Info | The Knowledge You Crave
Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video
About Us
Contact Us
Creative Commons License
Privacy Policy
Terms of Service
Home
News
Important Information
Discredited Myths
Historical Information
Political Humor
Current Issues, News
Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video
By Conover Kennard on October 15, 2018 12:13 pm ·
Donald Trump has repeatedly lashed out at Sen. Elizabeth Warren (D-MA) at his rallies, calling her Pochahontas to cast doubt on her Native American Heritage. Trump even referred to Warren as Pocahontas at a November 2017 White House event honoring Navajo code-breakers. At a rally in Montana this past July, Trump offered Sen. Warren $1 million if she took a DNA test to prove her heritage. Trump spoke of bringing a DNA test to a debate if she challenges him.
“We will take that little kit — but we have to do it gently. Because we’re in the #MeToo generation, we have to do it gently,” Trump said. “And we will very gently take that kit, and slowly toss it, hoping it doesn’t injure her arm, and we will say: I will give you a million dollars to your favorite charity, paid for by Trump, if you take the test and it shows you’re an Indian. I have a feeling she will say no.”
Today the Massachusetts Senator released her DNA results.
“The facts suggest that you absolutely have a Native American ancestor in your pedigree,” an Ancestry.com and 23andMe adviser told Warren.
Warren tweeted Trump to demand he pay what he promised and requested that he make the donation to the National Indigenous Women’s Resource Center.
By the way, @realDonaldTrump: Remember saying on 7/5 that you’d give $1M to a charity of my choice if my DNA showed Native American ancestry? I remember – and here’s the verdict. Please send the check to the National Indigenous Women’s Resource Center: https://t.co/I6YQ9hf7Tv pic.twitter.com/J4gBamaeeo
— Elizabeth Warren (@elizabethforma) October 15, 2018
In response, White House counselor Kellyanne Conway called DNA testing “junk science.”
Then, Trump flat out denied ever promising to make that donation even though it’s on tape.
“Who cares?” Trump told reporters today. “I didn’t say that. Nope, you better read it again.”
Well, MSNBC has the receipts:
MSNBC put together video of Trump unequivocally telling Elizabeth Warren in July that “I’ll give you a million dollars, to your favorite charity, paid for by Trump, if you take [a DNA] test and it shows you’re an Indian” — followed by him denying he ever said that this morning. pic.twitter.com/zR3n2DqaiY
— Aaron Rupar (@atrupar) October 15, 2018
Image via screen capture.
Share this Article!
Share on Facebook
Share on Twitter
Author: Conover KennardConover makes tea partiers cry as a hobby. She was Commander of Jade Helm15 during the failed takeover of the South. She's also one of the biggest arseholes on Twitter. At night, she can be found drinking Conservative tears while pulling off the wings of flies just because she can. She is the founder of a Marxist, Commie, Maoist, Socialist site and has contributed to several other sites, blah blah blah. She is an awful person but she doesn't like to brag about that.
Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video | Addicting Info | The Knowledge You Crave
Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video
About Us
Contact Us
Creative Commons License
Privacy Policy
Terms of Service
Home
News
Important Information
Discredited Myths
Historical Information
Political Humor
Current Issues, News
Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video
By Conover Kennard on October 15, 2018 12:13 pm ·
Donald Trump has repeatedly lashed out at Sen. Elizabeth Warren (D-MA) at his rallies, calling her Pochahontas to cast doubt on her Native American Heritage. Trump even referred to Warren as Pocahontas at a November 2017 White House event honoring Navajo code-breakers. At a rally in Montana this past July, Trump offered Sen. Warren $1 million if she took a DNA test to prove her heritage. Trump spoke of bringing a DNA test to a debate if she challenges him.
“We will take that little kit — but we have to do it gently. Because we’re in the #MeToo generation, we have to do it gently,” Trump said. “And we will very gently take that kit, and slowly toss it, hoping it doesn’t injure her arm, and we will say: I will give you a million dollars to your favorite charity, paid for by Trump, if you take the test and it shows you’re an Indian. I have a feeling she will say no.”
Today the Massachusetts Senator released her DNA results.
“The facts suggest that you absolutely have a Native American ancestor in your pedigree,” an Ancestry.com and 23andMe adviser told Warren.
Warren tweeted Trump to demand he pay what he promised and requested that he make the donation to the National Indigenous Women’s Resource Center.
By the way, @realDonaldTrump: Remember saying on 7/5 that you’d give $1M to a charity of my choice if my DNA showed Native American ancestry? I remember – and here’s the verdict. Please send the check to the National Indigenous Women’s Resource Center: https://t.co/I6YQ9hf7Tv pic.twitter.com/J4gBamaeeo
— Elizabeth Warren (@elizabethforma) October 15, 2018
In response, White House counselor Kellyanne Conway called DNA testing “junk science.”
Then, Trump flat out denied ever promising to make that donation even though it’s on tape.
“Who cares?” Trump told reporters today. “I didn’t say that. Nope, you better read it again.”
Well, MSNBC has the receipts:
MSNBC put together video of Trump unequivocally telling Elizabeth Warren in July that “I’ll give you a million dollars, to your favorite charity, paid for by Trump, if you take [a DNA] test and it shows you’re an Indian” — followed by him denying he ever said that this morning. pic.twitter.com/zR3n2DqaiY
— Aaron Rupar (@atrupar) October 15, 2018
Image via screen capture.
Share this Article!
Share on Facebook
Share on Twitter
Author: Conover KennardConover makes tea partiers cry as a hobby. She was Commander of Jade Helm15 during the failed takeover of the South. She's also one of the biggest arseholes on Twitter. At night, she can be found drinking Conservative tears while pulling off the wings of flies just because she can. She is the founder of a Marxist, Commie, Maoist, Socialist site and has contributed to several other sites, blah blah blah. She is an awful person but she doesn't like to brag about that.
Search
Archives
Share on Facebook
Comments
",
6 | "plain_content": "
Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video | Addicting Info | The Knowledge You Crave
Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video
About Us
Contact Us
Creative Commons License
Privacy Policy
Terms of Service
Home
News
Important Information
Discredited Myths
Historical Information
Political Humor
Current Issues, News
Trump Denies Charitable Donation He Promised If Elizabeth Warren Releases DNA Results And It’s On Video
By Conover Kennard on October 15, 2018 12:13 pm ·
Donald Trump has repeatedly lashed out at Sen. Elizabeth Warren (D-MA) at his rallies, calling her Pochahontas to cast doubt on her Native American Heritage. Trump even referred to Warren as Pocahontas at a November 2017 White House event honoring Navajo code-breakers. At a rally in Montana this past July, Trump offered Sen. Warren $1 million if she took a DNA test to prove her heritage. Trump spoke of bringing a DNA test to a debate if she challenges him.
“We will take that little kit — but we have to do it gently. Because we’re in the #MeToo generation, we have to do it gently,” Trump said. “And we will very gently take that kit, and slowly toss it, hoping it doesn’t injure her arm, and we will say: I will give you a million dollars to your favorite charity, paid for by Trump, if you take the test and it shows you’re an Indian. I have a feeling she will say no.”
Today the Massachusetts Senator released her DNA results.
“The facts suggest that you absolutely have a Native American ancestor in your pedigree,” an Ancestry.com and 23andMe adviser told Warren.
Warren tweeted Trump to demand he pay what he promised and requested that he make the donation to the National Indigenous Women’s Resource Center.
By the way, @realDonaldTrump: Remember saying on 7/5 that you’d give $1M to a charity of my choice if my DNA showed Native American ancestry? I remember – and here’s the verdict. Please send the check to the National Indigenous Women’s Resource Center: https://t.co/I6YQ9hf7Tv pic.twitter.com/J4gBamaeeo
— Elizabeth Warren (@elizabethforma) October 15, 2018
In response, White House counselor Kellyanne Conway called DNA testing “junk science.”
Then, Trump flat out denied ever promising to make that donation even though it’s on tape.
“Who cares?” Trump told reporters today. “I didn’t say that. Nope, you better read it again.”
Well, MSNBC has the receipts:
MSNBC put together video of Trump unequivocally telling Elizabeth Warren in July that “I’ll give you a million dollars, to your favorite charity, paid for by Trump, if you take [a DNA] test and it shows you’re an Indian” — followed by him denying he ever said that this morning. pic.twitter.com/zR3n2DqaiY
— Aaron Rupar (@atrupar) October 15, 2018
Image via screen capture.
Share this Article!
Share on Facebook
Share on Twitter
Author: Conover KennardConover makes tea partiers cry as a hobby. She was Commander of Jade Helm15 during the failed takeover of the South. She's also one of the biggest arseholes on Twitter. At night, she can be found drinking Conservative tears while pulling off the wings of flies just because she can. She is the founder of a Marxist, Commie, Maoist, Socialist site and has contributed to several other sites, blah blah blah. She is an awful person but she doesn't like to brag about that.
14 | A paragraph with no list before an unordered list outside a paragraph.
15 |
16 |
17 |
Unordered town A
18 |
Unordered town B
19 |
20 |
Unordered town C
21 |
22 |
23 | A paragraph with no list after an unordered list outside a paragraph.
24 |
25 |
26 |
27 |
28 | An ordered list inside a paragraph.
29 |
30 |
Ordered thing one
31 |
Ordered thing two
32 |
Ordered thing three
33 |
34 |
35 | A paragraph with no list before an ordered list outside a paragraph.
36 |
37 |
38 |
Ordered town A
39 |
Ordered town B
40 |
Ordered town C
41 |
42 |
43 | A paragraph with no list after an ordered list outside a paragraph.
44 |
45 |
46 |
47 |
--------------------------------------------------------------------------------
/tests/data/list_items_plain_text_paragraph_node_indexes.json:
--------------------------------------------------------------------------------
1 | [
2 | {"node_index": "0.1.1", "text": "An unordered list inside a paragraph."},
3 | {"node_index": "0.1.2", "text": "* Unordered thing one, * Unordered thing two, * Unordered thing three,"},
4 | {"node_index": "0.1.3", "text": "A paragraph with no list before an unordered list outside a paragraph."},
5 | {"node_index": "0.1.4", "text": "* Unordered town A, * Unordered town B, * Unordered town C,"},
6 | {"node_index": "0.1.5", "text": "A paragraph with no list after an unordered list outside a paragraph."},
7 | {"node_index": "0.2.1", "text": "An ordered list inside a paragraph."},
8 | {"node_index": "0.2.2", "text": "* Ordered thing one, * Ordered thing two, * Ordered thing three,"},
9 | {"node_index": "0.2.3", "text": "A paragraph with no list before an ordered list outside a paragraph."},
10 | {"node_index": "0.2.4", "text": "* Ordered town A, * Ordered town B, * Ordered town C,"},
11 | {"node_index": "0.2.5", "text": "A paragraph with no list after an ordered list outside a paragraph."}
12 | ]
--------------------------------------------------------------------------------
/tests/data/list_items_simple_article_from_full_page.json:
--------------------------------------------------------------------------------
1 | {
2 | "title": null,
3 | "byline": null,
4 | "date": null,
5 | "content": "
An unordered list inside a paragraph.
Unordered thing one
Unordered thing two
Unordered thing three
A paragraph with no list before an unordered list outside a paragraph.
Unordered town A
Unordered town B
Unordered town C
A paragraph with no list after an unordered list outside a paragraph.
An ordered list inside a paragraph.
Ordered thing one
Ordered thing two
Ordered thing three
A paragraph with no list before an ordered list outside a paragraph.
Ordered town A
Ordered town B
Ordered town C
A paragraph with no list after an ordered list outside a paragraph.
",
6 | "plain_content": "
An unordered list inside a paragraph.
Unordered thing one
Unordered thing two
Unordered thing three
A paragraph with no list before an unordered list outside a paragraph.
Unordered town A
Unordered town B
Unordered town C
A paragraph with no list after an unordered list outside a paragraph.
An ordered list inside a paragraph.
Ordered thing one
Ordered thing two
Ordered thing three
A paragraph with no list before an ordered list outside a paragraph.
Ordered town A
Ordered town B
Ordered town C
A paragraph with no list after an ordered list outside a paragraph.
",
7 | "plain_text": [
8 | {"text": "An unordered list inside a paragraph."},
9 | {"text": "* Unordered thing one, * Unordered thing two, * Unordered thing three,"},
10 | {"text": "A paragraph with no list before an unordered list outside a paragraph."},
11 | {"text": "* Unordered town A, * Unordered town B, * Unordered town C,"},
12 | {"text": "A paragraph with no list after an unordered list outside a paragraph."},
13 | {"text": "An ordered list inside a paragraph."},
14 | {"text": "* Ordered thing one, * Ordered thing two, * Ordered thing three,"},
15 | {"text": "A paragraph with no list before an ordered list outside a paragraph."},
16 | {"text": "* Ordered town A, * Ordered town B, * Ordered town C,"},
17 | {"text": "A paragraph with no list after an ordered list outside a paragraph."}
18 | ]
19 | }
20 |
--------------------------------------------------------------------------------
/tests/data/list_items_simple_article_from_full_page_content_digests.json:
--------------------------------------------------------------------------------
1 | {
2 | "title": null,
3 | "byline": null,
4 | "date": null,
5 | "content": "
An unordered list inside a paragraph.
Unordered thing one
Unordered thing two
Unordered thing three
A paragraph with no list before an unordered list outside a paragraph.
Unordered town A
Unordered town B
Unordered town C
A paragraph with no list after an unordered list outside a paragraph.
An ordered list inside a paragraph.
Ordered thing one
Ordered thing two
Ordered thing three
A paragraph with no list before an ordered list outside a paragraph.
Ordered town A
Ordered town B
Ordered town C
A paragraph with no list after an ordered list outside a paragraph.
",
6 | "plain_content": "
An unordered list inside a paragraph.
Unordered thing one
Unordered thing two
Unordered thing three
A paragraph with no list before an unordered list outside a paragraph.
Unordered town A
Unordered town B
Unordered town C
A paragraph with no list after an unordered list outside a paragraph.
An ordered list inside a paragraph.
Ordered thing one
Ordered thing two
Ordered thing three
A paragraph with no list before an ordered list outside a paragraph.
Ordered town A
Ordered town B
Ordered town C
A paragraph with no list after an ordered list outside a paragraph.
",
7 | "plain_text": [
8 | {"text": "An unordered list inside a paragraph."},
9 | {"text": "* Unordered thing one, * Unordered thing two, * Unordered thing three,"},
10 | {"text": "A paragraph with no list before an unordered list outside a paragraph."},
11 | {"text": "* Unordered town A, * Unordered town B, * Unordered town C,"},
12 | {"text": "A paragraph with no list after an unordered list outside a paragraph."},
13 | {"text": "An ordered list inside a paragraph."},
14 | {"text": "* Ordered thing one, * Ordered thing two, * Ordered thing three,"},
15 | {"text": "A paragraph with no list before an ordered list outside a paragraph."},
16 | {"text": "* Ordered town A, * Ordered town B, * Ordered town C,"},
17 | {"text": "A paragraph with no list after an ordered list outside a paragraph."}
18 | ]
19 | }
20 |
--------------------------------------------------------------------------------
/tests/data/list_items_simple_article_from_full_page_node_indexes.json:
--------------------------------------------------------------------------------
1 | {
2 | "title": null,
3 | "byline": null,
4 | "date": null,
5 | "content": "
An unordered list inside a paragraph.
Unordered thing one
Unordered thing two
Unordered thing three
A paragraph with no list before an unordered list outside a paragraph.
Unordered town A
Unordered town B
Unordered town C
A paragraph with no list after an unordered list outside a paragraph.
An ordered list inside a paragraph.
Ordered thing one
Ordered thing two
Ordered thing three
A paragraph with no list before an ordered list outside a paragraph.
Ordered town A
Ordered town B
Ordered town C
A paragraph with no list after an ordered list outside a paragraph.
",
6 | "plain_content": "
An unordered list inside a paragraph.
Unordered thing one
Unordered thing two
Unordered thing three
A paragraph with no list before an unordered list outside a paragraph.
Unordered town A
Unordered town B
Unordered town C
A paragraph with no list after an unordered list outside a paragraph.
An ordered list inside a paragraph.
Ordered thing one
Ordered thing two
Ordered thing three
A paragraph with no list before an ordered list outside a paragraph.
Ordered town A
Ordered town B
Ordered town C
A paragraph with no list after an ordered list outside a paragraph.
",
7 | "plain_text": [
8 | {"node_index": "0.1.1", "text": "An unordered list inside a paragraph."},
9 | {"node_index": "0.1.2", "text": "* Unordered thing one, * Unordered thing two, * Unordered thing three,"},
10 | {"node_index": "0.1.3", "text": "A paragraph with no list before an unordered list outside a paragraph."},
11 | {"node_index": "0.1.4", "text": "* Unordered town A, * Unordered town B, * Unordered town C,"},
12 | {"node_index": "0.1.5", "text": "A paragraph with no list after an unordered list outside a paragraph."},
13 | {"node_index": "0.2.1", "text": "An ordered list inside a paragraph."},
14 | {"node_index": "0.2.2", "text": "* Ordered thing one, * Ordered thing two, * Ordered thing three,"},
15 | {"node_index": "0.2.3", "text": "A paragraph with no list before an ordered list outside a paragraph."},
16 | {"node_index": "0.2.4", "text": "* Ordered town A, * Ordered town B, * Ordered town C,"},
17 | {"node_index": "0.2.5", "text": "A paragraph with no list after an ordered list outside a paragraph."}
18 | ]
19 | }
20 |
--------------------------------------------------------------------------------
/tests/data/non_article_full_page.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | No article here!
4 |
--------------------------------------------------------------------------------
/tests/data/non_article_full_page.json:
--------------------------------------------------------------------------------
1 | {
2 | "title": null,
3 | "byline": null,
4 | "date": null,
5 | "content": "
11 | Proin vulputate viverra dapibus. Duis hendrerit suscipit porta. Mauris arcu urna, placerat a
12 | lorem non, placerat sodales massa. Proin faucibus libero sollicitudin auctor tincidunt. Fusce
13 | malesuada massa quis enim cursus, at tincidunt dolor pretium. Donec sed felis et dolor semper
14 | molestie eget ut velit. Mauris ac ultricies arcu, nec vehicula felis. In eu mauris vel est
15 | tempor lobortis. Aenean sagittis molestie elit quis suscipit. Praesent volutpat mi in ipsum
16 | auctor, vitae dapibus ipsum cursus. Etiam iaculis lobortis mi ut accumsan. Vestibulum lacinia
17 | aliquam euismod. Cras venenatis dolor vel vulputate aliquet. Fusce cursus est quis nisi
18 | fringilla rhoncus. Maecenas lobortis nisi quis porttitor consectetur.
19 |
20 | Inside div, after div, before paragraph.
21 |
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam iaculis urna nec eros iaculis
22 | pretium. Sed efficitur eleifend turpis. Quisque at ipsum interdum, sagittis diam id, malesuada
23 | enim. Ut in nibh vitae eros vestibulum aliquam. Fusce finibus, ligula sed ullamcorper aliquet,
24 | nisl diam mollis quam, eu varius ligula justo nec nulla. Sed ut purus sollicitudin, porttitor
25 | mi quis, suscipit quam. In sit amet gravida nisl.
26 |
27 | Vivamus sapien mi, ultricies sit amet dolor sit amet, accumsan accumsan lacus. Donec in
28 | elit sit amet nisl pulvinar commodo. Nam at libero a est blandit vestibulum. Nam quis tempor sem.
29 | Integer a lobortis erat, vel ultricies diam. Vivamus sit amet ipsum in sapien tempus semper.
30 | Proin luctus pulvinar velit, eget egestas neque condimentum sit amet. Nam ut fringilla magna.
31 | Donec blandit lobortis mattis.
32 |
33 | Maecenas nec ipsum quis ipsum laoreet fermentum sit amet id est.
34 | Donec efficitur id est in iaculis. Morbi vel magna id ex efficitur tempus in in lectus. Fusce at
35 | orci vulputate, aliquet turpis eget, tincidunt leo. Aenean efficitur nibh vitae nisi volutpat
36 | iaculis. Etiam bibendum rhoncus vulputate.
37 |
38 | Nunc scelerisque, nibh ut porta cursus, ex
39 | orci molestie odio, in fringilla diam ex quis dui. Curabitur facilisis lacus ac congue finibus.
40 | Sed ipsum ex, blandit in dolor in, efficitur iaculis neque. Morbi gravida dapibus erat, eget
41 | cursus ipsum vehicula sit amet. Praesent feugiat bibendum felis venenatis pellentesque. Ut eu
42 | enim non nisl porttitor accumsan. Cras fringilla volutpat imperdiet.
43 |
44 | Inside div, after parqagraph, before blockquote
45 |
46 | Inside blockquote before paragraph.
47 |
48 | Donec sollicitudin, felis eu sodales semper, purus nisl mollis tellus, at bibendum diam sem non
49 | lectus. Nam vestibulum sagittis ultricies. Duis id metus lacus. Duis ac neque quis lectus ornare
50 | pharetra. Proin facilisis lectus sed urna tincidunt, ut cursus massa aliquam. Curabitur luctus
51 | aliquet sollicitudin. Nam dapibus purus maximus nulla semper, nec auctor augue interdum. Fusce
52 | ornare in velit vitae luctus.
53 |
54 | Inside blockquote after paragraph.
55 |
56 |
57 | Inside div before div
58 |
59 | Inside div before paragraph
60 |
61 | Nullam non ante nec felis rhoncus mattis. Mauris ornare eros et purus efficitur, in
62 | euismod lorem vestibulum. Vestibulum euismod mattis ornare. Aenean a sem eu tellus
63 | vestibulum sagittis. Vestibulum lectus nisi, tempor eget velit consectetur, vulputate
64 | tincidunt erat. Integer eleifend sit amet risus a blandit. Sed sagittis et lorem eu
65 | tincidunt. Integer eget tortor molestie, consequat nibh vitae, dignissim risus. Cras at
66 | neque non dolor viverra volutpat eget id dui. Cras sed convallis dui. Vestibulum porta
67 | odio nec risus consectetur condimentum. Curabitur commodo odio eget justo suscipit
68 | mattis. Proin in turpis at nisl venenatis sagittis. Integer justo tortor, accumsan
69 | blandit nulla at, molestie viverra enim. Duis pulvinar pulvinar massa, ac dapibus enim.
70 |
71 |
72 | Inside div after div.
73 |
74 |
Second level heading
75 | Text after second level heading and before third level heading.
76 |
Third level heading
77 | Text after third level heading and before fourth level heading.
78 |
Fourth level heading
79 | Text after fourth level heading and before fifth level heading.
80 |
Fifth level heading
81 | Text after fifth level heading and before sixth level heading.
82 |
Sixth level heading
83 | Text after Sixth level heading.
84 |
85 | Cras ac orci faucibus, rhoncus erat ac, fermentum ligula. Suspendisse et elit sed justo mollis
86 | lobortis. Ut consequat aliquet varius. Morbi accumsan nulla eget lectus euismod elementum.
87 | Quisque tincidunt ornare nibh in sagittis. Integer lorem tortor, iaculis eget ex vitae, tempus
88 | porta turpis. Donec aliquam lorem at mauris condimentum pellentesque. Proin posuere cursus
89 | diam, in commodo enim dignissim eu. Pellentesque id libero at lorem dapibus laoreet eget nec
90 | ligula.
91 |
92 |
93 |
\n Proin vulputate viverra dapibus. Duis hendrerit suscipit porta. Mauris arcu urna, placerat a\n lorem non, placerat sodales massa. Proin faucibus libero sollicitudin auctor tincidunt. Fusce\n malesuada massa quis enim cursus, at tincidunt dolor pretium. Donec sed felis et dolor semper\n molestie eget ut velit. Mauris ac ultricies arcu, nec vehicula felis. In eu mauris vel est\n tempor lobortis. Aenean sagittis molestie elit quis suscipit. Praesent volutpat mi in ipsum\n auctor, vitae dapibus ipsum cursus. Etiam iaculis lobortis mi ut accumsan. Vestibulum lacinia\n aliquam euismod. Cras venenatis dolor vel vulputate aliquet. Fusce cursus est quis nisi\n fringilla rhoncus. Maecenas lobortis nisi quis porttitor consectetur.\n
\n Inside div, after div, before paragraph.\n
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam iaculis urna nec eros iaculis\n pretium. Sed efficitur eleifend turpis. Quisque at ipsum interdum, sagittis diam id, malesuada\n enim. Ut in nibh vitae eros vestibulum aliquam. Fusce finibus, ligula sed ullamcorper aliquet,\n nisl diam mollis quam, eu varius ligula justo nec nulla. Sed ut purus sollicitudin, porttitor\n mi quis, suscipit quam. In sit amet gravida nisl.\n
\n Vivamus sapien mi, ultricies sit amet dolor sit amet, accumsan accumsan lacus. Donec in\n elit sit amet nisl pulvinar commodo. Nam at libero a est blandit vestibulum. Nam quis tempor sem.\n Integer a lobortis erat, vel ultricies diam. Vivamus sit amet ipsum in sapien tempus semper.\n Proin luctus pulvinar velit, eget egestas neque condimentum sit amet. Nam ut fringilla magna.\n Donec blandit lobortis mattis.\n
\n Maecenas nec ipsum quis ipsum laoreet fermentum sit amet id est.\n Donec efficitur id est in iaculis. Morbi vel magna id ex efficitur tempus in in lectus. Fusce at\n orci vulputate, aliquet turpis eget, tincidunt leo. Aenean efficitur nibh vitae nisi volutpat\n iaculis. Etiam bibendum rhoncus vulputate.\n
\n Nunc scelerisque, nibh ut porta cursus, ex\n orci molestie odio, in fringilla diam ex quis dui. Curabitur facilisis lacus ac congue finibus.\n Sed ipsum ex, blandit in dolor in, efficitur iaculis neque. Morbi gravida dapibus erat, eget\n cursus ipsum vehicula sit amet. Praesent feugiat bibendum felis venenatis pellentesque. Ut eu\n enim non nisl porttitor accumsan. Cras fringilla volutpat imperdiet.\n \n Inside div, after parqagraph, before blockquote\n
\n Inside blockquote before paragraph.\n
\n Donec sollicitudin, felis eu sodales semper, purus nisl mollis tellus, at bibendum diam sem non\n lectus. Nam vestibulum sagittis ultricies. Duis id metus lacus. Duis ac neque quis lectus ornare\n pharetra. Proin facilisis lectus sed urna tincidunt, ut cursus massa aliquam. Curabitur luctus\n aliquet sollicitudin. Nam dapibus purus maximus nulla semper, nec auctor augue interdum. Fusce\n ornare in velit vitae luctus.\n
\n Inside blockquote after paragraph.\n
\n
\n Inside div before div\n
\n Inside div before paragraph\n
\n Nullam non ante nec felis rhoncus mattis. Mauris ornare eros et purus efficitur, in\n euismod lorem vestibulum. Vestibulum euismod mattis ornare. Aenean a sem eu tellus\n vestibulum sagittis. Vestibulum lectus nisi, tempor eget velit consectetur, vulputate\n tincidunt erat. Integer eleifend sit amet risus a blandit. Sed sagittis et lorem eu\n tincidunt. Integer eget tortor molestie, consequat nibh vitae, dignissim risus. Cras at\n neque non dolor viverra volutpat eget id dui. Cras sed convallis dui. Vestibulum porta\n odio nec risus consectetur condimentum. Curabitur commodo odio eget justo suscipit\n mattis. Proin in turpis at nisl venenatis sagittis. Integer justo tortor, accumsan\n blandit nulla at, molestie viverra enim. Duis pulvinar pulvinar massa, ac dapibus enim.\n
\n
\n Inside div after div.\n
\n
Second level heading
\n Text after second level heading and before third level heading.\n
Third level heading
\n Text after third level heading and before fourth level heading.\n
Fourth level heading
\n Text after fourth level heading and before fifth level heading.\n
Fifth level heading
\n Text after fifth level heading and before sixth level heading.\n
Sixth level heading
\n Text after Sixth level heading.\n \n Cras ac orci faucibus, rhoncus erat ac, fermentum ligula. Suspendisse et elit sed justo mollis\n lobortis. Ut consequat aliquet varius. Morbi accumsan nulla eget lectus euismod elementum.\n Quisque tincidunt ornare nibh in sagittis. Integer lorem tortor, iaculis eget ex vitae, tempus\n porta turpis. Donec aliquam lorem at mauris condimentum pellentesque. Proin posuere cursus\n diam, in commodo enim dignissim eu. Pellentesque id libero at lorem dapibus laoreet eget nec\n ligula.\n \n \n
",
6 | "plain_content": "
Article title
Proin vulputate viverra dapibus. Duis hendrerit suscipit porta. Mauris arcu urna, placerat a lorem non, placerat sodales massa. Proin faucibus libero sollicitudin auctor tincidunt. Fusce malesuada massa quis enim cursus, at tincidunt dolor pretium. Donec sed felis et dolor semper molestie eget ut velit. Mauris ac ultricies arcu, nec vehicula felis. In eu mauris vel est tempor lobortis. Aenean sagittis molestie elit quis suscipit. Praesent volutpat mi in ipsum auctor, vitae dapibus ipsum cursus. Etiam iaculis lobortis mi ut accumsan. Vestibulum lacinia aliquam euismod. Cras venenatis dolor vel vulputate aliquet. Fusce cursus est quis nisi fringilla rhoncus. Maecenas lobortis nisi quis porttitor consectetur.
Inside div, after div, before paragraph.
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam iaculis urna nec eros iaculis pretium. Sed efficitur eleifend turpis. Quisque at ipsum interdum, sagittis diam id, malesuada enim. Ut in nibh vitae eros vestibulum aliquam. Fusce finibus, ligula sed ullamcorper aliquet, nisl diam mollis quam, eu varius ligula justo nec nulla. Sed ut purus sollicitudin, porttitor mi quis, suscipit quam. In sit amet gravida nisl.
Vivamus sapien mi, ultricies sit amet dolor sit amet, accumsan accumsan lacus. Donec in elit sit amet nisl pulvinar commodo. Nam at libero a est blandit vestibulum. Nam quis tempor sem. Integer a lobortis erat, vel ultricies diam. Vivamus sit amet ipsum in sapien tempus semper. Proin luctus pulvinar velit, eget egestas neque condimentum sit amet. Nam ut fringilla magna. Donec blandit lobortis mattis.
Maecenas nec ipsum quis ipsum laoreet fermentum sit amet id est. Donec efficitur id est in iaculis. Morbi vel magna id ex efficitur tempus in in lectus. Fusce at orci vulputate, aliquet turpis eget, tincidunt leo. Aenean efficitur nibh vitae nisi volutpat iaculis. Etiam bibendum rhoncus vulputate.
Nunc scelerisque, nibh ut porta cursus, ex orci molestie odio, in fringilla diam ex quis dui. Curabitur facilisis lacus ac congue finibus. Sed ipsum ex, blandit in dolor in, efficitur iaculis neque. Morbi gravida dapibus erat, eget cursus ipsum vehicula sit amet. Praesent feugiat bibendum felis venenatis pellentesque. Ut eu enim non nisl porttitor accumsan. Cras fringilla volutpat imperdiet. Inside div, after parqagraph, before blockquote
Inside blockquote before paragraph.
Donec sollicitudin, felis eu sodales semper, purus nisl mollis tellus, at bibendum diam sem non lectus. Nam vestibulum sagittis ultricies. Duis id metus lacus. Duis ac neque quis lectus ornare pharetra. Proin facilisis lectus sed urna tincidunt, ut cursus massa aliquam. Curabitur luctus aliquet sollicitudin. Nam dapibus purus maximus nulla semper, nec auctor augue interdum. Fusce ornare in velit vitae luctus.
Inside blockquote after paragraph.
Inside div before div
Inside div before paragraph
Nullam non ante nec felis rhoncus mattis. Mauris ornare eros et purus efficitur, in euismod lorem vestibulum. Vestibulum euismod mattis ornare. Aenean a sem eu tellus vestibulum sagittis. Vestibulum lectus nisi, tempor eget velit consectetur, vulputate tincidunt erat. Integer eleifend sit amet risus a blandit. Sed sagittis et lorem eu tincidunt. Integer eget tortor molestie, consequat nibh vitae, dignissim risus. Cras at neque non dolor viverra volutpat eget id dui. Cras sed convallis dui. Vestibulum porta odio nec risus consectetur condimentum. Curabitur commodo odio eget justo suscipit mattis. Proin in turpis at nisl venenatis sagittis. Integer justo tortor, accumsan blandit nulla at, molestie viverra enim. Duis pulvinar pulvinar massa, ac dapibus enim.
Inside div after div.
Second level heading
Text after second level heading and before third level heading.
Third level heading
Text after third level heading and before fourth level heading.
Fourth level heading
Text after fourth level heading and before fifth level heading.
Fifth level heading
Text after fifth level heading and before sixth level heading.
Sixth level heading
Text after Sixth level heading.Cras ac orci faucibus, rhoncus erat ac, fermentum ligula. Suspendisse et elit sed justo mollis lobortis. Ut consequat aliquet varius. Morbi accumsan nulla eget lectus euismod elementum. Quisque tincidunt ornare nibh in sagittis. Integer lorem tortor, iaculis eget ex vitae, tempus porta turpis. Donec aliquam lorem at mauris condimentum pellentesque. Proin posuere cursus diam, in commodo enim dignissim eu. Pellentesque id libero at lorem dapibus laoreet eget nec ligula.
",
7 | "plain_text": [
8 | {"text": "Article title"},
9 | {"text": "Proin vulputate viverra dapibus. Duis hendrerit suscipit porta. Mauris arcu urna, placerat a lorem non, placerat sodales massa. Proin faucibus libero sollicitudin auctor tincidunt. Fusce malesuada massa quis enim cursus, at tincidunt dolor pretium. Donec sed felis et dolor semper molestie eget ut velit. Mauris ac ultricies arcu, nec vehicula felis. In eu mauris vel est tempor lobortis. Aenean sagittis molestie elit quis suscipit. Praesent volutpat mi in ipsum auctor, vitae dapibus ipsum cursus. Etiam iaculis lobortis mi ut accumsan. Vestibulum lacinia aliquam euismod. Cras venenatis dolor vel vulputate aliquet. Fusce cursus est quis nisi fringilla rhoncus. Maecenas lobortis nisi quis porttitor consectetur."},
10 | {"text": "Inside div, after div, before paragraph."},
11 | {"text": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam iaculis urna nec eros iaculis pretium. Sed efficitur eleifend turpis. Quisque at ipsum interdum, sagittis diam id, malesuada enim. Ut in nibh vitae eros vestibulum aliquam. Fusce finibus, ligula sed ullamcorper aliquet, nisl diam mollis quam, eu varius ligula justo nec nulla. Sed ut purus sollicitudin, porttitor mi quis, suscipit quam. In sit amet gravida nisl."},
12 | {"text": "Vivamus sapien mi, ultricies sit amet dolor sit amet, accumsan accumsan lacus. Donec in elit sit amet nisl pulvinar commodo. Nam at libero a est blandit vestibulum. Nam quis tempor sem. Integer a lobortis erat, vel ultricies diam. Vivamus sit amet ipsum in sapien tempus semper. Proin luctus pulvinar velit, eget egestas neque condimentum sit amet. Nam ut fringilla magna. Donec blandit lobortis mattis."},
13 | {"text": "Maecenas nec ipsum quis ipsum laoreet fermentum sit amet id est. Donec efficitur id est in iaculis. Morbi vel magna id ex efficitur tempus in in lectus. Fusce at orci vulputate, aliquet turpis eget, tincidunt leo. Aenean efficitur nibh vitae nisi volutpat iaculis. Etiam bibendum rhoncus vulputate."},
14 | {"text": "Nunc scelerisque, nibh ut porta cursus, ex orci molestie odio, in fringilla diam ex quis dui. Curabitur facilisis lacus ac congue finibus. Sed ipsum ex, blandit in dolor in, efficitur iaculis neque. Morbi gravida dapibus erat, eget cursus ipsum vehicula sit amet. Praesent feugiat bibendum felis venenatis pellentesque. Ut eu enim non nisl porttitor accumsan. Cras fringilla volutpat imperdiet. Inside div, after parqagraph, before blockquote"},
15 | {"text": "Inside blockquote before paragraph."},
16 | {"text": "Donec sollicitudin, felis eu sodales semper, purus nisl mollis tellus, at bibendum diam sem non lectus. Nam vestibulum sagittis ultricies. Duis id metus lacus. Duis ac neque quis lectus ornare pharetra. Proin facilisis lectus sed urna tincidunt, ut cursus massa aliquam. Curabitur luctus aliquet sollicitudin. Nam dapibus purus maximus nulla semper, nec auctor augue interdum. Fusce ornare in velit vitae luctus."},
17 | {"text": "Inside blockquote after paragraph."},
18 | {"text": "Inside div before div"},
19 | {"text": "Inside div before paragraph"},
20 | {"text": "Nullam non ante nec felis rhoncus mattis. Mauris ornare eros et purus efficitur, in euismod lorem vestibulum. Vestibulum euismod mattis ornare. Aenean a sem eu tellus vestibulum sagittis. Vestibulum lectus nisi, tempor eget velit consectetur, vulputate tincidunt erat. Integer eleifend sit amet risus a blandit. Sed sagittis et lorem eu tincidunt. Integer eget tortor molestie, consequat nibh vitae, dignissim risus. Cras at neque non dolor viverra volutpat eget id dui. Cras sed convallis dui. Vestibulum porta odio nec risus consectetur condimentum. Curabitur commodo odio eget justo suscipit mattis. Proin in turpis at nisl venenatis sagittis. Integer justo tortor, accumsan blandit nulla at, molestie viverra enim. Duis pulvinar pulvinar massa, ac dapibus enim."},
21 | {"text": "Inside div after div."},
22 | {"text": "Second level heading"},
23 | {"text": "Text after second level heading and before third level heading."},
24 | {"text": "Third level heading"},
25 | {"text": "Text after third level heading and before fourth level heading."},
26 | {"text": "Fourth level heading"},
27 | {"text": "Text after fourth level heading and before fifth level heading."},
28 | {"text": "Fifth level heading"},
29 | {"text": "Text after fifth level heading and before sixth level heading."},
30 | {"text": "Sixth level heading"},
31 | {"text": "Text after Sixth level heading."},
32 | {"text": "Cras ac orci faucibus, rhoncus erat ac, fermentum ligula. Suspendisse et elit sed justo mollis lobortis. Ut consequat aliquet varius. Morbi accumsan nulla eget lectus euismod elementum. Quisque tincidunt ornare nibh in sagittis. Integer lorem tortor, iaculis eget ex vitae, tempus porta turpis. Donec aliquam lorem at mauris condimentum pellentesque. Proin posuere cursus diam, in commodo enim dignissim eu. Pellentesque id libero at lorem dapibus laoreet eget nec ligula."}
33 | ]
34 | }
35 |
--------------------------------------------------------------------------------
/tests/test_article_extraction.py:
--------------------------------------------------------------------------------
1 | """Test readability.py on sample articles"""
2 | from checks import check_extract_article, check_extract_paragraphs_as_plain_text
3 |
4 |
5 | # Test end-to-end article extraction
6 | def test_extract_article_full_page():
7 | check_extract_article(
8 | "addictinginfo.com-1_full_page.html",
9 | "addictinginfo.com-1_simple_article_from_full_page.json"
10 | )
11 |
12 |
13 | def test_extract_article_full_article():
14 | check_extract_article(
15 | "addictinginfo.com-1_full_article.html",
16 | "addictinginfo.com-1_simple_article_from_full_article.json"
17 | )
18 |
19 |
20 | def test_extract_article_non_article():
21 | check_extract_article(
22 | "non_article_full_page.html",
23 | "non_article_full_page.json"
24 | )
25 |
26 |
27 | def test_extract_article_unicode_normalisation():
28 | check_extract_article(
29 | "conservativehq.com-1_full_page.html",
30 | "conservativehq.com-1_simple_article_from_full_page.json"
31 | )
32 |
33 |
34 | def test_extract_article_list_items():
35 | check_extract_article(
36 | "list_items_full_page.html",
37 | "list_items_simple_article_from_full_page.json"
38 | )
39 |
40 |
41 | def test_extract_article_headers_and_non_paragraph_blockquote_text():
42 | check_extract_article(
43 | "davidwolfe.com-1_full_page.html",
44 | "davidwolfe.com-1_simple_article_from_full_page.json"
45 | )
46 |
47 |
48 | def test_extract_article_list_items_content_digests():
49 | check_extract_article(
50 | "list_items_full_page.html",
51 | "list_items_simple_article_from_full_page_content_digests.json",
52 | content_digests=True
53 | )
54 |
55 |
56 | def test_extract_article_list_items_node_indexes():
57 | check_extract_article(
58 | "list_items_full_page.html",
59 | "list_items_simple_article_from_full_page_node_indexes.json",
60 | node_indexes=True
61 | )
62 |
63 |
64 | def test_extract_article_full_page_content_digest():
65 | check_extract_article(
66 | "addictinginfo.com-1_full_page.html",
67 | "addictinginfo.com-1_simple_article_from_full_page_content_digest.json",
68 | content_digests=True
69 | )
70 |
71 |
72 | def test_extract_article_full_page_node_indexes():
73 | check_extract_article(
74 | "addictinginfo.com-1_full_page.html",
75 | "addictinginfo.com-1_simple_article_from_full_page_node_indexes.json",
76 | node_indexes=True
77 | )
78 |
79 |
80 | def test_extract_article_full_page_content_digest_node_indexes():
81 | check_extract_article(
82 | "addictinginfo.com-1_full_page.html",
83 | "addictinginfo.com-1_simple_article_from_full_page_content_digest_node_indexes.json",
84 | content_digests=True,
85 | node_indexes=True
86 | )
87 |
88 |
89 | # Test plain text extraction
90 | def test_extract_paragraphs_as_plain_text():
91 | check_extract_paragraphs_as_plain_text(
92 | "addictinginfo.com-1_simple_article_from_full_article.json",
93 | "addictinginfo.com-1_plain_text_paragraphs_from_simple_article.json"
94 | )
95 |
96 |
97 | def test_extract_paragraphs_as_plain_text_node_indexes():
98 | check_extract_paragraphs_as_plain_text(
99 | "list_items_simple_article_from_full_page_node_indexes.json",
100 | "list_items_plain_text_paragraph_node_indexes.json"
101 | )
102 |
--------------------------------------------------------------------------------
/tests/test_benchmarking.py:
--------------------------------------------------------------------------------
1 | import os
2 | from readabilipy import simple_json_from_html_string
3 | from readabilipy.extractors import extract_date, extract_title
4 |
5 |
6 | TEST_FILEPATH = os.path.join(os.path.dirname(__file__), "data", "benchmarkinghuge.html")
7 | with open(TEST_FILEPATH, encoding="utf-8") as h:
8 | HTML = h.read()
9 |
10 |
11 | def test_benchmark_simple_json_from_html_string(benchmark):
12 | benchmark(simple_json_from_html_string, html=HTML)
13 |
14 |
15 | def test_benchmark_extract_title(benchmark):
16 | benchmark(extract_title, html=HTML)
17 |
18 |
19 | def test_benchmark_extract_date(benchmark):
20 | benchmark(extract_date, html=HTML)
21 |
--------------------------------------------------------------------------------
/tests/test_date_functions.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from readabilipy.extractors import extract_date, ensure_iso_date_format
3 |
4 |
5 | htmls_with_expected = [
6 | ("""
"
11 | )
12 |
13 |
14 | def test_html_bare_text_linebreaks():
15 | """Line breaks in bare text should be removed."""
16 | check_html_output_contains_text("""
17 | Bare text with
18 | some linebreaks here
19 | """, "
Bare text with some linebreaks here
")
20 |
21 |
22 | def test_html_text_with_semantic_br():
23 | """Single is sometimes used as a word separator so should be replaced
24 | with a space."""
25 | check_exact_html_output(
26 | """link caption""",
27 | "
link caption
"
28 | )
29 |
30 |
31 | def test_html_bare_text_double_br():
32 | """Double in bare text should trigger a new paragraph."""
33 | check_html_output_contains_text("""
34 | Bare text with
35 |
36 | some linebreaks here
37 | """, "
Bare text with
some linebreaks here
")
38 |
39 |
40 | def test_html_space_separated_double_br():
41 | """Double separated by whitespace should still trigger a new paragraph."""
42 | check_html_output_contains_text("""
43 | Bare text with
44 |
45 |
46 | some linebreaks here
47 | """, "
Bare text with
some linebreaks here
")
48 |
49 |
50 | def test_html_space_separated_double_br_inside_div():
51 | """Double separated by whitespace should still trigger a new div."""
52 | check_html_output_contains_text("""
53 |
54 | Text with
55 |
56 |
57 | some linebreaks here
58 |
59 | """, "
Text with
some linebreaks here
")
60 |
61 |
62 | def test_html_space_separated_double_br_inside_and_outside_div():
63 | """First double should trigger a new
, second several
inside the div, third a new
"""
64 | check_exact_html_output("""
65 |
66 |
Some
67 | example text here.
68 |
69 |
70 | Text in a div.
71 | A new div.
72 |
73 | Bare text.
74 | A new paragraph.
75 | """, "
Some
example text here.
Text in a div.
A new div.
Bare text.
A new paragraph.
")
76 |
77 |
78 | # Test correct wrapping
79 | def test_ensure_correct_outer_div_wrapping():
80 | """Do not wrap in a
if this is already a
."""
81 | check_exact_html_output("""
82 |
83 |
84 | Some example text here.
85 |
86 |
""", """
Some example text here.
""")
87 |
88 |
89 | def test_ensure_correct_paragraph_wrapping():
90 | """Do not wrap bare text inside
")
115 |
116 |
117 | # Test text consolidation
118 | def test_span_removal_and_conversion():
119 | """First should be removed. Second should give bare text that will be wrapped."""
120 | check_exact_html_output("""
121 |
122 |
Some example text here.
123 | More text in a span.
124 |
""", "
Some example text here.
More text in a span.
")
125 |
126 |
127 | def test_consolidating_string_between_tags():
128 | """First should be removed. Second should give bare text that will be wrapped."""
129 | check_exact_html_output("""
130 |
131 |
Some
example text here.
132 | More text in a span.
133 | Part of the same paragraph.
134 | A new paragraph.
135 |
""", "
Some
example text here.
More text in a span. Part of the same paragraph.
A new paragraph.
")
136 |
137 |
138 | def test_empty_element_removal():
139 | """Empty elements should be removed."""
140 | check_exact_html_output("""
141 |
142 |
Text
143 |
144 | Paragraphs
145 |
146 | Bare text
147 |
148 | """, "
Text
Paragraphs
Bare text
")
149 |
150 |
151 | def test_single_br_with_semantic_space():
152 | """Empty elements should be removed."""
153 | check_exact_html_output("""
154 |
155 |
This tag will be removed but the space after it is important.
156 |
157 | """, "
This tag will be removed but the space after it is important.
."""
21 | html = ""
22 | parsed_content = simple_json_from_html_string(html)
23 | assert parsed_content["content"] == ""
24 |
25 |
26 | def test_plain_element_with_comments():
27 | """Contents of comments should be stripped but the comment itself should be kept."""
28 | html = """
29 |
30 |
Text
31 |
32 |
33 | """.strip()
34 | soup = BeautifulSoup(html, 'html.parser')
35 | elements = [str(plain_element(element, False, False)) for element in soup.contents]
36 | assert elements == ["
Text
"]
37 |
38 |
39 | def test_content_digest_on_filled_and_empty_elements():
40 | """Filled strings should get a digest but empty strings should not."""
41 | html = """
42 |
43 |
Text
44 |
45 |
46 | """.strip()
47 | soup = BeautifulSoup(html, 'html.parser')
48 | elements = [str(plain_element(element, True, True)) for element in soup.contents]
49 | assert elements == ['
Text
']
50 |
51 |
52 | def test_leaf_nodes_without_text():
53 | """Leaf nodes with text should yield their text, while those without should yield None."""
54 | html = """
55 |
56 |
Some text
57 |
58 |
Some more text
59 |
60 | """.strip()
61 | soup = BeautifulSoup(html, 'html.parser')
62 | text_blocks = [plain_text_leaf_node(paragraph) for paragraph in soup.find_all("p")]
63 | assert text_blocks == [{'text': 'Some text'}, {'text': None}, {'text': 'Some more text'}]
64 |
65 |
66 | def test_node_index_assignment():
67 | """Whitelisted elements should get an appropriate index but bare strings should not."""
68 | html = """
69 |
70 |
Some text
71 |
72 | Some bare text
73 |
74 | """.strip()
75 | soup = BeautifulSoup(html, 'html.parser')
76 | normalised_strings = [normalise_text(str(add_node_indexes(elem))) for elem in soup.find_all("div")[0].children]
77 | normalised_strings = [s for s in normalised_strings if s]
78 | assert normalised_strings == ['
Some text
', '', 'Some bare text']
79 |
80 |
81 | def test_content_digest_assignment():
82 | """No content digest hash should be assigned when no child elements exist."""
83 | html = """
84 |
85 |
Some text
86 |
87 | Some bare text
88 |
89 | """.strip()
90 | soup = BeautifulSoup(html, 'html.parser')
91 | digests = [content_digest(elem) for elem in soup.find_all()]
92 | assert digests == ['5271913f47bd4cbfda56ff8c0cddfc481d6bc4fe99725906068fbb6144bfeab4',
93 | '4c2e9e6da31a64c70623619c449a040968cdbea85945bf384fa30ed2d5d24fa3',
94 | '']
95 |
96 |
97 | @mock.patch('subprocess.run')
98 | def test_have_node_1(mock_subprocess_run):
99 | mock_subprocess_run.side_effect = FileNotFoundError("No such file or directory: 'node'")
100 | assert not have_node()
101 |
102 |
103 | @mock.patch('subprocess.run')
104 | def test_have_node_2(mock_subprocess_run):
105 | mock_subprocess_run.return_value = CompletedProcess("", 1)
106 | assert not have_node()
107 |
108 |
109 | @mock.patch('subprocess.run')
110 | def test_have_node_3(mock_subprocess_run):
111 | mock_subprocess_run.return_value = CompletedProcess("", 0, stdout=b"v9.0.0\n")
112 | assert not have_node()
113 |
114 |
115 | @mock.patch('os.path.exists')
116 | def test_have_node_4(mock_os_path_exists):
117 | mock_os_path_exists.return_value = False
118 | assert not have_node()
119 |
120 |
121 | def test_have_node_5():
122 | # Assumes we're running on a system with Node/Readability.js installed
123 | assert have_node()
124 |
--------------------------------------------------------------------------------
/tests/test_simple_tree.py:
--------------------------------------------------------------------------------
1 | """Tests for simple_tree functions."""
2 | from readabilipy import simple_tree_from_html_string
3 | from readabilipy.simplifiers import strip_html_whitespace
4 |
5 |
6 | def test_remove_cdata():
7 | """Test all possible methods of CData inclusion. Note that in the final
8 | example the '//' prefixes have no effect (since we are not in a
22 | //
25 | """.strip()
26 | parsed_html = str(simple_tree_from_html_string(html))
27 | expected_output = "
"
33 |
34 |
35 | def test_strip_control_characters_non_printing_characters():
36 | unnormalised_string = "A string with non-printing characters in\u200Bc\u200Bluded\ufeff"
37 | assert strip_control_characters(unnormalised_string) == "A string with non-printing characters included"
38 | assert normalise_text(unnormalised_string) == "A string with non-printing characters included"
39 |
40 |
41 | def test_strip_control_characters_cr():
42 | unnormalised_string = "A string with new lines\rin\u200Bc\u200Bluded\ufeff"
43 | assert strip_control_characters(unnormalised_string) == "A string with new lines\rincluded"
44 | assert normalise_text(unnormalised_string) == "A string with new lines included"
45 |
46 |
47 | def test_strip_control_characters_lf():
48 | unnormalised_string = "A string with new lines\ninc\u200Bluded\ufeff"
49 | assert strip_control_characters(unnormalised_string) == "A string with new lines\nincluded"
50 | assert normalise_text(unnormalised_string) == "A string with new lines included"
51 |
52 |
53 | def test_strip_control_characters_cr_lf():
54 | unnormalised_string = "A string with new lines\r\nin\u200Bc\u200Bluded\ufeff"
55 | assert strip_control_characters(unnormalised_string) == "A string with new lines\r\nincluded"
56 | assert normalise_text(unnormalised_string) == "A string with new lines included"
57 |
58 |
59 | def test_strip_control_characters_ff():
60 | unnormalised_string = "A string with form feed\fin\u200Bc\u200Bluded\ufeff"
61 | assert strip_control_characters(unnormalised_string) == "A string with form feed\fincluded"
62 | assert normalise_text(unnormalised_string) == "A string with form feed included"
63 |
64 |
65 | def test_strip_control_characters_tab():
66 | unnormalised_string = "A string with tabs\tin\u200Bc\u200Bluded\ufeff"
67 | assert strip_control_characters(unnormalised_string) == "A string with tabs\tincluded"
68 | assert normalise_text(unnormalised_string) == "A string with tabs included"
69 |
70 |
71 | # Test whitespace around tags
72 | @mark.parametrize('terminal_punctuation', text.terminal_punctuation_marks)
73 | def test_ensure_correct_punctuation_joining(terminal_punctuation):
74 | """Do not join with ' ' if the following character is a punctuation mark."""
75 | input_html = f"""
76 |
77 |
78 | Some text like this{terminal_punctuation} with punctuation.
79 |
80 |
"""
81 | expected_output = f"""
Some text like this{terminal_punctuation} with punctuation.
"""
82 | check_exact_html_output(input_html, expected_output)
83 |
84 |
85 | @mark.parametrize('matched_pair', text.matched_punctuation_marks)
86 | def test_ensure_correct_bracket_quote_joining(matched_pair):
87 | """Do not join with ' ' if we are inside matched punctuation marks."""
88 | input_html = f"""
89 |
90 |
91 | Some text {matched_pair[0]}like this{matched_pair[1]} with punctuation.
92 |
93 |
"""
94 | expected_output = f"""
Some text {matched_pair[0]}like this{matched_pair[1]} with punctuation.
"""
95 | check_exact_html_output(input_html, expected_output)
96 |
--------------------------------------------------------------------------------
/tests/test_title_functions.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from readabilipy.extractors.extract_title import extract_title
3 | from readabilipy.extractors.extract_title import combine_similar_titles
4 |
5 |
6 | htmls_with_expected = [
7 | ("""""", "Example title 1"),
8 | ("""""", "Example title 2"),
9 | ("""Example title 3""", "Example title 3"),
10 | ("""
Example title 4
""", "Example title 4"),
11 | (""")""", "Example title 5"),
12 | (""")""", "Example title 6"),
13 | (""")""", "Example title 7"),
14 | (""")""", "Example title 8"),
15 | ("""
Example title 9
""", "Example title 9"),
16 | ("""
Example title 10
""", "Example title 10"),
17 | ("""
Example title 11
""", "Example title 11"),
18 | ("""
Example title 12
""", "Example title 12"),
19 | ("""
Example title 13""", "Example title 13"),
20 | ("""
Example title 14
""", "Example title 14"),
21 | ("""
Example title 15
""", None), # not one of the xpaths in extract_title()
22 | ("""
27 | """, "")
28 |
29 |
30 | def test_iframe_with_source():
31 | """At present we blacklist iframes, but may want to extract the links in future."""
32 | check_exact_html_output(
33 | """""",
34 | ""
35 | )
36 |
37 |
38 | # Test comments inside tags
39 | def test_comments_inside_tags():
40 | """Ensure that comments inside tags are removed."""
41 | check_exact_html_output(
42 | "
Some text here.
",
43 | "
Some text here.
"
44 | )
45 |
46 |
47 | # Test tags inside words
48 | def test_tags_inside_words():
49 | """Ensure that words with tags inside them are kept together when the tags are stripped."""
50 | check_exact_html_output(
51 | """aisle""",
52 | "
aisle
"
53 | )
54 |
55 |
56 | # Test splitting for unclosed tags inside paragraphs
57 | def test_paragraph_splitting_with_unclosed_tags():
58 | """Ensure that paragraphs with unclosed tags inside them split correctly."""
59 | check_exact_html_output(
60 | """
61 |
62 | First paragraph.
63 |
64 | Second paragraph.
65 |
""",
66 | "
First paragraph.
Second paragraph.
"
67 | )
68 |
69 |
70 | # Test (possibly illegal) nested elements
71 | def test_nested_superscript():
72 | """Ensure that nested superscripts are correctly parsed."""
73 | check_exact_html_output(
74 | "