├── .github └── workflows │ ├── lint.yml │ └── test.yml ├── .gitignore ├── .pylintrc ├── CHANGELOG.md ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── benchmarks ├── Dockerfile └── README.md ├── make_release.py ├── readabilipy ├── __init__.py ├── __main__.py ├── __version__.py ├── extractors │ ├── __init__.py │ ├── extract_date.py │ ├── extract_element.py │ └── extract_title.py ├── javascript │ ├── ExtractArticle.js │ └── package.json ├── simple_json.py ├── simple_tree.py ├── simplifiers │ ├── __init__.py │ ├── html.py │ └── text.py └── utils.py ├── setup.py └── tests ├── checks.py ├── data ├── addictinginfo.com-1_full_article.html ├── addictinginfo.com-1_full_page.html ├── addictinginfo.com-1_full_page_javascript.json ├── addictinginfo.com-1_plain_text_paragraphs_from_simple_article.json ├── addictinginfo.com-1_simple_article_from_full_article.json ├── addictinginfo.com-1_simple_article_from_full_page.json ├── addictinginfo.com-1_simple_article_from_full_page_content_digest.json ├── addictinginfo.com-1_simple_article_from_full_page_content_digest_node_indexes.json ├── addictinginfo.com-1_simple_article_from_full_page_node_indexes.json ├── benchmarkinghuge.html ├── conservativehq.com-1_full_page.html ├── conservativehq.com-1_simple_article_from_full_page.json ├── davidwolfe.com-1_full_page.html ├── davidwolfe.com-1_simple_article_from_full_page.json ├── list_items_full_page.html ├── list_items_plain_text_paragraph_node_indexes.json ├── list_items_simple_article_from_full_page.json ├── list_items_simple_article_from_full_page_content_digests.json ├── list_items_simple_article_from_full_page_node_indexes.json ├── non_article_full_page.html ├── non_article_full_page.json ├── plain-content-test_full_article.html └── plain-content-test_full_article_javascript.json ├── test_article_extraction.py ├── test_benchmarking.py ├── test_date_functions.py ├── test_extract_element.py ├── test_html_elements.py ├── test_javascript.py ├── test_normal_html.py ├── test_simple_json.py ├── test_simple_tree.py ├── test_simplifiers_html.py ├── test_simplifiers_text.py ├── test_title_functions.py └── test_weird_html.py /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: ReadabiliPy CI Linting 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | 8 | jobs: 9 | build: 10 | 11 | runs-on: "ubuntu-20.04" 12 | strategy: 13 | matrix: 14 | python-version: ["3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] 15 | 16 | steps: 17 | - uses: actions/checkout@v4 18 | - name: Set up Python ${{ matrix.python-version }} 19 | uses: actions/setup-python@v4 20 | with: 21 | python-version: ${{ matrix.python-version }} 22 | 23 | - name: Install dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install -e ".[test]" 27 | 28 | - name: Lint with pyflakes 29 | run: | 30 | pyflakes *.py readabilipy tests 31 | 32 | - name: check PEP8 33 | run: | 34 | pycodestyle --statistics --ignore=E501 --count *.py readabilipy tests 35 | 36 | - name: Run pylint for stricter error checking 37 | run: | 38 | pylint readabilipy 39 | pylint ./tests/*.py 40 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: ReadabiliPy CI 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | 8 | jobs: 9 | build: 10 | 11 | runs-on: "ubuntu-20.04" 12 | strategy: 13 | matrix: 14 | python-version: ["3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] 15 | node-version: [18.x, 20.x, 22.x] 16 | 17 | steps: 18 | - uses: actions/checkout@v4 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v4 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | 24 | - name: Set up Node.js ${{ matrix.node-version }} 25 | uses: actions/setup-node@v4 26 | with: 27 | node-version: ${{ matrix.node-version }} 28 | 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install -e ".[test]" 33 | 34 | - name: Test with pytest 35 | run: | 36 | pytest -v . --cov readabilipy --cov-report term-missing --benchmark-disable 37 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # PyCharm IDE stuff 2 | .idea/ 3 | 4 | # OSX temporary files 5 | .DS_Store 6 | 7 | # ===== PYTHON STUFF ==== 8 | # Byte-compiled / optimized / DLL files 9 | __pycache__/ 10 | *.py[cod] 11 | *$py.class 12 | 13 | # C extensions 14 | *.so 15 | 16 | # Distribution / packaging 17 | .Python 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | MANIFEST 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .nox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *.cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | .python-version 89 | 90 | # celery beat schedule file 91 | celerybeat-schedule 92 | 93 | # SageMath parsed files 94 | *.sage.py 95 | 96 | # Environments 97 | .env 98 | .venv 99 | env/ 100 | venv/ 101 | ENV/ 102 | env.bak/ 103 | venv.bak/ 104 | 105 | # Spyder project settings 106 | .spyderproject 107 | .spyproject 108 | 109 | # Rope project settings 110 | .ropeproject 111 | 112 | # mkdocs documentation 113 | /site 114 | 115 | # mypy 116 | .mypy_cache/ 117 | .dmypy.json 118 | dmypy.json 119 | 120 | # Pyre type checker 121 | .pyre/ 122 | 123 | 124 | # ===== NODE STUFF ===== 125 | # Logs 126 | logs 127 | *.log 128 | npm-debug.log* 129 | yarn-debug.log* 130 | yarn-error.log* 131 | 132 | # Runtime data 133 | pids 134 | *.pid 135 | *.seed 136 | *.pid.lock 137 | 138 | # Directory for instrumented libs generated by jscoverage/JSCover 139 | lib-cov 140 | 141 | # Coverage directory used by tools like istanbul 142 | coverage 143 | 144 | # nyc test coverage 145 | .nyc_output 146 | 147 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 148 | .grunt 149 | 150 | # Bower dependency directory (https://bower.io/) 151 | bower_components 152 | 153 | # node-waf configuration 154 | .lock-wscript 155 | 156 | # Compiled binary addons (https://nodejs.org/api/addons.html) 157 | build/Release 158 | 159 | # Dependency directories 160 | node_modules/ 161 | jspm_packages/ 162 | 163 | # TypeScript v1 declaration files 164 | typings/ 165 | 166 | # Optional npm cache directory 167 | .npm 168 | 169 | # Optional eslint cache 170 | .eslintcache 171 | 172 | # Optional REPL history 173 | .node_repl_history 174 | 175 | # Output of 'npm pack' 176 | *.tgz 177 | 178 | # Yarn Integrity file 179 | .yarn-integrity 180 | 181 | # dotenv environment variables file 182 | .env 183 | 184 | # next.js build output 185 | .next 186 | 187 | # package-lock 188 | package-lock.json 189 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Change Log 2 | 3 | ## Version 0.3.0 4 | * Fixed some bugs. Updated supported Python versions. 5 | 6 | ## Version 0.2.0 7 | * Restructured project ready for initial PyPI upload. 8 | 9 | ## Version 0.1.0 10 | * Final version used by Misinformation project. 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 The Alan Turing Institute 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include readabilipy/javascript/*.js 2 | include readabilipy/javascript/package.json 3 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for easier installation and cleanup. 2 | # 3 | # Uses self-documenting macros from here: 4 | # http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html 5 | 6 | SHELL := bash 7 | .SHELLFLAGS := -eu -o pipefail -c 8 | MAKEFLAGS += --warn-undefined-variables --no-builtin-rules 9 | 10 | PACKAGE=readabilipy 11 | DOC_DIR=./docs 12 | VENV_DIR=/tmp/rdpy_venv 13 | TEST_DIR=./tests 14 | 15 | .PHONY: help 16 | 17 | .DEFAULT_GOAL := help 18 | 19 | # Display a help message when called without target, using the ## comments 20 | help: 21 | @grep -E '^[0-9a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) |\ 22 | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-15s\033[0m\ 23 | %s\n", $$1, $$2}' 24 | 25 | ################ 26 | # Installation # 27 | ################ 28 | 29 | .PHONY: install 30 | 31 | install: ## Install for the current user using the default python command 32 | python setup.py build_ext --inplace 33 | python setup.py install --user 34 | 35 | 36 | ################ 37 | # Distribution # 38 | ################ 39 | 40 | .PHONY: release dist 41 | 42 | release: ## Make a release 43 | python make_release.py 44 | 45 | dist: ## Make Python source distribution 46 | python setup.py sdist bdist_wheel 47 | 48 | 49 | ########### 50 | # Testing # 51 | ########### 52 | 53 | .PHONY: test 54 | 55 | test: venv ## Run unit tests 56 | source $(VENV_DIR)/bin/activate && cd $(TEST_DIR) && python -m pytest -v . --cov readabilipy --cov-report term-missing --benchmark-disable 57 | source $(VENV_DIR)/bin/activate && pyflakes *.py readabilipy $(TEST_DIR) 58 | source $(VENV_DIR)/bin/activate && pycodestyle --statistics --ignore=E501 --count *.py readabilipy $(TEST_DIR) 59 | source $(VENV_DIR)/bin/activate && pylint readabilipy $(TEST_DIR)/*.py 60 | 61 | ################# 62 | # Documentation # 63 | ################# 64 | 65 | .PHONY: docs 66 | 67 | docs: install ## Build documentation with Sphinx 68 | exit; # not implemented 69 | source $(VENV_DIR)/bin/activate && m2r README.md && mv README.rst $(DOC_DIR) 70 | source $(VENV_DIR)/bin/activate && m2r CHANGELOG.md && mv CHANGELOG.rst $(DOC_DIR) 71 | cd $(DOC_DIR) && \ 72 | rm source/* && \ 73 | source $(VENV_DIR)/bin/activate && \ 74 | sphinx-apidoc -H 'ReadabiliPy API Documentation' -o source ../$(PACKAGE) && \ 75 | touch source/AUTOGENERATED 76 | $(MAKE) -C $(DOC_DIR) html 77 | 78 | ####################### 79 | # Virtual environment # 80 | ####################### 81 | 82 | .PHONY: venv clean_venv 83 | 84 | venv: $(VENV_DIR)/bin/activate 85 | 86 | $(VENV_DIR)/bin/activate: setup.py 87 | test -d $(VENV_DIR) || python -m venv $(VENV_DIR) 88 | source $(VENV_DIR)/bin/activate && pip install .[dev] 89 | touch $(VENV_DIR)/bin/activate 90 | 91 | clean_venv: 92 | rm -rf $(VENV_DIR) 93 | 94 | ############ 95 | # Clean up # 96 | ############ 97 | 98 | .PHONY: clean 99 | 100 | clean: ## Clean build dist and egg directories left after install 101 | rm -rf ./dist 102 | rm -rf ./build 103 | rm -rf ./$(PACKAGE).egg-info 104 | rm -rf $(VENV_DIR) 105 | rm -f MANIFEST 106 | rm -rf $(PACKAGE)/javascript/node_modules 107 | find . -type f -iname '*.pyc' -delete 108 | find . -type d -name '__pycache__' -empty -delete 109 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ReadabiliPy 2 | 3 | [![Coverage Status](https://coveralls.io/repos/github/alan-turing-institute/ReadabiliPy/badge.svg?branch=master)](https://coveralls.io/github/alan-turing-institute/ReadabiliPy?branch=master) 4 | 5 | `ReadabiliPy` contains a Python wrapper for Mozilla's [Readability.js](https://github.com/mozilla/readability) Node.js package, as well as article extraction routines written in pure Python. 6 | 7 | This package augments the output of `Readability.js` to also return a list of plain text representations of article paragraphs. 8 | 9 | `ReadabiliPy` comes with a handy command line application: ``readabilipy``. 10 | 11 | ## Installation 12 | 13 | To use the `Readability.js` wrapper you need to have a working [Node.js](https://nodejs.org/en/download/) installation of version 14 or higher. 14 | Make sure to install Node.js before installing this package, as this ensures Readability.js will be installed. 15 | If you only want to use the Python-based article extraction, you **do not need** to install Node.js. 16 | 17 | `ReadabiliPy` can be installed simply from PyPI: 18 | 19 | ``` 20 | $ pip install readabilipy 21 | ``` 22 | 23 | Note that to update to a new version of `Readability.js` you can simply reinstall `ReadabiliPy`. 24 | 25 | ## Usage 26 | 27 | `ReadabiliPy` can be used either as a command line application or as a Python library. 28 | 29 | ### Command line application 30 | 31 | The ``readabilipy`` command line application can be used to extract an article from an HTML source file. 32 | 33 | For example, if you have the article saved as ``input.html`` in the current directory then you can run: 34 | 35 | ``` 36 | $ readabilipy -i ./input.html -o article.json 37 | ``` 38 | 39 | The extracted article can then be found in the ``article.json`` file. By default ReadabiliPy will use the Readability.js functionality to extract the article, provided this is available. If instead you'd like to use the Python-based extraction, run: 40 | 41 | ``` 42 | $ readabilipy -p -i ./input.html -o article.json 43 | ``` 44 | 45 | The complete help text of the command line application is as follows: 46 | 47 | ``` 48 | $ readabilipy -h 49 | usage: readabilipy [-h] -i INPUT_FILE -o OUTPUT_FILE [-c] [-n] [-p] [-V] 50 | 51 | Extract article data from a HTML file using either Mozilla's Readability.js 52 | package or a simplified python-only alternative. 53 | 54 | optional arguments: 55 | -h, --help show this help message and exit 56 | -i INPUT_FILE, --input-file INPUT_FILE 57 | Path to input file containing HTML. 58 | -o OUTPUT_FILE, --output-file OUTPUT_FILE 59 | Path to file to output the article data to as JSON. 60 | -c, --content-digests 61 | Add a 'data-content-digest' attribute containing a 62 | SHA256-based digest of the element's contents to each 63 | HTML element in the plain_content output. 64 | -n, --node-indexes Add a 'data-node-index' attribute containing a 65 | hierarchical representation of the element's position 66 | in the HTML structure each HTML element in the 67 | plain_content output. 68 | -p, --use-python-parser 69 | Use the pure-python 'plain_html' parser included in 70 | this project rather than Mozilla's Readability.js. 71 | -V, --version Show version and exit 72 | ``` 73 | 74 | ## Library 75 | 76 | ReadabiliPy can also be used as a Python package. 77 | The main routine is called ``simple_json_from_html_string`` and expects the HTML article as a string. 78 | Here is an example of extracting an article after downloading the page using [requests](https://requests.readthedocs.io/en/master/): 79 | 80 | ```python 81 | >>> import requests 82 | >>> from readabilipy import simple_json_from_html_string 83 | >>> req = requests.get('https://en.wikipedia.org/wiki/Readability') 84 | >>> article = simple_json_from_html_string(req.text, use_readability=True) 85 | ``` 86 | 87 | Note that you need to use the flag ``use_readability=True`` to use Readability.js, otherwise the Python-based extraction is used. 88 | 89 | The ``simple_json_from_html_string`` function returns a dictionary with the following fields: 90 | 91 | - `title`: The article title 92 | - `byline`: Author information 93 | - `content`: A simplified HTML representation of the article, with all article text contained in paragraph elements. 94 | - `plain_content`: A "plain" version of the simplified `Readability.js` article HTML present in the `content` field. This attempts to retain only the plain text content of the article, while preserving the HTML structure. 95 | - `plain_text`: A list containing plain text representations of each paragraph (`

`) or list (`

    ` or `