├── .coveragerc ├── .git-blame-ignore-revs ├── .github ├── dependabot.yml └── workflows │ ├── codeql-analysis.yml │ ├── create-container.yml │ ├── helm-release.yaml │ └── python-package.yml ├── .gitignore ├── .readthedocs.yaml ├── AUTHORS ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE ├── README.rst ├── RENDERING.md ├── TODO.txt ├── benchmarking ├── a ├── b ├── run_benchmarking.py ├── speed_comparisons.txt └── url_list.txt ├── docker-compose.yml ├── docs ├── Makefile ├── README.rst ├── benchmarking.rst ├── conf.py ├── contributing.md ├── images │ ├── stackoverflow-code-annotation.png │ ├── wikipedia-chur-entry-annotation.png │ ├── wikipedia-chur-table-annotation.png │ └── xda-posts-annotation.png ├── index.rst ├── inscriptis-module-documentation.rst ├── paper │ ├── Makefile │ ├── images │ │ ├── annotations.png │ │ ├── inscriptis-vs-lynx.png │ │ ├── inscriptis-vs-lynx.xcf │ │ └── raw │ │ │ ├── inscriptis.png │ │ │ └── lynx.png │ ├── paper.bib │ └── paper.md └── requirements.txt ├── examples ├── annotation │ ├── annotation-profile.json │ ├── stackoverflow.json │ ├── table-annotation-profile.json │ ├── unittest.json │ ├── wikipedia-entities-and-citations.json │ ├── wikipedia.json │ └── xda-developers.json └── custom-html-handling.py ├── img ├── nested-table-firefox.png ├── wikipedia-chur-firefox.png └── wikipedia-python-example.png ├── publish.sh ├── pyproject.toml ├── src └── inscriptis │ ├── __init__.py │ ├── annotation │ ├── __init__.py │ ├── output │ │ ├── __init__.py │ │ ├── html.py │ │ ├── surface.py │ │ └── xml.py │ └── parser.py │ ├── cli │ ├── __init__.py │ └── inscript.py │ ├── css_profiles.py │ ├── html_engine.py │ ├── html_properties.py │ ├── metadata.py │ ├── model │ ├── __init__.py │ ├── attribute.py │ ├── canvas │ │ ├── __init__.py │ │ ├── block.py │ │ └── prefix.py │ ├── config.py │ ├── css.py │ ├── html_document_state.py │ ├── html_element.py │ ├── table.py │ └── tag │ │ ├── __init__.py │ │ ├── a_tag.py │ │ ├── br_tag.py │ │ ├── img_tag.py │ │ ├── list_tag.py │ │ └── table_tag.py │ └── service │ ├── __init__.py │ └── web.py ├── tests ├── __init__.py ├── data │ └── annotation-profile-unittest.json ├── html │ ├── advanced-prefix-test.html │ ├── advanced-prefix-test.txt │ ├── br-in-table.html │ ├── br-in-table.txt │ ├── br-in-table2.html │ ├── br-li.html │ ├── br-li.txt │ ├── br.html │ ├── br.txt │ ├── direct-enumeration.html │ ├── direct-enumeration.txt │ ├── empty-table.html │ ├── empty-table.txt │ ├── enumerations.html │ ├── enumerations.txt │ ├── html-comment-ofuscation.html │ ├── html-comment-ofuscation.txt │ ├── invalid-table.html │ ├── invalid-table.txt │ ├── invalid-table2.html │ ├── invalid-table2.txt │ ├── invalid-table3.html │ ├── invalid-table3.txt │ ├── invisible.html │ ├── invisible.txt │ ├── invisible2.html │ ├── invisible2.txt │ ├── invisible3.html │ ├── invisible3.txt │ ├── nested-list.html │ ├── nested-list.txt │ ├── nested-table-alignment-css.html │ ├── nested-table-alignment-css.txt │ ├── nested-table-alignment.html │ ├── nested-table-alignment.txt │ ├── nested-table.html │ ├── nested-table.txt │ ├── p-br.html │ ├── p-br.txt │ ├── pre.html │ ├── pre.txt │ ├── real-world │ │ ├── avantec-team.html │ │ ├── naturgruen-team.html │ │ └── rswag-mitarbeiter.html │ ├── stackoverflow-list-snippet.html │ ├── stackoverflow-list-snippet.txt │ ├── subsequent-headings.html │ ├── subsequent-headings.json │ ├── subsequent-headings.txt │ ├── table-alignment.html │ ├── table-alignment.txt │ ├── table-empty-row.html │ ├── table-empty-row.txt │ ├── table-in-table.html │ ├── table-in-table.json │ ├── table-in-table.txt │ ├── table-itemize.html │ ├── table-itemize.txt │ ├── table-pre.html │ ├── table-pre.txt │ ├── table.html │ ├── table.json │ ├── table.txt │ ├── td-only-table.html │ ├── td-only-table.txt │ ├── test.html │ ├── tr-only-table.html │ ├── tr-only-table.txt │ ├── whitespace.html │ ├── whitespace.txt │ ├── wikipedia-code.html │ ├── wikipedia-code.txt │ ├── wikipedia-consequtive-links-and-umlauts.html │ ├── wikipedia-consequtive-links-and-umlauts.txt │ ├── wikipedia-consequtive-tables.html │ ├── wikipedia-consequtive-tables.json │ ├── wikipedia-enumeration-annotation.html │ ├── wikipedia-enumeration-annotation.json │ ├── wikipedia-enumeration-annotation.txt │ ├── wikipedia-enumeration.html │ ├── wikipedia-enumeration.txt │ ├── wikipedia-equation.html │ ├── wikipedia-equation.txt │ ├── wikipedia-table-bordercase-verticial-alignmnet.html │ ├── wikipedia-table-bordercase-verticial-alignmnet.json │ ├── wikipedia-table-bordercase1.html │ ├── wikipedia-table-bordercase1.json │ ├── wikipedia-table.html │ ├── wikipedia-table.json │ └── wikipedia-table.txt ├── test_annotation.py ├── test_annotation_engine.py ├── test_annotation_output_processor.py ├── test_annotation_output_xml.py ├── test_annotation_rule_parsing.py ├── test_block.py ├── test_broken_table_handling.py ├── test_cli.py ├── test_custom_html_tag_handling.py ├── test_double_a.py ├── test_empty_string.py ├── test_engine.py ├── test_html_conversion_options.py ├── test_html_snippets.py ├── test_html_snippets_annotations.py ├── test_invalid_float_specification.py ├── test_limit_whitespace_affixes.py ├── test_list_div.py ├── test_margin_before_at_start.py ├── test_margin_handling.py ├── test_metadata.py ├── test_model_html_element_canvas.py ├── test_model_prefix.py ├── test_parse_css.py ├── test_strip_xml_header.py ├── test_style_parsing.py ├── test_table_cell.py ├── test_table_cell_formatting.py ├── test_table_row.py ├── test_web_service.py └── test_white_space_handling.py └── tox.ini /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = tests/ 3 | -------------------------------------------------------------------------------- /.git-blame-ignore-revs: -------------------------------------------------------------------------------- 1 | 55fa29ca39f9ed5895f9e88b2eb0f17e4d84245f 2 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | updates: 4 | 5 | # Enable version updates for github actions. 6 | - package-ecosystem: "github-actions" 7 | directory: "/" 8 | schedule: 9 | # Check for updates to GitHub Actions every weekday 10 | interval: "weekly" 11 | 12 | # Enable version updates for Docker. 13 | - package-ecosystem: "docker" 14 | # Look for a `Dockerfile` in the `root` directory 15 | directory: "/" 16 | # Check for updates once a week 17 | schedule: 18 | interval: "weekly" 19 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | pull_request: 17 | schedule: 18 | - cron: '26 5 * * 2' 19 | 20 | jobs: 21 | analyze: 22 | name: Analyze 23 | runs-on: ubuntu-latest 24 | permissions: 25 | actions: read 26 | contents: read 27 | security-events: write 28 | 29 | strategy: 30 | fail-fast: false 31 | matrix: 32 | language: [ 'python' ] 33 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ] 34 | # Learn more: 35 | # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed 36 | 37 | steps: 38 | - name: Checkout repository 39 | uses: actions/checkout@v3 40 | 41 | # Initializes the CodeQL tools for scanning. 42 | - name: Initialize CodeQL 43 | uses: github/codeql-action/init@v2 44 | with: 45 | languages: ${{ matrix.language }} 46 | # If you wish to specify custom queries, you can do so here or in a config file. 47 | # By default, queries listed here will override any specified in a config file. 48 | # Prefix the list here with "+" to use these queries and those in the config file. 49 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 50 | 51 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 52 | # If this step fails, then you should remove it and run the build manually (see below) 53 | - name: Autobuild 54 | uses: github/codeql-action/autobuild@v2 55 | 56 | # ℹ️ Command-line programs to run using the OS shell. 57 | # 📚 https://git.io/JvXDl 58 | 59 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 60 | # and modify them (or add more) to build your code if your project 61 | # uses a compiled language 62 | 63 | #- run: | 64 | # make bootstrap 65 | # make release 66 | 67 | - name: Perform CodeQL Analysis 68 | uses: github/codeql-action/analyze@v2 69 | -------------------------------------------------------------------------------- /.github/workflows/create-container.yml: -------------------------------------------------------------------------------- 1 | name: container 2 | 3 | on: 4 | push: 5 | tags: 6 | - '*' 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Checkout code 13 | uses: actions/checkout@v3 14 | 15 | - name: get version 16 | id: version 17 | run: echo ::set-output name=APP_VERSION::${GITHUB_REF/refs\/tags\//} 18 | 19 | - name: init docker build 20 | uses: docker/setup-buildx-action@v2 21 | 22 | - name: login docker 23 | uses: docker/login-action@v2 24 | with: 25 | registry: ghcr.io 26 | username: ${{ github.actor }} 27 | password: ${{ secrets.GITHUB_TOKEN }} 28 | 29 | - name: publish container 30 | uses: docker/build-push-action@v4 31 | with: 32 | push: true 33 | tags: | 34 | ghcr.io/weblyzard/inscriptis:v${{ steps.version.outputs.APP_VERSION }} 35 | ghcr.io/weblyzard/inscriptis:latest 36 | -------------------------------------------------------------------------------- /.github/workflows/helm-release.yaml: -------------------------------------------------------------------------------- 1 | name: helm release 2 | 3 | on: 4 | push: 5 | branches: 6 | - PhilippKuntschik-patch-2 7 | tags: 8 | - '*' 9 | 10 | jobs: 11 | dispatch_helm_release: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: dispatch inscriptis-helm 15 | uses: peter-evans/repository-dispatch@v2 16 | with: 17 | token: ${{ secrets.HELMREPO_ACCESS_TOKEN }} 18 | repository: weblyzard/inscriptis-helm 19 | event-type: tag-released 20 | client-payload: '{"ref": "${{ github.ref_name }}"}' 21 | -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | name: build 2 | 3 | on: 4 | push: 5 | pull_request: 6 | 7 | jobs: 8 | build: 9 | 10 | runs-on: ubuntu-24.04 11 | strategy: 12 | fail-fast: false 13 | matrix: 14 | python-version: [ '3.9', '3.10', '3.11', '3.12', '3.13' ] 15 | 16 | steps: 17 | - uses: actions/checkout@v3 18 | - name: Set up Python ${{ matrix.python-version }} 19 | uses: actions/setup-python@v4 20 | with: 21 | python-version: ${{ matrix.python-version }} 22 | - name: Install build environment 23 | run: | 24 | python -m pip install --upgrade pip 25 | python -m pip install tox setuptools pytest pytest-cov codecov 26 | - name: Build and test with tox. 27 | run: | 28 | tox 29 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.pyx 3 | .*.swp 4 | *.egg-info 5 | __pycache__/ 6 | benchmarking_results/ 7 | html_cache/ 8 | .tox 9 | build/ 10 | dist/ 11 | .cache/ 12 | .project 13 | .pydevproject 14 | .settings/ 15 | .pytest_cache/ 16 | .coverage 17 | _build/ 18 | .mypy_cache/ 19 | .idea/ 20 | venv/ 21 | tests/converted.txt 22 | tests/reference.txt 23 | *.c 24 | docs/paper/*.pdf 25 | htmlcov/ 26 | poetry.lock 27 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # Read the Docs configuration file for Sphinx projects 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 3 | 4 | # Required 5 | version: 2 6 | 7 | # Set the OS, Python version and other tools you might need 8 | build: 9 | os: ubuntu-22.04 10 | tools: 11 | python: "3.12" 12 | # You can also specify other tool versions: 13 | # nodejs: "20" 14 | # rust: "1.70" 15 | # golang: "1.20" 16 | 17 | # Build documentation in the "docs/" directory with Sphinx 18 | sphinx: 19 | configuration: docs/conf.py 20 | # You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs 21 | # builder: "dirhtml" 22 | # Fail on all warnings to avoid broken references 23 | # fail_on_warning: true 24 | 25 | # Optionally build your docs in additional formats such as PDF and ePub 26 | formats: 27 | - pdf 28 | # - epub 29 | 30 | # Optional but recommended, declare the Python requirements required 31 | # to build your documentation 32 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html 33 | python: 34 | install: 35 | - requirements: docs/requirements.txt 36 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | Albert Weichselbraun 2 | Fabian Odoni 3 | 4 | The design of inscriptis has originally been inspired by SpiffWikiMarkup 5 | developed by Samuel Abels . 6 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Inscriptis 2 | 3 | First off, thank you for considering contributing to inscriptis. 4 | There are many ways how you can contribute to the project and these guidelines aim at supporting you in doing so. 5 | 6 | 1. [Reporting bugs and seeking support](#reporting-bugs-and-seeking-support) 7 | 2. [Suggesting enhancements](#suggesting-enhancements) 8 | 3. [Pull requests](#pull-requests) (contributing code) 9 | 4. [Python style guide](#python-style-guide) 10 | 11 | 12 | ## Reporting bugs and seeking support 13 | 14 | Bugs and support requests are tracked as GitHub issues. 15 | 16 | To create an effective and high quality ticket, please include the following information in your 17 | ticket: 18 | 19 | 1. **Use a clear and descriptive title** for the issue to identify the problem. This also helps other users to quickly locate bug reports that affect them. 20 | 2. **Describe the exact steps necessary for reproducing the problem** including at least information on 21 | - the affected URL 22 | - the command line parameters or function arguments you used 23 | 3. What would have been the **expected behavior**? 24 | 4. Describe the **observed behavior**. 25 | 5. Provide any additional information which might be helpful in reproducing and/or fixing this issue. 26 | 27 | 28 | ## Suggesting enhancements 29 | 30 | Enhancements are also tracked as GitHub issues and should contain the following information: 31 | 32 | 1. **A clear and descriptive title** helps other people to identify enhancements they like, so that they can also add their thoughts and suggestions. 33 | 2. **Provide a step-by-step description** of the suggested enhancement. 34 | 3. **Describe the current behavior** and **explain which behavior you expected to see instead** and why. 35 | 36 | 37 | ## Pull requests 38 | 39 | 1. Ensure that your code complies with our [Python style guide](#python-style-guide). 40 | 2. Write a unit test that covers your new code and put it into the `./tests` directory. 41 | 3. Execute `tox .` in the project's root directory to ensure that your code passes the static code analysis, coding style guidelines and security checks. 42 | 4. In addition, please document any new API functions in the Inscriptis documentation. 43 | 44 | 45 | ## Python style guide 46 | 47 | Inscriptis code should comply to 48 | - the [PEP8 Style Guide for Python Code](https://www.python.org/dev/peps/pep-0008/), and 49 | - to the [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html) 50 | 51 | Please also ensure that 52 | 1. functions are properly documented with docstrings that comply to the Google Python Style Guide, and 53 | 2. any new code is covered by unit tests. 54 | 55 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # 2 | # Stage 1 - Install build dependencies 3 | # 4 | FROM python:3.11-slim-bullseye AS builder 5 | 6 | WORKDIR /inscriptis 7 | RUN python -m venv .venv && .venv/bin/python -m pip install --upgrade pip 8 | RUN .venv/bin/pip install --no-cache-dir inscriptis[web-service] && \ 9 | find /inscriptis/.venv \( -type d -a -name test -o -name tests \) -o \( -type f -a -name '*.pyc' -o -name '*.pyo' \) -exec rm -rf '{}' \+ 10 | 11 | # 12 | # Stage 2 - Copy only necessary files to the runner stage 13 | # 14 | FROM python:3.11-slim-bullseye 15 | LABEL maintainer="albert@weichselbraun.net" 16 | 17 | # Note: only copy the src directory, to prevent bloating the image with 18 | # irrelevant files from the project directory. 19 | WORKDIR /inscriptis 20 | COPY --from=builder /inscriptis /inscriptis 21 | 22 | ENV PATH="/inscriptis/.venv/bin:$PATH" 23 | CMD ["uvicorn", "inscriptis.service.web:app", "--port=5000", "--host=0.0.0.0"] 24 | EXPOSE 5000 25 | -------------------------------------------------------------------------------- /TODO.txt: -------------------------------------------------------------------------------- 1 | Please feel free to address any of the following issues 2 | 3 | - add a class that translates style sheets to the corresponding dictionary of `HtmlElement`s. 4 | - check: option to remove links with a one letter description (footnotes) 5 | - support for
tag (if needed) 6 | -------------------------------------------------------------------------------- /benchmarking/a: -------------------------------------------------------------------------------- 1 | justext is not available. Please install it in order to compare with justext. 2 | 3 | URL: www.watson.de 4 | Lynx : 0.15138936042785645 --> fastest 5 | Inscriptis : 0.20263218879699707 +0.051242828369140625 6 | BeautifulSoup: 0.3756422996520996 +0.22425293922424316 7 | Html2Text : 0.43219757080078125 +0.2808082103729248 8 | 9 | 10 | URL: www.watson.ch-Digital20&20Games-Android-134350872-Der-Monster-Akku-in-diesem-Smartphone-hC3A4lt-bis- 11 | Inscriptis : 0.07737088203430176 --> fastest 12 | BeautifulSoup: 0.1150212287902832 +0.037650346755981445 13 | Lynx : 0.1359405517578125 +0.05856966972351074 14 | Html2Text : 0.1448962688446045 +0.06752538681030273 15 | 16 | 17 | URL: www.heise.de 18 | Lynx : 0.15659260749816895 --> fastest 19 | Inscriptis : 0.20164966583251953 +0.045057058334350586 20 | BeautifulSoup: 0.29897594451904297 +0.14238333702087402 21 | Html2Text : 0.37505173683166504 +0.2184591293334961 22 | 23 | 24 | URL: www.heise.de-newsticker-meldung-Fairphone-2-im-Test-Das-erste-modulare-Smartphone-3043417.html 25 | Inscriptis : 0.09370565414428711 --> fastest 26 | Lynx : 0.15947198867797852 +0.0657663345336914 27 | BeautifulSoup: 0.16203570365905762 +0.06833004951477051 28 | Html2Text : 0.21861886978149414 +0.12491321563720703 29 | 30 | 31 | URL: www.nzz.de 32 | Lynx : 0.17096304893493652 --> fastest 33 | Inscriptis : 0.2877614498138428 +0.11679840087890625 34 | Html2Text : 0.4983334541320801 +0.32737040519714355 35 | BeautifulSoup: 0.5966424942016602 +0.42567944526672363 36 | 37 | 38 | URL: www.nzz.ch-mobilitaet-auto-mobil-bekenntnis-zum-stromauto-ld.3630 39 | Inscriptis : 0.1326134204864502 --> fastest 40 | Lynx : 0.14449405670166016 +0.011880636215209961 41 | BeautifulSoup: 0.16537070274353027 +0.03275728225708008 42 | Html2Text : 0.2061011791229248 +0.07348775863647461 43 | 44 | 45 | URL: de.wikipedia.org-wiki-Wikipedia-Hauptseite 46 | Inscriptis : 0.0768730640411377 --> fastest 47 | BeautifulSoup: 0.1140899658203125 +0.037216901779174805 48 | Html2Text : 0.1279299259185791 +0.051056861877441406 49 | Lynx : 0.13344478607177734 +0.05657172203063965 50 | 51 | 52 | URL: de.wikipedia.org-wiki-Python_(Programmiersprache) 53 | Lynx : 0.15608739852905273 --> fastest 54 | Inscriptis : 0.2505784034729004 +0.09449100494384766 55 | BeautifulSoup: 0.3396627902984619 +0.18357539176940918 56 | Html2Text : 0.407498836517334 +0.25141143798828125 57 | 58 | 59 | URL: de.wikipedia.org-wiki-Chur 60 | Lynx : 0.19526290893554688 --> fastest 61 | Inscriptis : 0.4372870922088623 +0.24202418327331543 62 | BeautifulSoup: 0.5105750560760498 +0.31531214714050293 63 | Html2Text : 0.7925112247467041 +0.5972483158111572 64 | 65 | 66 | URL: jr-central.co.jp 67 | Inscriptis : 0.030536651611328125 --> fastest 68 | BeautifulSoup: 0.04150390625 +0.010967254638671875 69 | Html2Text : 0.05070781707763672 +0.020171165466308594 70 | Lynx : 0.1379244327545166 +0.10738778114318848 71 | 72 | 73 | URL: www.aljazeera.net-portal 74 | Lynx : 0.18790936470031738 --> fastest 75 | Inscriptis : 0.3582143783569336 +0.1703050136566162 76 | BeautifulSoup: 0.5611743927001953 +0.37326502799987793 77 | Html2Text : 0.6482110023498535 +0.46030163764953613 78 | 79 | 80 | URL: www.aljazeera.net-news-humanrights-2015-12-14-D8A3D988D8A8D8A7D985D8A7-D98AD8ACD8AFD8AF-D8A7D984D8AA 81 | Inscriptis : 0.13330984115600586 --> fastest 82 | Lynx : 0.14847993850708008 +0.015170097351074219 83 | BeautifulSoup: 0.17941498756408691 +0.046105146408081055 84 | Html2Text : 0.242262601852417 +0.10895276069641113 85 | 86 | 87 | URL: www.fhgr.ch 88 | Lynx : 0.20734667778015137 --> fastest 89 | Inscriptis : 0.5514888763427734 +0.34414219856262207 90 | BeautifulSoup: 0.7790236473083496 +0.5716769695281982 91 | Html2Text : 0.9708971977233887 +0.7635505199432373 92 | 93 | 94 | URL: www.diepresse.com 95 | Lynx : 0.18340134620666504 --> fastest 96 | Inscriptis : 0.2943253517150879 +0.11092400550842285 97 | BeautifulSoup: 0.48204803466796875 +0.2986466884613037 98 | Html2Text : 0.5474369525909424 +0.36403560638427734 99 | 100 | 101 | URL: derstandard.at 102 | Lynx : 0.17057490348815918 --> fastest 103 | Inscriptis : 0.3920929431915283 +0.22151803970336914 104 | BeautifulSoup: 0.4781017303466797 +0.3075268268585205 105 | Html2Text : 0.5499060153961182 +0.379331111907959 106 | 107 | 108 | URL: krone.at 109 | Lynx : 0.18678593635559082 --> fastest 110 | Inscriptis : 0.41831398010253906 +0.23152804374694824 111 | BeautifulSoup: 0.6808819770812988 +0.494096040725708 112 | Html2Text : 0.794529914855957 +0.6077439785003662 113 | 114 | -------------------------------------------------------------------------------- /benchmarking/b: -------------------------------------------------------------------------------- 1 | justext is not available. Please install it in order to compare with justext. 2 | 3 | URL: www.watson.de 4 | -------------------------------------------------------------------------------- /benchmarking/speed_comparisons.txt: -------------------------------------------------------------------------------- 1 | 2 | URL: www.watson.de 3 | inscriptis : 0.0886073112487793 --> fastest 4 | lynx : 0.09243917465209961 +0.0038318634033203125 5 | html2text : 0.27269411087036133 +0.18408679962158203 6 | beautifulsoup: 0.3715205192565918 +0.2829132080078125 7 | 8 | 9 | URL: www.watson.ch-Digital20&20Games-Android-134350872-Der-Monster-Akku-in-diesem-Smartphone-hC3A4lt-bis- 10 | inscriptis : 0.031877756118774414 --> fastest 11 | lynx : 0.06591463088989258 +0.034036874771118164 12 | html2text : 0.09615325927734375 +0.06427550315856934 13 | beautifulsoup: 0.10839462280273438 +0.07651686668395996 14 | 15 | 16 | URL: www.heise.de 17 | inscriptis : 0.0771639347076416 --> fastest 18 | lynx : 0.0936579704284668 +0.016494035720825195 19 | html2text : 0.2419900894165039 +0.1648261547088623 20 | beautifulsoup: 0.29470372200012207 +0.21753978729248047 21 | 22 | 23 | URL: www.heise.de-newsticker-meldung-Fairphone-2-im-Test-Das-erste-modulare-Smartphone-3043417.html 24 | inscriptis : 0.036151885986328125 --> fastest 25 | lynx : 0.0704348087310791 +0.03428292274475098 26 | html2text : 0.10545611381530762 +0.06930422782897949 27 | beautifulsoup: 0.12367486953735352 +0.08752298355102539 28 | 29 | 30 | URL: www.nzz.de 31 | lynx : 0.10388016700744629 --> fastest 32 | inscriptis : 0.11366724967956543 +0.00978708267211914 33 | html2text : 0.34471607208251953 +0.24083590507507324 34 | beautifulsoup: 0.37203025817871094 +0.26815009117126465 35 | 36 | 37 | URL: www.nzz.ch-mobilitaet-auto-mobil-bekenntnis-zum-stromauto-ld.3630 38 | inscriptis : 0.05420851707458496 --> fastest 39 | lynx : 0.08396458625793457 +0.02975606918334961 40 | html2text : 0.15306854248046875 +0.09886002540588379 41 | beautifulsoup: 0.16551637649536133 +0.11130785942077637 42 | 43 | 44 | URL: de.wikipedia.org-wiki-Wikipedia-Hauptseite 45 | inscriptis : 0.029024839401245117 --> fastest 46 | lynx : 0.0713193416595459 +0.04229450225830078 47 | beautifulsoup: 0.08946847915649414 +0.06044363975524902 48 | html2text : 0.09077596664428711 +0.06175112724304199 49 | 50 | 51 | URL: de.wikipedia.org-wiki-Python_(Programmiersprache) 52 | inscriptis : 0.08830070495605469 --> fastest 53 | lynx : 0.09342122077941895 +0.005120515823364258 54 | html2text : 0.30716776847839355 +0.21886706352233887 55 | beautifulsoup: 0.3195374011993408 +0.23123669624328613 56 | 57 | 58 | URL: de.wikipedia.org-wiki-Chur 59 | lynx : 0.110748291015625 --> fastest 60 | inscriptis : 0.16320323944091797 +0.05245494842529297 61 | html2text : 0.4872932434082031 +0.3765449523925781 62 | beautifulsoup: 0.4883759021759033 +0.3776276111602783 63 | 64 | 65 | URL: jr-central.co.jp 66 | inscriptis : 0.012284517288208008 --> fastest 67 | html2text : 0.03157520294189453 +0.019290685653686523 68 | beautifulsoup: 0.04013681411743164 +0.027852296829223633 69 | lynx : 0.06790828704833984 +0.055623769760131836 70 | 71 | 72 | URL: www.aljazeera.net-portal 73 | lynx : 0.11873912811279297 --> fastest 74 | inscriptis : 0.13616037368774414 +0.017421245574951172 75 | html2text : 0.35196900367736816 +0.2332298755645752 76 | beautifulsoup: 0.5011019706726074 +0.38236284255981445 77 | 78 | 79 | URL: www.aljazeera.net-news-humanrights-2015-12-14-D8A3D988D8A8D8A7D985D8A7-D98AD8ACD8AFD8AF-D8A7D984D8AA 80 | inscriptis : 0.04958152770996094 --> fastest 81 | lynx : 0.08647871017456055 +0.03689718246459961 82 | html2text : 0.1424856185913086 +0.09290409088134766 83 | beautifulsoup: 0.21869587898254395 +0.169114351272583 84 | 85 | 86 | URL: www.htwchur.ch 87 | inscriptis : 0.04151415824890137 --> fastest 88 | lynx : 0.07280635833740234 +0.03129220008850098 89 | html2text : 0.11662626266479492 +0.07511210441589355 90 | beautifulsoup: 0.1333613395690918 +0.09184718132019043 91 | 92 | 93 | URL: www.diepresse.com 94 | lynx : 0.10844087600708008 --> fastest 95 | inscriptis : 0.11291694641113281 +0.004476070404052734 96 | html2text : 0.3410661220550537 +0.23262524604797363 97 | beautifulsoup: 0.42446470260620117 +0.3160238265991211 98 | 99 | 100 | URL: derstandard.at 101 | lynx : 0.10470342636108398 --> fastest 102 | inscriptis : 0.14974093437194824 +0.04503750801086426 103 | html2text : 0.4319000244140625 +0.3271965980529785 104 | beautifulsoup: 0.4459238052368164 +0.3412203788757324 105 | 106 | 107 | URL: krone.at 108 | lynx : 0.11936330795288086 --> fastest 109 | inscriptis : 0.18073749542236328 +0.06137418746948242 110 | html2text : 0.571204662322998 +0.4518413543701172 111 | beautifulsoup: 0.6350071430206299 +0.515643835067749 112 | 113 | 114 | -------------------------------------------------------------------------------- /benchmarking/url_list.txt: -------------------------------------------------------------------------------- 1 | https://www.watson.de 2 | https://www.watson.ch/Digital%20&%20Games/Android/134350872-Der-Monster-Akku-in-diesem-Smartphone-h%C3%A4lt-bis-15-Tage 3 | https://www.heise.de 4 | https://www.heise.de/newsticker/meldung/Fairphone-2-im-Test-Das-erste-modulare-Smartphone-3043417.html 5 | http://www.nzz.de 6 | https://www.nzz.ch/mobilitaet/auto-mobil/bekenntnis-zum-stromauto-ld.3630 7 | https://de.wikipedia.org/wiki/Wikipedia:Hauptseite 8 | https://de.wikipedia.org/wiki/Python_(Programmiersprache) 9 | https://de.wikipedia.org/wiki/Chur 10 | http://jr-central.co.jp 11 | http://www.aljazeera.net/portal 12 | http://www.aljazeera.net/news/humanrights/2015/12/14/%D8%A3%D9%88%D8%A8%D8%A7%D9%85%D8%A7-%D9%8A%D8%AC%D8%AF%D8%AF-%D8%A7%D9%84%D8%AA%D8%B2%D8%A7%D9%85%D9%87-%D8%A8%D8%A5%D8%BA%D9%84%D8%A7%D9%82-%D8%BA%D9%88%D8%A7%D9%86%D8%AA%D8%A7%D9%86%D8%A7%D9%85%D9%88 13 | https://www.fhgr.ch 14 | https://www.diepresse.com 15 | https://derstandard.at 16 | https://krone.at 17 | https://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python/46921881 18 | https://www.chur.ch/churinzahlen 19 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | 3 | services: 4 | inscriptis: 5 | build: 6 | context: . 7 | dockerfile: Dockerfile 8 | ports: 9 | - 5000:5000 10 | volumes: 11 | - /etc/localtime:/etc/localtime:ro 12 | environment: 13 | - TZ=Europe/Berlin 14 | restart: always 15 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = inscriptis 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/README.rst: -------------------------------------------------------------------------------- 1 | ../README.rst -------------------------------------------------------------------------------- /docs/benchmarking.rst: -------------------------------------------------------------------------------- 1 | ==================================== 2 | Testing, benchmarking and evaluation 3 | ==================================== 4 | 5 | Unit tests 6 | ========== 7 | In addition to the standard unit tests that are located in the project's `test` directory Inscriptis also contains 8 | test cases that solely focus on the html to text conversion and are located in the `tests/html` directory. 9 | These tests consist of two files: 10 | 11 | 1. `test-name.html` and 12 | 2. `test-name.txt` 13 | 14 | The `.txt` file contains the reference text output for the given html file. 15 | 16 | Since Inscripits 2.0 there may also be a third file named `test-name.json` in the `tests/html` directory which contains a JSON dictioanry with keys 17 | 18 | 1. `annotation-rules` containing the annotation rules for extracting metadata from the corresponding html file, and 19 | 2. `result` which stores the surface forms of the extracted metadata. 20 | 21 | 22 | Example:: 23 | 24 | {"annotation_rules": { 25 | "h1": ["heading"], 26 | "b": ["emphasis"] 27 | }, 28 | "result": [ 29 | ["heading", "The first"], 30 | ["heading", "The second"], 31 | ["heading", "Subheading"] 32 | ] 33 | } 34 | 35 | 36 | Text conversion output comparison and benchmarking 37 | ================================================== 38 | The inscriptis project contains a benchmarking script that can compare different HTML to text conversion approaches. 39 | The script will run the different approaches on a list of URLs, `url_list.txt`, and save the text output into a time stamped folder in `benchmarking/benchmarking_results` for manual comparison. 40 | Additionally the processing speed of every approach per URL is measured and saved in a text file called `speed_comparisons.txt` in the respective time stamped folder. 41 | 42 | To run the benchmarking script execute `run_benchmarking.py` from within the folder `benchmarking`. 43 | In `def pipeline()` set the which HTML -> Text algorithms to be executed by modifying:: 44 | 45 | run_lynx = True 46 | run_justext = True 47 | run_html2text = True 48 | run_beautifulsoup = True 49 | run_inscriptis = True 50 | 51 | In `url_list.txt` the URLs to be parsed can be specified by adding them to the file, one per line with no additional formatting. URLs need to be complete (including http:// or https://) 52 | e.g.:: 53 | 54 | http://www.informationscience.ch 55 | https://en.wikipedia.org/wiki/Information_science 56 | ... 57 | 58 | -------------------------------------------------------------------------------- /docs/contributing.md: -------------------------------------------------------------------------------- 1 | ../CONTRIBUTING.md -------------------------------------------------------------------------------- /docs/images/stackoverflow-code-annotation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weblyzard/inscriptis/2ef7e3bdc428816a34e8f6e6280888c7d64eec26/docs/images/stackoverflow-code-annotation.png -------------------------------------------------------------------------------- /docs/images/wikipedia-chur-entry-annotation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weblyzard/inscriptis/2ef7e3bdc428816a34e8f6e6280888c7d64eec26/docs/images/wikipedia-chur-entry-annotation.png -------------------------------------------------------------------------------- /docs/images/wikipedia-chur-table-annotation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weblyzard/inscriptis/2ef7e3bdc428816a34e8f6e6280888c7d64eec26/docs/images/wikipedia-chur-table-annotation.png -------------------------------------------------------------------------------- /docs/images/xda-posts-annotation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weblyzard/inscriptis/2ef7e3bdc428816a34e8f6e6280888c7d64eec26/docs/images/xda-posts-annotation.png -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. inscriptis documentation master file, created by 2 | sphinx-quickstart on Sat Dec 14 06:42:31 2019. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | .. include:: README.rst 7 | 8 | Documentation 9 | ============= 10 | 11 | Contents: 12 | 13 | .. toctree:: 14 | :maxdepth: 2 15 | 16 | Documentation 17 | benchmarking 18 | contributing 19 | inscriptis-module-documentation 20 | 21 | 22 | 23 | Indices and tables 24 | ================== 25 | 26 | * :ref:`genindex` 27 | * :ref:`modindex` 28 | * :ref:`search` 29 | 30 | -------------------------------------------------------------------------------- /docs/inscriptis-module-documentation.rst: -------------------------------------------------------------------------------- 1 | =============================== 2 | Inscriptis module documentation 3 | =============================== 4 | 5 | .. automodule:: inscriptis 6 | :members: 7 | 8 | Inscriptis model 9 | ================ 10 | 11 | Inscriptis HTML engine 12 | ---------------------- 13 | .. automodule:: inscriptis.html_engine 14 | :members: 15 | 16 | Inscriptis HTML properties 17 | -------------------------- 18 | .. automodule:: inscriptis.html_properties 19 | :members: 20 | 21 | Inscriptis CSS model 22 | -------------------- 23 | .. automodule:: inscriptis.model.css 24 | :members: 25 | 26 | Inscriptis canvas model 27 | ----------------------- 28 | .. automodule:: inscriptis.model.canvas 29 | :members: 30 | 31 | .. automodule:: inscriptis.model.canvas.block 32 | :members: 33 | 34 | .. automodule:: inscriptis.model.canvas.prefix 35 | :members: 36 | 37 | 38 | 39 | Inscriptis table model 40 | ---------------------- 41 | .. automodule:: inscriptis.model.table 42 | :members: 43 | 44 | 45 | .. _annotations: 46 | 47 | Inscriptis annotations 48 | ====================== 49 | 50 | .. automodule:: inscriptis.annotation 51 | :members: 52 | 53 | 54 | Annotation processors 55 | --------------------- 56 | 57 | .. automodule:: inscriptis.annotation.output 58 | :members: 59 | -------------------------------------------------------------------------------- /docs/paper/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | docker run --rm \ 3 | --volume `pwd`:/data \ 4 | --user $(id -u):$(id -g) \ 5 | --env JOURNAL=joss \ 6 | openjournals/paperdraft 7 | -------------------------------------------------------------------------------- /docs/paper/images/annotations.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weblyzard/inscriptis/2ef7e3bdc428816a34e8f6e6280888c7d64eec26/docs/paper/images/annotations.png -------------------------------------------------------------------------------- /docs/paper/images/inscriptis-vs-lynx.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weblyzard/inscriptis/2ef7e3bdc428816a34e8f6e6280888c7d64eec26/docs/paper/images/inscriptis-vs-lynx.png -------------------------------------------------------------------------------- /docs/paper/images/inscriptis-vs-lynx.xcf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weblyzard/inscriptis/2ef7e3bdc428816a34e8f6e6280888c7d64eec26/docs/paper/images/inscriptis-vs-lynx.xcf -------------------------------------------------------------------------------- /docs/paper/images/raw/inscriptis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weblyzard/inscriptis/2ef7e3bdc428816a34e8f6e6280888c7d64eec26/docs/paper/images/raw/inscriptis.png -------------------------------------------------------------------------------- /docs/paper/images/raw/lynx.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weblyzard/inscriptis/2ef7e3bdc428816a34e8f6e6280888c7d64eec26/docs/paper/images/raw/lynx.png -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | lxml 2 | requests 3 | inscriptis 4 | myst_parser 5 | -------------------------------------------------------------------------------- /examples/annotation/annotation-profile.json: -------------------------------------------------------------------------------- 1 | { 2 | "h1": ["heading"], 3 | "h2": ["heading"], 4 | "h3": ["heading"], 5 | "h4": ["heading"], 6 | "h5": ["heading"], 7 | "b": ["emphasis"], 8 | "div#class=toc": ["table-of-contents"], 9 | "#class=FactBox": ["fact-box"], 10 | "#class=shortdescription]": ["description"], 11 | "table": ["table"], 12 | "tr": ["row"], 13 | "td": ["cell"] 14 | } 15 | -------------------------------------------------------------------------------- /examples/annotation/stackoverflow.json: -------------------------------------------------------------------------------- 1 | { 2 | "h1": ["heading"], 3 | "h2": ["heading"], 4 | "h3": ["heading"], 5 | "b": ["emphasis"], 6 | "code": ["code"], 7 | "#itemprop=dateCreated": ["creation-date"], 8 | "#class=lang-py": ["code"], 9 | "#class=user-details": ["user"], 10 | "#class=reputation-score": ["reputation"], 11 | "#class=comment-user": ["comment-user"], 12 | "#class=comment-date": ["comment-date"], 13 | "#class=comment-copy": ["comment-comment"] 14 | } 15 | -------------------------------------------------------------------------------- /examples/annotation/table-annotation-profile.json: -------------------------------------------------------------------------------- 1 | { 2 | "table": ["table"], 3 | "th": ["table-heading"], 4 | "tr": ["table-row"], 5 | "td": ["table-cell"], 6 | "b": ["emphasis"] 7 | } 8 | -------------------------------------------------------------------------------- /examples/annotation/unittest.json: -------------------------------------------------------------------------------- 1 | { 2 | "h1": ["heading"], 3 | "h2": ["heading"], 4 | "h3": ["heading"], 5 | "b": ["emphasis"], 6 | "table": ["table"] 7 | } 8 | -------------------------------------------------------------------------------- /examples/annotation/wikipedia-entities-and-citations.json: -------------------------------------------------------------------------------- 1 | { 2 | "a#title": ["entity"], 3 | "a#class=new": ["missing entity"], 4 | "#class=reference": ["citation"] 5 | } 6 | -------------------------------------------------------------------------------- /examples/annotation/wikipedia.json: -------------------------------------------------------------------------------- 1 | { 2 | "h1": ["heading"], 3 | "h2": ["heading"], 4 | "h3": ["subheading"], 5 | "h4": ["subheading"], 6 | "h5": ["subheading"], 7 | "i": ["emphasis"], 8 | "b": ["bold"], 9 | "table": ["table"], 10 | "th": ["tableheading"], 11 | "a": ["link"] 12 | } 13 | -------------------------------------------------------------------------------- /examples/annotation/xda-developers.json: -------------------------------------------------------------------------------- 1 | { 2 | "article#class=message-body": ["article"], 3 | "li#class=u-concealed": ["time"], 4 | "#itemprop=name": ["user-name"], 5 | "#itemprop=jobTitle": ["user-title"] 6 | } 7 | -------------------------------------------------------------------------------- /examples/custom-html-handling.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Custom HTML tag handling example. 5 | 6 | Add a custom HTML handler for the bold tag which encloses 7 | bold text with "**". 8 | 9 | Example: 10 | "Welcome to Chur" is rendered as "Welcome to **Chur**". 11 | """ 12 | from typing import Dict 13 | 14 | from inscriptis import ParserConfig 15 | from inscriptis.html_engine import Inscriptis 16 | from inscriptis.model.html_document_state import HtmlDocumentState 17 | from inscriptis.model.tag import CustomHtmlTagHandlerMapping 18 | from lxml.html import fromstring 19 | 20 | 21 | def my_handle_start_b(state: HtmlDocumentState, _: Dict) -> None: 22 | """Handle the opening tag.""" 23 | state.tags[-1].write("**") 24 | 25 | 26 | def my_handle_end_b(state: HtmlDocumentState) -> None: 27 | """Handle the closing tag.""" 28 | state.tags[-1].write("**") 29 | 30 | 31 | MY_MAPPING = CustomHtmlTagHandlerMapping( 32 | start_tag_mapping={"b": my_handle_start_b}, 33 | end_tag_mapping={"b": my_handle_end_b}, 34 | ) 35 | 36 | 37 | HTML = "Welcome to Chur" 38 | 39 | html_tree = fromstring(HTML) 40 | inscriptis = Inscriptis( 41 | html_tree, ParserConfig(custom_html_tag_handler_mapping=MY_MAPPING) 42 | ) 43 | print(inscriptis.get_text()) 44 | -------------------------------------------------------------------------------- /img/nested-table-firefox.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weblyzard/inscriptis/2ef7e3bdc428816a34e8f6e6280888c7d64eec26/img/nested-table-firefox.png -------------------------------------------------------------------------------- /img/wikipedia-chur-firefox.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weblyzard/inscriptis/2ef7e3bdc428816a34e8f6e6280888c7d64eec26/img/wikipedia-chur-firefox.png -------------------------------------------------------------------------------- /img/wikipedia-python-example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weblyzard/inscriptis/2ef7e3bdc428816a34e8f6e6280888c7d64eec26/img/wikipedia-python-example.png -------------------------------------------------------------------------------- /publish.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Publishing sequence: 4 | # ==================== 5 | # 1. create pypi package 6 | # 2. publish docker container 7 | # 3. create github release (which runs the helm scripts) 8 | 9 | # publish the latest package to pypi 10 | # sources: 11 | # - https://packaging.python.org/guides/distributing-packages-using-setuptools/#packaging-your-project 12 | # - https://packaging.python.org/guides/making-a-pypi-friendly-readme/ 13 | 14 | VERSION=$(grep -oP '^version = "\K[^"]+' pyproject.toml) 15 | IMAGE_NAME=inscriptis 16 | 17 | case "$1" in 18 | python) 19 | # cleanup dist 20 | rm -rf ./dist 21 | 22 | # build and publish packages 23 | poetry publish --build 24 | ;; 25 | docker) 26 | echo "Publishing ${IMAGE_NAME} in version ${VERSION}" 27 | docker login ghcr.io -u AlbertWeichselbraun --password-stdin < ../github-token.txt 28 | docker build -t ${IMAGE_NAME}:${VERSION} . 29 | 30 | # Step 2: Tag 31 | docker tag ${IMAGE_NAME}:${VERSION} ghcr.io/weblyzard/${IMAGE_NAME}:${VERSION} 32 | docker tag ${IMAGE_NAME}:${VERSION} ghcr.io/weblyzard/${IMAGE_NAME}:latest 33 | 34 | # Step 3: Publish 35 | docker push ghcr.io/weblyzard/${IMAGE_NAME}:${VERSION} 36 | ;; 37 | esac 38 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "inscriptis" 3 | version = "2.6.0" 4 | authors = ["Albert Weichselbraun ", "Fabian Odoni "] 5 | description = "inscriptis - HTML to text converter." 6 | keywords = ["HTML", "converter", "text"] 7 | classifiers = [ 8 | 'Development Status :: 5 - Production/Stable', 9 | 'Intended Audience :: Developers', 10 | 'License :: OSI Approved :: Apache Software License', 11 | 'Topic :: Text Processing', 12 | 'Topic :: Text Processing :: Markup :: HTML', 13 | 'Topic :: Utilities', 14 | 'Programming Language :: Python :: 3', 15 | 'Programming Language :: Python :: 3.9', 16 | 'Programming Language :: Python :: 3.10', 17 | 'Programming Language :: Python :: 3.11', 18 | 'Programming Language :: Python :: 3.12', 19 | 'Programming Language :: Python :: 3.13', 20 | ] 21 | homepage = "https://github.com/weblyzard/inscriptis" 22 | repository = "https://github.com/weblyzard/inscriptis" 23 | documentation = "https://inscriptis.readthedocs.io/en" 24 | license = "Apache-2.0" 25 | readme = "README.rst" 26 | 27 | packages = [ 28 | {include = "inscriptis", from="src"}, 29 | ] 30 | 31 | 32 | [tool.poetry.scripts] 33 | inscript = "inscriptis.cli.inscript:cli" 34 | inscriptis-api = "inscriptis.service.web:start" 35 | 36 | 37 | [tool.poetry.extras] 38 | web-service = ["fastapi", "uvicorn"] 39 | 40 | 41 | [tool.poetry.dependencies] 42 | python = "^3.9 || ^3.10 || ^3.11 || ^3.12 || ^3.13" 43 | requests = ">=2.32.2" 44 | lxml = ">=4.9.3" 45 | 46 | # optional dependencies 47 | fastapi = { version = "^0.115.11", optional = true } 48 | uvicorn = { version = "^0.34.0", optional = true } 49 | 50 | [tool.poetry.group.dev.dependencies] 51 | pytest = "^8.3.5" 52 | 53 | 54 | [build-system] 55 | requires = ["poetry-core"] 56 | build-backend = "poetry.core.masonry.api" 57 | 58 | 59 | # code formatting with black 60 | [tool.black] 61 | line-length = 88 62 | target-version = ["py39", "py310", "py311", "py312", "py313"] 63 | extend-exclude = '\.html$|\.json$|\.txt$|/a$|/b$' 64 | include = ''' 65 | ^/src/|^/tests/|^/benchmarking/|^/examples/ 66 | ''' 67 | -------------------------------------------------------------------------------- /src/inscriptis/__init__.py: -------------------------------------------------------------------------------- 1 | r"""Parse HTML content and converts it into a text representation. 2 | 3 | Inscriptis provides support for 4 | 5 | - nested HTML tables 6 | - basic Cascade Style Sheets 7 | - annotations 8 | 9 | The following example provides the text representation of 10 | ``_. 11 | 12 | .. code:: 13 | 14 | import urllib.request 15 | from inscriptis import get_text 16 | 17 | url = 'https://www.fhgr.ch' 18 | html = urllib.request.urlopen(url).read().decode('utf-8') 19 | 20 | text = get_text(html) 21 | 22 | print(text) 23 | 24 | Use the method :meth:`~inscriptis.get_annotated_text` to obtain text and 25 | annotations. The method requires annotation rules as described in annotations_. 26 | 27 | .. code:: 28 | 29 | import urllib.request 30 | from inscriptis import get_annotated_text 31 | 32 | url = "https://www.fhgr.ch" 33 | html = urllib.request.urlopen(url).read().decode('utf-8') 34 | 35 | # annotation rules specify the HTML elements and attributes to annotate. 36 | rules = {'h1': ['heading'], 37 | 'h2': ['heading'], 38 | '#class=FactBox': ['fact-box'], 39 | 'i': ['emphasis']} 40 | 41 | output = get_annotated_text(html, ParserConfig(annotation_rules=rules) 42 | print("Text:", output['text']) 43 | print("Annotations:", output['label']) 44 | 45 | The method returns a dictionary with two keys: 46 | 47 | 1. `text` which contains the page's plain text and 48 | 2. `label` with the annotations in JSONL format that is used by annotators 49 | such as `doccano `_. 50 | 51 | Annotations in the `label` field are returned as a list of triples with 52 | `start index`, `end index` and `label` as indicated below: 53 | 54 | .. code-block:: json 55 | 56 | {"text": "Chur\n\nChur is the capital and largest town of the Swiss canton 57 | of the Grisons and lies in the Grisonian Rhine Valley.", 58 | "label": [[0, 4, "heading"], [6, 10, "emphasis"]]} 59 | 60 | """ 61 | 62 | import re 63 | from typing import Dict, Optional, Any 64 | from inscriptis.model.config import ParserConfig 65 | 66 | from lxml.etree import ParserError 67 | from lxml.html import fromstring, HtmlElement 68 | 69 | from inscriptis.html_engine import Inscriptis 70 | 71 | RE_STRIP_XML_DECLARATION = re.compile(r"^<\?xml [^>]+?\?>") 72 | 73 | 74 | def _get_html_tree(html_content: str) -> Optional[HtmlElement]: 75 | """Obtain the HTML parse tree for the given HTML content. 76 | 77 | Args: 78 | html_content: The content to parse. 79 | 80 | Returns: 81 | The corresponding HTML parse tree. 82 | """ 83 | html_content = html_content.strip() 84 | if not html_content: 85 | return None 86 | 87 | # strip XML declaration, if necessary 88 | if html_content.startswith("" + html_content + "") 95 | 96 | 97 | def get_text(html_content: str, config: ParserConfig = None) -> str: 98 | """Provide a text representation of the given HTML content. 99 | 100 | Args: 101 | html_content (str): The HTML content to convert. 102 | config: An optional ParserConfig object. 103 | 104 | Returns: 105 | The text representation of the HTML content. 106 | """ 107 | html_tree = _get_html_tree(html_content) 108 | return Inscriptis(html_tree, config).get_text() if html_tree is not None else "" 109 | 110 | 111 | def get_annotated_text( 112 | html_content: str, config: ParserConfig = None 113 | ) -> Dict[str, Any]: 114 | """Return a dictionary of the extracted text and annotations. 115 | 116 | Notes: 117 | - the text is stored under the key 'text'. 118 | - annotations are provided under the key 'label' which contains a 119 | list of :class:`Annotation`s. 120 | 121 | Examples: 122 | {"text": "EU rejects German call to boycott British lamb.", " 123 | label": [ [0, 2, "strong"], ... ]} 124 | {"text": "Peter Blackburn", 125 | "label": [ [0, 15, "heading"] ]} 126 | 127 | Returns: 128 | A dictionary of text (key: 'text') and annotations (key: 'label') 129 | """ 130 | html_tree = _get_html_tree(html_content) 131 | if html_tree is None: 132 | return {} 133 | 134 | inscriptis = Inscriptis(html_tree, config) 135 | text = inscriptis.get_text() 136 | labels = [(a.start, a.end, a.metadata) for a in inscriptis.get_annotations()] 137 | return {"text": text, "label": labels} 138 | -------------------------------------------------------------------------------- /src/inscriptis/annotation/__init__.py: -------------------------------------------------------------------------------- 1 | """The model used for saving annotations.""" 2 | 3 | from typing import List 4 | from typing import NamedTuple 5 | 6 | from inscriptis.html_properties import HorizontalAlignment 7 | 8 | 9 | class Annotation(NamedTuple): 10 | """An Inscriptis annotation which provides metadata on the extracted text. 11 | 12 | The :attr:`start` and :attr:`end` indices indicate the span of the text 13 | to which the metadata refers, and the attribute :attr:`metadata` contains 14 | the tuple of tags describing this span. 15 | 16 | Example:: 17 | 18 | Annotation(0, 10, ('heading', )) 19 | 20 | The annotation above indicates that the text span between the 1st (index 0) 21 | and 11th (index 10) character of the extracted text contains a *heading*. 22 | """ 23 | 24 | start: int 25 | """the annotation's start index within the text output.""" 26 | end: int 27 | """the annotation's end index within the text output.""" 28 | metadata: str 29 | """the tag to be attached to the annotation.""" 30 | 31 | 32 | def horizontal_shift( 33 | annotations: List[Annotation], 34 | content_width: int, 35 | line_width: int, 36 | align: HorizontalAlignment, 37 | shift: int = 0, 38 | ) -> List[Annotation]: 39 | r"""Shift annotations based on the given line's formatting. 40 | 41 | Adjusts the start and end indices of annotations based on the line's 42 | formatting and width. 43 | 44 | Args: 45 | annotations: a list of Annotations. 46 | content_width: the width of the actual content 47 | line_width: the width of the line in which the content is placed. 48 | align: the horizontal alignment (left, right, center) to assume for 49 | the adjustment 50 | shift: an optional additional shift 51 | 52 | Returns: 53 | A list of :class:`Annotation`\s with the adjusted start and end 54 | positions. 55 | """ 56 | if align == HorizontalAlignment.left: 57 | h_align = shift 58 | elif align == HorizontalAlignment.right: 59 | h_align = shift + line_width - content_width 60 | else: 61 | h_align = shift + (line_width - content_width) // 2 62 | 63 | return [ 64 | Annotation(a.start + h_align, a.end + h_align, a.metadata) for a in annotations 65 | ] 66 | -------------------------------------------------------------------------------- /src/inscriptis/annotation/output/__init__.py: -------------------------------------------------------------------------------- 1 | r""":class:`AnnotationProcessor`\s transform annotations to an output format. 2 | 3 | All AnnotationProcessor's implement the :class:`AnnotationProcessor` interface 4 | by overwrite the class's :meth:`AnnotationProcessor.__call__` method. 5 | 6 | .. note:: 7 | 1. The AnnotationExtractor class must be put into a package with the 8 | extractor's name (e.g., :mod:`inscriptis.annotation.output.*package*`) 9 | and be named :class:`*PackageExtractor*` (see the examples below). 10 | 2. The overwritten :meth:`__call__` method may either extend the original 11 | dictionary which contains the extracted text and annotations (e.g., 12 | :class:`~inscriptis.annotation.output.surface.SurfaceExtractor`) or 13 | may replace it with a custom output (e.g., 14 | :class:`~inscriptis.annotation.output.html.HtmlExtractor` and 15 | :class:`~inscriptis.annotation.output.xml.XmlExtractor`). 16 | 17 | Currently, Inscriptis supports the following built-in AnnotationProcessors: 18 | 19 | 1. :class:`~inscriptis.annotation.output.html.HtmlExtractor` provides an 20 | annotated HTML output format. 21 | 2. :class:`~inscriptis.annotation.output.xml.XmlExtractor` yields an output 22 | which marks annotations with XML tags. 23 | 3. :class:`~inscriptis.annotation.output.surface.SurfaceExtractor` adds the 24 | key `surface` to the result dictionary which contains the surface forms 25 | of the extracted annotations. 26 | 27 | """ 28 | 29 | from typing import Dict, Any 30 | 31 | 32 | class AnnotationProcessor: 33 | """An AnnotationProcessor is called for formatting annotations.""" 34 | 35 | def __call__(self, annotated_text: Dict[str, str]) -> Any: 36 | """Format the given text and annotations. 37 | 38 | Args: 39 | annotated_text: a dictionary that contains the converted text and 40 | all annotations that have been found. 41 | 42 | Returns: 43 | An output representation that has been changed according to the 44 | AnnotationProcessor's design. 45 | """ 46 | raise NotImplementedError 47 | -------------------------------------------------------------------------------- /src/inscriptis/annotation/output/html.py: -------------------------------------------------------------------------------- 1 | """HTML Annotation Processor.""" 2 | 3 | from collections import defaultdict 4 | from itertools import cycle 5 | from typing import Dict, Any, List 6 | 7 | from inscriptis.annotation.output import AnnotationProcessor 8 | 9 | COLOR_SCHEMA = ("#D8115980", "#8F2D5680", "#21838080", "#FBB13C80", "#73D2DE80") 10 | 11 | 12 | class HtmlExtractor(AnnotationProcessor): 13 | """Provides an HTML version of the extracted text. 14 | 15 | The generated HTML colors annotations based on the COLOR_SCHEMA 16 | constant. 17 | """ 18 | 19 | verbatim = True 20 | 21 | def __call__(self, annotated_text: Dict[str, Any]) -> str: 22 | tag_dict = defaultdict(list) 23 | 24 | for start, end, label in reversed(annotated_text["label"]): 25 | tag_dict[start].append( 26 | f'{label}' 27 | ) 28 | tag_dict[end].insert(0, "") 29 | 30 | tagged_content = [ 31 | "
",
34 |         ]
35 | 
36 |         text = annotated_text["text"]
37 |         current_idx = 0
38 |         for idx, tags in sorted(tag_dict.items()):
39 |             tagged_content.append(text[current_idx:idx].replace("\n", "
\n
"))
40 |             current_idx = idx
41 |             tagged_content.extend(tags)
42 |         tagged_content.append(text[current_idx:].replace("\n", "
\n")) 43 | return "".join(tagged_content) + "" 44 | 45 | @staticmethod 46 | def _get_label_colors(labels: List[str]) -> Dict[str, str]: 47 | """Compute the mapping between annotation labels and colors. 48 | 49 | The used color schema is available in the global variable COLOR_SCHEMA. 50 | 51 | Args: 52 | labels: a list of the annotations classes (e.g., heading, etc.) 53 | that need to be color-coded. 54 | Returns: 55 | A mapping between the available labels and the corresponding color 56 | from the COLOR_SCHEMA. 57 | """ 58 | return dict(zip({a[2] for a in sorted(labels)}, cycle(COLOR_SCHEMA))) 59 | 60 | def _get_css(self, labels: List[str]) -> str: 61 | """Compute the CSS to be included into the HTML output. 62 | 63 | Args: 64 | labels: a list of the annotations classes (e.g., heading, etc.) 65 | that need to be color-coded. 66 | 67 | Returns: 68 | A string containing the CSS to be embedded into the HTML output. 69 | 70 | """ 71 | css = [] 72 | for label, color in sorted(self._get_label_colors(labels).items()): 73 | css.append( 74 | "pre{{" 75 | " position: relative;\n" 76 | "}}\n" 77 | ".{label} {{\n" 78 | " background-color: {color};\n" 79 | " border-radius: 0.4em;\n" 80 | "}}\n" 81 | ".{label}-label {{\n" 82 | " top: -1.0em;\n" 83 | ' content: "{label}";\n' 84 | " position: absolute;\n" 85 | " background-color: {color};\n" 86 | " font-size: 75%; }}\n".format(label=label, color=color) 87 | ) 88 | return "\n".join(css) 89 | -------------------------------------------------------------------------------- /src/inscriptis/annotation/output/surface.py: -------------------------------------------------------------------------------- 1 | """Surface Form Annotation Processor.""" 2 | from typing import Dict, Any 3 | 4 | from inscriptis.annotation.output import AnnotationProcessor 5 | 6 | 7 | class SurfaceExtractor(AnnotationProcessor): 8 | """Extracts the surface form of all annotated labels.""" 9 | 10 | verbatim = False 11 | 12 | def __call__(self, annotated_text: Dict[str, Any]) -> Dict[str, Any]: 13 | """ 14 | Add information on the surface forms to the annotated_text dictionary. 15 | 16 | Args: 17 | annotated_text: a dictionary containing the plain text and the 18 | extracted annotations. 19 | 20 | Returns: 21 | An extended dictionary which contains the extracted surface-forms 22 | of the annotations under the key 'surface'. 23 | """ 24 | surface_forms = [ 25 | (label, annotated_text["text"][start:end]) 26 | for start, end, label in annotated_text["label"] 27 | ] 28 | annotated_text["surface"] = surface_forms 29 | return annotated_text 30 | -------------------------------------------------------------------------------- /src/inscriptis/annotation/output/xml.py: -------------------------------------------------------------------------------- 1 | """XML Annotation processor.""" 2 | 3 | from collections import defaultdict 4 | from typing import Dict, Any 5 | 6 | from inscriptis.annotation.output import AnnotationProcessor 7 | 8 | 9 | class XmlExtractor(AnnotationProcessor): 10 | """Provide the converted text with XML-style annotations.""" 11 | 12 | verbatim = True 13 | 14 | def __call__(self, annotated_text: Dict[str, Any], root_element="content"): 15 | tag_dict = defaultdict(list) 16 | for start, end, tag in reversed(annotated_text["label"]): 17 | tag_dict[start].append(f"<{tag}>") 18 | tag_dict[end].insert(0, f"") 19 | 20 | current_idx = 0 21 | text = annotated_text["text"] 22 | tagged_content = ['\n', "\n"] 23 | for idx, tags in sorted(tag_dict.items()): 24 | tagged_content.append(text[current_idx:idx]) 25 | current_idx = idx 26 | tagged_content.extend(tags) 27 | 28 | tagged_content.append(text[current_idx:]) 29 | tagged_content.append("\n") 30 | return "".join(tagged_content) 31 | -------------------------------------------------------------------------------- /src/inscriptis/annotation/parser.py: -------------------------------------------------------------------------------- 1 | """Parse annotation configuration files. 2 | 3 | Annotation configuration files contain a dictionary that maps tags and 4 | attributes to the corresponding annotation. 5 | 6 | - tags are referenced by their name 7 | - attributes by a `#` (e.g., `#class`) and an optional selector (e.g., 8 | `#class=short-description`) 9 | 10 | Example:: 11 | 12 | { 13 | "h1": ["heading"], 14 | "b": ["emphasis"], 15 | "div#class=toc": ["table-of-contents"], 16 | "#class=short-description]": ["description"] 17 | } 18 | """ 19 | from collections import defaultdict 20 | from copy import copy 21 | from typing import Dict, Tuple, List 22 | 23 | from inscriptis.model.html_element import HtmlElement, DEFAULT_HTML_ELEMENT 24 | 25 | 26 | class ApplyAnnotation: 27 | """Apply an Annotation to the given attribute. 28 | 29 | Arguments: 30 | annotations: a tuple of annotations to be applied to the attribute. 31 | attr: the name of the attribute. 32 | match_tag: only apply annotations to attributes that belong to the 33 | given match_tag. 34 | match_value: only apply annotations to attribute with the given 35 | match_value. 36 | """ 37 | 38 | __slots__ = ("annotations", "match_tag", "match_value", "attr", "matcher") 39 | 40 | def __init__( 41 | self, 42 | annotations: tuple, 43 | attr: str, 44 | match_tag: str = None, 45 | match_value: str = None, 46 | ): 47 | self.annotations = tuple(annotations) 48 | self.attr = attr 49 | self.match_tag = match_tag 50 | self.match_value = match_value 51 | 52 | def apply(self, attr_value: str, html_element: HtmlElement): 53 | """Apply the annotation to HtmlElements with matching tags.""" 54 | if (self.match_tag and self.match_tag != html_element.tag) or ( 55 | self.match_value and self.match_value not in attr_value.split() 56 | ): 57 | return 58 | 59 | html_element.annotation += self.annotations 60 | 61 | def __str__(self): 62 | return " Tuple[Dict, List]: 90 | """Compute the AnnotationModel from a model dictionary. 91 | 92 | Returns: 93 | the AnnotationModel matching the input dictionary. 94 | """ 95 | tags = defaultdict(list) 96 | attrs = [] 97 | for key, annotations in model.items(): 98 | if "#" in key: 99 | tag, attr = key.split("#") 100 | if "=" in attr: 101 | attr, value = attr.split("=") 102 | else: 103 | value = None 104 | attrs.append(ApplyAnnotation(annotations, attr, tag, value)) 105 | else: 106 | tags[key].extend(annotations) 107 | return tags, attrs 108 | -------------------------------------------------------------------------------- /src/inscriptis/cli/__init__.py: -------------------------------------------------------------------------------- 1 | """Inscriptis command line interface clients.""" 2 | -------------------------------------------------------------------------------- /src/inscriptis/css_profiles.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding: utf-8 3 | """Standard CSS profiles shipped with inscriptis. 4 | 5 | - `strict`: this profile corresponds to the defaults used by Firefox 6 | - `relaxed`: this profile is more suited for text analytics, since it ensures 7 | that whitespaces are inserted between span and div elements 8 | preventing cases where two words stick together. 9 | """ 10 | 11 | from inscriptis.html_properties import Display, WhiteSpace 12 | from inscriptis.model.html_element import HtmlElement 13 | 14 | STRICT_CSS_PROFILE = { 15 | "body": HtmlElement(display=Display.inline, whitespace=WhiteSpace.normal), 16 | "head": HtmlElement(display=Display.none), 17 | "link": HtmlElement(display=Display.none), 18 | "meta": HtmlElement(display=Display.none), 19 | "script": HtmlElement(display=Display.none), 20 | "title": HtmlElement(display=Display.none), 21 | "style": HtmlElement(display=Display.none), 22 | "p": HtmlElement(display=Display.block, margin_before=1, margin_after=1), 23 | "figure": HtmlElement(display=Display.block, margin_before=1, margin_after=1), 24 | "h1": HtmlElement(display=Display.block, margin_before=1, margin_after=1), 25 | "h2": HtmlElement(display=Display.block, margin_before=1, margin_after=1), 26 | "h3": HtmlElement(display=Display.block, margin_before=1, margin_after=1), 27 | "h4": HtmlElement(display=Display.block, margin_before=1, margin_after=1), 28 | "h5": HtmlElement(display=Display.block, margin_before=1, margin_after=1), 29 | "h6": HtmlElement(display=Display.block, margin_before=1, margin_after=1), 30 | "ul": HtmlElement( 31 | display=Display.block, margin_before=0, margin_after=0, padding_inline=4 32 | ), 33 | "ol": HtmlElement( 34 | display=Display.block, margin_before=0, margin_after=0, padding_inline=4 35 | ), 36 | "li": HtmlElement(display=Display.block), 37 | "address": HtmlElement(display=Display.block), 38 | "article": HtmlElement(display=Display.block), 39 | "aside": HtmlElement(display=Display.block), 40 | "div": HtmlElement(display=Display.block), 41 | "footer": HtmlElement(display=Display.block), 42 | "header": HtmlElement(display=Display.block), 43 | "hgroup": HtmlElement(display=Display.block), 44 | "layer": HtmlElement(display=Display.block), 45 | "main": HtmlElement(display=Display.block), 46 | "nav": HtmlElement(display=Display.block), 47 | "figcaption": HtmlElement(display=Display.block), 48 | "blockquote": HtmlElement(display=Display.block), 49 | "q": HtmlElement(prefix='"', suffix='"'), 50 | # Handling of
51 |     "pre": HtmlElement(display=Display.block, whitespace=WhiteSpace.pre),
52 |     "xmp": HtmlElement(display=Display.block, whitespace=WhiteSpace.pre),
53 |     "listing": HtmlElement(display=Display.block, whitespace=WhiteSpace.pre),
54 |     "plaintext": HtmlElement(display=Display.block, whitespace=WhiteSpace.pre),
55 | }
56 | 
57 | RELAXED_CSS_PROFILE = STRICT_CSS_PROFILE.copy()
58 | RELAXED_CSS_PROFILE["div"] = HtmlElement(display=Display.block, padding_inline=2)
59 | RELAXED_CSS_PROFILE["span"] = HtmlElement(
60 |     display=Display.inline, prefix=" ", suffix=" ", limit_whitespace_affixes=True
61 | )
62 | 
63 | 
64 | CSS_PROFILES = {"strict": STRICT_CSS_PROFILE, "relaxed": RELAXED_CSS_PROFILE}
65 | 


--------------------------------------------------------------------------------
/src/inscriptis/html_engine.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding:utf-8
  3 | """The HTML Engine is responsible for converting HTML to text."""
  4 | from typing import List, Dict, Callable
  5 | 
  6 | import lxml.html
  7 | from lxml.etree import Comment
  8 | 
  9 | from inscriptis.annotation import Annotation
 10 | from inscriptis.model.canvas import Canvas
 11 | from inscriptis.model.config import ParserConfig
 12 | from inscriptis.model.html_document_state import HtmlDocumentState
 13 | from inscriptis.model.tag.a_tag import a_start_handler, a_end_handler
 14 | from inscriptis.model.tag.br_tag import br_start_handler
 15 | from inscriptis.model.tag.img_tag import img_start_handler
 16 | from inscriptis.model.tag.list_tag import (
 17 |     ul_start_handler,
 18 |     ol_start_handler,
 19 |     li_start_handler,
 20 |     ul_end_handler,
 21 |     ol_end_handler,
 22 | )
 23 | from inscriptis.model.tag.table_tag import (
 24 |     table_start_handler,
 25 |     tr_start_handler,
 26 |     td_start_handler,
 27 |     table_end_handler,
 28 |     td_end_handler,
 29 | )
 30 | 
 31 | 
 32 | class Inscriptis:
 33 |     """Translate an lxml HTML tree to the corresponding text representation.
 34 | 
 35 |     Args:
 36 |       html_tree: the lxml HTML tree to convert.
 37 |       config: an optional ParserConfig configuration object.
 38 | 
 39 |     Example::
 40 | 
 41 |       from lxml.html import fromstring
 42 |       from inscriptis.html_engine import Inscriptis
 43 | 
 44 |       html_content = "

Test

" 45 | 46 | # create an HTML tree from the HTML content. 47 | html_tree = fromstring(html_content) 48 | 49 | # transform the HTML tree to text. 50 | parser = Inscriptis(html_tree) 51 | text = parser.get_text() 52 | """ 53 | 54 | def __init__( 55 | self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None 56 | ) -> None: 57 | # use the default configuration, if no config object is provided 58 | config = config or ParserConfig() 59 | 60 | # setup start and end tag call tables 61 | self.start_tag_handler_dict: Dict[ 62 | str, Callable[[HtmlDocumentState, Dict], None] 63 | ] = { 64 | "table": table_start_handler, 65 | "tr": tr_start_handler, 66 | "td": td_start_handler, 67 | "th": td_start_handler, 68 | "ul": ul_start_handler, 69 | "ol": ol_start_handler, 70 | "li": li_start_handler, 71 | "br": br_start_handler, 72 | "a": a_start_handler if config.parse_a() else None, 73 | "img": img_start_handler if config.display_images else None, 74 | } 75 | self.end_tag_handler_dict: Dict[str, Callable[[HtmlDocumentState], None]] = { 76 | "table": table_end_handler, 77 | "ul": ul_end_handler, 78 | "ol": ol_end_handler, 79 | "td": td_end_handler, 80 | "th": td_end_handler, 81 | "a": a_end_handler if config.parse_a() else None, 82 | } 83 | 84 | if config.custom_html_tag_handler_mapping: 85 | self.start_tag_handler_dict.update( 86 | config.custom_html_tag_handler_mapping.start_tag_mapping 87 | ) 88 | self.end_tag_handler_dict.update( 89 | config.custom_html_tag_handler_mapping.end_tag_mapping 90 | ) 91 | 92 | # parse the HTML tree 93 | self.canvas = self._parse_html_tree(HtmlDocumentState(config), html_tree) 94 | 95 | def _parse_html_tree(self, state: HtmlDocumentState, tree) -> Canvas: 96 | """Parse the HTML tree. 97 | 98 | Args: 99 | tree: the HTML tree to parse. 100 | """ 101 | if isinstance(tree.tag, str): 102 | state.apply_starttag_layout(tree.tag, tree.attrib) 103 | 104 | if handler := self.start_tag_handler_dict.get(tree.tag): 105 | handler(state, tree.attrib) 106 | cur = state.tags[-1] 107 | cur.canvas.open_tag(cur) 108 | 109 | state.tags[-1].write(tree.text) 110 | 111 | for node in tree: 112 | self._parse_html_tree(state, node) 113 | 114 | # handle the endtag 115 | if handler := self.end_tag_handler_dict.get(tree.tag): 116 | handler(state) 117 | prev = state.tags.pop() 118 | prev.canvas.close_tag(prev) 119 | 120 | # write the tail text to the element's container 121 | state.tags[-1].write(tree.tail) 122 | 123 | elif tree.tag is Comment and tree.tail: 124 | state.tags[-1].canvas.write(state.tags[-1], tree.tail) 125 | 126 | return state.canvas 127 | 128 | def get_text(self) -> str: 129 | """Return the text extracted from the HTML page.""" 130 | return self.canvas.get_text() 131 | 132 | def get_annotations(self) -> List[Annotation]: 133 | """Return the annotations extracted from the HTML page.""" 134 | return self.canvas.annotations 135 | -------------------------------------------------------------------------------- /src/inscriptis/html_properties.py: -------------------------------------------------------------------------------- 1 | r"""Provide properties used for rendering HTML pages. 2 | 3 | Supported attributes:: 4 | 1. :class:`Display` properties. 5 | 2. :class:`WhiteSpace` properties. 6 | 3. :class:`HorizontalAlignment` properties. 7 | 4. :class:`VerticalAlignment` properties. 8 | """ 9 | 10 | from enum import Enum 11 | 12 | 13 | class Display(Enum): 14 | """Specify whether content will be rendered as inline, block or none. 15 | 16 | .. note:: 17 | A display attribute on none indicates, that the content should not be 18 | rendered at all. 19 | """ 20 | 21 | inline = 1 22 | block = 2 23 | none = 3 24 | 25 | 26 | class WhiteSpace(Enum): 27 | """Specify the HTML element's whitespace handling. 28 | 29 | Inscriptis supports the following handling strategies outlined in the 30 | `Cascading Style Sheets `_ specification. 31 | """ 32 | 33 | normal = 1 34 | """Collapse multiple whitespaces into a single one.""" 35 | pre = 3 36 | """Preserve sequences of whitespaces.""" 37 | 38 | 39 | class HorizontalAlignment(Enum): 40 | """Specify the content's horizontal alignment.""" 41 | 42 | left = "<" 43 | """Left alignment of the block's content.""" 44 | right = ">" 45 | """Right alignment of the block's content.""" 46 | center = "^" 47 | """Center the block's content.""" 48 | 49 | 50 | class VerticalAlignment(Enum): 51 | """Specify the content's vertical alignment.""" 52 | 53 | top = 1 54 | """Align all content at the top.""" 55 | middle = 2 56 | """Align all content in the middle.""" 57 | bottom = 3 58 | """Align all content at the bottom.""" 59 | -------------------------------------------------------------------------------- /src/inscriptis/metadata.py: -------------------------------------------------------------------------------- 1 | """Inscriptis metadata information.""" 2 | 3 | import importlib.metadata as metadata 4 | 5 | PACKAGE = "inscriptis" 6 | 7 | __author__ = "Albert Weichselbraun, Fabian Odoni" 8 | __author_email__ = "albert.weichselbraun@fhgr.ch, fabian.odoni@fhgr.ch" 9 | __copyright__ = ( 10 | f"{metadata.metadata(PACKAGE)['Name']} " 11 | + f"{metadata.metadata(PACKAGE)['Version']} © 2016-2025 {__author__}" 12 | ) 13 | __license__ = metadata.metadata(PACKAGE)["License"] 14 | __version__ = metadata.metadata(PACKAGE)["Version"] 15 | -------------------------------------------------------------------------------- /src/inscriptis/model/__init__.py: -------------------------------------------------------------------------------- 1 | """The model used for HTML rendering. 2 | 3 | - :mod:`inscriptis.model.canvas`: classes required for rendering parts of 4 | the HTML page. 5 | - :mod:`inscriptis.model.css`: classes required for the CSS support. 6 | - :mod:`inscriptis.model.table`: support for rendering HTML tables. 7 | """ 8 | -------------------------------------------------------------------------------- /src/inscriptis/model/attribute.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # encoding: utf-8 3 | 4 | """HTML attribute handling.""" 5 | from copy import copy 6 | from typing import Dict, List 7 | 8 | from inscriptis.annotation.parser import ApplyAnnotation 9 | from inscriptis.model.css import CssParse 10 | from inscriptis.model.html_element import HtmlElement 11 | 12 | DEFAULT_ATTRIBUTE_MAP = { 13 | "style": CssParse.attr_style, 14 | "align": CssParse.attr_horizontal_align, 15 | "valign": CssParse.attr_vertical_align, 16 | } 17 | 18 | 19 | def merge_function(func1, func2): 20 | """Merge two functions with the same arguments into a single one. 21 | 22 | This function is used for cascading functions that operate on HtmlElements 23 | and attributes. 24 | 25 | Args: 26 | func1: the first function 27 | func2: the second function 28 | """ 29 | 30 | def merged(*args): 31 | func1(*args) 32 | func2(*args) 33 | 34 | return merged 35 | 36 | 37 | class Attribute: 38 | """Handle HTML attributes such as `align`, and `valign`. 39 | 40 | This class handles HTML attributes by mapping them to the corresponding 41 | functions in the :class:`~inscriptis.model.css.CssParse` class. 42 | 43 | Attributes: 44 | attribute_mapping: a mapping of attributes to the corresponding handler 45 | functions. 46 | """ 47 | 48 | def __init__(self): 49 | self.attribute_mapping = DEFAULT_ATTRIBUTE_MAP 50 | 51 | def apply_attributes( 52 | self, attributes: Dict[str, str], html_element: HtmlElement 53 | ) -> HtmlElement: 54 | """Apply the attributes to the given HTML element. 55 | 56 | Args: 57 | attributes: the list of attributes 58 | html_element: the HTML element for which the attributes are parsed 59 | """ 60 | for attr_name, attr_value in attributes.items(): 61 | if attr_name in self.attribute_mapping: 62 | self.attribute_mapping[attr_name](attr_value, html_element) 63 | return html_element 64 | 65 | def merge_attribute_map(self, annotations: List[ApplyAnnotation] = None) -> None: 66 | attributes = copy(self.attribute_mapping) 67 | for a in annotations: 68 | attributes[a.attr] = ( 69 | a.apply 70 | if a.attr not in attributes 71 | else merge_function(attributes[a.attr], a.apply) 72 | ) 73 | self.attribute_mapping = attributes 74 | -------------------------------------------------------------------------------- /src/inscriptis/model/canvas/block.py: -------------------------------------------------------------------------------- 1 | """Representation of a text block within the HTML canvas.""" 2 | from __future__ import annotations 3 | 4 | from html import unescape 5 | from typing import TYPE_CHECKING 6 | 7 | from inscriptis.html_properties import WhiteSpace 8 | 9 | if TYPE_CHECKING: 10 | from inscriptis.model.canvas import Prefix 11 | 12 | 13 | class Block: 14 | """The current block of text. 15 | 16 | A block usually refers to one line of output text. 17 | 18 | .. note:: 19 | If pre-formatted content is merged with a block, it may also contain 20 | multiple lines. 21 | 22 | Args: 23 | idx: the current block's start index. 24 | prefix: prefix used within the current block. 25 | """ 26 | 27 | __slots__ = ("idx", "prefix", "_content", "collapsable_whitespace") 28 | 29 | def __init__(self, idx: int, prefix: Prefix): 30 | self.idx = idx 31 | self.prefix = prefix 32 | self._content = "" 33 | self.collapsable_whitespace = True 34 | 35 | def merge(self, text: str, whitespace: WhiteSpace) -> None: 36 | """Merge the given text with the current block. 37 | 38 | Args: 39 | text: the text to merge. 40 | whitespace: whitespace handling. 41 | """ 42 | if whitespace == WhiteSpace.pre: 43 | self.merge_pre_text(text) 44 | else: 45 | self.merge_normal_text(text) 46 | 47 | def merge_normal_text(self, text: str) -> None: 48 | """Merge the given text with the current block. 49 | 50 | Args: 51 | text: the text to merge 52 | 53 | Note: 54 | If the previous text ended with a whitespace and text starts with one, both 55 | will automatically collapse into a single whitespace. 56 | """ 57 | normalized_text = [] 58 | 59 | for ch in text: 60 | if not ch.isspace(): 61 | normalized_text.append(ch) 62 | self.collapsable_whitespace = False 63 | elif not self.collapsable_whitespace: 64 | normalized_text.append(" ") 65 | self.collapsable_whitespace = True 66 | 67 | if normalized_text: 68 | text = ( 69 | "".join((self.prefix.first, *normalized_text)) 70 | if not self._content 71 | else "".join(normalized_text) 72 | ) 73 | text = unescape(text) 74 | self._content += text 75 | self.idx += len(text) 76 | 77 | def merge_pre_text(self, text: str) -> None: 78 | """Merge the given pre-formatted text with the current block. 79 | 80 | Args: 81 | text: the text to merge 82 | """ 83 | text = "".join((self.prefix.first, text.replace("\n", "\n" + self.prefix.rest))) 84 | text = unescape(text) 85 | self._content += text 86 | self.idx += len(text) 87 | self.collapsable_whitespace = False 88 | 89 | def is_empty(self) -> bool: 90 | return len(self.content) == 0 91 | 92 | @property 93 | def content(self): 94 | if not self.collapsable_whitespace: 95 | return self._content 96 | 97 | if self._content.endswith(" "): 98 | self._content = self._content[:-1] 99 | self.idx -= 1 100 | return self._content 101 | 102 | def new_block(self) -> "Block": 103 | """Return a new Block based on the current one.""" 104 | self.prefix.consumed = False 105 | return Block(idx=self.idx + 1, prefix=self.prefix) 106 | -------------------------------------------------------------------------------- /src/inscriptis/model/canvas/prefix.py: -------------------------------------------------------------------------------- 1 | """Manage the horizontal prefix (left-indentation, bullets) of canvas lines.""" 2 | 3 | from contextlib import suppress 4 | 5 | 6 | class Prefix: 7 | """Class Prefix manages paddings and bullets that prefix an HTML block. 8 | 9 | Attributes: 10 | current_padding: the number of characters used for the current 11 | left-indentation. 12 | paddings: the list of paddings for the current and all previous tags. 13 | bullets: the list of bullets in the current and all previous tags. 14 | consumed: whether the current bullet has already been consumed. 15 | """ 16 | 17 | __slots__ = ("current_padding", "paddings", "bullets", "consumed") 18 | 19 | def __init__(self): 20 | self.current_padding = 0 21 | self.paddings = [] 22 | self.bullets = [] 23 | self.consumed = False 24 | 25 | def register_prefix(self, padding_inline: int, bullet: str) -> None: 26 | """Register the given prefix. 27 | 28 | Args: 29 | padding_inline: the number of characters used for padding_inline 30 | bullet: an optional bullet. 31 | """ 32 | self.current_padding += padding_inline 33 | self.paddings.append(padding_inline) 34 | self.bullets.append(bullet if bullet else "") 35 | 36 | def remove_last_prefix(self) -> None: 37 | """Remove the last prefix from the list.""" 38 | with suppress(IndexError): 39 | self.current_padding -= self.paddings.pop() 40 | del self.bullets[-1] 41 | 42 | def pop_next_bullet(self) -> str: 43 | """Pop the next bullet to use, if any bullet is available.""" 44 | next_bullet_idx = ( 45 | next((-idx for idx, val in enumerate(reversed(self.bullets)) if val), 1) - 1 46 | ) 47 | 48 | if not next_bullet_idx: 49 | return "" 50 | 51 | bullet = self.bullets[next_bullet_idx] 52 | self.bullets[next_bullet_idx] = "" 53 | return bullet 54 | 55 | @property 56 | def first(self) -> str: 57 | """Return the prefix used at the beginning of a tag. 58 | 59 | Note:: 60 | A new block needs to be prefixed by the current padding and bullet. 61 | Once this has happened (i.e., :attr:`consumed` is set to `True`) no 62 | further prefixes should be used for a line. 63 | """ 64 | if self.consumed: 65 | return "" 66 | 67 | self.consumed = True 68 | bullet = self.pop_next_bullet() 69 | return " " * (self.current_padding - len(bullet)) + bullet 70 | 71 | @property 72 | def unconsumed_bullet(self) -> str: 73 | """Yield any yet unconsumed bullet. 74 | 75 | Note:: 76 | This function yields the previous element's bullets, if they have 77 | not been consumed yet. 78 | """ 79 | if self.consumed: 80 | return "" 81 | 82 | bullet = self.pop_next_bullet() 83 | if not bullet: 84 | return "" 85 | 86 | padding = self.current_padding - self.paddings[-1] 87 | return " " * (padding - len(bullet)) + bullet 88 | 89 | @property 90 | def rest(self) -> str: 91 | """Return the prefix used for new lines within a block. 92 | 93 | This prefix is used for pre-text that contains newlines. The lines 94 | need to be prefixed with the right padding to preserver the 95 | indentation. 96 | """ 97 | return " " * self.current_padding 98 | -------------------------------------------------------------------------------- /src/inscriptis/model/config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Provide configuration objects for the Inscriptis HTML to text converter.""" 3 | from __future__ import annotations 4 | 5 | from copy import deepcopy 6 | from typing import Dict, List 7 | 8 | from inscriptis.annotation.parser import AnnotationModel 9 | from inscriptis.css_profiles import CSS_PROFILES 10 | from inscriptis.model.attribute import Attribute 11 | from inscriptis.model.html_element import HtmlElement 12 | from inscriptis.model.tag import CustomHtmlTagHandlerMapping 13 | 14 | DEFAULT_CSS_PROFILE_NAME = "relaxed" 15 | 16 | 17 | class ParserConfig: 18 | """Encapsulate configuration options and CSS definitions.""" 19 | 20 | def __init__( 21 | self, 22 | css: Dict[str, HtmlElement] = None, 23 | display_images: bool = False, 24 | deduplicate_captions: bool = False, 25 | display_links: bool = False, 26 | display_anchors: bool = False, 27 | annotation_rules: Dict[str, List[str]] = None, 28 | table_cell_separator: str = " ", 29 | custom_html_tag_handler_mapping: CustomHtmlTagHandlerMapping = None, 30 | ): 31 | """Create a ParserConfig configuration. 32 | 33 | Args: 34 | css: an optional custom CSS definition. 35 | display_images: whether to include image tiles/alt texts. 36 | deduplicate_captions: whether to deduplicate captions such as image 37 | titles (many newspaper include images and video previews with 38 | identical titles). 39 | display_links: whether to display link targets 40 | (e.g. `[Python](https://www.python.org)`). 41 | display_anchors: whether to display anchors (e.g. `[here](#here)`). 42 | annotation_rules: an optional dictionary of annotation rules which 43 | specify tags and attributes to annotation. 44 | table_cell_separator: separator to use between table cells. 45 | custom_html_tag_handler_mapping: an optional CustomHtmlTagHandler 46 | """ 47 | self.display_images = display_images 48 | self.deduplicate_captions = deduplicate_captions 49 | self.display_links = display_links 50 | self.display_anchors = display_anchors 51 | self.css = css or CSS_PROFILES[DEFAULT_CSS_PROFILE_NAME] 52 | self.attribute_handler = Attribute() 53 | self.table_cell_separator = table_cell_separator 54 | self.custom_html_tag_handler_mapping = custom_html_tag_handler_mapping 55 | 56 | if annotation_rules: 57 | # ensure that we do not modify the original model or its 58 | # members. 59 | annotation_model = AnnotationModel(deepcopy(self.css), annotation_rules) 60 | # css with annotation support 61 | self.css = annotation_model.css 62 | # attribute handler with annotation support 63 | self.attribute_handler.merge_attribute_map(annotation_model.css_attr) 64 | 65 | def parse_a(self) -> bool: 66 | """Indicate whether the text output should contain links or anchors. 67 | 68 | Returns 69 | Whether we need to parse tags. 70 | """ 71 | return self.display_links or self.display_anchors 72 | -------------------------------------------------------------------------------- /src/inscriptis/model/css.py: -------------------------------------------------------------------------------- 1 | """Implement basic CSS support for inscriptis. 2 | 3 | - The :class:`~inscriptis.model.html_element.HtmlElement` class 4 | encapsulates all CSS properties of a single HTML element. 5 | - :class:`CssParse` parses CSS specifications and translates them into the 6 | corresponding HtmlElements used by Inscriptis for rendering HTML pages. 7 | """ 8 | from contextlib import suppress 9 | from re import compile as re_compile 10 | 11 | from inscriptis.html_properties import ( 12 | Display, 13 | WhiteSpace, 14 | HorizontalAlignment, 15 | VerticalAlignment, 16 | ) 17 | from inscriptis.model.html_element import HtmlElement 18 | 19 | 20 | class CssParse: 21 | """Parse CSS specifications and applies them to HtmlElements. 22 | 23 | The attribute `display: none`, for instance, is translated to 24 | :attr:`HtmlElement.display=Display.none`. 25 | """ 26 | 27 | # used to separate value and unit from each other 28 | RE_UNIT = re_compile(r"(-?[0-9.]+)(\w+)") 29 | 30 | @staticmethod 31 | def attr_style(style_attribute: str, html_element: HtmlElement): 32 | """Apply the provided style attributes to the given HtmlElement. 33 | 34 | Args: 35 | style_attribute: The attribute value of the given style sheet. 36 | Example: display: none 37 | html_element: The HtmlElement to which the given style is applied. 38 | """ 39 | for style_directive in style_attribute.lower().split(";"): 40 | if ":" not in style_directive: 41 | continue 42 | key, value = (s.strip() for s in style_directive.split(":", 1)) 43 | 44 | try: 45 | apply_style = getattr( 46 | CssParse, "attr_" + key.replace("-webkit-", "").replace("-", "_") 47 | ) 48 | apply_style(value, html_element) 49 | except AttributeError: 50 | pass 51 | 52 | @staticmethod 53 | def _get_em(length: str) -> int: 54 | """Convert length specifications into em. 55 | 56 | This function takes a length specification (e.g., 2em, 2px, etc.) and 57 | transforms it into em. 58 | 59 | Args: 60 | length: the length specification. 61 | 62 | Returns: 63 | the length in em. 64 | """ 65 | _m = CssParse.RE_UNIT.search(length) 66 | value = float(_m.group(1)) 67 | unit = _m.group(2) 68 | 69 | if unit not in ("em", "qem", "rem"): 70 | return int(round(value / 8)) 71 | return int(round(value)) 72 | 73 | # ------------------------------------------------------------------------ 74 | # css styles 75 | # ------------------------------------------------------------------------ 76 | 77 | @staticmethod 78 | def attr_display(value: str, html_element: HtmlElement): 79 | """Apply the given display value.""" 80 | if html_element.display == Display.none: 81 | return 82 | 83 | if value == "block": 84 | html_element.display = Display.block 85 | elif value == "none": 86 | html_element.display = Display.none 87 | else: 88 | html_element.display = Display.inline 89 | 90 | @staticmethod 91 | def attr_white_space(value: str, html_element: HtmlElement): 92 | """Apply the given white-space value.""" 93 | if value in ("normal", "nowrap"): 94 | html_element.whitespace = WhiteSpace.normal 95 | elif value in ("pre", "pre-line", "pre-wrap"): 96 | html_element.whitespace = WhiteSpace.pre 97 | 98 | @staticmethod 99 | def attr_margin_top(value: str, html_element: HtmlElement): 100 | """Apply the given top margin.""" 101 | with suppress(ValueError): 102 | html_element.margin_before = CssParse._get_em(value) 103 | 104 | @staticmethod 105 | def attr_margin_bottom(value: str, html_element: HtmlElement): 106 | """Apply the provided bottom margin.""" 107 | with suppress(ValueError): 108 | html_element.margin_after = CssParse._get_em(value) 109 | 110 | @staticmethod 111 | def attr_padding_left(value: str, html_element: HtmlElement): 112 | """Apply the given left padding_inline.""" 113 | with suppress(ValueError): 114 | html_element.padding_inline = CssParse._get_em(value) 115 | 116 | @staticmethod 117 | def attr_horizontal_align(value: str, html_element: HtmlElement): 118 | """Apply the provided horizontal alignment.""" 119 | with suppress(KeyError): 120 | html_element.align = HorizontalAlignment[value] 121 | 122 | @staticmethod 123 | def attr_vertical_align(value: str, html_element: HtmlElement): 124 | """Apply the given vertical alignment.""" 125 | with suppress(KeyError): 126 | html_element.valign = VerticalAlignment[value] 127 | 128 | # register aliases 129 | attr_margin_before = attr_margin_top 130 | attr_margin_after = attr_margin_bottom 131 | attr_padding_start = attr_padding_left 132 | -------------------------------------------------------------------------------- /src/inscriptis/model/html_document_state.py: -------------------------------------------------------------------------------- 1 | """Represents the state of an HTML document. 2 | 3 | The provided `HtmlDocumentState` class contains and exposes all fields required for 4 | representing the current state of the HTML to text conversion. 5 | """ 6 | 7 | from inscriptis import ParserConfig 8 | from inscriptis.model.canvas import Canvas 9 | from inscriptis.model.html_element import DEFAULT_HTML_ELEMENT 10 | 11 | 12 | class HtmlDocumentState: 13 | """Represents the state of the parsed html document.""" 14 | 15 | def __init__(self, config: ParserConfig): 16 | # instance variables 17 | self.canvas = Canvas() 18 | self.config = config 19 | self.css = config.css 20 | self.apply_attributes = config.attribute_handler.apply_attributes 21 | 22 | self.tags = [self.css["body"].set_canvas(self.canvas)] 23 | self.current_table = [] 24 | self.li_counter = [] 25 | self.last_caption = None 26 | 27 | # used if display_links is enabled 28 | self.link_target = "" 29 | 30 | def apply_starttag_layout(self, tag, attrs): 31 | """Compute the layout of the tag. 32 | 33 | Compute the style of the current :class:`HtmlElement`, based on 34 | 35 | 1. the used :attr:`css`, 36 | 2. apply attributes and css with :meth:`~Attribute.apply_attributes` 37 | 3. add the `HtmlElement` to the list of open tags. 38 | 39 | Args: 40 | tag: the HTML start tag to process. 41 | attrs: a dictionary of HTML attributes and their respective values. 42 | """ 43 | # use the css to handle tags known to it :) 44 | cur = self.tags[-1].get_refined_html_element( 45 | self.apply_attributes( 46 | attrs, 47 | html_element=self.css.get(tag, DEFAULT_HTML_ELEMENT) 48 | .__copy__() 49 | .set_tag(tag), 50 | ) 51 | ) 52 | self.tags.append(cur) 53 | -------------------------------------------------------------------------------- /src/inscriptis/model/tag/__init__.py: -------------------------------------------------------------------------------- 1 | """HTML Tag handlers and classes for designing custom HTML tag handlers.""" 2 | from __future__ import annotations 3 | 4 | from typing import Dict, Callable, NamedTuple 5 | from typing import TYPE_CHECKING 6 | 7 | if TYPE_CHECKING: 8 | from inscriptis.model.html_document_state import HtmlDocumentState 9 | 10 | 11 | class CustomHtmlTagHandlerMapping(NamedTuple): 12 | """Refine the standard HTML Tag handling with the provided mapping. 13 | 14 | Attributes: 15 | start_tag_mapping: a dictionary of custom start tag handlers. 16 | end_tag_mapping: a dictionary of custom end tag handlers. 17 | """ 18 | 19 | start_tag_mapping: Dict[str, Callable[[HtmlDocumentState, Dict], None]] 20 | end_tag_mapping: Dict[str, Callable[[HtmlDocumentState], None]] 21 | -------------------------------------------------------------------------------- /src/inscriptis/model/tag/a_tag.py: -------------------------------------------------------------------------------- 1 | """Handle the tag.""" 2 | from typing import Dict 3 | 4 | from inscriptis.model.html_document_state import HtmlDocumentState 5 | 6 | 7 | def a_start_handler(state: HtmlDocumentState, attrs: Dict) -> None: 8 | """Handle the tag.""" 9 | state.link_target = "" 10 | if state.config.display_links: 11 | state.link_target = attrs.get("href", "") 12 | if state.config.display_anchors: 13 | state.link_target = state.link_target or attrs.get("name", "") 14 | 15 | if state.link_target: 16 | state.tags[-1].write("[") 17 | 18 | 19 | def a_end_handler(state: HtmlDocumentState) -> None: 20 | """Handle the tag.""" 21 | if state.link_target: 22 | state.tags[-1].write(f"]({state.link_target})") 23 | -------------------------------------------------------------------------------- /src/inscriptis/model/tag/br_tag.py: -------------------------------------------------------------------------------- 1 | """Handle the
tag.""" 2 | from typing import Dict 3 | 4 | from inscriptis.model.html_document_state import HtmlDocumentState 5 | 6 | 7 | def br_start_handler(state: HtmlDocumentState, _: Dict) -> None: 8 | """Handle the
tag.""" 9 | state.tags[-1].canvas.write_newline() 10 | -------------------------------------------------------------------------------- /src/inscriptis/model/tag/img_tag.py: -------------------------------------------------------------------------------- 1 | """Handle the tag.""" 2 | from typing import Dict 3 | 4 | from inscriptis.model.html_document_state import HtmlDocumentState 5 | 6 | 7 | def img_start_handler(state: HtmlDocumentState, attrs: Dict) -> None: 8 | """Handle the tag.""" 9 | image_text = attrs.get("alt", "") or attrs.get("title", "") 10 | if image_text and not ( 11 | state.config.deduplicate_captions and image_text == state.last_caption 12 | ): 13 | state.tags[-1].write(f"[{image_text}]") 14 | state.last_caption = image_text 15 | -------------------------------------------------------------------------------- /src/inscriptis/model/tag/list_tag.py: -------------------------------------------------------------------------------- 1 | """Handle the
  • ,
      ,
        tags.""" 2 | from typing import Dict 3 | 4 | from inscriptis.model.html_document_state import HtmlDocumentState 5 | 6 | UL_COUNTER = ("* ", "+ ", "o ", "- ") 7 | UL_COUNTER_LEN = len(UL_COUNTER) 8 | 9 | 10 | def get_bullet(state: HtmlDocumentState) -> str: 11 | """Return the bullet that correspond to the given index.""" 12 | return UL_COUNTER[len(state.li_counter) % UL_COUNTER_LEN] 13 | 14 | 15 | def li_start_handler(state: HtmlDocumentState, _: Dict) -> None: 16 | """Handle the
      • tag.""" 17 | bullet = state.li_counter[-1] if state.li_counter else "* " 18 | if isinstance(bullet, int): 19 | state.li_counter[-1] += 1 20 | state.tags[-1].list_bullet = f"{bullet}. " 21 | else: 22 | state.tags[-1].list_bullet = bullet 23 | 24 | state.tags[-1].write("") 25 | 26 | 27 | def ul_start_handler(state: HtmlDocumentState, _: Dict) -> None: 28 | """Handle the
          tag.""" 29 | state.li_counter.append(get_bullet(state)) 30 | 31 | 32 | def ul_end_handler(state: HtmlDocumentState) -> None: 33 | """Handle the
        tag.""" 34 | state.li_counter.pop() 35 | 36 | 37 | def ol_start_handler(state: HtmlDocumentState, _: Dict) -> None: 38 | """Handle the
          tag.""" 39 | state.li_counter.append(1) 40 | 41 | 42 | def ol_end_handler(state: HtmlDocumentState) -> None: 43 | """Handle the
        tag.""" 44 | state.li_counter.pop() 45 | -------------------------------------------------------------------------------- /src/inscriptis/model/tag/table_tag.py: -------------------------------------------------------------------------------- 1 | """Handle the , and tag.""" 21 | if state.current_table: 22 | state.current_table[-1].add_row() 23 | 24 | 25 | def table_start_handler(state: HtmlDocumentState, _: Dict) -> None: 26 | """Handle the
        tags.""" 2 | from typing import Dict 3 | 4 | from inscriptis.annotation import Annotation 5 | from inscriptis.model.canvas import Canvas 6 | from inscriptis.model.html_document_state import HtmlDocumentState 7 | from inscriptis.model.table import Table, TableCell 8 | 9 | 10 | def td_start_handler(state: HtmlDocumentState, _: Dict) -> None: 11 | """Handle the tag.""" 12 | if state.current_table: 13 | # open td tag 14 | table_cell = TableCell(align=state.tags[-1].align, valign=state.tags[-1].valign) 15 | state.tags[-1].canvas = table_cell 16 | state.current_table[-1].add_cell(table_cell) 17 | 18 | 19 | def tr_start_handler(state: HtmlDocumentState, _: Dict) -> None: 20 | """Handle the
        tag.""" 27 | state.tags[-1].set_canvas(Canvas()) 28 | state.current_table.append( 29 | Table( 30 | left_margin_len=state.tags[-1].canvas.left_margin, 31 | cell_separator=state.config.table_cell_separator, 32 | ) 33 | ) 34 | 35 | 36 | def td_end_handler(state: HtmlDocumentState) -> None: 37 | """Handle the tag.""" 38 | if state.current_table: 39 | state.tags[-1].canvas.close_tag(state.tags[-1]) 40 | 41 | 42 | def table_end_handler(state: HtmlDocumentState) -> None: 43 | """Handle the
        tag.""" 44 | if state.current_table: 45 | td_end_handler(state) 46 | table = state.current_table.pop() 47 | # last tag before the table: self.tags[-2] 48 | # table tag: self.tags[-1] 49 | 50 | out_of_table_text = state.tags[-1].canvas.get_text().strip() 51 | if out_of_table_text: 52 | state.tags[-2].write(out_of_table_text) 53 | state.tags[-2].canvas.write_newline() 54 | 55 | start_idx = state.tags[-2].canvas.current_block.idx 56 | state.tags[-2].write_verbatim_text(table.get_text()) 57 | state.tags[-2].canvas.flush_inline() 58 | 59 | # transfer annotations from the current tag 60 | if state.tags[-1].annotation: 61 | end_idx = state.tags[-2].canvas.current_block.idx 62 | for a in state.tags[-1].annotation: 63 | state.tags[-2].canvas.annotations.append(Annotation(start_idx, end_idx, a)) 64 | 65 | # transfer in-table annotations 66 | state.tags[-2].canvas.annotations.extend( 67 | table.get_annotations(start_idx, state.tags[-2].canvas.left_margin) 68 | ) 69 | -------------------------------------------------------------------------------- /src/inscriptis/service/__init__.py: -------------------------------------------------------------------------------- 1 | """The Inscriptis Web service.""" 2 | -------------------------------------------------------------------------------- /src/inscriptis/service/web.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding:utf-8 3 | """Inscriptis Web Service.""" 4 | 5 | from fastapi import FastAPI, Request 6 | from fastapi.responses import PlainTextResponse 7 | 8 | from inscriptis import get_text 9 | from inscriptis.css_profiles import RELAXED_CSS_PROFILE 10 | from inscriptis.metadata import __version__ 11 | from inscriptis.model.config import ParserConfig 12 | 13 | app = FastAPI() 14 | CONFIG = ParserConfig( 15 | css=RELAXED_CSS_PROFILE, 16 | display_images=True, 17 | deduplicate_captions=True, 18 | display_links=False, 19 | ) 20 | 21 | 22 | @app.get("/") 23 | def index(): 24 | """Print a short status message for the Web service's base URL.""" 25 | return PlainTextResponse("Inscriptis text to HTML Web service.") 26 | 27 | 28 | @app.post("/get_text", response_class=PlainTextResponse) 29 | async def get_text_call(request: Request): 30 | """Return the text representation of the given HTML content.""" 31 | content_type = request.headers.get("Content-type") 32 | if "; charset=" in content_type: 33 | encoding = content_type.split("; charset=")[1] 34 | else: 35 | encoding = "UTF-8" 36 | html_content = await request.body() 37 | return get_text(html_content.decode(encoding, errors="ignore"), CONFIG) 38 | 39 | 40 | @app.get("/version", response_class=PlainTextResponse) 41 | def get_version_call(): 42 | """Return the used inscriptis version.""" 43 | return __version__ 44 | 45 | 46 | def start(): 47 | """Start the webservice.""" 48 | import uvicorn 49 | 50 | print("Starting Web service based on Inscriptis", __version__) 51 | uvicorn.run(app, host="127.0.0.1", port=5000) 52 | 53 | 54 | if __name__ == "__main__": 55 | start() 56 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weblyzard/inscriptis/2ef7e3bdc428816a34e8f6e6280888c7d64eec26/tests/__init__.py -------------------------------------------------------------------------------- /tests/data/annotation-profile-unittest.json: -------------------------------------------------------------------------------- 1 | { 2 | "h1": ["heading"], 3 | "h2": ["heading"], 4 | "h3": ["heading"], 5 | "b": ["emphasis"], 6 | "table": ["table"] 7 | } 8 | -------------------------------------------------------------------------------- /tests/html/advanced-prefix-test.html: -------------------------------------------------------------------------------- 1 |
          2 |
        1. first
        2. 3 |
        3. 4 |
            5 |
          • y=0
             6 | for x in range(3,10):
             7 |    print(x)
             8 |    y += x
             9 | print(y)
            10 |
          • 11 |
          • print("Hallo")
            12 | print("Echo")
            13 | print("123")
            14 |             
          • 15 |
          • 16 |
          17 |
        4. third
        5. 18 |
        19 | 20 | -------------------------------------------------------------------------------- /tests/html/advanced-prefix-test.txt: -------------------------------------------------------------------------------- 1 | 1. first 2 | 2. 3 | + y=0 4 | for x in range(3,10): 5 | print(x) 6 | y += x 7 | print(y) 8 | + print("Hallo") 9 | print("Echo") 10 | print("123") 11 | 12 | + 13 | 3. third 14 | -------------------------------------------------------------------------------- /tests/html/br-in-table.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 |
        First
        a special line
        SecondThird
        abc
        11 | -------------------------------------------------------------------------------- /tests/html/br-in-table.txt: -------------------------------------------------------------------------------- 1 | First Second Third 2 | a special line 3 | a b c 4 | -------------------------------------------------------------------------------- /tests/html/br-in-table2.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 |
        First
        a special line
        SecondThird
        abc
        a2c2
        • first
        • second
        • third
        c3
        last1last2last3
        23 | -------------------------------------------------------------------------------- /tests/html/br-li.html: -------------------------------------------------------------------------------- 1 | List 2 |
          3 |
        • first line
          4 | second line 5 |
        • third line 6 |
        7 | -------------------------------------------------------------------------------- /tests/html/br-li.txt: -------------------------------------------------------------------------------- 1 | List 2 | * first line 3 | second line 4 | * third line 5 | -------------------------------------------------------------------------------- /tests/html/br.html: -------------------------------------------------------------------------------- 1 | First line
        2 | second line 3 | -------------------------------------------------------------------------------- /tests/html/br.txt: -------------------------------------------------------------------------------- 1 | First line 2 | second line 3 | -------------------------------------------------------------------------------- /tests/html/direct-enumeration.html: -------------------------------------------------------------------------------- 1 |
          2 |
        1. First 3 |
        2. Second 4 |
            5 |
          1. Sec, First 6 |
          2. Sec, Second 7 |
              8 |
            • item 9 |
            • item2 10 |
            11 |
          12 |
        3. Third 13 |
        14 | -------------------------------------------------------------------------------- /tests/html/direct-enumeration.txt: -------------------------------------------------------------------------------- 1 | 1. First 2 | 2. Second 3 | 1. Sec, First 4 | 2. Sec, Second 5 | o item 6 | o item2 7 | 3. Third 8 | -------------------------------------------------------------------------------- /tests/html/empty-table.html: -------------------------------------------------------------------------------- 1 | 2 | 1 3 |
        4 | -------------------------------------------------------------------------------- /tests/html/empty-table.txt: -------------------------------------------------------------------------------- 1 | 1 2 | -------------------------------------------------------------------------------- /tests/html/enumerations.html: -------------------------------------------------------------------------------- 1 | Hallo 2 |
          3 |
        1. First 4 |
        2. Second 5 |
            6 |
          1. Second, First 7 |
          2. Second, Second 8 |
              9 |
            • item 10 |
            • item2 11 |
            12 |
          13 |
        3. Third 14 |
        15 | -------------------------------------------------------------------------------- /tests/html/enumerations.txt: -------------------------------------------------------------------------------- 1 | Hallo 2 | 1. First 3 | 2. Second 4 | 1. Second, First 5 | 2. Second, Second 6 | o item 7 | o item2 8 | 3. Third 9 | -------------------------------------------------------------------------------- /tests/html/html-comment-ofuscation.html: -------------------------------------------------------------------------------- 1 | $90.74 2 | -------------------------------------------------------------------------------- /tests/html/html-comment-ofuscation.txt: -------------------------------------------------------------------------------- 1 | $90.74 2 | -------------------------------------------------------------------------------- /tests/html/invalid-table.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 |
        FirstSecondThird
        anybetacharly
        long timeshort timemedium time
        15 | -------------------------------------------------------------------------------- /tests/html/invalid-table.txt: -------------------------------------------------------------------------------- 1 | First Second Third 2 | any beta charly 3 | long time short time medium time 4 | -------------------------------------------------------------------------------- /tests/html/invalid-table2.html: -------------------------------------------------------------------------------- 1 | Good day 2 | 3 | first second third 4 |
        5 | forth 6 | 7 | beta
        alpha 8 | epsilon 9 |
        gamma 10 |
        11 | -------------------------------------------------------------------------------- /tests/html/invalid-table2.txt: -------------------------------------------------------------------------------- 1 | Good day first second third 2 | forth beta 3 | alpha epsilon 4 | gamma 5 | -------------------------------------------------------------------------------- /tests/html/invalid-table3.html: -------------------------------------------------------------------------------- 1 | Good day 2 | 3 | first second third 4 |
        5 | forth 6 | 7 | oho 8 | beta
        alphaepsilon 9 |
        gamma 10 |
        11 | -------------------------------------------------------------------------------- /tests/html/invalid-table3.txt: -------------------------------------------------------------------------------- 1 | Good day first second third 2 | forth oho beta 3 | alpha epsilon 4 | gamma -------------------------------------------------------------------------------- /tests/html/invisible.html: -------------------------------------------------------------------------------- 1 | <ul>hallo 2 | 3 |

        Title

        4 | noch mehr text 5 | -------------------------------------------------------------------------------- /tests/html/invisible.txt: -------------------------------------------------------------------------------- 1 | Title 2 | -------------------------------------------------------------------------------- /tests/html/invisible2.html: -------------------------------------------------------------------------------- 1 |

        Leertest

        2 | halloecho 3 | -------------------------------------------------------------------------------- /tests/html/invisible2.txt: -------------------------------------------------------------------------------- 1 | Leertest 2 | -------------------------------------------------------------------------------- /tests/html/invisible3.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /tests/html/invisible3.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weblyzard/inscriptis/2ef7e3bdc428816a34e8f6e6280888c7d64eec26/tests/html/invisible3.txt -------------------------------------------------------------------------------- /tests/html/nested-list.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | First 4 |
          5 |
        • 6 |
        • 7 |
        • 1 8 |
        • 2 9 |
        • 3 10 |
        • 11 |
        • 12 |
        13 | 14 | Second 15 |
          16 |
        • 17 |
        • 18 |
        • 19 |
            20 |
          • 1 21 |
          • 22 |
              23 |
            • a 24 |
            • b 25 |
            • c 26 |
            27 |
          • 28 |
          • 3 29 |
          30 |
        • 31 |
        • 32 |
        • 33 |
            34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /tests/html/nested-list.txt: -------------------------------------------------------------------------------- 1 | First 2 | * 3 | * 4 | * 1 5 | * 2 6 | * 3 7 | * 8 | * 9 | Second 10 | * 11 | * 12 | * 13 | + 1 14 | + 15 | o a 16 | o b 17 | o c 18 | + 3 19 | * 20 | * 21 | * 22 | -------------------------------------------------------------------------------- /tests/html/nested-table-alignment-css.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
            column with nested tablecolumn 2column 3column 4
            8 | 9 | 10 | 11 | 12 | 13 |
            nestedtable
            1112
            2122
            3132
            14 |
            TomJoeSue
            lastline
            24 | -------------------------------------------------------------------------------- /tests/html/nested-table-alignment-css.txt: -------------------------------------------------------------------------------- 1 | column with nested table column 2 column 3 column 4 2 | nested table Tom 3 | 11 12 4 | 21 22 Joe 5 | 31 32 6 | Sue 7 | last line -------------------------------------------------------------------------------- /tests/html/nested-table-alignment.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
            column with nested tablecolumn 2column 3column 4
            8 | 9 | 10 | 11 | 12 | 13 |
            nestedtable
            1112
            2122
            3132
            14 |
            TomJoeSue
            lastline
            24 | -------------------------------------------------------------------------------- /tests/html/nested-table-alignment.txt: -------------------------------------------------------------------------------- 1 | column with nested table column 2 column 3 column 4 2 | nested table Tom 3 | 11 12 4 | 21 22 Joe 5 | 31 32 6 | Sue 7 | last line -------------------------------------------------------------------------------- /tests/html/nested-table.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 |
            column with nested tablecolumn 2column 3
            7 | 8 | 9 | 10 |
            nestedtable
            12
            11 |
            TomJoe
            lastline
            20 | -------------------------------------------------------------------------------- /tests/html/nested-table.txt: -------------------------------------------------------------------------------- 1 | column with nested table column 2 column 3 2 | nested table 3 | 1 2 Tom Joe 4 | 5 | last line -------------------------------------------------------------------------------- /tests/html/p-br.html: -------------------------------------------------------------------------------- 1 | L


            2 | B 3 |

            4 | Line 5 |
            6 |

            Another line
            7 | Third line

            8 |
            9 | Forth line 10 | -------------------------------------------------------------------------------- /tests/html/p-br.txt: -------------------------------------------------------------------------------- 1 | L 2 | 3 | 4 | B 5 | 6 | Line 7 | 8 | Another line 9 | Third line 10 | 11 | Forth line 12 | -------------------------------------------------------------------------------- /tests/html/pre.html: -------------------------------------------------------------------------------- 1 |

            Pre elements

            2 | 3 |
             4 | b = 1
             5 | for a in range(10):
             6 |    print(a)
             7 |    b *= a
             8 |    print(b)
             9 | 
            10 | 11 |

            A pre block within an enumeration

            12 | 13 |
              14 |
            • Hallo
            • 15 |
            • b = 1
              16 | for a in range(10):
              17 |    print(a)
              18 |    b *= a
              19 |    print(b)
              20 | 
            • 21 |
            • Echo
            • 22 | 23 | -------------------------------------------------------------------------------- /tests/html/pre.txt: -------------------------------------------------------------------------------- 1 | Pre elements 2 | 3 | 4 | b = 1 5 | for a in range(10): 6 | print(a) 7 | b *= a 8 | print(b) 9 | 10 | 11 | A pre block within an enumeration 12 | 13 | * Hallo 14 | * b = 1 15 | for a in range(10): 16 | print(a) 17 | b *= a 18 | print(b) 19 | 20 | * Echo 21 | -------------------------------------------------------------------------------- /tests/html/real-world/naturgruen-team.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weblyzard/inscriptis/2ef7e3bdc428816a34e8f6e6280888c7d64eec26/tests/html/real-world/naturgruen-team.html -------------------------------------------------------------------------------- /tests/html/stackoverflow-list-snippet.html: -------------------------------------------------------------------------------- 1 |
            • 2 |
              3 |
              4 |
              5 |
              6 |
              7 |
              8 | 9 | I obtain "NameError: name 'NamedTuple' is not defined" 10 | 11 | – nbedou 14 | Jul 6 '18 at 12:45 15 |
              16 |
              17 |
            • 18 |
            • 19 |
              20 |
              21 |
              22 |
              23 |
              24 |
              25 | 26 | @nbedou docs.python.org/3/library/typing.html#typing.NamedTuple 27 | 28 | – nodakai 31 | Oct 3 '18 at 7:44 32 |
              33 |
              34 |
            • 35 | -------------------------------------------------------------------------------- /tests/html/stackoverflow-list-snippet.txt: -------------------------------------------------------------------------------- 1 | * I obtain "NameError: name 'NamedTuple' is not defined" – nbedou Jul 6 '18 at 12:45 2 | * @nbedou docs.python.org/3/library/typing.html#typing.NamedTuple – nodakai Oct 3 '18 at 7:44 3 | -------------------------------------------------------------------------------- /tests/html/subsequent-headings.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Test the spacing between subsequent headings 4 | 5 |

              The first

              6 | 7 | And text, concerning the first heading. 8 | 9 |

              The second

              10 | Text concerning the second heading. 11 | 12 |

              Subheading

              13 | Sub1 14 | 15 |

              This is a subsubtopic

              16 | 17 |

              Another subheading

              18 | Sub2 19 | 20 |

              The third

              21 | The third and final heading. 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /tests/html/subsequent-headings.json: -------------------------------------------------------------------------------- 1 | {"annotation_rules": { 2 | "h1": ["heading"], 3 | "h2": ["heading"], 4 | "h3": ["heading"], 5 | "b": ["emphasis"], 6 | "table": ["table"], 7 | "th": ["table-heading"], 8 | "td": ["table-cell"] 9 | }, 10 | "result": [ 11 | ["heading", "The first\n\n"], 12 | ["heading", "\nThe second\n\n"], 13 | ["heading", "\nSubheading\n\n"], 14 | ["heading", "\nThis is a subsubtopic\n\n"], 15 | ["heading", "Another subheading\n\n"], 16 | ["heading", "\nThe third\n\n"] 17 | ] 18 | } 19 | -------------------------------------------------------------------------------- /tests/html/subsequent-headings.txt: -------------------------------------------------------------------------------- 1 | The first 2 | 3 | And text, concerning the first heading. 4 | 5 | The second 6 | 7 | Text concerning the second heading. 8 | 9 | Subheading 10 | 11 | Sub1 12 | 13 | This is a subsubtopic 14 | 15 | Another subheading 16 | 17 | Sub2 18 | 19 | The third 20 | 21 | The third and final heading. 22 | -------------------------------------------------------------------------------- /tests/html/table-alignment.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 |
              TitelBeschreibungLänge
              123
              Der Prinz von ÄgyptenBasierend auf dem Buch Exodus99 min
              Leo Lausemaus Der Läusebub99 min
              8 | 9 | -------------------------------------------------------------------------------- /tests/html/table-alignment.txt: -------------------------------------------------------------------------------- 1 | Titel Beschreibung Länge 2 | 1 2 3 3 | Der Prinz von Ägypten Basierend auf dem Buch Exodus 99 min 4 | Leo Lausemaus Der Läusebub 99 min 5 | -------------------------------------------------------------------------------- /tests/html/table-empty-row.html: -------------------------------------------------------------------------------- 1 | 2 | 5 | Leer 6 | 9 |
              Hallo 3 | Echo 4 |
              (1) 7 | (2) 8 |
              10 | -------------------------------------------------------------------------------- /tests/html/table-empty-row.txt: -------------------------------------------------------------------------------- 1 | Leer 2 | Hallo Echo 3 | 4 | (1) (2) 5 | 6 | -------------------------------------------------------------------------------- /tests/html/table-in-table.html: -------------------------------------------------------------------------------- 1 |

              Single

              2 | 3 |

              First

              4 | 5 | 6 |
              redgreen
              blue
              redgreen
              7 | 8 |

              Second

              9 | 10 | 11 |
              blue
              red?green
              blue
              12 | 13 |

              Nested

              14 | 15 | 16 | 19 | 22 | 25 | 26 | 27 | 30 | 33 | 36 | 37 | 38 | 41 | 44 | 47 | 48 |
              17 | 18 |
              redgreen.
              blue
              redgreen
              20 | 21 |
              blue
              redgreen
              blue
              23 | 24 |
              blue
              redgreen
              blue
              28 | 29 |
              blue.
              redgreen
              blue
              31 | 32 |
              redgreen
              blue
              redgreen!
              34 | 35 |
              blue
              redgreen
              blue
              39 | 40 |
              redgreen
              blue
              redgreen
              42 | 43 |
              blue
              redgreen
              blue
              45 | 46 |
              blue
              redgreen
              blue!
              49 | -------------------------------------------------------------------------------- /tests/html/table-in-table.json: -------------------------------------------------------------------------------- 1 | {"annotation_rules": { 2 | "h1": ["heading"], 3 | "h2": ["heading"], 4 | "h3": ["heading"], 5 | "table#border": ["table"], 6 | "b": ["bold"], 7 | "i": ["italic"] 8 | }, 9 | "result": [ 10 | ["heading", "Single\n\n"], 11 | ["heading", "First\n\n"], 12 | ["table", "red green\n blue \nred green\n\n"], 13 | ["heading", "\nSecond\n\n"], 14 | ["table", " blue \nred? green\n blue \n\n"], 15 | ["bold", "red?"], 16 | ["heading", "\nNested\n\n"], 17 | ["table", "red green. blue blue \n blue red green red green \nred green blue blue \n \n blue. red green blue \nred green blue red green \n blue red green! blue \n \nred green blue blue \n blue red green red green\nred green blue blue! \n \n"], 18 | ["italic", "green."], 19 | ["italic", "blue."], 20 | ["bold", "green!"], 21 | ["bold", "blue!"] 22 | ] 23 | } 24 | 25 | 26 | -------------------------------------------------------------------------------- /tests/html/table-in-table.txt: -------------------------------------------------------------------------------- 1 | Single 2 | 3 | First 4 | 5 | red green 6 | blue 7 | red green 8 | 9 | 10 | Second 11 | 12 | blue 13 | red? green 14 | blue 15 | 16 | 17 | Nested 18 | 19 | red green. blue blue 20 | blue red green red green 21 | red green blue blue 22 | 23 | blue. red green blue 24 | red green blue red green 25 | blue red green! blue 26 | 27 | red green blue blue 28 | blue red green red green 29 | red green blue blue! -------------------------------------------------------------------------------- /tests/html/table-itemize.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 |

              An der Gewerbeausstellung vom 1.-3.September sind wir nicht persönlich anwesend. 5 |

              6 |
              8 | -------------------------------------------------------------------------------- /tests/html/table-itemize.txt: -------------------------------------------------------------------------------- 1 | * aktuell An der Gewerbeausstellung vom 1.-3.September sind wir nicht persönlich anwesend. 2 | * projekte 3 | * zu verkaufen 4 | * offene stelle 5 | -------------------------------------------------------------------------------- /tests/html/table-pre.html: -------------------------------------------------------------------------------- 1 |

              Pre elements that have been nested in a table.

              2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 18 | 28 | 31 | 34 | 35 |
              PythonJava
              10 |
              11 | b = 1
              12 | for a in range(10):
              13 |    print(a)
              14 |    b *= a
              15 |    print(b)
              16 | 
              17 |
              19 |
              20 | int b = 1;
              21 | for (int a=0; a<10; a++) {
              22 |    System.out.println(a);
              23 |    b = b * a;
              24 |    System.out.println(b);
              25 | }
              26 | 
              27 |
              29 | 3.8 30 | 32 | 14 33 |
              36 | 37 | -------------------------------------------------------------------------------- /tests/html/table-pre.txt: -------------------------------------------------------------------------------- 1 | Pre elements that have been nested in a table. 2 | 3 | Python Java 4 | 5 | b = 1 int b = 1; 6 | for a in range(10): for (int a=0; a<10; a++) { 7 | print(a) System.out.println(a); 8 | b *= a b = b * a; 9 | print(b) System.out.println(b); 10 | } 11 | 12 | 3.8 14 13 | -------------------------------------------------------------------------------- /tests/html/table.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 |
              FirstSecondThird
              abc
              11 | -------------------------------------------------------------------------------- /tests/html/table.json: -------------------------------------------------------------------------------- 1 | {"annotation_rules": { 2 | "h1": ["heading"], 3 | "h2": ["heading"], 4 | "h3": ["heading"], 5 | "b": ["emphasis"], 6 | "table": ["table"], 7 | "th": ["table-heading"], 8 | "td": ["table-cell"] 9 | }, 10 | "result": [ 11 | ["table", "First Second Third\na b c \n"], 12 | ["table-heading", "First"], 13 | ["table-heading", "Second"], 14 | ["table-heading", "Third"], 15 | ["table-cell", "a"], 16 | ["emphasis", "b"], 17 | ["table-cell", "b"], 18 | ["table-cell", "c"] 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /tests/html/table.txt: -------------------------------------------------------------------------------- 1 | First Second Third 2 | a b c 3 | -------------------------------------------------------------------------------- /tests/html/td-only-table.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |
              123
              6 | -------------------------------------------------------------------------------- /tests/html/td-only-table.txt: -------------------------------------------------------------------------------- 1 | 1 2 3 2 | -------------------------------------------------------------------------------- /tests/html/test.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 11 | 12 |

              Test Cases

              13 | 14 | Thomas
              • Anton
                Maria
              15 | 16 | Thomas
              • Anton
                Maria
              17 | 18 | Thomas
              • a
                Anton
                Maria
              19 | 20 |

              Other examples

              21 | 22 | 23 | The first enumeration 24 |
                25 |
              • first line

                second line 26 |
              • third line 27 |
              28 | 29 | 30 | The second enumeration 31 |
                32 |
              • first 33 |
              • second

                34 |
              • third 35 |
              • forth 36 |
              37 | 38 | The third enumeration 39 |
                40 |
              • first line

                second line

                third line 41 |
              • last line 42 |
              43 | 44 | The forth enumeration (div rather than p) 45 |
                46 |
              • first line
                second line
                third line 47 |
              • last line 48 |
              49 | 50 | 51 | Spaces between enumerated items? 52 |
                53 |
              • first line


                54 |
              • second line 55 |
              • third line

                56 |
              • last line 57 |
              58 | 59 | Normal enumeration 60 |
                61 |
              • first 62 |
              • second 63 |
              • third 64 |
              65 |
                66 |
              1. first 67 |
              2. second 68 |
              3. first 69 |
              4. second 70 |
              5. first 71 |
              6. second 72 |
              7. first 73 |
              8. second 74 |
              9. first 75 |
              10. second 76 |
              11. first 77 |
              12. second 78 |
              79 | 80 | 81 | 82 | Amen, amen ich sage euch - Ehre sei Gott in der Höhe! 83 | 84 |
              85 | 86 | 87 | Davor... 88 | Inline alles drunter? Weiter geht's? 89 | 90 | 92 | 93 |

              Block elements

              94 | This is the first line
              95 |
              Block Element - is there a space to the previous line?
              96 | 97 |

              Whitespaces

              98 | White space space 99 | und
                mehr  ...
              . 100 | 101 |

              Divs

              102 | Thomas
              Anton
              Maria 103 | 104 |

              One versus two divs

              105 | One 106 |
              Anna
              107 | Div. 108 | 109 |
              110 | 111 | Two 112 |
              Anna
              113 | Div. 114 | 115 |
              116 | 117 | Empty 118 |
              119 | Div. 120 | 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /tests/html/tr-only-table.html: -------------------------------------------------------------------------------- 1 | 2 | 1 3 | 2 4 | 3 5 |
              6 | -------------------------------------------------------------------------------- /tests/html/tr-only-table.txt: -------------------------------------------------------------------------------- 1 | 1 2 3 2 | 3 | 4 | -------------------------------------------------------------------------------- /tests/html/whitespace.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Das 5 | ist 6 | interessant 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /tests/html/whitespace.txt: -------------------------------------------------------------------------------- 1 | 2 | Das 3 | ist 4 | interessant 5 | -------------------------------------------------------------------------------- /tests/html/wikipedia-code.html: -------------------------------------------------------------------------------- 1 |

              Pythons Schlüsselwort lambda könnte manche Anhänger der funktionalen Programmierung fehlleiten. Solche lambda-Blöcke in Python können nur Ausdrücke enthalten, aber keine Anweisungen. Damit werden solche Anweisungen generell nicht verwendet, um eine Funktion zurückzugeben. Die übliche Vorgehensweise ist stattdessen, den Namen einer lokalen Funktion zurückzugeben. Das folgende Beispiel zeigt dies anhand einer einfachen Funktion nach den Ideen von Haskell Brooks Curry: 2 |

              3 |
              def add_and_print_maker(x):
               4 |     def temp(y):
               5 |         print("{} + {} = {}".format(x, y, x + y))
               6 | 
               7 |     return temp
               8 | 
              9 |

              Damit ist auch Currying auf einfache Art möglich, um generische Funktionsobjekte auf problemspezifische herunterzubrechen. Hier ein einfaches Beispiel: 10 |

              11 |
              def curry(func, known_argument):
              12 |     return lambda unknown_argument: func(unknown_argument, known_argument)
              13 | 
              14 |

              Wird die curry-Funktion aufgerufen, erwartet diese eine Funktion mit zwei notwendigen Parametern sowie die Parameterbelegung für den zweiten Parameter dieser Funktion. Der Rückgabewert von curry ist eine Funktion, die das Gleiche tut wie func, aber nur noch einen Parameter benötigt. 15 | -------------------------------------------------------------------------------- /tests/html/wikipedia-code.txt: -------------------------------------------------------------------------------- 1 | Pythons Schlüsselwort lambda könnte manche Anhänger der funktionalen Programmierung fehlleiten. Solche lambda-Blöcke in Python können nur Ausdrücke enthalten, aber keine Anweisungen. Damit werden solche Anweisungen generell nicht verwendet, um eine Funktion zurückzugeben. Die übliche Vorgehensweise ist stattdessen, den Namen einer lokalen Funktion zurückzugeben. Das folgende Beispiel zeigt dies anhand einer einfachen Funktion nach den Ideen von Haskell Brooks Curry: 2 | 3 | def add_and_print_maker(x): 4 | def temp(y): 5 | print("{} + {} = {}".format(x, y, x + y)) 6 | 7 | return temp 8 | 9 | 10 | Damit ist auch Currying auf einfache Art möglich, um generische Funktionsobjekte auf problemspezifische herunterzubrechen. Hier ein einfaches Beispiel: 11 | 12 | def curry(func, known_argument): 13 | return lambda unknown_argument: func(unknown_argument, known_argument) 14 | 15 | 16 | Wird die curry-Funktion aufgerufen, erwartet diese eine Funktion mit zwei notwendigen Parametern sowie die Parameterbelegung für den zweiten Parameter dieser Funktion. Der Rückgabewert von curry ist eine Funktion, die das Gleiche tut wie func, aber nur noch einen Parameter benötigt. 17 | -------------------------------------------------------------------------------- /tests/html/wikipedia-consequtive-links-and-umlauts.html: -------------------------------------------------------------------------------- 1 |

              Araschgen | 2 | Chur City | 3 | Dreibündenquartier | 4 | Fürstenwald | 5 | Giacomettiquartier | 6 | Kornquader | 7 | Lacunaquartier | 8 | Masans | 9 | Niederlachen-Untere Au | 10 | Rheinquartier | 11 | Rossboden | 12 | Sand | 13 | Sommerau | 14 | Tittwiesen | 15 | Wiesental 16 |

              17 | -------------------------------------------------------------------------------- /tests/html/wikipedia-consequtive-links-and-umlauts.txt: -------------------------------------------------------------------------------- 1 | Araschgen | Chur City | Dreibündenquartier | Fürstenwald | Giacomettiquartier | Kornquader | Lacunaquartier | Masans | Niederlachen-Untere Au | Rheinquartier | Rossboden | Sand | Sommerau | Tittwiesen | Wiesental 2 | -------------------------------------------------------------------------------- /tests/html/wikipedia-consequtive-tables.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotation_rules": { 3 | "h1": ["heading"], 4 | "h2": ["heading"], 5 | "h3": ["subheading"], 6 | "h4": ["subheading"], 7 | "h5": ["subheading"], 8 | "i": ["emphasis"], 9 | "b": ["bold"], 10 | "th": ["tableheading"], 11 | "a": ["link"] 12 | }, 13 | "result": [ 14 | ["bold", "Monatliche Durchschnittstemperaturen und -niederschl\u00e4ge f\u00fcr Chur 1981\u20132010"], 15 | ["link", "Temperatur"], 16 | ["bold", "\u00d8"], 17 | ["bold", "15,1"], 18 | ["bold", "\u00d8"], 19 | ["bold", "5,6"], 20 | ["bold", "\u00d8"], 21 | ["bold", "10"], 22 | ["link", "Niederschlag"], 23 | ["bold", "\u03a3"], 24 | ["bold", "848"], 25 | ["link", "Sonnenstunden"], 26 | ["bold", "\u00d8"], 27 | ["bold", "4,6"], 28 | ["link", "Regentage"], 29 | ["bold", "\u03a3"], 30 | ["bold", "104,6"] 31 | ] 32 | } 33 | -------------------------------------------------------------------------------- /tests/html/wikipedia-enumeration-annotation.html: -------------------------------------------------------------------------------- 1 |
              2 |

              Inhaltsverzeichnis

              3 |
              4 | Another marker. 5 | 63 | 64 |

              End of enumeration

              65 | 66 | Closing remarks and an emphasized text portion. 67 | -------------------------------------------------------------------------------- /tests/html/wikipedia-enumeration-annotation.json: -------------------------------------------------------------------------------- 1 | {"annotation_rules": { 2 | "h1": ["heading"], 3 | "h2": ["heading"], 4 | "h3": ["heading"], 5 | "b": ["emphasis"], 6 | "table": ["table"], 7 | "th": ["table-heading"], 8 | "td": ["table-cell"] 9 | }, 10 | "result": [ 11 | ["heading", "Inhaltsverzeichnis\n\n"], 12 | ["emphasis", "marker"], 13 | ["emphasis", "marker2"], 14 | ["emphasis", "marker3"], 15 | ["emphasis", "marker31"], 16 | ["heading", "\nEnd of enumeration\n\n"], 17 | ["emphasis", "emphasized text portion"] 18 | ] 19 | } 20 | -------------------------------------------------------------------------------- /tests/html/wikipedia-enumeration-annotation.txt: -------------------------------------------------------------------------------- 1 | Inhaltsverzeichnis 2 | 3 | Another marker. 4 | * 1 Name und Aussprache - marker2 5 | * 2 Geographie - marker3 6 | + 2.1 Stadtquartiere - marker31 7 | + 2.2 Klima 8 | * 3 Geschichte 9 | + 3.1 Vorrömische Zeit 10 | + 3.2 Antike 11 | + 3.3 Mittelalter 12 | + 3.4 Wende zur Neuzeit 13 | + 3.5 Reformation und Dreissigjähriger Krieg 14 | + 3.6 19. Jahrhundert 15 | + 3.7 Moderne und Gegenwart 16 | * 4 Bevölkerung 17 | + 4.1 Sprachen 18 | + 4.2 Religionen 19 | * 5 Wappen 20 | * 6 Politik 21 | + 6.1 Stadtpräsidenten 22 | + 6.2 Partnerstädte 23 | * 7 Wirtschaft und Infrastruktur 24 | + 7.1 Wirtschaft 25 | + 7.2 Land- und Alpwirtschaft 26 | + 7.3 Verkehr 27 | + 7.4 Bildung 28 | + 7.5 Medien 29 | + 7.6 Kultur 30 | + 7.7 Justiz 31 | + 7.8 Friedhöfe 32 | + 7.9 Sportvereine 33 | * 8 Sehenswürdigkeiten und Tourismus 34 | + 8.1 Tourismus 35 | * 9 Besonderes 36 | * 10 Galerie 37 | * 11 Persönlichkeiten 38 | * 12 Siehe auch 39 | * 13 Literatur 40 | * 14 Weblinks 41 | * 15 Einzelnachweise 42 | 43 | End of enumeration 44 | 45 | Closing remarks and an emphasized text portion. 46 | -------------------------------------------------------------------------------- /tests/html/wikipedia-enumeration.html: -------------------------------------------------------------------------------- 1 |
              2 | Inhaltsverzeichnis 3 |
              4 | 62 | -------------------------------------------------------------------------------- /tests/html/wikipedia-enumeration.txt: -------------------------------------------------------------------------------- 1 | Inhaltsverzeichnis 2 | * 1 Name und Aussprache 3 | * 2 Geographie 4 | + 2.1 Stadtquartiere 5 | + 2.2 Klima 6 | * 3 Geschichte 7 | + 3.1 Vorrömische Zeit 8 | + 3.2 Antike 9 | + 3.3 Mittelalter 10 | + 3.4 Wende zur Neuzeit 11 | + 3.5 Reformation und Dreissigjähriger Krieg 12 | + 3.6 19. Jahrhundert 13 | + 3.7 Moderne und Gegenwart 14 | * 4 Bevölkerung 15 | + 4.1 Sprachen 16 | + 4.2 Religionen 17 | * 5 Wappen 18 | * 6 Politik 19 | + 6.1 Stadtpräsidenten 20 | + 6.2 Partnerstädte 21 | * 7 Wirtschaft und Infrastruktur 22 | + 7.1 Wirtschaft 23 | + 7.2 Land- und Alpwirtschaft 24 | + 7.3 Verkehr 25 | + 7.4 Bildung 26 | + 7.5 Medien 27 | + 7.6 Kultur 28 | + 7.7 Justiz 29 | + 7.8 Friedhöfe 30 | + 7.9 Sportvereine 31 | * 8 Sehenswürdigkeiten und Tourismus 32 | + 8.1 Tourismus 33 | * 9 Besonderes 34 | * 10 Galerie 35 | * 11 Persönlichkeiten 36 | * 12 Siehe auch 37 | * 13 Literatur 38 | * 14 Weblinks 39 | * 15 Einzelnachweise 40 | -------------------------------------------------------------------------------- /tests/html/wikipedia-equation.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
              int factorial(int x) {
               4 |     if (x <= 1)
               5 |             return 1;
               6 | 
               7 |                 return x * factorial(x - 1);
               8 |                 }
               9 | 
              10 | 11 | -------------------------------------------------------------------------------- /tests/html/wikipedia-equation.txt: -------------------------------------------------------------------------------- 1 | int factorial(int x) { 2 | if (x <= 1) 3 | return 1; 4 | 5 | return x * factorial(x - 1); 6 | } 7 | 8 | -------------------------------------------------------------------------------- /tests/html/wikipedia-table-bordercase-verticial-alignmnet.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 13 | 15 | 27 |
              4 | 12 | 14 | 16 | 26 | 28 |
              29 | -------------------------------------------------------------------------------- /tests/html/wikipedia-table-bordercase-verticial-alignmnet.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotation_rules": { 3 | "h1": ["heading"], 4 | "h2": ["heading"], 5 | "h3": ["subheading"], 6 | "h4": ["subheading"], 7 | "h5": ["subheading"], 8 | "i": ["emphasis"], 9 | "b": ["bold"], 10 | "th": ["tableheading"], 11 | "a": ["link"] 12 | }, 13 | "result": [ 14 | ["link", " * Araschgen"], 15 | ["link", " * F\u00fcrstenwald"], 16 | ["link", " * Masans"], 17 | ["link", " * Niederlachen-Untere Au"], 18 | ["link", " * Lacuna"], 19 | ["link", " * Giacomettiquartier"], 20 | ["link", " * Chur West"], 21 | ["link", " * Dreib\u00fcnden"], 22 | ["link", " * Sand"], 23 | ["link", " * Kornquader"], 24 | ["link", " * Rheinquartier"], 25 | ["link", " * Rossboden"], 26 | ["link", "Sommerau"], 27 | ["link", " * Wiesental"], 28 | ["link", " * Tittwiesen"], 29 | ["link", "[8]"] 30 | ] 31 | } 32 | -------------------------------------------------------------------------------- /tests/html/wikipedia-table-bordercase1.html: -------------------------------------------------------------------------------- 1 |
              2 | Dieser Artikel behandelt den Bündner Hauptort. Für andere Bedeutungen siehe Chur (Begriffsklärung).
              3 |
              4 | 5 | 6 | 8 | 9 | 11 | 12 | 14 | 16 | 17 | 19 | 21 |
              Chur 7 |
              Wappen von Chur
              10 |
              Staat: 13 | SchweizSchweiz Schweiz 15 |
              Kanton: 18 | Kanton GraubündenKanton Graubünden Graubünden (GR) 20 |
              22 | -------------------------------------------------------------------------------- /tests/html/wikipedia-table-bordercase1.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotation_rules": { 3 | "h1": ["heading"], 4 | "h2": ["heading"], 5 | "h3": ["subheading"], 6 | "h4": ["subheading"], 7 | "h5": ["subheading"], 8 | "i": ["emphasis"], 9 | "b": ["bold"], 10 | "th": ["tableheading"], 11 | "a": ["link"] 12 | }, 13 | "result": [ 14 | ["link", "Chur (Begriffskl\u00e4rung)"], 15 | ["tableheading", "Chur "], 16 | ["link", "Staat"], 17 | ["link", "Schweiz"], 18 | ["link", "Kanton"], 19 | ["link", "Graub\u00fcnden"] 20 | ] 21 | } 22 | -------------------------------------------------------------------------------- /tests/html/wikipedia-table.html: -------------------------------------------------------------------------------- 1 |

              Ehre sei Gott in der Höhe!

              2 | und Friede den Menschen, die guten Willens sind. 3 | 4 |

              Bevölkerung[Bearbeiten]

              5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 |
              Bevölkerungsentwicklung[6]
              Jahr150018601900195019702000200520112012
              Einwohnerca. 1500399011'53219'38231'19332'98932'40936'69037'036
              34 | -------------------------------------------------------------------------------- /tests/html/wikipedia-table.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotation_rules": { 3 | "h1": ["heading"], 4 | "h2": ["heading"], 5 | "h3": ["subheading"], 6 | "h4": ["subheading"], 7 | "h5": ["subheading"], 8 | "i": ["emphasis"], 9 | "b": ["bold"], 10 | "table": ["table"], 11 | "th": ["tableheading"], 12 | "a": ["link"] 13 | }, 14 | "result": [ 15 | ["heading", "Ehre sei Gott in der H\u00f6he!\n\n"], 16 | ["link", "Bearbeiten"], 17 | ["heading", "\nBev\u00f6lkerung[Bearbeiten]\n\n"], 18 | ["table", "Bev\u00f6lkerungsentwicklung[6]\nJahr 1500 1860 1900 1950 1970 2000 2005 2011 2012 \nEinwohner ca. 1500 3990 11'532 19'382 31'193 32'989 32'409 36'690 37'036\n"], 19 | ["link", "[6]"], 20 | ["tableheading", "Bev\u00f6lkerungsentwicklung[6]"], 21 | ["tableheading", "Jahr"], 22 | ["tableheading", "1500"], 23 | ["tableheading", "1860"], 24 | ["tableheading", "1900"], 25 | ["tableheading", "1950"], 26 | ["tableheading", "1970"], 27 | ["tableheading", "2000"], 28 | ["tableheading", "2005"], 29 | ["tableheading", "2011"], 30 | ["tableheading", "2012"], 31 | ["tableheading", "Einwohner"] 32 | ] 33 | } 34 | -------------------------------------------------------------------------------- /tests/html/wikipedia-table.txt: -------------------------------------------------------------------------------- 1 | Ehre sei Gott in der Höhe! 2 | 3 | und Friede den Menschen, die guten Willens sind. 4 | 5 | Bevölkerung[Bearbeiten] 6 | 7 | Bevölkerungsentwicklung[6] 8 | Jahr 1500 1860 1900 1950 1970 2000 2005 2011 2012 9 | Einwohner ca. 1500 3990 11'532 19'382 31'193 32'989 32'409 36'690 37'036 -------------------------------------------------------------------------------- /tests/test_annotation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | Tests the Table formatting with different parameters such as width and 6 | alignment 7 | """ 8 | 9 | from inscriptis.annotation import Annotation, horizontal_shift 10 | from inscriptis.html_properties import HorizontalAlignment 11 | 12 | 13 | def test_horizontal_shift(): 14 | a = [Annotation(0, 4, "test")] 15 | 16 | # no shift 17 | assert horizontal_shift( 18 | a, content_width=5, line_width=10, align=HorizontalAlignment.left, shift=0 19 | ).pop() == Annotation(0, 4, "test") 20 | 21 | # shift 22 | assert horizontal_shift( 23 | a, content_width=5, line_width=10, align=HorizontalAlignment.left, shift=3 24 | ).pop() == Annotation(3, 7, "test") 25 | 26 | # realignment to the right 27 | assert horizontal_shift( 28 | a, 29 | content_width=len("test"), 30 | line_width=10, 31 | align=HorizontalAlignment.right, 32 | shift=0, 33 | ).pop() == Annotation(6, 10, "test") 34 | assert "{:>10}".format("test")[6:10] == "test" 35 | 36 | # shift + realignment to the right 37 | assert horizontal_shift( 38 | a, 39 | content_width=len("test"), 40 | line_width=10, 41 | align=HorizontalAlignment.right, 42 | shift=3, 43 | ).pop() == Annotation(9, 13, "test") 44 | 45 | # realignment to the center 46 | assert horizontal_shift( 47 | a, 48 | content_width=len("test"), 49 | line_width=10, 50 | align=HorizontalAlignment.center, 51 | shift=0, 52 | ).pop() == Annotation(3, 7, "test") 53 | assert "{:^10}".format("test")[3:7] == "test" 54 | 55 | assert horizontal_shift( 56 | a, 57 | content_width=len("test"), 58 | line_width=11, 59 | align=HorizontalAlignment.center, 60 | shift=0, 61 | ).pop() == Annotation(3, 7, "test") 62 | assert "{:^11}".format("test")[3:7] == "test" 63 | 64 | # realignment + shift 65 | assert horizontal_shift( 66 | a, 67 | content_width=len("test"), 68 | line_width=11, 69 | align=HorizontalAlignment.center, 70 | shift=7, 71 | ).pop() == Annotation(10, 14, "test") 72 | -------------------------------------------------------------------------------- /tests/test_annotation_engine.py: -------------------------------------------------------------------------------- 1 | # test the annotation handling 2 | 3 | import pytest 4 | 5 | from inscriptis.annotation import Annotation 6 | from inscriptis.html_engine import Inscriptis 7 | from inscriptis.model.config import ParserConfig 8 | from lxml.html import fromstring 9 | 10 | 11 | def test_get_annotation(): 12 | """Test get_anntation from the Inscriptis class""" 13 | html = "Chur is a City in Switzerland" 14 | rules = {"b": ["bold"]} 15 | 16 | inscriptis = Inscriptis(fromstring(html), ParserConfig(annotation_rules=rules)) 17 | 18 | assert inscriptis.get_text() == "Chur is a City in Switzerland" 19 | assert inscriptis.get_annotations() == [ 20 | Annotation(start=0, end=4, metadata="bold"), 21 | Annotation(start=18, end=29, metadata="bold"), 22 | ] 23 | -------------------------------------------------------------------------------- /tests/test_annotation_output_processor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Test the annotation output formatter. 5 | """ 6 | 7 | import pytest 8 | 9 | from inscriptis.annotation.output import AnnotationProcessor 10 | from inscriptis.annotation.output.html import HtmlExtractor 11 | from inscriptis.annotation.output.surface import SurfaceExtractor 12 | from inscriptis.annotation.output.xml import XmlExtractor 13 | 14 | EXAMPLE_OUTPUT = { 15 | "text": "Chur\n\nChur is the capital and largest town of " 16 | "the Swiss canton of the Grisons and lies in the " 17 | "Grisonian Rhine Valley.", 18 | "label": [[0, 4, "h1"], [0, 4, "heading"], [6, 10, "emphasis"]], 19 | } 20 | 21 | 22 | def test_abstract_class(): 23 | processor = AnnotationProcessor() 24 | 25 | with pytest.raises(NotImplementedError): 26 | result = processor(EXAMPLE_OUTPUT) 27 | 28 | 29 | def test_surface_annotator(): 30 | processor = SurfaceExtractor() 31 | result = processor(EXAMPLE_OUTPUT) 32 | 33 | # the old keys haven't been changed 34 | assert "text" in result 35 | assert "label" in result 36 | 37 | # and we have additional information on surface forms :) 38 | assert result["surface"] == [ 39 | ("h1", "Chur"), 40 | ("heading", "Chur"), 41 | ("emphasis", "Chur"), 42 | ] 43 | 44 | 45 | def test_xml_annotator(): 46 | processor = XmlExtractor() 47 | result = processor(EXAMPLE_OUTPUT) 48 | 49 | # and we have additional information on surface forms :) 50 | assert result == ( 51 | '\n\n' 52 | "

              Chur

              \n\n" 53 | "Chur is the capital and largest town " 54 | "of the Swiss canton of the Grisons and lies in " 55 | "the Grisonian Rhine Valley.\n
              " 56 | ) 57 | 58 | 59 | def test_html_annotator(): 60 | processor = HtmlExtractor() 61 | result = processor(EXAMPLE_OUTPUT) 62 | 63 | assert result.startswith("")[1] == ( 65 | "" 66 | '
              heading'
              67 |         ''
              68 |         'h1'
              69 |         "Chur
              \n" 70 | "
              \n"
              71 |         '
              emphasis'
              72 |         'Chur is the capital '
              73 |         "and largest town of the Swiss canton of the "
              74 |         "Grisons and lies in the Grisonian Rhine Valley."
              75 |         "
              " 76 | ) 77 | 78 | 79 | def test_trailing_tag_annotation(): 80 | processor = XmlExtractor() 81 | result = processor({"text": "Ehre sei Gott!", "label": [[9, 14, "emphasis"]]}) 82 | 83 | assert result == ( 84 | '\n\n' 85 | "Ehre sei Gott!\n" 86 | ) 87 | -------------------------------------------------------------------------------- /tests/test_annotation_output_xml.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Test the annotation XmlExtractor. 5 | """ 6 | from lxml.html import fromstring 7 | 8 | from inscriptis import Inscriptis, ParserConfig 9 | from inscriptis.annotation.output.xml import XmlExtractor 10 | 11 | 12 | def test_tag_error_issue_93(): 13 | """ 14 | Test for the correct tag order in the XmlOutput as described in Issue #93. 15 | """ 16 | html_issue_93 = """ 17 | 18 |
              19 | Item1 20 | Item2 21 | Item3 22 | Item4 23 |
              24 | 25 | """ 26 | 27 | expected_output_issue_93 = ( 28 | """\n\n""" 29 | " Item1 Item2 Item3 " 30 | "Item4\n" 31 | ) 32 | rules = {"div#class=a": ["outer"], "span#class=b": ["inner"]} 33 | 34 | inscriptis = Inscriptis( 35 | fromstring(html_issue_93), ParserConfig(annotation_rules=rules) 36 | ) 37 | annotated_html = { 38 | "text": inscriptis.get_text(), 39 | "label": inscriptis.get_annotations(), 40 | } 41 | result = XmlExtractor()(annotated_html) 42 | assert result == expected_output_issue_93 43 | 44 | 45 | def test_tag_folding_issue_93_extended(): 46 | html_issue_93 = """ 47 | 48 |
              49 | Some Test to add :) 50 | Item1 51 | Item2 52 | Item3 53 | Item4 54 |
              55 | 56 | """ 57 | 58 | expected_output_issue_93 = ( 59 | """\n""" 60 | """\n""" 61 | """ Some Test to add :) Item 1 Item2 """ 62 | """Item3 It e m4\n""" 63 | """""" 64 | ) 65 | rules = {"div#class=a": ["outer"], "span#class=b": ["inner"], "b": ["bold"]} 66 | 67 | inscriptis = Inscriptis( 68 | fromstring(html_issue_93), ParserConfig(annotation_rules=rules) 69 | ) 70 | annotated_html = { 71 | "text": inscriptis.get_text(), 72 | "label": inscriptis.get_annotations(), 73 | } 74 | result = XmlExtractor()(annotated_html) 75 | assert result == expected_output_issue_93 76 | -------------------------------------------------------------------------------- /tests/test_annotation_rule_parsing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | Tests the Table formatting with different parameters such as width and 6 | alignment 7 | """ 8 | 9 | from copy import deepcopy 10 | 11 | from inscriptis.css_profiles import CSS_PROFILES 12 | from inscriptis.annotation.parser import AnnotationModel, ApplyAnnotation 13 | from inscriptis.model.attribute import Attribute 14 | from inscriptis.model.html_element import HtmlElement 15 | 16 | 17 | def test_parse(): 18 | """ 19 | basic rule parsing. 20 | """ 21 | rules = {"table#border=1": ["table"], "hr": ["horizontal-line"]} 22 | tags, attrs = AnnotationModel._parse(rules) 23 | 24 | assert tags == {"hr": ["horizontal-line"]} 25 | 26 | apply_annotation = attrs[0] 27 | assert apply_annotation.match_tag == "table" 28 | assert apply_annotation.match_value == "1" 29 | assert apply_annotation.attr == "border" 30 | 31 | e = HtmlElement(tag="table") 32 | apply_annotation.apply("1", e) 33 | assert e.annotation == ("table",) 34 | 35 | 36 | def test_apply_annotation(): 37 | """ 38 | rule application. 39 | """ 40 | rules = { 41 | "table#border=1": ["table"], 42 | "hr": ["horizontal-line"], 43 | "#color=red": ["red"], 44 | "#bgcolor": ["bgcolor"], 45 | } 46 | 47 | css = deepcopy(CSS_PROFILES["strict"]) 48 | annotation_model = AnnotationModel(css, rules) 49 | assert annotation_model.css["hr"].annotation == ("horizontal-line",) 50 | 51 | attribute_handler = Attribute() 52 | attribute_handler.merge_attribute_map(annotation_model.css_attr) 53 | assert "table#border=1" in str(attribute_handler.attribute_mapping["border"]) 54 | assert "{any}#color=red" in str(attribute_handler.attribute_mapping["color"]) 55 | assert "{any}#bgcolor={any}" in str(attribute_handler.attribute_mapping["bgcolor"]) 56 | 57 | 58 | def test_merged_attribute(): 59 | """ 60 | test multiple rules per attribute 61 | """ 62 | rules = {"#color=white": ["white"], "#color=yellow": ["yellow"]} 63 | css = deepcopy(CSS_PROFILES["strict"]) 64 | annotation_model = AnnotationModel(css, rules) 65 | 66 | attribute_handler = Attribute() 67 | attribute_handler.merge_attribute_map(annotation_model.css_attr) 68 | 69 | e = HtmlElement() 70 | attribute_handler.attribute_mapping["color"]("green", e) 71 | assert e.annotation == () 72 | attribute_handler.attribute_mapping["color"]("yellow", e) 73 | assert e.annotation == ("yellow",) 74 | attribute_handler.attribute_mapping["color"]("white", e) 75 | assert e.annotation == ("yellow", "white") 76 | -------------------------------------------------------------------------------- /tests/test_block.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test cases for the Block class. 3 | """ 4 | 5 | from inscriptis.model.canvas.block import Block 6 | from inscriptis.model.canvas.prefix import Prefix 7 | 8 | 9 | def test_merge_normal_text_collapsable_whitespaces(): 10 | """ 11 | test cases where the block has collapsable whitespaces 12 | """ 13 | b = Block(0, Prefix()) 14 | b.merge_normal_text("Hallo") 15 | assert b._content == "Hallo" 16 | assert not b.collapsable_whitespace 17 | 18 | b = Block(0, Prefix()) 19 | b.merge_normal_text(" Hallo ") 20 | assert b._content == "Hallo " 21 | assert b.collapsable_whitespace 22 | 23 | b = Block(0, Prefix()) 24 | b.merge_normal_text("") 25 | assert b._content == "" 26 | assert b.collapsable_whitespace 27 | 28 | b.merge_normal_text(" ") 29 | assert b._content == "" 30 | assert b.collapsable_whitespace 31 | 32 | b.merge_normal_text(" ") 33 | assert b._content == "" 34 | assert b.collapsable_whitespace 35 | 36 | 37 | def test_merge_normal_non_collapsable_whitespaces(): 38 | b = Block(0, Prefix()) 39 | b.collapsable_whitespace = False 40 | b.merge_normal_text("Hallo") 41 | assert b._content == "Hallo" 42 | assert not b.collapsable_whitespace 43 | 44 | b = Block(0, Prefix()) 45 | b.collapsable_whitespace = False 46 | b.merge_normal_text(" Hallo ") 47 | assert b._content == " Hallo " 48 | assert b.collapsable_whitespace 49 | 50 | b = Block(0, Prefix()) 51 | b.collapsable_whitespace = False 52 | b.merge_normal_text("") 53 | assert b._content == "" 54 | assert not b.collapsable_whitespace 55 | 56 | b = Block(0, Prefix()) 57 | b.collapsable_whitespace = False 58 | b.merge_normal_text(" ") 59 | assert b._content == " " 60 | assert b.collapsable_whitespace 61 | 62 | b = Block(0, Prefix()) 63 | b.collapsable_whitespace = False 64 | b.merge_normal_text(" ") 65 | assert b._content == " " 66 | assert b.collapsable_whitespace 67 | -------------------------------------------------------------------------------- /tests/test_broken_table_handling.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | Tests the handling of tables that do not properly close all column tags. 6 | """ 7 | 8 | from inscriptis import get_text 9 | from inscriptis.css_profiles import CSS_PROFILES 10 | from inscriptis.model.config import ParserConfig 11 | 12 | config = ParserConfig(css=CSS_PROFILES["strict"]) 13 | 14 | 15 | def test_forgotten_td_close_tag(): 16 | # one line (i.e., missing before the next and the next 17 | html = "hallo" "" "
              12
              echo" 18 | print(html) 19 | # assert get_text(html, config) == u'hallo\n1 2\necho' 20 | 21 | # two lines (i.e. missing before the and before the 22 | html = "hallo" "
              12" "
              34" "
              echo" 23 | print(html) 24 | assert get_text(html, config) == "hallo\n1 2\n3 4\n\necho" 25 | -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests the Inscriptis CLI client. 3 | """ 4 | 5 | from io import StringIO 6 | from pathlib import Path 7 | from json import loads 8 | from unittest.mock import Mock, mock_open, patch, call 9 | 10 | import pytest 11 | 12 | from inscriptis.cli.inscript import cli 13 | 14 | INPUT_DATA = """Hello World!""" 15 | 16 | 17 | def test_cli_read_from_stdin(monkeypatch, capsys): 18 | """Test converting HTML from standard input with the command line client.""" 19 | # Use monkeypatch to replace the 'input' function 20 | monkeypatch.setattr("sys.argv", ["inscript"]) 21 | monkeypatch.setattr("sys.stdin", StringIO(INPUT_DATA)) 22 | cli() 23 | 24 | # Capture the printed output 25 | captured = capsys.readouterr() 26 | assert captured.out.strip() == "Hello World!" 27 | 28 | 29 | def test_cli_read_from_stdin_write_to_file(monkeypatch, capsys): 30 | """Test converting HTML from standard input with the command line client and 31 | writing it to a file.""" 32 | # Use monkeypatch to replace the 'input' function 33 | monkeypatch.setattr("sys.argv", ["inscript", "--output", "test.txt"]) 34 | monkeypatch.setattr("sys.stdin", StringIO(INPUT_DATA)) 35 | with patch("pathlib.Path.open", create=True) as mock_file: 36 | cli() 37 | 38 | # Capture the printed output 39 | captured = capsys.readouterr() 40 | assert captured.out.strip() == "" 41 | # Capture the test written to the mock output file 42 | assert call().__enter__().write("Hello World!") in mock_file.mock_calls 43 | 44 | 45 | def test_cli_read_from_file(monkeypatch, capsys): 46 | """Test converting HTML from a file with the command line client.""" 47 | # Use monkeypatch to replace the 'input' function 48 | monkeypatch.setattr("sys.argv", ["inscript", "test.html"]) 49 | monkeypatch.setattr("pathlib.Path.is_file", lambda _: True) 50 | monkeypatch.setattr("pathlib.Path.open", mock_open(read_data=INPUT_DATA)) 51 | cli() 52 | 53 | # Capture the printed output 54 | captured = capsys.readouterr() 55 | assert captured.out.strip() == "Hello World!" 56 | 57 | 58 | def test_cli_read_from_url(monkeypatch, capsys): 59 | """Test converting HTML from an URL with the command line client.""" 60 | # Use monkeypatch to replace the 'input' function 61 | monkeypatch.setattr("sys.argv", ["inscript", "https://www.fhgr.ch/test.html"]) 62 | 63 | mock_request = Mock() 64 | mock_request.content = INPUT_DATA.encode("utf8") 65 | mock_request.encoding = "utf-8" 66 | monkeypatch.setattr("requests.get", lambda url, timeout=0: mock_request) 67 | cli() 68 | 69 | # Capture the printed output 70 | captured = capsys.readouterr() 71 | assert captured.out.strip() == "Hello World!" 72 | 73 | 74 | def test_cli_annotations(monkeypatch, capsys): 75 | """Test annotation handling in the command line client.""" 76 | # Prepare input data for the test 77 | annotation_rule_path = ( 78 | Path(__file__).parent / "data" / "annotation-profile-unittest.json" 79 | ) 80 | 81 | # Use monkeypatch to replace the 'input' function 82 | monkeypatch.setattr( 83 | "sys.argv", ["inscript", "-p", "surface", "-r", str(annotation_rule_path)] 84 | ) 85 | monkeypatch.setattr("sys.stdin", StringIO(INPUT_DATA)) 86 | cli() 87 | 88 | # Capture the printed json data and convert it to an object 89 | captured = loads(capsys.readouterr().out.strip()) 90 | assert captured["text"].strip() == "Hello World!" 91 | assert captured["label"] == [[6, 11, "emphasis"]] 92 | assert captured["surface"] == [["emphasis", "World"]] 93 | 94 | 95 | def test_help(monkeypatch, capsys): 96 | monkeypatch.setattr("sys.argv", ["inscript", "--version"]) 97 | 98 | # the cli should exit with exit code 0 99 | with pytest.raises(SystemExit) as exit_info: 100 | cli() 101 | assert exit_info.value.code == 0 102 | 103 | captured = capsys.readouterr().out 104 | assert captured.startswith("Inscript HTML to text conversion") 105 | assert "Inscript comes with ABSOLUTELY NO WARRANTY." in captured 106 | 107 | 108 | def test_missing_input_file(monkeypatch, capsys): 109 | monkeypatch.setattr("sys.argv", ["inscript", "test.html"]) 110 | with pytest.raises(SystemExit) as exit_info: 111 | cli() 112 | 113 | captured = capsys.readouterr() 114 | assert exit_info.value.code == -1 115 | assert captured.out.strip().startswith("ERROR: Cannot open input file") 116 | 117 | 118 | def test_missing_annotation_file(monkeypatch, capsys): 119 | monkeypatch.setattr("sys.argv", ["inscript", "--annotation-rules", "rules.json"]) 120 | monkeypatch.setattr("sys.stdin", StringIO(INPUT_DATA)) 121 | with pytest.raises(SystemExit) as exit_info: 122 | cli() 123 | 124 | captured = capsys.readouterr() 125 | assert exit_info.value.code == -1 126 | assert captured.out.strip().startswith("ERROR: Cannot open annotation rule file") 127 | -------------------------------------------------------------------------------- /tests/test_custom_html_tag_handling.py: -------------------------------------------------------------------------------- 1 | """Test the custom HTML tag handling.""" 2 | 3 | from lxml.html import fromstring 4 | 5 | from inscriptis import Inscriptis, ParserConfig 6 | from inscriptis.model.html_document_state import HtmlDocumentState 7 | from inscriptis.model.tag import CustomHtmlTagHandlerMapping 8 | 9 | 10 | def test_custom_html_handler(): 11 | def my_handle_start_b(state: HtmlDocumentState, _): 12 | """Handle the opening tag.""" 13 | state.tags[-1].write("**") 14 | 15 | def my_handle_end_b(state: HtmlDocumentState): 16 | """Handle the closing tag.""" 17 | state.tags[-1].write("**") 18 | 19 | custom_mapping = CustomHtmlTagHandlerMapping( 20 | start_tag_mapping={"b": my_handle_start_b}, 21 | end_tag_mapping={"b": my_handle_end_b}, 22 | ) 23 | 24 | html_tree = fromstring("Welcome to Chur") 25 | inscriptis = Inscriptis( 26 | html_tree, ParserConfig(custom_html_tag_handler_mapping=custom_mapping) 27 | ) 28 | 29 | # custom HTML Handler 30 | assert inscriptis.get_text().strip() == "Welcome to **Chur**" 31 | # standard HTML handler 32 | assert Inscriptis(html_tree).get_text().strip() == "Welcome to Chur" 33 | -------------------------------------------------------------------------------- /tests/test_double_a.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ ensures that two successive text contain 4 | a space between each other, if there is a linebreak 5 | or space between the tags. 6 | """ 7 | 8 | from inscriptis import get_text 9 | 10 | 11 | def test_successive_a(): 12 | html = ( 13 | 'first' 14 | 'second' 15 | ) 16 | assert get_text(html) == "firstsecond" 17 | 18 | html = ( 19 | 'first\n' 20 | 'second' 21 | ) 22 | assert get_text(html) == "first second" 23 | -------------------------------------------------------------------------------- /tests/test_empty_string.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ ensures that two successive text contain 4 | a space between each other, if there is a linebreak 5 | or space between the tags. 6 | """ 7 | 8 | from inscriptis import get_text 9 | 10 | 11 | def test_empty_and_corrupt(): 12 | assert get_text("test").strip() == "test" 13 | assert get_text(" ") == "" 14 | assert get_text("") == "" 15 | # test for the behaviour of older and recent lxml versions. 16 | assert get_text("<<<").strip() in ("<<<", "<<", "") 17 | -------------------------------------------------------------------------------- /tests/test_engine.py: -------------------------------------------------------------------------------- 1 | # test borderline cases 2 | 3 | from inscriptis import get_text, get_annotated_text 4 | 5 | 6 | def test_text_from_empty_content(): 7 | assert get_text("") == "" 8 | 9 | 10 | def test_annotations_from_empty_content(): 11 | assert get_annotated_text("") == {} 12 | -------------------------------------------------------------------------------- /tests/test_html_conversion_options.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Tests different HTML to text conversion options. 5 | """ 6 | 7 | from inscriptis import get_text 8 | from inscriptis.model.config import ParserConfig 9 | 10 | 11 | def test_display_links(): 12 | html = """ 13 | 14 | first 15 | second 16 | third 17 | 18 | 19 | """ 20 | config = ParserConfig(display_links=True) 21 | assert get_text(html, config).strip() == "[first](first) [second](second) third" 22 | 23 | 24 | def test_display_anchors(): 25 | html = """ 26 | 27 | first 28 | second 29 | 30 | 31 | """ 32 | config = ParserConfig(display_anchors=True) 33 | assert get_text(html, config).strip() == "[first](first) second" 34 | 35 | 36 | def test_display_links_and_anchors(): 37 | html = """ 38 | 39 | first 40 | second 41 | third 42 | 43 | 44 | """ 45 | config = ParserConfig(display_links=True, display_anchors=True) 46 | assert ( 47 | get_text(html, config).strip() 48 | == "[first](first) [second](second) [third](third)" 49 | ) 50 | 51 | 52 | def test_display_images(): 53 | html = """ 54 | 55 | Ein Test Bild 56 | Ein Test Bild 57 | Ein zweites Bild 58 | 59 | 60 | """ 61 | config = ParserConfig(display_images=True) 62 | assert ( 63 | get_text(html, config).strip() 64 | == "[Ein Test Bild] [Ein Test Bild] [Ein zweites Bild]" 65 | ) 66 | 67 | 68 | def test_display_images_deduplicated(): 69 | html = """ 70 | 71 | Ein Test Bild 72 | Ein Test Bild 73 | Ein zweites Bild 74 | 75 | 76 | """ 77 | config = ParserConfig(display_images=True, deduplicate_captions=True) 78 | assert get_text(html, config).strip() == "[Ein Test Bild] [Ein zweites Bild]" 79 | -------------------------------------------------------------------------------- /tests/test_html_snippets.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Test HTML snippets in the project's HTML directory. The corresponding .txt file 5 | contains the reference conversion. 6 | """ 7 | from os.path import dirname, join 8 | from glob import glob 9 | 10 | from inscriptis import get_text 11 | from inscriptis.css_profiles import CSS_PROFILES 12 | from inscriptis.model.config import ParserConfig 13 | 14 | TESTCASE_PATTERN = join(dirname(__file__), "html/*.txt") 15 | 16 | 17 | def test_html_snippets(filter_str=""): 18 | for testcase_txt in glob(TESTCASE_PATTERN): 19 | if filter_str not in testcase_txt: 20 | continue 21 | 22 | with open(testcase_txt) as f: 23 | reference_txt = f.read().rstrip() 24 | 25 | with open(testcase_txt.replace(".txt", ".html")) as f: 26 | print(f.name) 27 | html = "{}".format(f.read()) 28 | 29 | converted_txt = get_text( 30 | html, ParserConfig(css=CSS_PROFILES["strict"]) 31 | ).rstrip() 32 | 33 | if converted_txt != reference_txt: 34 | print( 35 | "File:{}\nHTML:\n{}\n\nReference:\n{}\n\nConverted:\n{}".format( 36 | testcase_txt, html, reference_txt, converted_txt 37 | ) 38 | ) 39 | print("HTML file:", testcase_txt.replace(".txt", ".html")) 40 | print("Visualize differences with `vimdiff reference.txt " "converted.txt`") 41 | open("reference.txt", "w").write(reference_txt) 42 | open("converted.txt", "w").write(converted_txt) 43 | 44 | assert converted_txt == reference_txt 45 | 46 | 47 | if __name__ == "__main__": 48 | from sys import argv 49 | 50 | filter_str = argv[1] if len(argv) > 1 else "" 51 | test_html_snippets(filter_str) 52 | -------------------------------------------------------------------------------- /tests/test_html_snippets_annotations.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | This test case verifies that annotation are correctly computed. 5 | """ 6 | import os 7 | from json import load 8 | from glob import glob 9 | from typing import List 10 | 11 | from inscriptis import get_annotated_text 12 | from inscriptis.css_profiles import CSS_PROFILES 13 | from inscriptis.model.config import ParserConfig 14 | 15 | TESTCASE_PATTERN = os.path.join(os.path.dirname(__file__), "html/*.json") 16 | 17 | 18 | def assert_equal_ignoring_whitespace( 19 | reference: List[str], converted: List[str] 20 | ) -> bool: 21 | for (ref_tag, ref_str), (conv_tag, conv_str) in zip(reference, converted): 22 | assert ref_tag == conv_tag 23 | assert "".join(ref_str.split()) == "".join(conv_str.split()) 24 | 25 | 26 | def test_html_annotations(filter_str=""): 27 | for annotation_file in glob(TESTCASE_PATTERN): 28 | if filter_str not in annotation_file: 29 | continue 30 | 31 | with open(annotation_file) as f: 32 | reference = load(f) 33 | 34 | with open(annotation_file.replace(".json", ".html")) as f: 35 | print(f.name) 36 | html = "{}".format(f.read()) 37 | 38 | for indentation_strategy in ("strict", "relaxed"): 39 | result = get_annotated_text( 40 | html, 41 | ParserConfig( 42 | css=CSS_PROFILES[indentation_strategy], 43 | annotation_rules=reference["annotation_rules"], 44 | ), 45 | ) 46 | 47 | converted = [[a[2], result["text"][a[0] : a[1]]] for a in result["label"]] 48 | 49 | if reference["result"] != converted: 50 | print("Reference:") 51 | print(reference["result"]) 52 | print( 53 | "\nConverted (indentation strategy: {})".format( 54 | indentation_strategy 55 | ) 56 | ) 57 | print(converted) 58 | 59 | if indentation_strategy == "strict": 60 | assert reference["result"] == converted 61 | else: 62 | assert_equal_ignoring_whitespace(reference["result"], converted) 63 | 64 | 65 | if __name__ == "__main__": 66 | from sys import argv 67 | 68 | filter_str = argv[1] if len(argv) > 1 else "" 69 | test_html_annotations(filter_str) 70 | -------------------------------------------------------------------------------- /tests/test_invalid_float_specification.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | Tests handling of invalid length specifications. 6 | (https://github.com/weblyzard/inscriptis/issues/63) 7 | """ 8 | 9 | from inscriptis import get_text 10 | 11 | 12 | def test_invalid_length_specification_handling(): 13 | html = """

              """ 14 | print(get_text(html)) 15 | -------------------------------------------------------------------------------- /tests/test_limit_whitespace_affixes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Tests different HTML to text conversion options. 5 | """ 6 | 7 | from copy import copy 8 | from inscriptis import get_text 9 | from inscriptis.css_profiles import RELAXED_CSS_PROFILE 10 | from inscriptis.html_properties import Display, WhiteSpace 11 | from inscriptis.model.config import ParserConfig 12 | from inscriptis.model.html_element import HtmlElement 13 | 14 | 15 | def test_html_element_refinement(): 16 | new = HtmlElement( 17 | "span", 18 | display=Display.inline, 19 | prefix=" ", 20 | suffix=" ", 21 | limit_whitespace_affixes=True, 22 | ) 23 | pre = HtmlElement("pre", display=Display.block, whitespace=WhiteSpace.pre) 24 | code = HtmlElement("code") 25 | 26 | # refinement with pre and whitespaces 27 | refined = pre.get_refined_html_element(copy(new)) 28 | assert refined.prefix == "" 29 | assert refined.suffix == "" 30 | 31 | # refinement with code and whitespaces 32 | refined = code.get_refined_html_element(copy(new)) 33 | assert refined.prefix == " " 34 | assert refined.suffix == " " 35 | 36 | # refinement with pre and non-whitespaces 37 | new.prefix = " 1. " 38 | new.suffix = "<" 39 | refined = pre.get_refined_html_element(copy(new)) 40 | assert refined.prefix == " 1. " 41 | assert refined.suffix == "<" 42 | 43 | # refinement with code and non-whitespaces 44 | refined = code.get_refined_html_element(copy(new)) 45 | assert refined.prefix == " 1. " 46 | assert refined.suffix == "<" 47 | 48 | 49 | def test_limit_whitespace_affixes(): 50 | html = """ 51 | 52 | halloecho 53 |

              54 | def hallo():
              55 |    print("echo")
              56 |                    
              57 | 58 | 59 | """ 60 | config = ParserConfig(css=RELAXED_CSS_PROFILE) 61 | assert ( 62 | get_text(html, config).strip() == "hallo echo\n\n" 63 | "def hallo():\n" 64 | ' print("echo")' 65 | ) 66 | -------------------------------------------------------------------------------- /tests/test_list_div.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ ensures that two successive text contain 5 | a space between each other, if there is a linebreak 6 | or space between the tags. 7 | """ 8 | 9 | from inscriptis import get_text 10 | from inscriptis.css_profiles import CSS_PROFILES 11 | from inscriptis.model.config import ParserConfig 12 | 13 | config = ParserConfig(css=CSS_PROFILES["strict"]) 14 | 15 | 16 | def test_divs(): 17 | html = "Thomas
              Anton
              Maria" 18 | assert get_text(html, config) == "Thomas\nAnton\nMaria" 19 | 20 | html = "Thomas
              Anna läuft weit weg.
              " 21 | assert get_text(html, config) == "Thomas\nAnna läuft weit weg." 22 | 23 | html = "Thomas
              • Anton
                Maria
              " 24 | assert get_text(html, config) == "Thomas\n * Anton\n Maria" 25 | 26 | html = "Thomas
              • Anton
                Maria
              " 27 | assert get_text(html, config) == "Thomas\n * Anton\n Maria" 28 | 29 | html = "Thomas
              • a
                Anton
                Maria
              " 30 | assert get_text(html, config) == "Thomas\n * a\n Anton\n Maria" 31 | -------------------------------------------------------------------------------- /tests/test_margin_before_at_start.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ ensures that two successive text contain 4 | a space between each other, if there is a linebreak 5 | or space between the tags. 6 | """ 7 | 8 | from inscriptis import get_text 9 | 10 | 11 | def test_content(): 12 | html = "first" 13 | assert get_text(html) == "first" 14 | 15 | 16 | def test_margin_before(): 17 | html = "

              first

              " 18 | assert get_text(html) == "first\n" 19 | 20 | html = "first

              " "second

              " 21 | assert get_text(html) == "first\n\nsecond\n" 22 | 23 | 24 | def test_br(): 25 | html = "
              " "first

              " 26 | assert get_text(html) == "\nfirst" 27 | -------------------------------------------------------------------------------- /tests/test_margin_handling.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | Tests different white-space handling. 6 | """ 7 | 8 | from inscriptis import get_text 9 | from inscriptis.css_profiles import CSS_PROFILES 10 | from inscriptis.model.config import ParserConfig 11 | 12 | config = ParserConfig(css=CSS_PROFILES["strict"]) 13 | 14 | 15 | def test_margin_handling(): 16 | html = """Hallo 17 |
              Echo 18 |
              Mecho
              19 |
              20 | sei Gott 21 | """ 22 | assert get_text(html, config) == "Hallo\n\nEcho\n\n\nMecho\n\nsei Gott" 23 | 24 | html = """Hallo 25 |
              Echo
              26 |
              Mecho
              27 | sei Gott 28 | """ 29 | assert get_text(html, config) == "Hallo\n\nEcho\n\n\nMecho\nsei Gott" 30 | 31 | html = """Hallo 32 |
              33 |
              Ehre
              34 |
              35 | sei Gott 36 | """ 37 | assert get_text(html, config) == "Hallo\n\n\nEhre\n\nsei Gott" 38 | -------------------------------------------------------------------------------- /tests/test_metadata.py: -------------------------------------------------------------------------------- 1 | from inscriptis.metadata import ( 2 | __author__, 3 | __author_email__, 4 | __copyright__, 5 | __license__, 6 | __version__, 7 | ) 8 | 9 | 10 | def test_metadata(): 11 | """Test inscriptis package metadata.""" 12 | assert "Albert Weichselbraun" in __author__ 13 | assert "Fabian Odoni" in __author__ 14 | 15 | assert "Albert Weichselbraun" in __copyright__ 16 | assert "Fabian Odoni" in __copyright__ 17 | 18 | assert "@" in __author_email__ 19 | assert __license__ == "Apache-2.0" 20 | assert __version__[0].isnumeric() 21 | assert "." in __version__ 22 | -------------------------------------------------------------------------------- /tests/test_model_html_element_canvas.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | Tests the rendering of a single table line. 6 | """ 7 | 8 | from inscriptis.model.canvas import Canvas 9 | from inscriptis.model.html_element import HtmlElement 10 | from inscriptis.html_properties import Display 11 | 12 | 13 | def _get_text(html_element): 14 | """ 15 | Returns 16 | the text formatted based on the current HTML element. 17 | """ 18 | c = Canvas() 19 | html_element.canvas = c 20 | 21 | HtmlElement().set_canvas(c).write("first") 22 | 23 | c.open_tag(html_element) 24 | html_element.write("Ehre sei Gott!") 25 | c.close_tag(html_element) 26 | 27 | HtmlElement().set_canvas(c).write("last") 28 | c.flush_inline() 29 | return "\n".join(c.blocks) 30 | 31 | 32 | def test_formatting(): 33 | # standard line 34 | 35 | h = HtmlElement() 36 | assert _get_text(h) == "firstEhre sei Gott!last" 37 | 38 | h.display = Display.block 39 | h.margin_before = 1 40 | h.margin_after = 2 41 | print(h) 42 | print(_get_text(h)) 43 | assert _get_text(h) == "first\n\nEhre sei Gott!\n\n\nlast" 44 | 45 | # list bullet without padding_inline 46 | h.list_bullet = "* " 47 | assert _get_text(h) == "first\n\n* Ehre sei Gott!\n\n\nlast" 48 | 49 | # add a padding_inline 50 | h.padding_inline = 3 51 | assert _get_text(h) == "first\n\n * Ehre sei Gott!\n\n\nlast" 52 | 53 | # and prefixes + suffixes 54 | h.prefix = ">>" 55 | h.suffix = "<<" 56 | assert _get_text(h) == "first\n\n * >>Ehre sei Gott!<<\n\n\nlast" 57 | -------------------------------------------------------------------------------- /tests/test_model_prefix.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | Tests the rendering of a single table line. 6 | """ 7 | 8 | from inscriptis.model.canvas import Prefix 9 | 10 | 11 | def test_simple_prefix(): 12 | p = Prefix() 13 | 14 | p.register_prefix(5, "1. ") 15 | 16 | # first use 17 | assert p.first == " 1. " 18 | 19 | # the prefix has been consumed 20 | assert p.first == "" 21 | 22 | # prefix used to indent lines separated with newlines 23 | assert p.rest == " " 24 | 25 | 26 | def test_combined_prefix(): 27 | p = Prefix() 28 | 29 | p.register_prefix(5, "1. ") 30 | p.register_prefix(2, "") 31 | 32 | assert p.first == " 1. " 33 | assert p.first == "" 34 | 35 | p.remove_last_prefix() 36 | assert p.first == "" 37 | 38 | p.remove_last_prefix() 39 | # final consumption - no prefix 40 | assert p.first == "" 41 | 42 | # ensure that there are no interactions between different runs with 43 | # bullets 44 | p.consumed = False 45 | p.register_prefix(5, "2. ") 46 | p.register_prefix(2, "- ") 47 | 48 | assert p.first == " - " 49 | assert p.first == "" 50 | assert p.rest == " " 51 | 52 | p.consumed = False 53 | p.remove_last_prefix() 54 | assert p.first == " 2. " 55 | assert p.rest == " " 56 | -------------------------------------------------------------------------------- /tests/test_parse_css.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | Tests HtmlElement and the parsing of CSS style definitiosn 6 | """ 7 | 8 | from copy import copy 9 | from inscriptis.css_profiles import CSS_PROFILES 10 | from inscriptis.html_properties import ( 11 | Display, 12 | WhiteSpace, 13 | VerticalAlignment, 14 | HorizontalAlignment, 15 | ) 16 | from inscriptis.model.css import CssParse 17 | from inscriptis.model.html_element import HtmlElement 18 | 19 | 20 | def test_css_parsing(): 21 | html_element = copy(CSS_PROFILES["strict"]["div"]) 22 | CssParse.attr_style("padding_left: 8px; display: block", html_element) 23 | assert html_element.padding_inline == 1 24 | assert html_element.display == Display.block 25 | 26 | CssParse.attr_style("margin_before: 8em; display: inline", html_element) 27 | assert html_element.margin_before == 8 28 | assert html_element.display == Display.inline 29 | 30 | 31 | def test_html_element_str(): 32 | """ 33 | Tests the string representation of an HtmlElement. 34 | """ 35 | html_element = HtmlElement( 36 | "div", "", "", Display.inline, 0, 0, 0, "", WhiteSpace.pre 37 | ) 38 | assert str(html_element) == ( 39 | "
              " 47 | ) 48 | 49 | 50 | def test_parse_vertical_align(): 51 | html_element = HtmlElement() 52 | CssParse.attr_vertical_align("top", html_element) 53 | assert html_element.valign == VerticalAlignment.top 54 | 55 | # invalid value 56 | CssParse.attr_vertical_align("unknown", html_element) 57 | assert html_element.valign == VerticalAlignment.top 58 | 59 | 60 | def test_parse_horizontal_align(): 61 | html_element = HtmlElement() 62 | CssParse.attr_horizontal_align("center", html_element) 63 | assert html_element.align == HorizontalAlignment.center 64 | 65 | # invalid value 66 | CssParse.attr_horizontal_align("unknown", html_element) 67 | assert html_element.align == HorizontalAlignment.center 68 | -------------------------------------------------------------------------------- /tests/test_strip_xml_header.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ ensures that xml declaration headers are correctly stripped""" 4 | 5 | from inscriptis import get_text 6 | 7 | 8 | def test_successive_a(): 9 | html = ' Hallo?>' 10 | assert get_text(html).strip() == "Hallo?>" 11 | -------------------------------------------------------------------------------- /tests/test_style_parsing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Tests inscriptis' parsing of CSS style definitions. 5 | """ 6 | 7 | from inscriptis.model.css import CssParse 8 | from inscriptis.model.html_element import HtmlElement 9 | 10 | 11 | def test_style_unit_parsing(): 12 | html_element = HtmlElement() 13 | CssParse.attr_style( 14 | "margin-top:2.666666667em;margin-bottom: 2.666666667em", html_element 15 | ) 16 | assert html_element.margin_before == 3 17 | assert html_element.margin_after == 3 18 | -------------------------------------------------------------------------------- /tests/test_table_cell.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | Tests the Table formatting with different parameters such as width and 6 | alignment 7 | """ 8 | 9 | from inscriptis.model.table import TableCell 10 | from inscriptis.html_properties import HorizontalAlignment, VerticalAlignment 11 | 12 | 13 | def test_height(): 14 | cell = TableCell(HorizontalAlignment.left, VerticalAlignment.top) 15 | 16 | cell.blocks = ["hallo"] 17 | cell.normalize_blocks() 18 | assert cell.height == len("\n".join(cell.blocks).split("\n")) 19 | 20 | cell.blocks = ["hallo", "echo"] 21 | cell.normalize_blocks() 22 | assert cell.height == 2 23 | 24 | cell.blocks = ["hallo\necho"] 25 | cell.normalize_blocks() 26 | assert cell.height == 2 27 | 28 | cell.blocks = ["hallo\necho", "Ehre sei Gott", "Jump\n&\nRun!\n\n\n"] 29 | cell.normalize_blocks() 30 | assert cell.height == 9 31 | assert cell.height == len("\n".join(cell.blocks).split("\n")) 32 | 33 | 34 | def test_width(): 35 | cell = TableCell(HorizontalAlignment.left, VerticalAlignment.top) 36 | 37 | cell.blocks = ["hallo"] 38 | cell.normalize_blocks() 39 | assert cell.width == len(cell.blocks[0]) 40 | 41 | cell.blocks = ["hallo\necho", "Ehre sei Gott", "Jump\n&\nRun!\n\n\n"] 42 | cell.normalize_blocks() 43 | assert cell.width == len("Ehre sei Gott") 44 | 45 | # fixed set width 46 | cell.width = 95 47 | cell.normalize_blocks() 48 | assert cell.width == 95 49 | -------------------------------------------------------------------------------- /tests/test_table_cell_formatting.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | Tests the Table formatting with different parameters such as width and 6 | alignment 7 | """ 8 | 9 | from inscriptis.model.table import TableCell 10 | from inscriptis.html_properties import HorizontalAlignment, VerticalAlignment 11 | 12 | 13 | def test_horizontal_cell_formatting(): 14 | cell = TableCell(align=HorizontalAlignment.left, valign=VerticalAlignment.top) 15 | # left alignment 16 | cell.blocks = ["Ehre sei Gott!"] 17 | cell.width = 16 18 | assert cell.blocks == ["Ehre sei Gott! "] 19 | 20 | # right alignment 21 | cell.align = HorizontalAlignment.right 22 | cell.blocks = ["Ehre sei Gott!"] 23 | cell.width = 16 24 | assert cell.blocks == [" Ehre sei Gott!"] 25 | 26 | 27 | def test_vertical_cell_formatting(): 28 | cell = TableCell(align=HorizontalAlignment.left, valign=VerticalAlignment.top) 29 | 30 | # default top alignment 31 | cell.blocks = ["Ehre sei Gott!"] 32 | cell.width = 16 33 | cell.height = 4 34 | assert cell.blocks == ["Ehre sei Gott! ", "", "", ""] 35 | 36 | # bottom alignment 37 | cell.blocks = ["Ehre sei Gott!"] 38 | cell.valign = VerticalAlignment.bottom 39 | cell.width = 16 40 | cell.height = 4 41 | assert cell.blocks == ["", "", "", "Ehre sei Gott! "] 42 | 43 | # middle alignment 44 | cell.blocks = ["Ehre sei Gott!"] 45 | cell.valign = VerticalAlignment.middle 46 | cell.width = 16 47 | cell.height = 4 48 | assert cell.blocks == ["", "Ehre sei Gott! ", "", ""] 49 | -------------------------------------------------------------------------------- /tests/test_table_row.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | Test borderline cases for table rows 6 | """ 7 | 8 | from inscriptis import get_text 9 | from inscriptis.model.config import ParserConfig 10 | from inscriptis.model.table import TableRow 11 | 12 | 13 | def test_empty_row(): 14 | tr = TableRow(cell_separator=" ") 15 | 16 | assert tr.width == 0 17 | assert tr.get_text() == "" 18 | 19 | 20 | def test_table_cell_separator(): 21 | html = "
              Hallo
              Eins
              Echo
              Zwei
              " 22 | 23 | config = ParserConfig() 24 | assert get_text(html, config) == "Hallo Echo\nEins Zwei\n" 25 | 26 | config = ParserConfig(table_cell_separator="\t") 27 | assert get_text(html, config) == "Hallo\tEcho\nEins \tZwei\n" 28 | -------------------------------------------------------------------------------- /tests/test_web_service.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from fastapi.testclient import TestClient 3 | from inscriptis.service.web import app 4 | from inscriptis.metadata import __version__ 5 | 6 | 7 | @pytest.fixture 8 | def client(): 9 | return TestClient(app) 10 | 11 | 12 | def test_index(client): 13 | response = client.get("/") 14 | assert response.status_code == 200 15 | assert response.text == "Inscriptis text to HTML Web service." 16 | 17 | 18 | def test_get_text_call_with_content_type(client): 19 | html_content = "Österliche Freuden!" 20 | response = client.post( 21 | "/get_text", 22 | content=html_content, 23 | headers={"Content-type": "text/html; charset=UTF-8"}, 24 | ) 25 | assert response.status_code == 200 26 | assert response.text == "Österliche Freuden!" 27 | 28 | 29 | def test_get_text_call_without_content_type(client): 30 | html_content = "Hello World!" 31 | response = client.post( 32 | "/get_text", 33 | content=html_content, 34 | headers={"Content-type": "text/html"}, 35 | ) 36 | assert response.status_code == 200 37 | assert response.text == "Hello World!" 38 | 39 | 40 | def test_get_version_call(client): 41 | response = client.get("/version") 42 | assert response.status_code == 200 43 | assert response.text == __version__ 44 | -------------------------------------------------------------------------------- /tests/test_white_space_handling.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | Tests different white-space handling. 6 | """ 7 | 8 | from inscriptis import get_text 9 | from inscriptis.css_profiles import CSS_PROFILES 10 | from inscriptis.model.config import ParserConfig 11 | 12 | config = ParserConfig(css=CSS_PROFILES["strict"]) 13 | 14 | 15 | def test_white_space(): 16 | html = '12\n3' "" 17 | assert get_text(html, config) == "12 3" 18 | 19 | html = '12\n3' "" 20 | assert get_text(html, config) == "12 3" 21 | 22 | html = '12\n3' "" 23 | assert get_text(html, config) == "12\n3" 24 | 25 | html = '12\n3' "" 26 | assert get_text(html, config) == "12\n3" 27 | 28 | html = '12\n3' "" 29 | assert get_text(html, config) == "12\n3" 30 | 31 | 32 | def test_borderline_cases(): 33 | """ 34 | testing of borderline cases based on the behavior found in Firefox and 35 | Google Chrome. 36 | """ 37 | # change of whitespace handling between terms; no whitespace 38 | # between the terms 39 | html = 'Halloecho versus' 40 | assert get_text(html, config) == "Halloecho versus" 41 | 42 | # change of whitespace handling between terms; one whitespace 43 | # between the terms; option 1 44 | html = 'Hallo echo versus' 45 | assert get_text(html, config) == "Hallo echo versus" 46 | 47 | # change of whitespace handling between terms; one whitespace 48 | # between the terms; option 2 49 | html = 'Hallo echo versus' 50 | assert get_text(html, config) == "Hallo echo versus" 51 | 52 | # change of whitespace handling between terms; two whitespaces 53 | # between the terms 54 | html = 'Hallo echo versus' 55 | assert get_text(html, config) == "Hallo echo versus" 56 | 57 | # change of whitespace handling between terms; multiple whitespaces 58 | # between the terms 59 | html = 'Hallo echo versus' 60 | assert get_text(html, config) == "Hallo echo versus" 61 | 62 | # change of whitespace handling between terms; multiple whitespaces 63 | # between the terms 64 | html = 'Hallo echo versus' 65 | assert get_text(html, config) == "Hallo echo versus" 66 | 67 | 68 | def test_tail(): 69 | """ 70 | ensure that the tail elements are formated based on the container element. 71 | """ 72 | html = 'Hi 1 3 ' " versus 1 3" 73 | assert get_text(html, config) == "Hi 1 3 versus 1 3" 74 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = pytest, pyroma, flake8 3 | 4 | # standard unit tests 5 | [testenv:pytest] 6 | deps = pytest ~= 7.4.4 7 | pytest-cov ~= 4.1.0 8 | fastapi ~= 0.109.2 9 | httpx ~= 0.26.0 10 | commands = pytest --cov-config=.coveragerc --cov=inscriptis ./tests 11 | 12 | # python packaging best practices 13 | [testenv:pyroma] 14 | deps = pyroma 15 | commands = pyroma . 16 | 17 | [testenv:flake8] 18 | deps = flake8 ~= 7.0.0 19 | dlint ~= 0.14.1 20 | flake8-bandit ~= 4.1.1 21 | flake8-blind-except ~= 0.2.1 22 | flake8-bugbear ~= 24.2.6 23 | flake8-builtins ~= 2.2.0 24 | flake8-cognitive-complexity ~= 0.1.0 25 | flake8-colors ~= 0.1.9 26 | flake8-comprehensions ~= 3.14.0 27 | flake8-docstrings ~= 1.7.0 28 | flake8-eradicate ~= 1.5.0 29 | flake8-encodings ~= 0.5.1 30 | flake8-expression-complexity ~= 0.0.11 31 | flake8-logging-format ~= 0.9.0 32 | flake8-mutable ~= 1.2.0 33 | flake8-pie ~= 0.16.0 34 | flake8-pytest ~= 1.4 35 | flake8-raise ~= 0.0.5 36 | flake8-simplify ~= 0.21.0 37 | flake8-string-format ~= 0.3.0 38 | flake8-tuple ~= 0.4.1 39 | flake8-use-pathlib ~= 0.3.0 40 | flake8-warnings ~= 0.4.1 41 | pep8-naming ~= 0.13.3 42 | 43 | # S104 - do not cleanup XML data prior to processing 44 | # S410 - bind to all IPs is okay in the case of the Web service, since it is 45 | # aimed for use with docker. 46 | # W503 - replaced with W504 47 | # D102 - missing docstring in public method 48 | # D105 - missing docstring in magic method (e.g., __str__) 49 | # D107 - missing docstring in __init__ 50 | # E203, E704 black 51 | commands = flake8 --exclude=".tox, setup.py, tests, venv, docs, benchmarking, build" \ 52 | --show-source \ 53 | --max-line-length=88 \ 54 | --ignore="DUO107, W503, D107, D105, D102, S104, S410, E203, E708" \ 55 | --max-cognitive-complexity=13 56 | 57 | # --ignore="S104, S410, W503, D107, D105, D102" \ 58 | # --enable-extensions=G \ 59 | # --max-cognitive-complexity=13 60 | --------------------------------------------------------------------------------