├── .coveragerc
├── .git-blame-ignore-revs
├── .github
    ├── dependabot.yml
    └── workflows
    │   ├── codeql-analysis.yml
    │   ├── create-container.yml
    │   ├── helm-release.yaml
    │   └── python-package.yml
├── .gitignore
├── .readthedocs.yaml
├── AUTHORS
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── README.rst
├── RENDERING.md
├── TODO.txt
├── benchmarking
    ├── a
    ├── b
    ├── run_benchmarking.py
    ├── speed_comparisons.txt
    └── url_list.txt
├── docker-compose.yml
├── docs
    ├── Makefile
    ├── README.rst
    ├── benchmarking.rst
    ├── conf.py
    ├── contributing.md
    ├── images
    │   ├── stackoverflow-code-annotation.png
    │   ├── wikipedia-chur-entry-annotation.png
    │   ├── wikipedia-chur-table-annotation.png
    │   └── xda-posts-annotation.png
    ├── index.rst
    ├── inscriptis-module-documentation.rst
    ├── paper
    │   ├── Makefile
    │   ├── images
    │   │   ├── annotations.png
    │   │   ├── inscriptis-vs-lynx.png
    │   │   ├── inscriptis-vs-lynx.xcf
    │   │   └── raw
    │   │   │   ├── inscriptis.png
    │   │   │   └── lynx.png
    │   ├── paper.bib
    │   └── paper.md
    └── requirements.txt
├── examples
    ├── annotation
    │   ├── annotation-profile.json
    │   ├── stackoverflow.json
    │   ├── table-annotation-profile.json
    │   ├── unittest.json
    │   ├── wikipedia-entities-and-citations.json
    │   ├── wikipedia.json
    │   └── xda-developers.json
    └── custom-html-handling.py
├── img
    ├── nested-table-firefox.png
    ├── wikipedia-chur-firefox.png
    └── wikipedia-python-example.png
├── publish.sh
├── pyproject.toml
├── src
    └── inscriptis
    │   ├── __init__.py
    │   ├── annotation
    │       ├── __init__.py
    │       ├── output
    │       │   ├── __init__.py
    │       │   ├── html.py
    │       │   ├── surface.py
    │       │   └── xml.py
    │       └── parser.py
    │   ├── cli
    │       ├── __init__.py
    │       └── inscript.py
    │   ├── css_profiles.py
    │   ├── html_engine.py
    │   ├── html_properties.py
    │   ├── metadata.py
    │   ├── model
    │       ├── __init__.py
    │       ├── attribute.py
    │       ├── canvas
    │       │   ├── __init__.py
    │       │   ├── block.py
    │       │   └── prefix.py
    │       ├── config.py
    │       ├── css.py
    │       ├── html_document_state.py
    │       ├── html_element.py
    │       ├── table.py
    │       └── tag
    │       │   ├── __init__.py
    │       │   ├── a_tag.py
    │       │   ├── br_tag.py
    │       │   ├── img_tag.py
    │       │   ├── list_tag.py
    │       │   └── table_tag.py
    │   └── service
    │       ├── __init__.py
    │       └── web.py
├── tests
    ├── __init__.py
    ├── data
    │   └── annotation-profile-unittest.json
    ├── html
    │   ├── advanced-prefix-test.html
    │   ├── advanced-prefix-test.txt
    │   ├── br-in-table.html
    │   ├── br-in-table.txt
    │   ├── br-in-table2.html
    │   ├── br-li.html
    │   ├── br-li.txt
    │   ├── br.html
    │   ├── br.txt
    │   ├── direct-enumeration.html
    │   ├── direct-enumeration.txt
    │   ├── empty-table.html
    │   ├── empty-table.txt
    │   ├── enumerations.html
    │   ├── enumerations.txt
    │   ├── html-comment-ofuscation.html
    │   ├── html-comment-ofuscation.txt
    │   ├── invalid-table.html
    │   ├── invalid-table.txt
    │   ├── invalid-table2.html
    │   ├── invalid-table2.txt
    │   ├── invalid-table3.html
    │   ├── invalid-table3.txt
    │   ├── invisible.html
    │   ├── invisible.txt
    │   ├── invisible2.html
    │   ├── invisible2.txt
    │   ├── invisible3.html
    │   ├── invisible3.txt
    │   ├── nested-list.html
    │   ├── nested-list.txt
    │   ├── nested-table-alignment-css.html
    │   ├── nested-table-alignment-css.txt
    │   ├── nested-table-alignment.html
    │   ├── nested-table-alignment.txt
    │   ├── nested-table.html
    │   ├── nested-table.txt
    │   ├── p-br.html
    │   ├── p-br.txt
    │   ├── pre.html
    │   ├── pre.txt
    │   ├── real-world
    │   │   ├── avantec-team.html
    │   │   ├── naturgruen-team.html
    │   │   └── rswag-mitarbeiter.html
    │   ├── stackoverflow-list-snippet.html
    │   ├── stackoverflow-list-snippet.txt
    │   ├── subsequent-headings.html
    │   ├── subsequent-headings.json
    │   ├── subsequent-headings.txt
    │   ├── table-alignment.html
    │   ├── table-alignment.txt
    │   ├── table-empty-row.html
    │   ├── table-empty-row.txt
    │   ├── table-in-table.html
    │   ├── table-in-table.json
    │   ├── table-in-table.txt
    │   ├── table-itemize.html
    │   ├── table-itemize.txt
    │   ├── table-pre.html
    │   ├── table-pre.txt
    │   ├── table.html
    │   ├── table.json
    │   ├── table.txt
    │   ├── td-only-table.html
    │   ├── td-only-table.txt
    │   ├── test.html
    │   ├── tr-only-table.html
    │   ├── tr-only-table.txt
    │   ├── whitespace.html
    │   ├── whitespace.txt
    │   ├── wikipedia-code.html
    │   ├── wikipedia-code.txt
    │   ├── wikipedia-consequtive-links-and-umlauts.html
    │   ├── wikipedia-consequtive-links-and-umlauts.txt
    │   ├── wikipedia-consequtive-tables.html
    │   ├── wikipedia-consequtive-tables.json
    │   ├── wikipedia-enumeration-annotation.html
    │   ├── wikipedia-enumeration-annotation.json
    │   ├── wikipedia-enumeration-annotation.txt
    │   ├── wikipedia-enumeration.html
    │   ├── wikipedia-enumeration.txt
    │   ├── wikipedia-equation.html
    │   ├── wikipedia-equation.txt
    │   ├── wikipedia-table-bordercase-verticial-alignmnet.html
    │   ├── wikipedia-table-bordercase-verticial-alignmnet.json
    │   ├── wikipedia-table-bordercase1.html
    │   ├── wikipedia-table-bordercase1.json
    │   ├── wikipedia-table.html
    │   ├── wikipedia-table.json
    │   └── wikipedia-table.txt
    ├── test_annotation.py
    ├── test_annotation_engine.py
    ├── test_annotation_output_processor.py
    ├── test_annotation_output_xml.py
    ├── test_annotation_rule_parsing.py
    ├── test_block.py
    ├── test_broken_table_handling.py
    ├── test_cli.py
    ├── test_custom_html_tag_handling.py
    ├── test_double_a.py
    ├── test_empty_string.py
    ├── test_engine.py
    ├── test_html_conversion_options.py
    ├── test_html_snippets.py
    ├── test_html_snippets_annotations.py
    ├── test_invalid_float_specification.py
    ├── test_limit_whitespace_affixes.py
    ├── test_list_div.py
    ├── test_margin_before_at_start.py
    ├── test_margin_handling.py
    ├── test_metadata.py
    ├── test_model_html_element_canvas.py
    ├── test_model_prefix.py
    ├── test_parse_css.py
    ├── test_strip_xml_header.py
    ├── test_style_parsing.py
    ├── test_table_cell.py
    ├── test_table_cell_formatting.py
    ├── test_table_row.py
    ├── test_web_service.py
    └── test_white_space_handling.py
└── tox.ini


/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | omit = tests/
3 | 


--------------------------------------------------------------------------------
/.git-blame-ignore-revs:
--------------------------------------------------------------------------------
1 | 55fa29ca39f9ed5895f9e88b2eb0f17e4d84245f
2 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | updates:
 4 | 
 5 |   # Enable version updates for github actions.
 6 |   - package-ecosystem: "github-actions"
 7 |     directory: "/"
 8 |     schedule:
 9 |       # Check for updates to GitHub Actions every weekday
10 |       interval: "weekly"
11 | 
12 |   # Enable version updates for Docker.
13 |   - package-ecosystem: "docker"
14 |     # Look for a `Dockerfile` in the `root` directory
15 |     directory: "/"
16 |     # Check for updates once a week
17 |     schedule:
18 |       interval: "weekly"
19 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | #
 7 | # ******** NOTE ********
 8 | # We have attempted to detect the languages in your repository. Please check
 9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 | 
14 | on:
15 |   push:
16 |   pull_request:
17 |   schedule:
18 |     - cron: '26 5 * * 2'
19 | 
20 | jobs:
21 |   analyze:
22 |     name: Analyze
23 |     runs-on: ubuntu-latest
24 |     permissions:
25 |       actions: read
26 |       contents: read
27 |       security-events: write
28 | 
29 |     strategy:
30 |       fail-fast: false
31 |       matrix:
32 |         language: [ 'python' ]
33 |         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ]
34 |         # Learn more:
35 |         # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed
36 | 
37 |     steps:
38 |     - name: Checkout repository
39 |       uses: actions/checkout@v3
40 | 
41 |     # Initializes the CodeQL tools for scanning.
42 |     - name: Initialize CodeQL
43 |       uses: github/codeql-action/init@v2
44 |       with:
45 |         languages: ${{ matrix.language }}
46 |         # If you wish to specify custom queries, you can do so here or in a config file.
47 |         # By default, queries listed here will override any specified in a config file.
48 |         # Prefix the list here with "+" to use these queries and those in the config file.
49 |         # queries: ./path/to/local/query, your-org/your-repo/queries@main
50 | 
51 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
52 |     # If this step fails, then you should remove it and run the build manually (see below)
53 |     - name: Autobuild
54 |       uses: github/codeql-action/autobuild@v2
55 | 
56 |     # ℹ️ Command-line programs to run using the OS shell.
57 |     # 📚 https://git.io/JvXDl
58 | 
59 |     # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
60 |     #    and modify them (or add more) to build your code if your project
61 |     #    uses a compiled language
62 | 
63 |     #- run: |
64 |     #   make bootstrap
65 |     #   make release
66 | 
67 |     - name: Perform CodeQL Analysis
68 |       uses: github/codeql-action/analyze@v2
69 | 


--------------------------------------------------------------------------------
/.github/workflows/create-container.yml:
--------------------------------------------------------------------------------
 1 | name: container
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - '*'
 7 |     
 8 | jobs:
 9 |   build:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |     - name: Checkout code
13 |       uses: actions/checkout@v3
14 | 
15 |     - name: get version
16 |       id: version
17 |       run: echo ::set-output name=APP_VERSION::${GITHUB_REF/refs\/tags\//}
18 | 
19 |     - name: init docker build
20 |       uses: docker/setup-buildx-action@v2
21 | 
22 |     - name: login docker
23 |       uses: docker/login-action@v2
24 |       with:
25 |         registry: ghcr.io
26 |         username: ${{ github.actor }}
27 |         password: ${{ secrets.GITHUB_TOKEN }}
28 | 
29 |     - name: publish container
30 |       uses: docker/build-push-action@v4
31 |       with:
32 |         push: true
33 |         tags: |
34 |           ghcr.io/weblyzard/inscriptis:v${{ steps.version.outputs.APP_VERSION }}
35 |           ghcr.io/weblyzard/inscriptis:latest
36 | 


--------------------------------------------------------------------------------
/.github/workflows/helm-release.yaml:
--------------------------------------------------------------------------------
 1 | name: helm release
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - PhilippKuntschik-patch-2
 7 |     tags:
 8 |       - '*'
 9 | 
10 | jobs:
11 |   dispatch_helm_release:
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - name: dispatch inscriptis-helm
15 |         uses: peter-evans/repository-dispatch@v2
16 |         with:
17 |           token: ${{ secrets.HELMREPO_ACCESS_TOKEN }}
18 |           repository: weblyzard/inscriptis-helm
19 |           event-type: tag-released
20 |           client-payload: '{"ref": "${{ github.ref_name }}"}'
21 | 


--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
 1 | name: build
 2 | 
 3 | on:
 4 |   push:
 5 |   pull_request:
 6 | 
 7 | jobs:
 8 |   build:
 9 | 
10 |     runs-on: ubuntu-24.04
11 |     strategy:
12 |       fail-fast: false
13 |       matrix:
14 |         python-version: [ '3.9', '3.10', '3.11', '3.12', '3.13' ]
15 | 
16 |     steps:
17 |     - uses: actions/checkout@v3
18 |     - name: Set up Python ${{ matrix.python-version }}
19 |       uses: actions/setup-python@v4
20 |       with:
21 |         python-version: ${{ matrix.python-version }}
22 |     - name: Install build environment
23 |       run: |
24 |         python -m pip install --upgrade pip
25 |         python -m pip install tox setuptools pytest pytest-cov codecov
26 |     - name: Build and test with tox.
27 |       run: |
28 |         tox
29 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.pyx
 3 | .*.swp
 4 | *.egg-info
 5 | __pycache__/
 6 | benchmarking_results/
 7 | html_cache/
 8 | .tox
 9 | build/
10 | dist/
11 | .cache/
12 | .project
13 | .pydevproject
14 | .settings/
15 | .pytest_cache/
16 | .coverage
17 | _build/
18 | .mypy_cache/
19 | .idea/
20 | venv/
21 | tests/converted.txt
22 | tests/reference.txt
23 | *.c
24 | docs/paper/*.pdf
25 | htmlcov/
26 | poetry.lock
27 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # Read the Docs configuration file for Sphinx projects
 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 3 | 
 4 | # Required
 5 | version: 2
 6 | 
 7 | # Set the OS, Python version and other tools you might need
 8 | build:
 9 |   os: ubuntu-22.04
10 |   tools:
11 |     python: "3.12"
12 |     # You can also specify other tool versions:
13 |     # nodejs: "20"
14 |     # rust: "1.70"
15 |     # golang: "1.20"
16 | 
17 | # Build documentation in the "docs/" directory with Sphinx
18 | sphinx:
19 |   configuration: docs/conf.py
20 |   # You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs
21 |   # builder: "dirhtml"
22 |   # Fail on all warnings to avoid broken references
23 |   # fail_on_warning: true
24 | 
25 | # Optionally build your docs in additional formats such as PDF and ePub
26 | formats:
27 |   - pdf
28 | #   - epub
29 | 
30 | # Optional but recommended, declare the Python requirements required
31 | # to build your documentation
32 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
33 | python:
34 |   install:
35 |     - requirements: docs/requirements.txt
36 | 


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
1 | Albert Weichselbraun <albert.weichselbraun@fhgr.ch>
2 | Fabian Odoni <fabian.odoni@fhgr.ch>
3 | 
4 | The design of inscriptis has originally been inspired by SpiffWikiMarkup
5 | developed by Samuel Abels <spam2, debain, org>.
6 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to Inscriptis
 2 | 
 3 | First off, thank you for considering contributing to inscriptis. 
 4 | There are many ways how you can contribute to the project and these guidelines aim at supporting you in doing so.
 5 | 
 6 | 1. [Reporting bugs and seeking support](#reporting-bugs-and-seeking-support)
 7 | 2. [Suggesting enhancements](#suggesting-enhancements)
 8 | 3. [Pull requests](#pull-requests) (contributing code)
 9 | 4. [Python style guide](#python-style-guide)
10 | 
11 | 
12 | ## Reporting bugs and seeking support
13 | 
14 | Bugs and support requests are tracked as GitHub issues.
15 | 
16 | To create an effective and high quality ticket, please include the following information in your
17 | ticket:
18 | 
19 |  1. **Use a clear and descriptive title** for the issue to identify the problem. This also helps other users to quickly locate bug reports that affect them.
20 |  2. **Describe the exact steps necessary for reproducing the problem** including at least information on
21 |     - the affected URL
22 |     - the command line parameters or function arguments you used
23 |  3. What would have been the **expected behavior**?
24 |  4. Describe the **observed behavior**.
25 |  5. Provide any additional information which might be helpful in reproducing and/or fixing this issue. 
26 | 
27 | 
28 | ## Suggesting enhancements
29 | 
30 | Enhancements are also tracked as GitHub issues and should contain the following information:
31 | 
32 |  1. **A clear and descriptive title** helps other people to identify enhancements they like, so that they can also add their thoughts and suggestions.
33 |  2. **Provide a step-by-step description** of the suggested enhancement.
34 |  3. **Describe the current behavior** and **explain which behavior you expected to see instead** and why.
35 | 
36 | 
37 | ## Pull requests
38 | 
39 | 1. Ensure that your code complies with our [Python style guide](#python-style-guide).
40 | 2. Write a unit test that covers your new code and put it into the `./tests` directory.
41 | 3. Execute `tox .` in the project's root directory to ensure that your code passes the static code analysis, coding style guidelines and security checks.
42 | 4. In addition, please document any new API functions in the Inscriptis documentation.
43 | 
44 | 
45 | ## Python style guide
46 | 
47 | Inscriptis code should comply to
48 | - the [PEP8 Style Guide for Python Code](https://www.python.org/dev/peps/pep-0008/), and
49 | - to the [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html)
50 | 
51 | Please also ensure that 
52 | 1. functions are properly documented with docstrings that comply to the Google Python Style Guide, and
53 | 2. any new code is covered by unit tests.
54 | 
55 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Stage 1 - Install build dependencies
 3 | #
 4 | FROM python:3.11-slim-bullseye AS builder
 5 | 
 6 | WORKDIR /inscriptis
 7 | RUN python -m venv .venv && .venv/bin/python -m pip install --upgrade pip
 8 | RUN .venv/bin/pip install --no-cache-dir inscriptis[web-service] && \
 9 |     find /inscriptis/.venv \( -type d -a -name test -o -name tests \) -o \( -type f -a -name '*.pyc' -o -name '*.pyo' \) -exec rm -rf '{}' \+
10 | 
11 | #
12 | # Stage 2 - Copy only necessary files to the runner stage
13 | #
14 | FROM python:3.11-slim-bullseye 
15 | LABEL maintainer="albert@weichselbraun.net"
16 | 
17 | # Note: only copy the src directory, to prevent bloating the image with 
18 | #       irrelevant files from the project directory.
19 | WORKDIR /inscriptis
20 | COPY --from=builder /inscriptis /inscriptis
21 | 
22 | ENV PATH="/inscriptis/.venv/bin:$PATH"
23 | CMD ["uvicorn", "inscriptis.service.web:app", "--port=5000", "--host=0.0.0.0"]
24 | EXPOSE 5000
25 | 


--------------------------------------------------------------------------------
/TODO.txt:
--------------------------------------------------------------------------------
1 | Please feel free to address any of the following issues
2 | 
3 | - add a class that translates style sheets to the corresponding dictionary of `HtmlElement`s.
4 | - check: option to remove links with a one letter description (footnotes)
5 | - support for <hr /> tag (if needed)
6 | 


--------------------------------------------------------------------------------
/benchmarking/a:
--------------------------------------------------------------------------------
  1 | justext is not available. Please install it in order to compare with justext.
  2 | 
  3 | URL: www.watson.de
  4 | Lynx         : 0.15138936042785645 --> fastest
  5 | Inscriptis   : 0.20263218879699707 +0.051242828369140625
  6 | BeautifulSoup: 0.3756422996520996  +0.22425293922424316
  7 | Html2Text    : 0.43219757080078125 +0.2808082103729248
  8 | 
  9 | 
 10 | URL: www.watson.ch-Digital20&20Games-Android-134350872-Der-Monster-Akku-in-diesem-Smartphone-hC3A4lt-bis-
 11 | Inscriptis   : 0.07737088203430176 --> fastest
 12 | BeautifulSoup: 0.1150212287902832  +0.037650346755981445
 13 | Lynx         : 0.1359405517578125  +0.05856966972351074
 14 | Html2Text    : 0.1448962688446045  +0.06752538681030273
 15 | 
 16 | 
 17 | URL: www.heise.de
 18 | Lynx         : 0.15659260749816895 --> fastest
 19 | Inscriptis   : 0.20164966583251953 +0.045057058334350586
 20 | BeautifulSoup: 0.29897594451904297 +0.14238333702087402
 21 | Html2Text    : 0.37505173683166504 +0.2184591293334961
 22 | 
 23 | 
 24 | URL: www.heise.de-newsticker-meldung-Fairphone-2-im-Test-Das-erste-modulare-Smartphone-3043417.html
 25 | Inscriptis   : 0.09370565414428711 --> fastest
 26 | Lynx         : 0.15947198867797852 +0.0657663345336914
 27 | BeautifulSoup: 0.16203570365905762 +0.06833004951477051
 28 | Html2Text    : 0.21861886978149414 +0.12491321563720703
 29 | 
 30 | 
 31 | URL: www.nzz.de
 32 | Lynx         : 0.17096304893493652 --> fastest
 33 | Inscriptis   : 0.2877614498138428  +0.11679840087890625
 34 | Html2Text    : 0.4983334541320801  +0.32737040519714355
 35 | BeautifulSoup: 0.5966424942016602  +0.42567944526672363
 36 | 
 37 | 
 38 | URL: www.nzz.ch-mobilitaet-auto-mobil-bekenntnis-zum-stromauto-ld.3630
 39 | Inscriptis   : 0.1326134204864502  --> fastest
 40 | Lynx         : 0.14449405670166016 +0.011880636215209961
 41 | BeautifulSoup: 0.16537070274353027 +0.03275728225708008
 42 | Html2Text    : 0.2061011791229248  +0.07348775863647461
 43 | 
 44 | 
 45 | URL: de.wikipedia.org-wiki-Wikipedia-Hauptseite
 46 | Inscriptis   : 0.0768730640411377  --> fastest
 47 | BeautifulSoup: 0.1140899658203125  +0.037216901779174805
 48 | Html2Text    : 0.1279299259185791  +0.051056861877441406
 49 | Lynx         : 0.13344478607177734 +0.05657172203063965
 50 | 
 51 | 
 52 | URL: de.wikipedia.org-wiki-Python_(Programmiersprache)
 53 | Lynx         : 0.15608739852905273 --> fastest
 54 | Inscriptis   : 0.2505784034729004  +0.09449100494384766
 55 | BeautifulSoup: 0.3396627902984619  +0.18357539176940918
 56 | Html2Text    : 0.407498836517334   +0.25141143798828125
 57 | 
 58 | 
 59 | URL: de.wikipedia.org-wiki-Chur
 60 | Lynx         : 0.19526290893554688 --> fastest
 61 | Inscriptis   : 0.4372870922088623  +0.24202418327331543
 62 | BeautifulSoup: 0.5105750560760498  +0.31531214714050293
 63 | Html2Text    : 0.7925112247467041  +0.5972483158111572
 64 | 
 65 | 
 66 | URL: jr-central.co.jp
 67 | Inscriptis   : 0.030536651611328125 --> fastest
 68 | BeautifulSoup: 0.04150390625        +0.010967254638671875
 69 | Html2Text    : 0.05070781707763672  +0.020171165466308594
 70 | Lynx         : 0.1379244327545166   +0.10738778114318848
 71 | 
 72 | 
 73 | URL: www.aljazeera.net-portal
 74 | Lynx         : 0.18790936470031738 --> fastest
 75 | Inscriptis   : 0.3582143783569336  +0.1703050136566162
 76 | BeautifulSoup: 0.5611743927001953  +0.37326502799987793
 77 | Html2Text    : 0.6482110023498535  +0.46030163764953613
 78 | 
 79 | 
 80 | URL: www.aljazeera.net-news-humanrights-2015-12-14-D8A3D988D8A8D8A7D985D8A7-D98AD8ACD8AFD8AF-D8A7D984D8AA
 81 | Inscriptis   : 0.13330984115600586 --> fastest
 82 | Lynx         : 0.14847993850708008 +0.015170097351074219
 83 | BeautifulSoup: 0.17941498756408691 +0.046105146408081055
 84 | Html2Text    : 0.242262601852417   +0.10895276069641113
 85 | 
 86 | 
 87 | URL: www.fhgr.ch
 88 | Lynx         : 0.20734667778015137 --> fastest
 89 | Inscriptis   : 0.5514888763427734  +0.34414219856262207
 90 | BeautifulSoup: 0.7790236473083496  +0.5716769695281982
 91 | Html2Text    : 0.9708971977233887  +0.7635505199432373
 92 | 
 93 | 
 94 | URL: www.diepresse.com
 95 | Lynx         : 0.18340134620666504 --> fastest
 96 | Inscriptis   : 0.2943253517150879  +0.11092400550842285
 97 | BeautifulSoup: 0.48204803466796875 +0.2986466884613037
 98 | Html2Text    : 0.5474369525909424  +0.36403560638427734
 99 | 
100 | 
101 | URL: derstandard.at
102 | Lynx         : 0.17057490348815918 --> fastest
103 | Inscriptis   : 0.3920929431915283  +0.22151803970336914
104 | BeautifulSoup: 0.4781017303466797  +0.3075268268585205
105 | Html2Text    : 0.5499060153961182  +0.379331111907959
106 | 
107 | 
108 | URL: krone.at
109 | Lynx         : 0.18678593635559082 --> fastest
110 | Inscriptis   : 0.41831398010253906 +0.23152804374694824
111 | BeautifulSoup: 0.6808819770812988  +0.494096040725708
112 | Html2Text    : 0.794529914855957   +0.6077439785003662
113 | 
114 | 


--------------------------------------------------------------------------------
/benchmarking/b:
--------------------------------------------------------------------------------
1 | justext is not available. Please install it in order to compare with justext.
2 | 
3 | URL: www.watson.de
4 | 


--------------------------------------------------------------------------------
/benchmarking/speed_comparisons.txt:
--------------------------------------------------------------------------------
  1 | 
  2 | URL: www.watson.de
  3 | inscriptis   : 0.0886073112487793  --> fastest
  4 | lynx         : 0.09243917465209961 +0.0038318634033203125
  5 | html2text    : 0.27269411087036133 +0.18408679962158203
  6 | beautifulsoup: 0.3715205192565918  +0.2829132080078125
  7 | 
  8 | 
  9 | URL: www.watson.ch-Digital20&20Games-Android-134350872-Der-Monster-Akku-in-diesem-Smartphone-hC3A4lt-bis-
 10 | inscriptis   : 0.031877756118774414 --> fastest
 11 | lynx         : 0.06591463088989258  +0.034036874771118164
 12 | html2text    : 0.09615325927734375  +0.06427550315856934
 13 | beautifulsoup: 0.10839462280273438  +0.07651686668395996
 14 | 
 15 | 
 16 | URL: www.heise.de
 17 | inscriptis   : 0.0771639347076416  --> fastest
 18 | lynx         : 0.0936579704284668  +0.016494035720825195
 19 | html2text    : 0.2419900894165039  +0.1648261547088623
 20 | beautifulsoup: 0.29470372200012207 +0.21753978729248047
 21 | 
 22 | 
 23 | URL: www.heise.de-newsticker-meldung-Fairphone-2-im-Test-Das-erste-modulare-Smartphone-3043417.html
 24 | inscriptis   : 0.036151885986328125 --> fastest
 25 | lynx         : 0.0704348087310791   +0.03428292274475098
 26 | html2text    : 0.10545611381530762  +0.06930422782897949
 27 | beautifulsoup: 0.12367486953735352  +0.08752298355102539
 28 | 
 29 | 
 30 | URL: www.nzz.de
 31 | lynx         : 0.10388016700744629 --> fastest
 32 | inscriptis   : 0.11366724967956543 +0.00978708267211914
 33 | html2text    : 0.34471607208251953 +0.24083590507507324
 34 | beautifulsoup: 0.37203025817871094 +0.26815009117126465
 35 | 
 36 | 
 37 | URL: www.nzz.ch-mobilitaet-auto-mobil-bekenntnis-zum-stromauto-ld.3630
 38 | inscriptis   : 0.05420851707458496 --> fastest
 39 | lynx         : 0.08396458625793457 +0.02975606918334961
 40 | html2text    : 0.15306854248046875 +0.09886002540588379
 41 | beautifulsoup: 0.16551637649536133 +0.11130785942077637
 42 | 
 43 | 
 44 | URL: de.wikipedia.org-wiki-Wikipedia-Hauptseite
 45 | inscriptis   : 0.029024839401245117 --> fastest
 46 | lynx         : 0.0713193416595459   +0.04229450225830078
 47 | beautifulsoup: 0.08946847915649414  +0.06044363975524902
 48 | html2text    : 0.09077596664428711  +0.06175112724304199
 49 | 
 50 | 
 51 | URL: de.wikipedia.org-wiki-Python_(Programmiersprache)
 52 | inscriptis   : 0.08830070495605469 --> fastest
 53 | lynx         : 0.09342122077941895 +0.005120515823364258
 54 | html2text    : 0.30716776847839355 +0.21886706352233887
 55 | beautifulsoup: 0.3195374011993408  +0.23123669624328613
 56 | 
 57 | 
 58 | URL: de.wikipedia.org-wiki-Chur
 59 | lynx         : 0.110748291015625   --> fastest
 60 | inscriptis   : 0.16320323944091797 +0.05245494842529297
 61 | html2text    : 0.4872932434082031  +0.3765449523925781
 62 | beautifulsoup: 0.4883759021759033  +0.3776276111602783
 63 | 
 64 | 
 65 | URL: jr-central.co.jp
 66 | inscriptis   : 0.012284517288208008 --> fastest
 67 | html2text    : 0.03157520294189453  +0.019290685653686523
 68 | beautifulsoup: 0.04013681411743164  +0.027852296829223633
 69 | lynx         : 0.06790828704833984  +0.055623769760131836
 70 | 
 71 | 
 72 | URL: www.aljazeera.net-portal
 73 | lynx         : 0.11873912811279297 --> fastest
 74 | inscriptis   : 0.13616037368774414 +0.017421245574951172
 75 | html2text    : 0.35196900367736816 +0.2332298755645752
 76 | beautifulsoup: 0.5011019706726074  +0.38236284255981445
 77 | 
 78 | 
 79 | URL: www.aljazeera.net-news-humanrights-2015-12-14-D8A3D988D8A8D8A7D985D8A7-D98AD8ACD8AFD8AF-D8A7D984D8AA
 80 | inscriptis   : 0.04958152770996094 --> fastest
 81 | lynx         : 0.08647871017456055 +0.03689718246459961
 82 | html2text    : 0.1424856185913086  +0.09290409088134766
 83 | beautifulsoup: 0.21869587898254395 +0.169114351272583
 84 | 
 85 | 
 86 | URL: www.htwchur.ch
 87 | inscriptis   : 0.04151415824890137 --> fastest
 88 | lynx         : 0.07280635833740234 +0.03129220008850098
 89 | html2text    : 0.11662626266479492 +0.07511210441589355
 90 | beautifulsoup: 0.1333613395690918  +0.09184718132019043
 91 | 
 92 | 
 93 | URL: www.diepresse.com
 94 | lynx         : 0.10844087600708008 --> fastest
 95 | inscriptis   : 0.11291694641113281 +0.004476070404052734
 96 | html2text    : 0.3410661220550537  +0.23262524604797363
 97 | beautifulsoup: 0.42446470260620117 +0.3160238265991211
 98 | 
 99 | 
100 | URL: derstandard.at
101 | lynx         : 0.10470342636108398 --> fastest
102 | inscriptis   : 0.14974093437194824 +0.04503750801086426
103 | html2text    : 0.4319000244140625  +0.3271965980529785
104 | beautifulsoup: 0.4459238052368164  +0.3412203788757324
105 | 
106 | 
107 | URL: krone.at
108 | lynx         : 0.11936330795288086 --> fastest
109 | inscriptis   : 0.18073749542236328 +0.06137418746948242
110 | html2text    : 0.571204662322998   +0.4518413543701172
111 | beautifulsoup: 0.6350071430206299  +0.515643835067749
112 | 
113 | 
114 | 


--------------------------------------------------------------------------------
/benchmarking/url_list.txt:
--------------------------------------------------------------------------------
 1 | https://www.watson.de
 2 | https://www.watson.ch/Digital%20&%20Games/Android/134350872-Der-Monster-Akku-in-diesem-Smartphone-h%C3%A4lt-bis-15-Tage
 3 | https://www.heise.de
 4 | https://www.heise.de/newsticker/meldung/Fairphone-2-im-Test-Das-erste-modulare-Smartphone-3043417.html
 5 | http://www.nzz.de
 6 | https://www.nzz.ch/mobilitaet/auto-mobil/bekenntnis-zum-stromauto-ld.3630
 7 | https://de.wikipedia.org/wiki/Wikipedia:Hauptseite
 8 | https://de.wikipedia.org/wiki/Python_(Programmiersprache)
 9 | https://de.wikipedia.org/wiki/Chur
10 | http://jr-central.co.jp
11 | http://www.aljazeera.net/portal
12 | http://www.aljazeera.net/news/humanrights/2015/12/14/%D8%A3%D9%88%D8%A8%D8%A7%D9%85%D8%A7-%D9%8A%D8%AC%D8%AF%D8%AF-%D8%A7%D9%84%D8%AA%D8%B2%D8%A7%D9%85%D9%87-%D8%A8%D8%A5%D8%BA%D9%84%D8%A7%D9%82-%D8%BA%D9%88%D8%A7%D9%86%D8%AA%D8%A7%D9%86%D8%A7%D9%85%D9%88
13 | https://www.fhgr.ch
14 | https://www.diepresse.com
15 | https://derstandard.at
16 | https://krone.at
17 | https://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python/46921881
18 | https://www.chur.ch/churinzahlen
19 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '2'
 2 | 
 3 | services:
 4 |   inscriptis:
 5 |     build:
 6 |       context: .
 7 |       dockerfile: Dockerfile
 8 |     ports:
 9 |       - 5000:5000
10 |     volumes:
11 |       - /etc/localtime:/etc/localtime:ro
12 |     environment:
13 |       - TZ=Europe/Berlin
14 |     restart: always
15 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = inscriptis
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/docs/README.rst:
--------------------------------------------------------------------------------
1 | ../README.rst


--------------------------------------------------------------------------------
/docs/benchmarking.rst:
--------------------------------------------------------------------------------
 1 | ====================================
 2 | Testing, benchmarking and evaluation
 3 | ====================================
 4 | 
 5 | Unit tests
 6 | ==========
 7 | In addition to the standard unit tests that are located in the project's `test` directory Inscriptis also contains 
 8 | test cases that solely focus on the html to text conversion and are located in the `tests/html` directory. 
 9 | These tests consist of two files:
10 | 
11 |  1. `test-name.html` and
12 |  2. `test-name.txt`
13 | 
14 | The `.txt` file contains the reference text output for the given html file.
15 | 
16 | Since Inscripits 2.0 there may also be a third file named `test-name.json` in the `tests/html` directory which contains a JSON dictioanry with keys
17 | 
18 |  1. `annotation-rules` containing the annotation rules for extracting metadata from the corresponding html file, and
19 |  2. `result` which stores the surface forms of the extracted metadata.
20 | 
21 | 
22 | Example::
23 | 
24 | 	{"annotation_rules": {
25 | 	    "h1": ["heading"],
26 | 	    "b": ["emphasis"]
27 | 	 },
28 | 	 "result": [
29 | 		["heading", "The first"],
30 | 		["heading", "The second"],
31 | 		["heading", "Subheading"]
32 | 	 ]
33 | 	}
34 | 
35 | 
36 | Text conversion output comparison and benchmarking
37 | ==================================================
38 | The inscriptis project contains a benchmarking script that can compare different HTML to text conversion approaches.
39 | The script will run the different approaches on a list of URLs, `url_list.txt`, and save the text output into a time stamped folder in `benchmarking/benchmarking_results` for manual comparison.
40 | Additionally the processing speed of every approach per URL is measured and saved in a text file called `speed_comparisons.txt` in the respective time stamped folder.
41 | 
42 | To run the benchmarking script execute `run_benchmarking.py` from within the folder `benchmarking`.
43 | In `def pipeline()` set the which HTML -> Text algorithms to be executed by modifying::
44 | 
45 |    run_lynx = True
46 |    run_justext = True
47 |    run_html2text = True
48 |    run_beautifulsoup = True
49 |    run_inscriptis = True
50 | 
51 | In `url_list.txt` the URLs to be parsed can be specified by adding them to the file, one per line with no additional formatting. URLs need to be complete (including http:// or https://)
52 | e.g.::
53 | 
54 |    http://www.informationscience.ch
55 |    https://en.wikipedia.org/wiki/Information_science
56 |    ...
57 | 
58 | 


--------------------------------------------------------------------------------
/docs/contributing.md:
--------------------------------------------------------------------------------
1 | ../CONTRIBUTING.md


--------------------------------------------------------------------------------
/docs/images/stackoverflow-code-annotation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weblyzard/inscriptis/2ef7e3bdc428816a34e8f6e6280888c7d64eec26/docs/images/stackoverflow-code-annotation.png


--------------------------------------------------------------------------------
/docs/images/wikipedia-chur-entry-annotation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weblyzard/inscriptis/2ef7e3bdc428816a34e8f6e6280888c7d64eec26/docs/images/wikipedia-chur-entry-annotation.png


--------------------------------------------------------------------------------
/docs/images/wikipedia-chur-table-annotation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weblyzard/inscriptis/2ef7e3bdc428816a34e8f6e6280888c7d64eec26/docs/images/wikipedia-chur-table-annotation.png


--------------------------------------------------------------------------------
/docs/images/xda-posts-annotation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weblyzard/inscriptis/2ef7e3bdc428816a34e8f6e6280888c7d64eec26/docs/images/xda-posts-annotation.png


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. inscriptis documentation master file, created by
 2 |    sphinx-quickstart on Sat Dec 14 06:42:31 2019.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | .. include:: README.rst
 7 | 
 8 | Documentation
 9 | =============
10 | 
11 | Contents:
12 | 
13 | .. toctree::
14 |    :maxdepth: 2
15 | 
16 |    Documentation <README>
17 |    benchmarking
18 |    contributing
19 |    inscriptis-module-documentation
20 | 
21 | 
22 | 
23 | Indices and tables
24 | ==================
25 | 
26 | * :ref:`genindex`
27 | * :ref:`modindex`
28 | * :ref:`search`
29 | 
30 | 


--------------------------------------------------------------------------------
/docs/inscriptis-module-documentation.rst:
--------------------------------------------------------------------------------
 1 | ===============================
 2 | Inscriptis module documentation
 3 | ===============================
 4 | 
 5 | .. automodule:: inscriptis
 6 |    :members:
 7 | 
 8 | Inscriptis model
 9 | ================
10 | 
11 | Inscriptis HTML engine
12 | ----------------------
13 | .. automodule:: inscriptis.html_engine
14 |    :members:
15 | 
16 | Inscriptis HTML properties
17 | --------------------------
18 | .. automodule:: inscriptis.html_properties
19 |    :members:
20 | 
21 | Inscriptis CSS model
22 | --------------------
23 | .. automodule:: inscriptis.model.css
24 |    :members:
25 | 
26 | Inscriptis canvas model
27 | -----------------------
28 | .. automodule:: inscriptis.model.canvas
29 |    :members:
30 | 
31 | .. automodule:: inscriptis.model.canvas.block
32 |    :members:
33 | 
34 | .. automodule:: inscriptis.model.canvas.prefix
35 |    :members:
36 | 
37 | 
38 | 
39 | Inscriptis table model
40 | ----------------------
41 | .. automodule:: inscriptis.model.table
42 |    :members:
43 | 
44 | 
45 | .. _annotations:
46 | 
47 | Inscriptis annotations
48 | ======================
49 | 
50 | .. automodule:: inscriptis.annotation
51 |    :members:
52 | 
53 | 
54 | Annotation processors
55 | ---------------------
56 | 
57 | .. automodule:: inscriptis.annotation.output
58 |    :members:
59 | 


--------------------------------------------------------------------------------
/docs/paper/Makefile:
--------------------------------------------------------------------------------
1 | all:
2 | 	docker run --rm \
3 | 	    --volume `pwd`:/data \
4 | 	    --user $(id -u):$(id -g) \
5 | 	    --env JOURNAL=joss \
6 | 	    openjournals/paperdraft
7 | 


--------------------------------------------------------------------------------
/docs/paper/images/annotations.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weblyzard/inscriptis/2ef7e3bdc428816a34e8f6e6280888c7d64eec26/docs/paper/images/annotations.png


--------------------------------------------------------------------------------
/docs/paper/images/inscriptis-vs-lynx.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weblyzard/inscriptis/2ef7e3bdc428816a34e8f6e6280888c7d64eec26/docs/paper/images/inscriptis-vs-lynx.png


--------------------------------------------------------------------------------
/docs/paper/images/inscriptis-vs-lynx.xcf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weblyzard/inscriptis/2ef7e3bdc428816a34e8f6e6280888c7d64eec26/docs/paper/images/inscriptis-vs-lynx.xcf


--------------------------------------------------------------------------------
/docs/paper/images/raw/inscriptis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weblyzard/inscriptis/2ef7e3bdc428816a34e8f6e6280888c7d64eec26/docs/paper/images/raw/inscriptis.png


--------------------------------------------------------------------------------
/docs/paper/images/raw/lynx.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weblyzard/inscriptis/2ef7e3bdc428816a34e8f6e6280888c7d64eec26/docs/paper/images/raw/lynx.png


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | lxml
2 | requests
3 | inscriptis
4 | myst_parser
5 | 


--------------------------------------------------------------------------------
/examples/annotation/annotation-profile.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "h1": ["heading"],
 3 |     "h2": ["heading"],
 4 |     "h3": ["heading"],
 5 |     "h4": ["heading"],
 6 |     "h5": ["heading"],
 7 |     "b": ["emphasis"],
 8 |     "div#class=toc": ["table-of-contents"],
 9 |     "#class=FactBox": ["fact-box"],
10 |     "#class=shortdescription]": ["description"],
11 |     "table": ["table"],
12 |     "tr": ["row"],
13 |     "td": ["cell"]
14 | }
15 | 


--------------------------------------------------------------------------------
/examples/annotation/stackoverflow.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "h1": ["heading"],
 3 |     "h2": ["heading"],
 4 |     "h3": ["heading"],
 5 |     "b": ["emphasis"],
 6 |     "code": ["code"],
 7 |     "#itemprop=dateCreated": ["creation-date"],
 8 |     "#class=lang-py": ["code"],
 9 |     "#class=user-details": ["user"],
10 |     "#class=reputation-score": ["reputation"],
11 |     "#class=comment-user": ["comment-user"],
12 |     "#class=comment-date": ["comment-date"],
13 |     "#class=comment-copy": ["comment-comment"]
14 | }
15 | 


--------------------------------------------------------------------------------
/examples/annotation/table-annotation-profile.json:
--------------------------------------------------------------------------------
1 | {
2 |     "table": ["table"],
3 |     "th": ["table-heading"],
4 |     "tr": ["table-row"],
5 |     "td": ["table-cell"],
6 |     "b": ["emphasis"]
7 | }
8 | 


--------------------------------------------------------------------------------
/examples/annotation/unittest.json:
--------------------------------------------------------------------------------
1 | {
2 |  "h1": ["heading"],
3 |  "h2": ["heading"],
4 |  "h3": ["heading"],
5 |  "b": ["emphasis"],
6 |  "table": ["table"]
7 | }
8 | 


--------------------------------------------------------------------------------
/examples/annotation/wikipedia-entities-and-citations.json:
--------------------------------------------------------------------------------
1 | {
2 |    "a#title": ["entity"],
3 |    "a#class=new": ["missing entity"],
4 |    "#class=reference": ["citation"]
5 | }
6 | 


--------------------------------------------------------------------------------
/examples/annotation/wikipedia.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "h1": ["heading"],
 3 |     "h2": ["heading"],
 4 |     "h3": ["subheading"],
 5 |     "h4": ["subheading"],
 6 |     "h5": ["subheading"],
 7 |     "i": ["emphasis"],
 8 |     "b": ["bold"],
 9 |     "table": ["table"],
10 |     "th": ["tableheading"],
11 |     "a": ["link"]
12 | }
13 | 


--------------------------------------------------------------------------------
/examples/annotation/xda-developers.json:
--------------------------------------------------------------------------------
1 | {
2 |     "article#class=message-body": ["article"],
3 |     "li#class=u-concealed": ["time"],
4 |     "#itemprop=name": ["user-name"],
5 |     "#itemprop=jobTitle": ["user-title"]
6 | }
7 | 


--------------------------------------------------------------------------------
/examples/custom-html-handling.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """
 4 | Custom HTML tag handling example.
 5 | 
 6 | Add a custom HTML handler for the bold <b> tag which encloses
 7 | bold text with "**".
 8 | 
 9 | Example:
10 |     "Welcome to <b>Chur</b>" is rendered as "Welcome to **Chur**".
11 | """
12 | from typing import Dict
13 | 
14 | from inscriptis import ParserConfig
15 | from inscriptis.html_engine import Inscriptis
16 | from inscriptis.model.html_document_state import HtmlDocumentState
17 | from inscriptis.model.tag import CustomHtmlTagHandlerMapping
18 | from lxml.html import fromstring
19 | 
20 | 
21 | def my_handle_start_b(state: HtmlDocumentState, _: Dict) -> None:
22 |     """Handle the opening <b> tag."""
23 |     state.tags[-1].write("**")
24 | 
25 | 
26 | def my_handle_end_b(state: HtmlDocumentState) -> None:
27 |     """Handle the closing </b> tag."""
28 |     state.tags[-1].write("**")
29 | 
30 | 
31 | MY_MAPPING = CustomHtmlTagHandlerMapping(
32 |     start_tag_mapping={"b": my_handle_start_b},
33 |     end_tag_mapping={"b": my_handle_end_b},
34 | )
35 | 
36 | 
37 | HTML = "Welcome to <b>Chur</b>"
38 | 
39 | html_tree = fromstring(HTML)
40 | inscriptis = Inscriptis(
41 |     html_tree, ParserConfig(custom_html_tag_handler_mapping=MY_MAPPING)
42 | )
43 | print(inscriptis.get_text())
44 | 


--------------------------------------------------------------------------------
/img/nested-table-firefox.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weblyzard/inscriptis/2ef7e3bdc428816a34e8f6e6280888c7d64eec26/img/nested-table-firefox.png


--------------------------------------------------------------------------------
/img/wikipedia-chur-firefox.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weblyzard/inscriptis/2ef7e3bdc428816a34e8f6e6280888c7d64eec26/img/wikipedia-chur-firefox.png


--------------------------------------------------------------------------------
/img/wikipedia-python-example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weblyzard/inscriptis/2ef7e3bdc428816a34e8f6e6280888c7d64eec26/img/wikipedia-python-example.png


--------------------------------------------------------------------------------
/publish.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Publishing sequence:
 4 | # ====================
 5 | # 1. create pypi package
 6 | # 2. publish docker container
 7 | # 3. create github release (which runs the helm scripts)
 8 | 
 9 | # publish the latest package to pypi
10 | # sources:
11 | # - https://packaging.python.org/guides/distributing-packages-using-setuptools/#packaging-your-project
12 | # - https://packaging.python.org/guides/making-a-pypi-friendly-readme/
13 | 
14 | VERSION=$(grep -oP '^version = "\K[^"]+' pyproject.toml)
15 | IMAGE_NAME=inscriptis
16 | 
17 | case "$1" in
18 | 	python)
19 | 		# cleanup dist
20 | 		rm -rf ./dist
21 | 
22 | 		# build and publish packages
23 | 		poetry publish --build
24 | 		;;
25 | 	docker)
26 | 		echo "Publishing ${IMAGE_NAME} in version ${VERSION}"
27 | 		docker login ghcr.io -u AlbertWeichselbraun --password-stdin < ../github-token.txt
28 | 		docker build -t ${IMAGE_NAME}:${VERSION} .
29 | 
30 | 		# Step 2: Tag
31 | 		docker tag ${IMAGE_NAME}:${VERSION} ghcr.io/weblyzard/${IMAGE_NAME}:${VERSION}
32 | 		docker tag ${IMAGE_NAME}:${VERSION} ghcr.io/weblyzard/${IMAGE_NAME}:latest
33 | 
34 | 		# Step 3: Publish
35 | 		docker push ghcr.io/weblyzard/${IMAGE_NAME}:${VERSION}
36 | 		;;
37 | esac
38 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "inscriptis"
 3 | version = "2.6.0"
 4 | authors = ["Albert Weichselbraun <albert.weichselbraun@fhgr.ch>", "Fabian Odoni <fabian.odoni@fhgr.ch>"]
 5 | description = "inscriptis - HTML to text converter."
 6 | keywords = ["HTML", "converter", "text"]
 7 | classifiers = [
 8 |         'Development Status :: 5 - Production/Stable',
 9 |         'Intended Audience :: Developers',
10 |         'License :: OSI Approved :: Apache Software License',
11 |         'Topic :: Text Processing',
12 |         'Topic :: Text Processing :: Markup :: HTML',
13 |         'Topic :: Utilities',
14 |         'Programming Language :: Python :: 3',
15 |         'Programming Language :: Python :: 3.9',
16 |         'Programming Language :: Python :: 3.10',
17 |         'Programming Language :: Python :: 3.11',
18 |         'Programming Language :: Python :: 3.12',
19 |         'Programming Language :: Python :: 3.13',
20 |     ]
21 | homepage = "https://github.com/weblyzard/inscriptis"
22 | repository = "https://github.com/weblyzard/inscriptis"
23 | documentation = "https://inscriptis.readthedocs.io/en"
24 | license = "Apache-2.0"
25 | readme = "README.rst"
26 | 
27 | packages = [
28 |    {include = "inscriptis", from="src"},
29 | ]
30 | 
31 | 
32 | [tool.poetry.scripts]
33 | inscript = "inscriptis.cli.inscript:cli"
34 | inscriptis-api = "inscriptis.service.web:start"
35 | 
36 | 
37 | [tool.poetry.extras]
38 | web-service = ["fastapi", "uvicorn"]
39 | 
40 | 
41 | [tool.poetry.dependencies]
42 | python = "^3.9 || ^3.10 || ^3.11 || ^3.12 || ^3.13"
43 | requests = ">=2.32.2"
44 | lxml = ">=4.9.3"
45 | 
46 | # optional dependencies
47 | fastapi = { version = "^0.115.11", optional = true }
48 | uvicorn = { version = "^0.34.0", optional = true }
49 | 
50 | [tool.poetry.group.dev.dependencies]
51 | pytest = "^8.3.5"
52 | 
53 | 
54 | [build-system]
55 | requires = ["poetry-core"]
56 | build-backend = "poetry.core.masonry.api"
57 | 
58 | 
59 | # code formatting with black
60 | [tool.black]
61 | line-length = 88
62 | target-version = ["py39", "py310", "py311", "py312", "py313"]
63 | extend-exclude = '\.html$|\.json$|\.txt$|/a$|/b$'
64 | include = '''
65 |   ^/src/|^/tests/|^/benchmarking/|^/examples/
66 | '''
67 | 


--------------------------------------------------------------------------------
/src/inscriptis/__init__.py:
--------------------------------------------------------------------------------
  1 | r"""Parse HTML content and converts it into a text representation.
  2 | 
  3 | Inscriptis provides support for
  4 | 
  5 |  - nested HTML tables
  6 |  - basic Cascade Style Sheets
  7 |  - annotations
  8 | 
  9 | The following example provides the text representation of
 10 | `<https://www.fhgr.ch>`_.
 11 | 
 12 | .. code::
 13 | 
 14 |    import urllib.request
 15 |    from inscriptis import get_text
 16 | 
 17 |    url = 'https://www.fhgr.ch'
 18 |    html = urllib.request.urlopen(url).read().decode('utf-8')
 19 | 
 20 |    text = get_text(html)
 21 | 
 22 |    print(text)
 23 | 
 24 | Use the method :meth:`~inscriptis.get_annotated_text` to obtain text and
 25 | annotations. The method requires annotation rules as described in annotations_.
 26 | 
 27 | .. code::
 28 | 
 29 |    import urllib.request
 30 |    from inscriptis import get_annotated_text
 31 | 
 32 |    url = "https://www.fhgr.ch"
 33 |    html = urllib.request.urlopen(url).read().decode('utf-8')
 34 | 
 35 |    # annotation rules specify the HTML elements and attributes to annotate.
 36 |    rules = {'h1': ['heading'],
 37 |             'h2': ['heading'],
 38 |             '#class=FactBox': ['fact-box'],
 39 |             'i': ['emphasis']}
 40 | 
 41 |   output = get_annotated_text(html, ParserConfig(annotation_rules=rules)
 42 |   print("Text:", output['text'])
 43 |   print("Annotations:", output['label'])
 44 | 
 45 | The method returns a dictionary with two keys:
 46 | 
 47 |  1. `text` which contains the page's plain text and
 48 |  2. `label` with the annotations in JSONL format that is used by annotators
 49 |      such as `doccano <https://doccano.herokuapp.com/>`_.
 50 | 
 51 | Annotations in the `label` field are returned as a list of triples with
 52 |  `start index`, `end index` and `label` as indicated below:
 53 | 
 54 | .. code-block:: json
 55 | 
 56 |    {"text": "Chur\n\nChur is the capital and largest town of the Swiss canton
 57 |              of the Grisons and lies in the Grisonian Rhine Valley.",
 58 |     "label": [[0, 4, "heading"], [6, 10, "emphasis"]]}
 59 | 
 60 | """
 61 | 
 62 | import re
 63 | from typing import Dict, Optional, Any
 64 | from inscriptis.model.config import ParserConfig
 65 | 
 66 | from lxml.etree import ParserError
 67 | from lxml.html import fromstring, HtmlElement
 68 | 
 69 | from inscriptis.html_engine import Inscriptis
 70 | 
 71 | RE_STRIP_XML_DECLARATION = re.compile(r"^<\?xml [^>]+?\?>")
 72 | 
 73 | 
 74 | def _get_html_tree(html_content: str) -> Optional[HtmlElement]:
 75 |     """Obtain the HTML parse tree for the given HTML content.
 76 | 
 77 |     Args:
 78 |         html_content: The content to parse.
 79 | 
 80 |     Returns:
 81 |         The corresponding HTML parse tree.
 82 |     """
 83 |     html_content = html_content.strip()
 84 |     if not html_content:
 85 |         return None
 86 | 
 87 |     # strip XML declaration, if necessary
 88 |     if html_content.startswith("<?xml "):
 89 |         html_content = RE_STRIP_XML_DECLARATION.sub("", html_content, count=1)
 90 | 
 91 |     try:
 92 |         return fromstring(html_content)
 93 |     except ParserError:
 94 |         return fromstring("<pre>" + html_content + "</pre>")
 95 | 
 96 | 
 97 | def get_text(html_content: str, config: ParserConfig = None) -> str:
 98 |     """Provide a text representation of the given HTML content.
 99 | 
100 |     Args:
101 |       html_content (str): The HTML content to convert.
102 |       config: An optional ParserConfig object.
103 | 
104 |     Returns:
105 |       The text representation of the HTML content.
106 |     """
107 |     html_tree = _get_html_tree(html_content)
108 |     return Inscriptis(html_tree, config).get_text() if html_tree is not None else ""
109 | 
110 | 
111 | def get_annotated_text(
112 |     html_content: str, config: ParserConfig = None
113 | ) -> Dict[str, Any]:
114 |     """Return a dictionary of the extracted text and annotations.
115 | 
116 |     Notes:
117 |         - the text is stored under the key 'text'.
118 |         - annotations are provided under the key 'label' which contains a
119 |           list of :class:`Annotation`s.
120 | 
121 |     Examples:
122 |         {"text": "EU rejects German call to boycott British lamb.", "
123 |          label": [ [0, 2, "strong"], ... ]}
124 |         {"text": "Peter Blackburn",
125 |          "label": [ [0, 15, "heading"] ]}
126 | 
127 |     Returns:
128 |         A dictionary of text (key: 'text') and annotations (key: 'label')
129 |     """
130 |     html_tree = _get_html_tree(html_content)
131 |     if html_tree is None:
132 |         return {}
133 | 
134 |     inscriptis = Inscriptis(html_tree, config)
135 |     text = inscriptis.get_text()
136 |     labels = [(a.start, a.end, a.metadata) for a in inscriptis.get_annotations()]
137 |     return {"text": text, "label": labels}
138 | 


--------------------------------------------------------------------------------
/src/inscriptis/annotation/__init__.py:
--------------------------------------------------------------------------------
 1 | """The model used for saving annotations."""
 2 | 
 3 | from typing import List
 4 | from typing import NamedTuple
 5 | 
 6 | from inscriptis.html_properties import HorizontalAlignment
 7 | 
 8 | 
 9 | class Annotation(NamedTuple):
10 |     """An Inscriptis annotation which provides metadata on the extracted text.
11 | 
12 |     The :attr:`start` and :attr:`end` indices indicate the span of the text
13 |     to which the metadata refers, and the attribute :attr:`metadata` contains
14 |     the tuple of tags describing this span.
15 | 
16 |     Example::
17 | 
18 |         Annotation(0, 10, ('heading', ))
19 | 
20 |     The annotation above indicates that the text span between the 1st (index 0)
21 |     and 11th (index 10) character of the extracted text contains a *heading*.
22 |     """
23 | 
24 |     start: int
25 |     """the annotation's start index within the text output."""
26 |     end: int
27 |     """the annotation's end index within the text output."""
28 |     metadata: str
29 |     """the tag to be attached to the annotation."""
30 | 
31 | 
32 | def horizontal_shift(
33 |     annotations: List[Annotation],
34 |     content_width: int,
35 |     line_width: int,
36 |     align: HorizontalAlignment,
37 |     shift: int = 0,
38 | ) -> List[Annotation]:
39 |     r"""Shift annotations based on the given line's formatting.
40 | 
41 |     Adjusts the start and end indices of annotations based on the line's
42 |     formatting and width.
43 | 
44 |     Args:
45 |         annotations: a list of Annotations.
46 |         content_width: the width of the actual content
47 |         line_width: the width of the line in which the content is placed.
48 |         align: the horizontal alignment (left, right, center) to assume for
49 |                the adjustment
50 |         shift: an optional additional shift
51 | 
52 |     Returns:
53 |         A list of :class:`Annotation`\s with the adjusted start and end
54 |         positions.
55 |     """
56 |     if align == HorizontalAlignment.left:
57 |         h_align = shift
58 |     elif align == HorizontalAlignment.right:
59 |         h_align = shift + line_width - content_width
60 |     else:
61 |         h_align = shift + (line_width - content_width) // 2
62 | 
63 |     return [
64 |         Annotation(a.start + h_align, a.end + h_align, a.metadata) for a in annotations
65 |     ]
66 | 


--------------------------------------------------------------------------------
/src/inscriptis/annotation/output/__init__.py:
--------------------------------------------------------------------------------
 1 | r""":class:`AnnotationProcessor`\s transform annotations to an output format.
 2 | 
 3 | All AnnotationProcessor's implement the :class:`AnnotationProcessor` interface
 4 | by overwrite the class's :meth:`AnnotationProcessor.__call__` method.
 5 | 
 6 | .. note::
 7 |     1. The AnnotationExtractor class must be put into a package with the
 8 |        extractor's name (e.g., :mod:`inscriptis.annotation.output.*package*`)
 9 |        and be named :class:`*PackageExtractor*` (see the examples below).
10 |     2. The overwritten :meth:`__call__` method may either extend the original
11 |        dictionary which contains the extracted text and annotations (e.g.,
12 |        :class:`~inscriptis.annotation.output.surface.SurfaceExtractor`) or
13 |        may replace it with a custom output (e.g.,
14 |        :class:`~inscriptis.annotation.output.html.HtmlExtractor` and
15 |        :class:`~inscriptis.annotation.output.xml.XmlExtractor`).
16 | 
17 | Currently, Inscriptis supports the following built-in AnnotationProcessors:
18 | 
19 |  1. :class:`~inscriptis.annotation.output.html.HtmlExtractor` provides an
20 |     annotated HTML output format.
21 |  2. :class:`~inscriptis.annotation.output.xml.XmlExtractor` yields an output
22 |     which marks annotations with XML tags.
23 |  3. :class:`~inscriptis.annotation.output.surface.SurfaceExtractor` adds the
24 |     key `surface` to the result dictionary which contains the surface forms
25 |     of the extracted annotations.
26 | 
27 | """
28 | 
29 | from typing import Dict, Any
30 | 
31 | 
32 | class AnnotationProcessor:
33 |     """An AnnotationProcessor is called for formatting annotations."""
34 | 
35 |     def __call__(self, annotated_text: Dict[str, str]) -> Any:
36 |         """Format the given text and annotations.
37 | 
38 |         Args:
39 |             annotated_text: a dictionary that contains the converted text and
40 |                             all annotations that have been found.
41 | 
42 |         Returns:
43 |             An output representation that has been changed according to the
44 |             AnnotationProcessor's design.
45 |         """
46 |         raise NotImplementedError
47 | 


--------------------------------------------------------------------------------
/src/inscriptis/annotation/output/html.py:
--------------------------------------------------------------------------------
 1 | """HTML Annotation Processor."""
 2 | 
 3 | from collections import defaultdict
 4 | from itertools import cycle
 5 | from typing import Dict, Any, List
 6 | 
 7 | from inscriptis.annotation.output import AnnotationProcessor
 8 | 
 9 | COLOR_SCHEMA = ("#D8115980", "#8F2D5680", "#21838080", "#FBB13C80", "#73D2DE80")
10 | 
11 | 
12 | class HtmlExtractor(AnnotationProcessor):
13 |     """Provides an HTML version of the extracted text.
14 | 
15 |     The generated HTML colors annotations based on the COLOR_SCHEMA
16 |     constant.
17 |     """
18 | 
19 |     verbatim = True
20 | 
21 |     def __call__(self, annotated_text: Dict[str, Any]) -> str:
22 |         tag_dict = defaultdict(list)
23 | 
24 |         for start, end, label in reversed(annotated_text["label"]):
25 |             tag_dict[start].append(
26 |                 f'<span class="{label}-label">{label}</span><span class="{label}">'
27 |             )
28 |             tag_dict[end].insert(0, "</span>")
29 | 
30 |         tagged_content = [
31 |             "<html><head><style>",
32 |             self._get_css(annotated_text["label"]),
33 |             "</style></head><body><pre>",
34 |         ]
35 | 
36 |         text = annotated_text["text"]
37 |         current_idx = 0
38 |         for idx, tags in sorted(tag_dict.items()):
39 |             tagged_content.append(text[current_idx:idx].replace("\n", "</pre>\n<pre>"))
40 |             current_idx = idx
41 |             tagged_content.extend(tags)
42 |         tagged_content.append(text[current_idx:].replace("\n", "</pre>\n</pre>"))
43 |         return "".join(tagged_content) + "</pre></body></html>"
44 | 
45 |     @staticmethod
46 |     def _get_label_colors(labels: List[str]) -> Dict[str, str]:
47 |         """Compute the mapping between annotation labels and colors.
48 | 
49 |         The used color schema is available in the global variable COLOR_SCHEMA.
50 | 
51 |         Args:
52 |             labels: a list of the annotations classes (e.g., heading, etc.)
53 |                     that need to be color-coded.
54 |         Returns:
55 |             A mapping between the available labels and the corresponding color
56 |             from the COLOR_SCHEMA.
57 |         """
58 |         return dict(zip({a[2] for a in sorted(labels)}, cycle(COLOR_SCHEMA)))
59 | 
60 |     def _get_css(self, labels: List[str]) -> str:
61 |         """Compute the CSS to be included into the HTML output.
62 | 
63 |         Args:
64 |             labels: a list of the annotations classes (e.g., heading, etc.)
65 |                     that need to be color-coded.
66 | 
67 |         Returns:
68 |             A string containing the CSS to be embedded into the HTML output.
69 | 
70 |         """
71 |         css = []
72 |         for label, color in sorted(self._get_label_colors(labels).items()):
73 |             css.append(
74 |                 "pre{{"
75 |                 "  position: relative;\n"
76 |                 "}}\n"
77 |                 ".{label} {{\n"
78 |                 "  background-color: {color};\n"
79 |                 "  border-radius: 0.4em;\n"
80 |                 "}}\n"
81 |                 ".{label}-label {{\n"
82 |                 "  top: -1.0em;\n"
83 |                 '  content: "{label}";\n'
84 |                 "  position: absolute;\n"
85 |                 "  background-color: {color};\n"
86 |                 "  font-size: 75%; }}\n".format(label=label, color=color)
87 |             )
88 |         return "\n".join(css)
89 | 


--------------------------------------------------------------------------------
/src/inscriptis/annotation/output/surface.py:
--------------------------------------------------------------------------------
 1 | """Surface Form Annotation Processor."""
 2 | from typing import Dict, Any
 3 | 
 4 | from inscriptis.annotation.output import AnnotationProcessor
 5 | 
 6 | 
 7 | class SurfaceExtractor(AnnotationProcessor):
 8 |     """Extracts the surface form of all annotated labels."""
 9 | 
10 |     verbatim = False
11 | 
12 |     def __call__(self, annotated_text: Dict[str, Any]) -> Dict[str, Any]:
13 |         """
14 |         Add information on the surface forms to the annotated_text dictionary.
15 | 
16 |         Args:
17 |             annotated_text: a dictionary containing the plain text and the
18 |                             extracted annotations.
19 | 
20 |         Returns:
21 |             An extended dictionary which contains the extracted surface-forms
22 |             of the annotations under the key 'surface'.
23 |         """
24 |         surface_forms = [
25 |             (label, annotated_text["text"][start:end])
26 |             for start, end, label in annotated_text["label"]
27 |         ]
28 |         annotated_text["surface"] = surface_forms
29 |         return annotated_text
30 | 


--------------------------------------------------------------------------------
/src/inscriptis/annotation/output/xml.py:
--------------------------------------------------------------------------------
 1 | """XML Annotation processor."""
 2 | 
 3 | from collections import defaultdict
 4 | from typing import Dict, Any
 5 | 
 6 | from inscriptis.annotation.output import AnnotationProcessor
 7 | 
 8 | 
 9 | class XmlExtractor(AnnotationProcessor):
10 |     """Provide the converted text with XML-style annotations."""
11 | 
12 |     verbatim = True
13 | 
14 |     def __call__(self, annotated_text: Dict[str, Any], root_element="content"):
15 |         tag_dict = defaultdict(list)
16 |         for start, end, tag in reversed(annotated_text["label"]):
17 |             tag_dict[start].append(f"<{tag}>")
18 |             tag_dict[end].insert(0, f"</{tag}>")
19 | 
20 |         current_idx = 0
21 |         text = annotated_text["text"]
22 |         tagged_content = ['<?xml version="1.0" encoding="UTF-8" ?>\n', "<content>\n"]
23 |         for idx, tags in sorted(tag_dict.items()):
24 |             tagged_content.append(text[current_idx:idx])
25 |             current_idx = idx
26 |             tagged_content.extend(tags)
27 | 
28 |         tagged_content.append(text[current_idx:])
29 |         tagged_content.append("\n</content>")
30 |         return "".join(tagged_content)
31 | 


--------------------------------------------------------------------------------
/src/inscriptis/annotation/parser.py:
--------------------------------------------------------------------------------
  1 | """Parse annotation configuration files.
  2 | 
  3 | Annotation configuration files contain a dictionary that maps tags and
  4 | attributes to the corresponding annotation.
  5 | 
  6 |   - tags are referenced by their name
  7 |   - attributes by a `#` (e.g., `#class`) and an optional selector (e.g.,
  8 |     `#class=short-description`)
  9 | 
 10 | Example::
 11 | 
 12 |     {
 13 |         "h1": ["heading"],
 14 |         "b": ["emphasis"],
 15 |         "div#class=toc": ["table-of-contents"],
 16 |         "#class=short-description]": ["description"]
 17 |     }
 18 | """
 19 | from collections import defaultdict
 20 | from copy import copy
 21 | from typing import Dict, Tuple, List
 22 | 
 23 | from inscriptis.model.html_element import HtmlElement, DEFAULT_HTML_ELEMENT
 24 | 
 25 | 
 26 | class ApplyAnnotation:
 27 |     """Apply an Annotation to the given attribute.
 28 | 
 29 |     Arguments:
 30 |         annotations: a tuple of annotations to be applied to the attribute.
 31 |         attr: the name of the attribute.
 32 |         match_tag: only apply annotations to attributes that belong to the
 33 |                    given match_tag.
 34 |         match_value: only apply annotations to attribute with the given
 35 |                      match_value.
 36 |     """
 37 | 
 38 |     __slots__ = ("annotations", "match_tag", "match_value", "attr", "matcher")
 39 | 
 40 |     def __init__(
 41 |         self,
 42 |         annotations: tuple,
 43 |         attr: str,
 44 |         match_tag: str = None,
 45 |         match_value: str = None,
 46 |     ):
 47 |         self.annotations = tuple(annotations)
 48 |         self.attr = attr
 49 |         self.match_tag = match_tag
 50 |         self.match_value = match_value
 51 | 
 52 |     def apply(self, attr_value: str, html_element: HtmlElement):
 53 |         """Apply the annotation to HtmlElements with matching tags."""
 54 |         if (self.match_tag and self.match_tag != html_element.tag) or (
 55 |             self.match_value and self.match_value not in attr_value.split()
 56 |         ):
 57 |             return
 58 | 
 59 |         html_element.annotation += self.annotations
 60 | 
 61 |     def __str__(self):
 62 |         return "<ApplyAnnotation: {tag}#{attr}={value}".format(
 63 |             tag=self.match_tag or "{any}",
 64 |             attr=self.attr or "{any}",
 65 |             value=self.match_value or "{any}",
 66 |         )
 67 | 
 68 |     __repr__ = __str__
 69 | 
 70 | 
 71 | class AnnotationModel:
 72 |     """Adapt the CSS profile and CSS attributes for annotation support.
 73 | 
 74 |     Attributes:
 75 |         css: the refined CSS class which contains annotations for HtmlElements
 76 |              which should be annotated.
 77 |         css_attr: information on CSS attributes that shall be annotated.
 78 |     """
 79 | 
 80 |     def __init__(self, css_profile, model: dict):
 81 |         tags, self.css_attr = self._parse(model)
 82 |         for tag, annotations in tags.items():
 83 |             if tag not in css_profile:
 84 |                 css_profile[tag] = copy(DEFAULT_HTML_ELEMENT)
 85 |             css_profile[tag].annotation += tuple(annotations)
 86 |         self.css = css_profile
 87 | 
 88 |     @staticmethod
 89 |     def _parse(model: dict) -> Tuple[Dict, List]:
 90 |         """Compute the AnnotationModel from a model dictionary.
 91 | 
 92 |         Returns:
 93 |             the AnnotationModel matching the input dictionary.
 94 |         """
 95 |         tags = defaultdict(list)
 96 |         attrs = []
 97 |         for key, annotations in model.items():
 98 |             if "#" in key:
 99 |                 tag, attr = key.split("#")
100 |                 if "=" in attr:
101 |                     attr, value = attr.split("=")
102 |                 else:
103 |                     value = None
104 |                 attrs.append(ApplyAnnotation(annotations, attr, tag, value))
105 |             else:
106 |                 tags[key].extend(annotations)
107 |         return tags, attrs
108 | 


--------------------------------------------------------------------------------
/src/inscriptis/cli/__init__.py:
--------------------------------------------------------------------------------
1 | """Inscriptis command line interface clients."""
2 | 


--------------------------------------------------------------------------------
/src/inscriptis/css_profiles.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # coding: utf-8
 3 | """Standard CSS profiles shipped with inscriptis.
 4 | 
 5 | - `strict`: this profile corresponds to the defaults used by Firefox
 6 | - `relaxed`: this profile is more suited for text analytics, since it ensures
 7 |              that whitespaces are inserted between span and div elements
 8 |              preventing cases where two words stick together.
 9 | """
10 | 
11 | from inscriptis.html_properties import Display, WhiteSpace
12 | from inscriptis.model.html_element import HtmlElement
13 | 
14 | STRICT_CSS_PROFILE = {
15 |     "body": HtmlElement(display=Display.inline, whitespace=WhiteSpace.normal),
16 |     "head": HtmlElement(display=Display.none),
17 |     "link": HtmlElement(display=Display.none),
18 |     "meta": HtmlElement(display=Display.none),
19 |     "script": HtmlElement(display=Display.none),
20 |     "title": HtmlElement(display=Display.none),
21 |     "style": HtmlElement(display=Display.none),
22 |     "p": HtmlElement(display=Display.block, margin_before=1, margin_after=1),
23 |     "figure": HtmlElement(display=Display.block, margin_before=1, margin_after=1),
24 |     "h1": HtmlElement(display=Display.block, margin_before=1, margin_after=1),
25 |     "h2": HtmlElement(display=Display.block, margin_before=1, margin_after=1),
26 |     "h3": HtmlElement(display=Display.block, margin_before=1, margin_after=1),
27 |     "h4": HtmlElement(display=Display.block, margin_before=1, margin_after=1),
28 |     "h5": HtmlElement(display=Display.block, margin_before=1, margin_after=1),
29 |     "h6": HtmlElement(display=Display.block, margin_before=1, margin_after=1),
30 |     "ul": HtmlElement(
31 |         display=Display.block, margin_before=0, margin_after=0, padding_inline=4
32 |     ),
33 |     "ol": HtmlElement(
34 |         display=Display.block, margin_before=0, margin_after=0, padding_inline=4
35 |     ),
36 |     "li": HtmlElement(display=Display.block),
37 |     "address": HtmlElement(display=Display.block),
38 |     "article": HtmlElement(display=Display.block),
39 |     "aside": HtmlElement(display=Display.block),
40 |     "div": HtmlElement(display=Display.block),
41 |     "footer": HtmlElement(display=Display.block),
42 |     "header": HtmlElement(display=Display.block),
43 |     "hgroup": HtmlElement(display=Display.block),
44 |     "layer": HtmlElement(display=Display.block),
45 |     "main": HtmlElement(display=Display.block),
46 |     "nav": HtmlElement(display=Display.block),
47 |     "figcaption": HtmlElement(display=Display.block),
48 |     "blockquote": HtmlElement(display=Display.block),
49 |     "q": HtmlElement(prefix='"', suffix='"'),
50 |     # Handling of <pre>
51 |     "pre": HtmlElement(display=Display.block, whitespace=WhiteSpace.pre),
52 |     "xmp": HtmlElement(display=Display.block, whitespace=WhiteSpace.pre),
53 |     "listing": HtmlElement(display=Display.block, whitespace=WhiteSpace.pre),
54 |     "plaintext": HtmlElement(display=Display.block, whitespace=WhiteSpace.pre),
55 | }
56 | 
57 | RELAXED_CSS_PROFILE = STRICT_CSS_PROFILE.copy()
58 | RELAXED_CSS_PROFILE["div"] = HtmlElement(display=Display.block, padding_inline=2)
59 | RELAXED_CSS_PROFILE["span"] = HtmlElement(
60 |     display=Display.inline, prefix=" ", suffix=" ", limit_whitespace_affixes=True
61 | )
62 | 
63 | 
64 | CSS_PROFILES = {"strict": STRICT_CSS_PROFILE, "relaxed": RELAXED_CSS_PROFILE}
65 | 


--------------------------------------------------------------------------------
/src/inscriptis/html_engine.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding:utf-8
  3 | """The HTML Engine is responsible for converting HTML to text."""
  4 | from typing import List, Dict, Callable
  5 | 
  6 | import lxml.html
  7 | from lxml.etree import Comment
  8 | 
  9 | from inscriptis.annotation import Annotation
 10 | from inscriptis.model.canvas import Canvas
 11 | from inscriptis.model.config import ParserConfig
 12 | from inscriptis.model.html_document_state import HtmlDocumentState
 13 | from inscriptis.model.tag.a_tag import a_start_handler, a_end_handler
 14 | from inscriptis.model.tag.br_tag import br_start_handler
 15 | from inscriptis.model.tag.img_tag import img_start_handler
 16 | from inscriptis.model.tag.list_tag import (
 17 |     ul_start_handler,
 18 |     ol_start_handler,
 19 |     li_start_handler,
 20 |     ul_end_handler,
 21 |     ol_end_handler,
 22 | )
 23 | from inscriptis.model.tag.table_tag import (
 24 |     table_start_handler,
 25 |     tr_start_handler,
 26 |     td_start_handler,
 27 |     table_end_handler,
 28 |     td_end_handler,
 29 | )
 30 | 
 31 | 
 32 | class Inscriptis:
 33 |     """Translate an lxml HTML tree to the corresponding text representation.
 34 | 
 35 |     Args:
 36 |       html_tree: the lxml HTML tree to convert.
 37 |       config: an optional ParserConfig configuration object.
 38 | 
 39 |     Example::
 40 | 
 41 |       from lxml.html import fromstring
 42 |       from inscriptis.html_engine import Inscriptis
 43 | 
 44 |       html_content = "<html><body><h1>Test</h1></body></html>"
 45 | 
 46 |       # create an HTML tree from the HTML content.
 47 |       html_tree = fromstring(html_content)
 48 | 
 49 |       # transform the HTML tree to text.
 50 |       parser = Inscriptis(html_tree)
 51 |       text = parser.get_text()
 52 |     """
 53 | 
 54 |     def __init__(
 55 |         self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None
 56 |     ) -> None:
 57 |         # use the default configuration, if no config object is provided
 58 |         config = config or ParserConfig()
 59 | 
 60 |         # setup start and end tag call tables
 61 |         self.start_tag_handler_dict: Dict[
 62 |             str, Callable[[HtmlDocumentState, Dict], None]
 63 |         ] = {
 64 |             "table": table_start_handler,
 65 |             "tr": tr_start_handler,
 66 |             "td": td_start_handler,
 67 |             "th": td_start_handler,
 68 |             "ul": ul_start_handler,
 69 |             "ol": ol_start_handler,
 70 |             "li": li_start_handler,
 71 |             "br": br_start_handler,
 72 |             "a": a_start_handler if config.parse_a() else None,
 73 |             "img": img_start_handler if config.display_images else None,
 74 |         }
 75 |         self.end_tag_handler_dict: Dict[str, Callable[[HtmlDocumentState], None]] = {
 76 |             "table": table_end_handler,
 77 |             "ul": ul_end_handler,
 78 |             "ol": ol_end_handler,
 79 |             "td": td_end_handler,
 80 |             "th": td_end_handler,
 81 |             "a": a_end_handler if config.parse_a() else None,
 82 |         }
 83 | 
 84 |         if config.custom_html_tag_handler_mapping:
 85 |             self.start_tag_handler_dict.update(
 86 |                 config.custom_html_tag_handler_mapping.start_tag_mapping
 87 |             )
 88 |             self.end_tag_handler_dict.update(
 89 |                 config.custom_html_tag_handler_mapping.end_tag_mapping
 90 |             )
 91 | 
 92 |         # parse the HTML tree
 93 |         self.canvas = self._parse_html_tree(HtmlDocumentState(config), html_tree)
 94 | 
 95 |     def _parse_html_tree(self, state: HtmlDocumentState, tree) -> Canvas:
 96 |         """Parse the HTML tree.
 97 | 
 98 |         Args:
 99 |             tree: the HTML tree to parse.
100 |         """
101 |         if isinstance(tree.tag, str):
102 |             state.apply_starttag_layout(tree.tag, tree.attrib)
103 | 
104 |             if handler := self.start_tag_handler_dict.get(tree.tag):
105 |                 handler(state, tree.attrib)
106 |             cur = state.tags[-1]
107 |             cur.canvas.open_tag(cur)
108 | 
109 |             state.tags[-1].write(tree.text)
110 | 
111 |             for node in tree:
112 |                 self._parse_html_tree(state, node)
113 | 
114 |             # handle the endtag
115 |             if handler := self.end_tag_handler_dict.get(tree.tag):
116 |                 handler(state)
117 |             prev = state.tags.pop()
118 |             prev.canvas.close_tag(prev)
119 | 
120 |             # write the tail text to the element's container
121 |             state.tags[-1].write(tree.tail)
122 | 
123 |         elif tree.tag is Comment and tree.tail:
124 |             state.tags[-1].canvas.write(state.tags[-1], tree.tail)
125 | 
126 |         return state.canvas
127 | 
128 |     def get_text(self) -> str:
129 |         """Return the text extracted from the HTML page."""
130 |         return self.canvas.get_text()
131 | 
132 |     def get_annotations(self) -> List[Annotation]:
133 |         """Return the annotations extracted from the HTML page."""
134 |         return self.canvas.annotations
135 | 


--------------------------------------------------------------------------------
/src/inscriptis/html_properties.py:
--------------------------------------------------------------------------------
 1 | r"""Provide properties used for rendering HTML pages.
 2 | 
 3 | Supported attributes::
 4 |  1. :class:`Display` properties.
 5 |  2. :class:`WhiteSpace` properties.
 6 |  3. :class:`HorizontalAlignment` properties.
 7 |  4. :class:`VerticalAlignment` properties.
 8 | """
 9 | 
10 | from enum import Enum
11 | 
12 | 
13 | class Display(Enum):
14 |     """Specify whether content will be rendered as inline, block or none.
15 | 
16 |     .. note::
17 |         A display attribute on none indicates, that the content should not be
18 |         rendered at all.
19 |     """
20 | 
21 |     inline = 1
22 |     block = 2
23 |     none = 3
24 | 
25 | 
26 | class WhiteSpace(Enum):
27 |     """Specify the HTML element's whitespace handling.
28 | 
29 |     Inscriptis supports the following handling strategies outlined in the
30 |     `Cascading Style Sheets <https://www.w3.org/TR/CSS1/>`_ specification.
31 |     """
32 | 
33 |     normal = 1
34 |     """Collapse multiple whitespaces into a single one."""
35 |     pre = 3
36 |     """Preserve sequences of whitespaces."""
37 | 
38 | 
39 | class HorizontalAlignment(Enum):
40 |     """Specify the content's horizontal alignment."""
41 | 
42 |     left = "<"
43 |     """Left alignment of the block's content."""
44 |     right = ">"
45 |     """Right alignment of the block's content."""
46 |     center = "^"
47 |     """Center the block's content."""
48 | 
49 | 
50 | class VerticalAlignment(Enum):
51 |     """Specify the content's vertical alignment."""
52 | 
53 |     top = 1
54 |     """Align all content at the top."""
55 |     middle = 2
56 |     """Align all content in the middle."""
57 |     bottom = 3
58 |     """Align all content at the bottom."""
59 | 


--------------------------------------------------------------------------------
/src/inscriptis/metadata.py:
--------------------------------------------------------------------------------
 1 | """Inscriptis metadata information."""
 2 | 
 3 | import importlib.metadata as metadata
 4 | 
 5 | PACKAGE = "inscriptis"
 6 | 
 7 | __author__ = "Albert Weichselbraun, Fabian Odoni"
 8 | __author_email__ = "albert.weichselbraun@fhgr.ch, fabian.odoni@fhgr.ch"
 9 | __copyright__ = (
10 |     f"{metadata.metadata(PACKAGE)['Name']} "
11 |     + f"{metadata.metadata(PACKAGE)['Version']} © 2016-2025 {__author__}"
12 | )
13 | __license__ = metadata.metadata(PACKAGE)["License"]
14 | __version__ = metadata.metadata(PACKAGE)["Version"]
15 | 


--------------------------------------------------------------------------------
/src/inscriptis/model/__init__.py:
--------------------------------------------------------------------------------
1 | """The model used for HTML rendering.
2 | 
3 | - :mod:`inscriptis.model.canvas`: classes required for rendering parts of
4 |     the HTML page.
5 | - :mod:`inscriptis.model.css`: classes required for the CSS support.
6 | - :mod:`inscriptis.model.table`: support for rendering HTML tables.
7 | """
8 | 


--------------------------------------------------------------------------------
/src/inscriptis/model/attribute.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # encoding: utf-8
 3 | 
 4 | """HTML attribute handling."""
 5 | from copy import copy
 6 | from typing import Dict, List
 7 | 
 8 | from inscriptis.annotation.parser import ApplyAnnotation
 9 | from inscriptis.model.css import CssParse
10 | from inscriptis.model.html_element import HtmlElement
11 | 
12 | DEFAULT_ATTRIBUTE_MAP = {
13 |     "style": CssParse.attr_style,
14 |     "align": CssParse.attr_horizontal_align,
15 |     "valign": CssParse.attr_vertical_align,
16 | }
17 | 
18 | 
19 | def merge_function(func1, func2):
20 |     """Merge two functions with the same arguments into a single one.
21 | 
22 |     This function is used for cascading functions that operate on HtmlElements
23 |     and attributes.
24 | 
25 |     Args:
26 |         func1: the first function
27 |         func2: the second function
28 |     """
29 | 
30 |     def merged(*args):
31 |         func1(*args)
32 |         func2(*args)
33 | 
34 |     return merged
35 | 
36 | 
37 | class Attribute:
38 |     """Handle HTML attributes such as `align`, and `valign`.
39 | 
40 |     This class handles HTML attributes by mapping them to the corresponding
41 |     functions in the :class:`~inscriptis.model.css.CssParse` class.
42 | 
43 |     Attributes:
44 |         attribute_mapping: a mapping of attributes to the corresponding handler
45 |                            functions.
46 |     """
47 | 
48 |     def __init__(self):
49 |         self.attribute_mapping = DEFAULT_ATTRIBUTE_MAP
50 | 
51 |     def apply_attributes(
52 |         self, attributes: Dict[str, str], html_element: HtmlElement
53 |     ) -> HtmlElement:
54 |         """Apply the attributes to the given HTML element.
55 | 
56 |         Args:
57 |             attributes: the list of attributes
58 |             html_element: the HTML element for which the attributes are parsed
59 |         """
60 |         for attr_name, attr_value in attributes.items():
61 |             if attr_name in self.attribute_mapping:
62 |                 self.attribute_mapping[attr_name](attr_value, html_element)
63 |         return html_element
64 | 
65 |     def merge_attribute_map(self, annotations: List[ApplyAnnotation] = None) -> None:
66 |         attributes = copy(self.attribute_mapping)
67 |         for a in annotations:
68 |             attributes[a.attr] = (
69 |                 a.apply
70 |                 if a.attr not in attributes
71 |                 else merge_function(attributes[a.attr], a.apply)
72 |             )
73 |         self.attribute_mapping = attributes
74 | 


--------------------------------------------------------------------------------
/src/inscriptis/model/canvas/block.py:
--------------------------------------------------------------------------------
  1 | """Representation of a text block within the HTML canvas."""
  2 | from __future__ import annotations
  3 | 
  4 | from html import unescape
  5 | from typing import TYPE_CHECKING
  6 | 
  7 | from inscriptis.html_properties import WhiteSpace
  8 | 
  9 | if TYPE_CHECKING:
 10 |     from inscriptis.model.canvas import Prefix
 11 | 
 12 | 
 13 | class Block:
 14 |     """The current block of text.
 15 | 
 16 |     A block usually refers to one line of output text.
 17 | 
 18 |     .. note::
 19 |         If pre-formatted content is merged with a block, it may also contain
 20 |         multiple lines.
 21 | 
 22 |     Args:
 23 |         idx: the current block's start index.
 24 |         prefix: prefix used within the current block.
 25 |     """
 26 | 
 27 |     __slots__ = ("idx", "prefix", "_content", "collapsable_whitespace")
 28 | 
 29 |     def __init__(self, idx: int, prefix: Prefix):
 30 |         self.idx = idx
 31 |         self.prefix = prefix
 32 |         self._content = ""
 33 |         self.collapsable_whitespace = True
 34 | 
 35 |     def merge(self, text: str, whitespace: WhiteSpace) -> None:
 36 |         """Merge the given text with the current block.
 37 | 
 38 |         Args:
 39 |             text: the text to merge.
 40 |             whitespace: whitespace handling.
 41 |         """
 42 |         if whitespace == WhiteSpace.pre:
 43 |             self.merge_pre_text(text)
 44 |         else:
 45 |             self.merge_normal_text(text)
 46 | 
 47 |     def merge_normal_text(self, text: str) -> None:
 48 |         """Merge the given text with the current block.
 49 | 
 50 |         Args:
 51 |             text: the text to merge
 52 | 
 53 |         Note:
 54 |             If the previous text ended with a whitespace and text starts with one, both
 55 |              will automatically collapse into a single whitespace.
 56 |         """
 57 |         normalized_text = []
 58 | 
 59 |         for ch in text:
 60 |             if not ch.isspace():
 61 |                 normalized_text.append(ch)
 62 |                 self.collapsable_whitespace = False
 63 |             elif not self.collapsable_whitespace:
 64 |                 normalized_text.append(" ")
 65 |                 self.collapsable_whitespace = True
 66 | 
 67 |         if normalized_text:
 68 |             text = (
 69 |                 "".join((self.prefix.first, *normalized_text))
 70 |                 if not self._content
 71 |                 else "".join(normalized_text)
 72 |             )
 73 |             text = unescape(text)
 74 |             self._content += text
 75 |             self.idx += len(text)
 76 | 
 77 |     def merge_pre_text(self, text: str) -> None:
 78 |         """Merge the given pre-formatted text with the current block.
 79 | 
 80 |         Args:
 81 |             text: the text to merge
 82 |         """
 83 |         text = "".join((self.prefix.first, text.replace("\n", "\n" + self.prefix.rest)))
 84 |         text = unescape(text)
 85 |         self._content += text
 86 |         self.idx += len(text)
 87 |         self.collapsable_whitespace = False
 88 | 
 89 |     def is_empty(self) -> bool:
 90 |         return len(self.content) == 0
 91 | 
 92 |     @property
 93 |     def content(self):
 94 |         if not self.collapsable_whitespace:
 95 |             return self._content
 96 | 
 97 |         if self._content.endswith(" "):
 98 |             self._content = self._content[:-1]
 99 |             self.idx -= 1
100 |         return self._content
101 | 
102 |     def new_block(self) -> "Block":
103 |         """Return a new Block based on the current one."""
104 |         self.prefix.consumed = False
105 |         return Block(idx=self.idx + 1, prefix=self.prefix)
106 | 


--------------------------------------------------------------------------------
/src/inscriptis/model/canvas/prefix.py:
--------------------------------------------------------------------------------
 1 | """Manage the horizontal prefix (left-indentation, bullets) of canvas lines."""
 2 | 
 3 | from contextlib import suppress
 4 | 
 5 | 
 6 | class Prefix:
 7 |     """Class Prefix manages paddings and bullets that prefix an HTML block.
 8 | 
 9 |     Attributes:
10 |         current_padding: the number of characters used for the current
11 |                          left-indentation.
12 |         paddings: the list of paddings for the current and all previous tags.
13 |         bullets: the list of bullets in the current and all previous tags.
14 |         consumed: whether the current bullet has already been consumed.
15 |     """
16 | 
17 |     __slots__ = ("current_padding", "paddings", "bullets", "consumed")
18 | 
19 |     def __init__(self):
20 |         self.current_padding = 0
21 |         self.paddings = []
22 |         self.bullets = []
23 |         self.consumed = False
24 | 
25 |     def register_prefix(self, padding_inline: int, bullet: str) -> None:
26 |         """Register the given prefix.
27 | 
28 |         Args:
29 |             padding_inline: the number of characters used for padding_inline
30 |             bullet: an optional bullet.
31 |         """
32 |         self.current_padding += padding_inline
33 |         self.paddings.append(padding_inline)
34 |         self.bullets.append(bullet if bullet else "")
35 | 
36 |     def remove_last_prefix(self) -> None:
37 |         """Remove the last prefix from the list."""
38 |         with suppress(IndexError):
39 |             self.current_padding -= self.paddings.pop()
40 |             del self.bullets[-1]
41 | 
42 |     def pop_next_bullet(self) -> str:
43 |         """Pop the next bullet to use, if any bullet is available."""
44 |         next_bullet_idx = (
45 |             next((-idx for idx, val in enumerate(reversed(self.bullets)) if val), 1) - 1
46 |         )
47 | 
48 |         if not next_bullet_idx:
49 |             return ""
50 | 
51 |         bullet = self.bullets[next_bullet_idx]
52 |         self.bullets[next_bullet_idx] = ""
53 |         return bullet
54 | 
55 |     @property
56 |     def first(self) -> str:
57 |         """Return the prefix used at the beginning of a tag.
58 | 
59 |         Note::
60 |             A new block needs to be prefixed by the current padding and bullet.
61 |             Once this has happened (i.e., :attr:`consumed` is set to `True`) no
62 |             further prefixes should be used for a line.
63 |         """
64 |         if self.consumed:
65 |             return ""
66 | 
67 |         self.consumed = True
68 |         bullet = self.pop_next_bullet()
69 |         return " " * (self.current_padding - len(bullet)) + bullet
70 | 
71 |     @property
72 |     def unconsumed_bullet(self) -> str:
73 |         """Yield any yet unconsumed bullet.
74 | 
75 |         Note::
76 |             This function yields the previous element's bullets, if they have
77 |             not been consumed yet.
78 |         """
79 |         if self.consumed:
80 |             return ""
81 | 
82 |         bullet = self.pop_next_bullet()
83 |         if not bullet:
84 |             return ""
85 | 
86 |         padding = self.current_padding - self.paddings[-1]
87 |         return " " * (padding - len(bullet)) + bullet
88 | 
89 |     @property
90 |     def rest(self) -> str:
91 |         """Return the prefix used for new lines within a block.
92 | 
93 |         This prefix is used for pre-text that contains newlines. The lines
94 |         need to be prefixed with the right padding to preserver the
95 |         indentation.
96 |         """
97 |         return " " * self.current_padding
98 | 


--------------------------------------------------------------------------------
/src/inscriptis/model/config.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Provide configuration objects for the Inscriptis HTML to text converter."""
 3 | from __future__ import annotations
 4 | 
 5 | from copy import deepcopy
 6 | from typing import Dict, List
 7 | 
 8 | from inscriptis.annotation.parser import AnnotationModel
 9 | from inscriptis.css_profiles import CSS_PROFILES
10 | from inscriptis.model.attribute import Attribute
11 | from inscriptis.model.html_element import HtmlElement
12 | from inscriptis.model.tag import CustomHtmlTagHandlerMapping
13 | 
14 | DEFAULT_CSS_PROFILE_NAME = "relaxed"
15 | 
16 | 
17 | class ParserConfig:
18 |     """Encapsulate configuration options and CSS definitions."""
19 | 
20 |     def __init__(
21 |         self,
22 |         css: Dict[str, HtmlElement] = None,
23 |         display_images: bool = False,
24 |         deduplicate_captions: bool = False,
25 |         display_links: bool = False,
26 |         display_anchors: bool = False,
27 |         annotation_rules: Dict[str, List[str]] = None,
28 |         table_cell_separator: str = "  ",
29 |         custom_html_tag_handler_mapping: CustomHtmlTagHandlerMapping = None,
30 |     ):
31 |         """Create a ParserConfig configuration.
32 | 
33 |         Args:
34 |             css: an optional custom CSS definition.
35 |             display_images: whether to include image tiles/alt texts.
36 |             deduplicate_captions: whether to deduplicate captions such as image
37 |                 titles (many newspaper include images and video previews with
38 |                 identical titles).
39 |             display_links: whether to display link targets
40 |                            (e.g. `[Python](https://www.python.org)`).
41 |             display_anchors: whether to display anchors (e.g. `[here](#here)`).
42 |             annotation_rules: an optional dictionary of annotation rules which
43 |                               specify tags and attributes to annotation.
44 |             table_cell_separator: separator to use between table cells.
45 |             custom_html_tag_handler_mapping: an optional CustomHtmlTagHandler
46 |         """
47 |         self.display_images = display_images
48 |         self.deduplicate_captions = deduplicate_captions
49 |         self.display_links = display_links
50 |         self.display_anchors = display_anchors
51 |         self.css = css or CSS_PROFILES[DEFAULT_CSS_PROFILE_NAME]
52 |         self.attribute_handler = Attribute()
53 |         self.table_cell_separator = table_cell_separator
54 |         self.custom_html_tag_handler_mapping = custom_html_tag_handler_mapping
55 | 
56 |         if annotation_rules:
57 |             # ensure that we do not modify the original model or its
58 |             # members.
59 |             annotation_model = AnnotationModel(deepcopy(self.css), annotation_rules)
60 |             # css with annotation support
61 |             self.css = annotation_model.css
62 |             # attribute handler with annotation support
63 |             self.attribute_handler.merge_attribute_map(annotation_model.css_attr)
64 | 
65 |     def parse_a(self) -> bool:
66 |         """Indicate whether the text output should contain links or anchors.
67 | 
68 |         Returns
69 |             Whether we need to parse <a> tags.
70 |         """
71 |         return self.display_links or self.display_anchors
72 | 


--------------------------------------------------------------------------------
/src/inscriptis/model/css.py:
--------------------------------------------------------------------------------
  1 | """Implement basic CSS support for inscriptis.
  2 | 
  3 | - The :class:`~inscriptis.model.html_element.HtmlElement` class
  4 |   encapsulates all CSS properties of a single HTML element.
  5 | - :class:`CssParse` parses CSS specifications and translates them into the
  6 |   corresponding HtmlElements used by Inscriptis for rendering HTML pages.
  7 | """
  8 | from contextlib import suppress
  9 | from re import compile as re_compile
 10 | 
 11 | from inscriptis.html_properties import (
 12 |     Display,
 13 |     WhiteSpace,
 14 |     HorizontalAlignment,
 15 |     VerticalAlignment,
 16 | )
 17 | from inscriptis.model.html_element import HtmlElement
 18 | 
 19 | 
 20 | class CssParse:
 21 |     """Parse CSS specifications and applies them to HtmlElements.
 22 | 
 23 |     The attribute `display: none`, for instance, is translated to
 24 |     :attr:`HtmlElement.display=Display.none`.
 25 |     """
 26 | 
 27 |     # used to separate value and unit from each other
 28 |     RE_UNIT = re_compile(r"(-?[0-9.]+)(\w+)")
 29 | 
 30 |     @staticmethod
 31 |     def attr_style(style_attribute: str, html_element: HtmlElement):
 32 |         """Apply the provided style attributes to the given HtmlElement.
 33 | 
 34 |         Args:
 35 |           style_attribute: The attribute value of the given style sheet.
 36 |                            Example: display: none
 37 |           html_element: The HtmlElement to which the given style is applied.
 38 |         """
 39 |         for style_directive in style_attribute.lower().split(";"):
 40 |             if ":" not in style_directive:
 41 |                 continue
 42 |             key, value = (s.strip() for s in style_directive.split(":", 1))
 43 | 
 44 |             try:
 45 |                 apply_style = getattr(
 46 |                     CssParse, "attr_" + key.replace("-webkit-", "").replace("-", "_")
 47 |                 )
 48 |                 apply_style(value, html_element)
 49 |             except AttributeError:
 50 |                 pass
 51 | 
 52 |     @staticmethod
 53 |     def _get_em(length: str) -> int:
 54 |         """Convert length specifications into em.
 55 | 
 56 |         This function takes a length specification (e.g., 2em, 2px, etc.) and
 57 |         transforms it into em.
 58 | 
 59 |         Args:
 60 |           length: the length specification.
 61 | 
 62 |         Returns:
 63 |             the length in em.
 64 |         """
 65 |         _m = CssParse.RE_UNIT.search(length)
 66 |         value = float(_m.group(1))
 67 |         unit = _m.group(2)
 68 | 
 69 |         if unit not in ("em", "qem", "rem"):
 70 |             return int(round(value / 8))
 71 |         return int(round(value))
 72 | 
 73 |     # ------------------------------------------------------------------------
 74 |     # css styles
 75 |     # ------------------------------------------------------------------------
 76 | 
 77 |     @staticmethod
 78 |     def attr_display(value: str, html_element: HtmlElement):
 79 |         """Apply the given display value."""
 80 |         if html_element.display == Display.none:
 81 |             return
 82 | 
 83 |         if value == "block":
 84 |             html_element.display = Display.block
 85 |         elif value == "none":
 86 |             html_element.display = Display.none
 87 |         else:
 88 |             html_element.display = Display.inline
 89 | 
 90 |     @staticmethod
 91 |     def attr_white_space(value: str, html_element: HtmlElement):
 92 |         """Apply the given white-space value."""
 93 |         if value in ("normal", "nowrap"):
 94 |             html_element.whitespace = WhiteSpace.normal
 95 |         elif value in ("pre", "pre-line", "pre-wrap"):
 96 |             html_element.whitespace = WhiteSpace.pre
 97 | 
 98 |     @staticmethod
 99 |     def attr_margin_top(value: str, html_element: HtmlElement):
100 |         """Apply the given top margin."""
101 |         with suppress(ValueError):
102 |             html_element.margin_before = CssParse._get_em(value)
103 | 
104 |     @staticmethod
105 |     def attr_margin_bottom(value: str, html_element: HtmlElement):
106 |         """Apply the provided bottom margin."""
107 |         with suppress(ValueError):
108 |             html_element.margin_after = CssParse._get_em(value)
109 | 
110 |     @staticmethod
111 |     def attr_padding_left(value: str, html_element: HtmlElement):
112 |         """Apply the given left padding_inline."""
113 |         with suppress(ValueError):
114 |             html_element.padding_inline = CssParse._get_em(value)
115 | 
116 |     @staticmethod
117 |     def attr_horizontal_align(value: str, html_element: HtmlElement):
118 |         """Apply the provided horizontal alignment."""
119 |         with suppress(KeyError):
120 |             html_element.align = HorizontalAlignment[value]
121 | 
122 |     @staticmethod
123 |     def attr_vertical_align(value: str, html_element: HtmlElement):
124 |         """Apply the given vertical alignment."""
125 |         with suppress(KeyError):
126 |             html_element.valign = VerticalAlignment[value]
127 | 
128 |     # register aliases
129 |     attr_margin_before = attr_margin_top
130 |     attr_margin_after = attr_margin_bottom
131 |     attr_padding_start = attr_padding_left
132 | 


--------------------------------------------------------------------------------
/src/inscriptis/model/html_document_state.py:
--------------------------------------------------------------------------------
 1 | """Represents the state of an HTML document.
 2 | 
 3 | The provided `HtmlDocumentState` class contains and exposes all fields required for
 4 | representing the current state of the HTML to text conversion.
 5 | """
 6 | 
 7 | from inscriptis import ParserConfig
 8 | from inscriptis.model.canvas import Canvas
 9 | from inscriptis.model.html_element import DEFAULT_HTML_ELEMENT
10 | 
11 | 
12 | class HtmlDocumentState:
13 |     """Represents the state of the parsed html document."""
14 | 
15 |     def __init__(self, config: ParserConfig):
16 |         # instance variables
17 |         self.canvas = Canvas()
18 |         self.config = config
19 |         self.css = config.css
20 |         self.apply_attributes = config.attribute_handler.apply_attributes
21 | 
22 |         self.tags = [self.css["body"].set_canvas(self.canvas)]
23 |         self.current_table = []
24 |         self.li_counter = []
25 |         self.last_caption = None
26 | 
27 |         # used if display_links is enabled
28 |         self.link_target = ""
29 | 
30 |     def apply_starttag_layout(self, tag, attrs):
31 |         """Compute the layout of the tag.
32 | 
33 |         Compute the style of the current :class:`HtmlElement`, based on
34 | 
35 |         1. the used :attr:`css`,
36 |         2. apply attributes and css with :meth:`~Attribute.apply_attributes`
37 |         3. add the `HtmlElement` to the list of open tags.
38 | 
39 |         Args:
40 |           tag: the HTML start tag to process.
41 |           attrs: a dictionary of HTML attributes and their respective values.
42 |         """
43 |         # use the css to handle tags known to it :)
44 |         cur = self.tags[-1].get_refined_html_element(
45 |             self.apply_attributes(
46 |                 attrs,
47 |                 html_element=self.css.get(tag, DEFAULT_HTML_ELEMENT)
48 |                 .__copy__()
49 |                 .set_tag(tag),
50 |             )
51 |         )
52 |         self.tags.append(cur)
53 | 


--------------------------------------------------------------------------------
/src/inscriptis/model/tag/__init__.py:
--------------------------------------------------------------------------------
 1 | """HTML Tag handlers and classes for designing custom HTML tag handlers."""
 2 | from __future__ import annotations
 3 | 
 4 | from typing import Dict, Callable, NamedTuple
 5 | from typing import TYPE_CHECKING
 6 | 
 7 | if TYPE_CHECKING:
 8 |     from inscriptis.model.html_document_state import HtmlDocumentState
 9 | 
10 | 
11 | class CustomHtmlTagHandlerMapping(NamedTuple):
12 |     """Refine the standard HTML Tag handling with the provided mapping.
13 | 
14 |     Attributes:
15 |         start_tag_mapping: a dictionary of custom start tag handlers.
16 |         end_tag_mapping: a dictionary of custom end tag handlers.
17 |     """
18 | 
19 |     start_tag_mapping: Dict[str, Callable[[HtmlDocumentState, Dict], None]]
20 |     end_tag_mapping: Dict[str, Callable[[HtmlDocumentState], None]]
21 | 


--------------------------------------------------------------------------------
/src/inscriptis/model/tag/a_tag.py:
--------------------------------------------------------------------------------
 1 | """Handle the <a> tag."""
 2 | from typing import Dict
 3 | 
 4 | from inscriptis.model.html_document_state import HtmlDocumentState
 5 | 
 6 | 
 7 | def a_start_handler(state: HtmlDocumentState, attrs: Dict) -> None:
 8 |     """Handle the <a> tag."""
 9 |     state.link_target = ""
10 |     if state.config.display_links:
11 |         state.link_target = attrs.get("href", "")
12 |     if state.config.display_anchors:
13 |         state.link_target = state.link_target or attrs.get("name", "")
14 | 
15 |     if state.link_target:
16 |         state.tags[-1].write("[")
17 | 
18 | 
19 | def a_end_handler(state: HtmlDocumentState) -> None:
20 |     """Handle the </a> tag."""
21 |     if state.link_target:
22 |         state.tags[-1].write(f"]({state.link_target})")
23 | 


--------------------------------------------------------------------------------
/src/inscriptis/model/tag/br_tag.py:
--------------------------------------------------------------------------------
 1 | """Handle the <br> tag."""
 2 | from typing import Dict
 3 | 
 4 | from inscriptis.model.html_document_state import HtmlDocumentState
 5 | 
 6 | 
 7 | def br_start_handler(state: HtmlDocumentState, _: Dict) -> None:
 8 |     """Handle the <br> tag."""
 9 |     state.tags[-1].canvas.write_newline()
10 | 


--------------------------------------------------------------------------------
/src/inscriptis/model/tag/img_tag.py:
--------------------------------------------------------------------------------
 1 | """Handle the <img> tag."""
 2 | from typing import Dict
 3 | 
 4 | from inscriptis.model.html_document_state import HtmlDocumentState
 5 | 
 6 | 
 7 | def img_start_handler(state: HtmlDocumentState, attrs: Dict) -> None:
 8 |     """Handle the <img> tag."""
 9 |     image_text = attrs.get("alt", "") or attrs.get("title", "")
10 |     if image_text and not (
11 |         state.config.deduplicate_captions and image_text == state.last_caption
12 |     ):
13 |         state.tags[-1].write(f"[{image_text}]")
14 |         state.last_caption = image_text
15 | 


--------------------------------------------------------------------------------
/src/inscriptis/model/tag/list_tag.py:
--------------------------------------------------------------------------------
 1 | """Handle the <li>, <ol>, <ul> tags."""
 2 | from typing import Dict
 3 | 
 4 | from inscriptis.model.html_document_state import HtmlDocumentState
 5 | 
 6 | UL_COUNTER = ("* ", "+ ", "o ", "- ")
 7 | UL_COUNTER_LEN = len(UL_COUNTER)
 8 | 
 9 | 
10 | def get_bullet(state: HtmlDocumentState) -> str:
11 |     """Return the bullet that correspond to the given index."""
12 |     return UL_COUNTER[len(state.li_counter) % UL_COUNTER_LEN]
13 | 
14 | 
15 | def li_start_handler(state: HtmlDocumentState, _: Dict) -> None:
16 |     """Handle the <li> tag."""
17 |     bullet = state.li_counter[-1] if state.li_counter else "* "
18 |     if isinstance(bullet, int):
19 |         state.li_counter[-1] += 1
20 |         state.tags[-1].list_bullet = f"{bullet}. "
21 |     else:
22 |         state.tags[-1].list_bullet = bullet
23 | 
24 |     state.tags[-1].write("")
25 | 
26 | 
27 | def ul_start_handler(state: HtmlDocumentState, _: Dict) -> None:
28 |     """Handle the <ul> tag."""
29 |     state.li_counter.append(get_bullet(state))
30 | 
31 | 
32 | def ul_end_handler(state: HtmlDocumentState) -> None:
33 |     """Handle the </ul> tag."""
34 |     state.li_counter.pop()
35 | 
36 | 
37 | def ol_start_handler(state: HtmlDocumentState, _: Dict) -> None:
38 |     """Handle the <ol> tag."""
39 |     state.li_counter.append(1)
40 | 
41 | 
42 | def ol_end_handler(state: HtmlDocumentState) -> None:
43 |     """Handle the </ol> tag."""
44 |     state.li_counter.pop()
45 | 


--------------------------------------------------------------------------------
/src/inscriptis/model/tag/table_tag.py:
--------------------------------------------------------------------------------
 1 | """Handle the <table>, <tr> and <td> tags."""
 2 | from typing import Dict
 3 | 
 4 | from inscriptis.annotation import Annotation
 5 | from inscriptis.model.canvas import Canvas
 6 | from inscriptis.model.html_document_state import HtmlDocumentState
 7 | from inscriptis.model.table import Table, TableCell
 8 | 
 9 | 
10 | def td_start_handler(state: HtmlDocumentState, _: Dict) -> None:
11 |     """Handle the <td> tag."""
12 |     if state.current_table:
13 |         # open td tag
14 |         table_cell = TableCell(align=state.tags[-1].align, valign=state.tags[-1].valign)
15 |         state.tags[-1].canvas = table_cell
16 |         state.current_table[-1].add_cell(table_cell)
17 | 
18 | 
19 | def tr_start_handler(state: HtmlDocumentState, _: Dict) -> None:
20 |     """Handle the <tr> tag."""
21 |     if state.current_table:
22 |         state.current_table[-1].add_row()
23 | 
24 | 
25 | def table_start_handler(state: HtmlDocumentState, _: Dict) -> None:
26 |     """Handle the <table> tag."""
27 |     state.tags[-1].set_canvas(Canvas())
28 |     state.current_table.append(
29 |         Table(
30 |             left_margin_len=state.tags[-1].canvas.left_margin,
31 |             cell_separator=state.config.table_cell_separator,
32 |         )
33 |     )
34 | 
35 | 
36 | def td_end_handler(state: HtmlDocumentState) -> None:
37 |     """Handle the </td> tag."""
38 |     if state.current_table:
39 |         state.tags[-1].canvas.close_tag(state.tags[-1])
40 | 
41 | 
42 | def table_end_handler(state: HtmlDocumentState) -> None:
43 |     """Handle the </table> tag."""
44 |     if state.current_table:
45 |         td_end_handler(state)
46 |     table = state.current_table.pop()
47 |     # last tag before the table: self.tags[-2]
48 |     # table tag: self.tags[-1]
49 | 
50 |     out_of_table_text = state.tags[-1].canvas.get_text().strip()
51 |     if out_of_table_text:
52 |         state.tags[-2].write(out_of_table_text)
53 |         state.tags[-2].canvas.write_newline()
54 | 
55 |     start_idx = state.tags[-2].canvas.current_block.idx
56 |     state.tags[-2].write_verbatim_text(table.get_text())
57 |     state.tags[-2].canvas.flush_inline()
58 | 
59 |     # transfer annotations from the current tag
60 |     if state.tags[-1].annotation:
61 |         end_idx = state.tags[-2].canvas.current_block.idx
62 |         for a in state.tags[-1].annotation:
63 |             state.tags[-2].canvas.annotations.append(Annotation(start_idx, end_idx, a))
64 | 
65 |     # transfer in-table annotations
66 |     state.tags[-2].canvas.annotations.extend(
67 |         table.get_annotations(start_idx, state.tags[-2].canvas.left_margin)
68 |     )
69 | 


--------------------------------------------------------------------------------
/src/inscriptis/service/__init__.py:
--------------------------------------------------------------------------------
1 | """The Inscriptis Web service."""
2 | 


--------------------------------------------------------------------------------
/src/inscriptis/service/web.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # coding:utf-8
 3 | """Inscriptis Web Service."""
 4 | 
 5 | from fastapi import FastAPI, Request
 6 | from fastapi.responses import PlainTextResponse
 7 | 
 8 | from inscriptis import get_text
 9 | from inscriptis.css_profiles import RELAXED_CSS_PROFILE
10 | from inscriptis.metadata import __version__
11 | from inscriptis.model.config import ParserConfig
12 | 
13 | app = FastAPI()
14 | CONFIG = ParserConfig(
15 |     css=RELAXED_CSS_PROFILE,
16 |     display_images=True,
17 |     deduplicate_captions=True,
18 |     display_links=False,
19 | )
20 | 
21 | 
22 | @app.get("/")
23 | def index():
24 |     """Print a short status message for the Web service's base URL."""
25 |     return PlainTextResponse("Inscriptis text to HTML Web service.")
26 | 
27 | 
28 | @app.post("/get_text", response_class=PlainTextResponse)
29 | async def get_text_call(request: Request):
30 |     """Return the text representation of the given HTML content."""
31 |     content_type = request.headers.get("Content-type")
32 |     if "; charset=" in content_type:
33 |         encoding = content_type.split("; charset=")[1]
34 |     else:
35 |         encoding = "UTF-8"
36 |     html_content = await request.body()
37 |     return get_text(html_content.decode(encoding, errors="ignore"), CONFIG)
38 | 
39 | 
40 | @app.get("/version", response_class=PlainTextResponse)
41 | def get_version_call():
42 |     """Return the used inscriptis version."""
43 |     return __version__
44 | 
45 | 
46 | def start():
47 |     """Start the webservice."""
48 |     import uvicorn
49 | 
50 |     print("Starting Web service based on Inscriptis", __version__)
51 |     uvicorn.run(app, host="127.0.0.1", port=5000)
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     start()
56 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weblyzard/inscriptis/2ef7e3bdc428816a34e8f6e6280888c7d64eec26/tests/__init__.py


--------------------------------------------------------------------------------
/tests/data/annotation-profile-unittest.json:
--------------------------------------------------------------------------------
1 | {
2 |  "h1": ["heading"],
3 |  "h2": ["heading"],
4 |  "h3": ["heading"],
5 |  "b": ["emphasis"],
6 |  "table": ["table"]
7 | }
8 | 


--------------------------------------------------------------------------------
/tests/html/advanced-prefix-test.html:
--------------------------------------------------------------------------------
 1 | <ol>
 2 |     <li>first</li>
 3 |     <li>
 4 |         <ul>
 5 |             <li><pre>y=0
 6 | for x in range(3,10):
 7 |    print(x)
 8 |    y += x
 9 | print(y)</pre>
10 |             </li>
11 |             <li><pre>print("Hallo")
12 | print("Echo")
13 | print("123")
14 |             </pre></li>
15 |             <li>
16 |         </ul>
17 |     <li>third</li>
18 | </ol>
19 | 
20 | 


--------------------------------------------------------------------------------
/tests/html/advanced-prefix-test.txt:
--------------------------------------------------------------------------------
 1 |  1. first
 2 |  2. 
 3 |       + y=0
 4 |         for x in range(3,10):
 5 |            print(x)
 6 |            y += x
 7 |         print(y)
 8 |       + print("Hallo")
 9 |         print("Echo")
10 |         print("123")
11 |                     
12 |       + 
13 |  3. third
14 | 


--------------------------------------------------------------------------------
/tests/html/br-in-table.html:
--------------------------------------------------------------------------------
 1 | <table>
 2 |   <tr><th>First<br />a special line</th>
 3 |       <th>Second</th>
 4 |       <th>Third</th>
 5 |   </tr>
 6 |   <tr><td>a</td>
 7 |       <td>b</td>
 8 |       <td>c</td>
 9 |   </tr>
10 | </table>
11 | 


--------------------------------------------------------------------------------
/tests/html/br-in-table.txt:
--------------------------------------------------------------------------------
1 | First           Second  Third
2 | a special line               
3 | a               b       c
4 | 


--------------------------------------------------------------------------------
/tests/html/br-in-table2.html:
--------------------------------------------------------------------------------
 1 | <table>
 2 |   <tr><th>First<br />a special line</th>
 3 |       <th>Second</th>
 4 |       <th>Third</th>
 5 |   </tr>
 6 |   <tr><td>a</td>
 7 |       <td>b</td>
 8 |       <td>c</td>
 9 |   </tr>
10 |   <tr><td>a2</td>
11 |       <td></td>
12 |       <td>c2</td>
13 |   </tr>
14 |   <tr><td></td>
15 |       <td><ul><li>first</li><li>second</li><li>third</li></ul></td>
16 |       <td>c3</td>
17 |   </tr>
18 |   <tr><td>last1</td>
19 |       <td>last2</td>
20 |       <td>last3</td>
21 |   </tr>
22 | </table>
23 | 


--------------------------------------------------------------------------------
/tests/html/br-li.html:
--------------------------------------------------------------------------------
1 | List
2 | <ul>
3 |  <li>first line <br>
4 |      second line
5 |  <li>third line
6 | </ul>
7 | 


--------------------------------------------------------------------------------
/tests/html/br-li.txt:
--------------------------------------------------------------------------------
1 | List
2 |   * first line
3 |     second line
4 |   * third line
5 | 


--------------------------------------------------------------------------------
/tests/html/br.html:
--------------------------------------------------------------------------------
1 | First line <br>
2 | second line
3 | 


--------------------------------------------------------------------------------
/tests/html/br.txt:
--------------------------------------------------------------------------------
1 | First line
2 | second line
3 | 


--------------------------------------------------------------------------------
/tests/html/direct-enumeration.html:
--------------------------------------------------------------------------------
 1 | <ol>
 2 |   <li>First
 3 |   <li>Second
 4 |   <ol>
 5 |     <li>Sec, First
 6 |     <li>Sec, Second
 7 |     <ul>
 8 |       <li>item
 9 |       <li>item2
10 |     </ul>
11 |   </ol>
12 |   <li>Third
13 | </ol>
14 | 


--------------------------------------------------------------------------------
/tests/html/direct-enumeration.txt:
--------------------------------------------------------------------------------
1 |  1. First
2 |  2. Second
3 |      1. Sec, First
4 |      2. Sec, Second
5 |           o item
6 |           o item2
7 |  3. Third
8 | 


--------------------------------------------------------------------------------
/tests/html/empty-table.html:
--------------------------------------------------------------------------------
1 | <table>
2 |   1
3 | </table>
4 | 


--------------------------------------------------------------------------------
/tests/html/empty-table.txt:
--------------------------------------------------------------------------------
1 | 1
2 | 


--------------------------------------------------------------------------------
/tests/html/enumerations.html:
--------------------------------------------------------------------------------
 1 | Hallo
 2 | <ol>
 3 |   <li>First
 4 |   <li>Second
 5 |   <ol>
 6 |     <li>Second, First
 7 |     <li>Second, Second
 8 |     <ul>
 9 |       <li>item
10 |       <li>item2
11 |     </ul>
12 |   </ol>
13 |   <li>Third
14 | </ol>
15 | 


--------------------------------------------------------------------------------
/tests/html/enumerations.txt:
--------------------------------------------------------------------------------
1 | Hallo
2 |  1. First
3 |  2. Second
4 |      1. Second, First
5 |      2. Second, Second
6 |           o item
7 |           o item2
8 |  3. Third
9 | 


--------------------------------------------------------------------------------
/tests/html/html-comment-ofuscation.html:
--------------------------------------------------------------------------------
1 | <html><body><span class="price-detailed__unit-price"><span>$<!--o-->90.<!--o-->74</span></span></body></html>
2 | 


--------------------------------------------------------------------------------
/tests/html/html-comment-ofuscation.txt:
--------------------------------------------------------------------------------
1 | $90.74
2 | 


--------------------------------------------------------------------------------
/tests/html/invalid-table.html:
--------------------------------------------------------------------------------
 1 | <table>
 2 |   <th><td>First</td>
 3 |       <td>Second</td>
 4 |       <td>Third</td>
 5 |   </th>
 6 |   <tr><td>any</td>
 7 |       <td>beta</td>
 8 |       <td>charly</td>
 9 |   </tr>
10 |   <tr><td>long time</td>
11 |       <td>short time</td>
12 |       <td>medium time</td>
13 |   </tr>
14 | </table>
15 | 


--------------------------------------------------------------------------------
/tests/html/invalid-table.txt:
--------------------------------------------------------------------------------
1 |            First       Second       Third
2 | any        beta        charly     
3 | long time  short time  medium time
4 | 


--------------------------------------------------------------------------------
/tests/html/invalid-table2.html:
--------------------------------------------------------------------------------
 1 | Good day
 2 | <tr>
 3 |   first <td>second<td> third
 4 | <br />
 5 | forth
 6 | <table border="2">
 7 | <td>alpha
 8 | <td>epsilon
 9 | <tr>beta<td>gamma
10 | </table>
11 | 


--------------------------------------------------------------------------------
/tests/html/invalid-table2.txt:
--------------------------------------------------------------------------------
1 | Good day first second third
2 | forth beta
3 | alpha  epsilon
4 | gamma
5 | 


--------------------------------------------------------------------------------
/tests/html/invalid-table3.html:
--------------------------------------------------------------------------------
 1 | Good day
 2 | <tr>
 3 |   first <td>second<td> third
 4 | <br />
 5 | forth
 6 | <table border="2">
 7 |     <td><td>alpha</td>oho</td>
 8 |     <td>epsilon
 9 | <tr>beta<td>gamma
10 | </table>
11 | 


--------------------------------------------------------------------------------
/tests/html/invalid-table3.txt:
--------------------------------------------------------------------------------
1 | Good day first second third
2 | forth oho beta
3 |        alpha  epsilon
4 | gamma


--------------------------------------------------------------------------------
/tests/html/invisible.html:
--------------------------------------------------------------------------------
1 | <title><ul>hallo</title>
2 | 
3 | <h1>Title</h1>
4 | <b style="display: none">noch mehr text</b>
5 | 


--------------------------------------------------------------------------------
/tests/html/invisible.txt:
--------------------------------------------------------------------------------
1 | Title
2 | 


--------------------------------------------------------------------------------
/tests/html/invisible2.html:
--------------------------------------------------------------------------------
1 | <h1>Leertest</h1>
2 | <b style="display:none">hallo<i style="display:block">echo</i></b>
3 | 


--------------------------------------------------------------------------------
/tests/html/invisible2.txt:
--------------------------------------------------------------------------------
1 | Leertest
2 | 


--------------------------------------------------------------------------------
/tests/html/invisible3.html:
--------------------------------------------------------------------------------
1 | <!--An invisible element cannot be made visible by a style -->
2 | <script style="display: inline"><b>Hallo</b></script>
3 | 


--------------------------------------------------------------------------------
/tests/html/invisible3.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weblyzard/inscriptis/2ef7e3bdc428816a34e8f6e6280888c7d64eec26/tests/html/invisible3.txt


--------------------------------------------------------------------------------
/tests/html/nested-list.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 |     <body>
 3 |         First
 4 |         <ul>
 5 |             <li>
 6 |             <li>
 7 |             <li>1
 8 |             <li>2
 9 |             <li>3
10 |             <li>
11 |             <li>
12 |         </ul>
13 | 
14 |         Second
15 |         <ul>
16 |             <li>
17 |             <li>
18 |             <li>
19 |                 <ul>
20 |                     <li>1
21 |                     <li>
22 |                         <ul>
23 |                             <li>a
24 |                             <li>b
25 |                             <li>c
26 |                         </ul>
27 |                     </li>
28 |                     <li>3
29 |                 </ul>
30 |             <li>
31 |             <li>
32 |             <li>
33 |         <ul>
34 | 
35 |     </body>
36 | </html>
37 | 


--------------------------------------------------------------------------------
/tests/html/nested-list.txt:
--------------------------------------------------------------------------------
 1 | First
 2 |   * 
 3 |   * 
 4 |   * 1
 5 |   * 2
 6 |   * 3
 7 |   * 
 8 |   * 
 9 | Second
10 |   * 
11 |   * 
12 |   * 
13 |       + 1
14 |       + 
15 |           o a
16 |           o b
17 |           o c
18 |       + 3
19 |   * 
20 |   * 
21 |   * 
22 | 


--------------------------------------------------------------------------------
/tests/html/nested-table-alignment-css.html:
--------------------------------------------------------------------------------
 1 | <table>
 2 |     <tr><td><b>column with nested table</b></td>
 3 |       <td>column 2</td>
 4 |       <td>column 3</td>
 5 |       <td>column 4</td>
 6 |   </tr>
 7 |   <tr><td style="text-align: right">
 8 |     <table>
 9 |       <tr><td>nested</td><td>table</td>
10 |       <tr><td>11</td><td>12</td>
11 |       <tr><td>21</td><td>22</td>
12 |       <tr><td>31</td><td>32</td>
13 |     </table>
14 |       </td> 
15 |       <td style="vertical-align: top; text-align: left">Tom</td>
16 |       <td style="vertical-align: middle; text-align: center">Joe</td>
17 |       <td style="vertical-align: bottom; text-align: right">Sue</td>
18 |   </tr>
19 |   <tr>
20 |     <td>last</td>
21 |     <td>line</td>
22 |  </tr>      
23 | </table>
24 | 


--------------------------------------------------------------------------------
/tests/html/nested-table-alignment-css.txt:
--------------------------------------------------------------------------------
1 | column with nested table  column 2  column 3  column 4
2 | nested  table             Tom                         
3 | 11      12                                            
4 | 21      22                          Joe               
5 | 31      32                                            
6 |                                               Sue     
7 | last                      line


--------------------------------------------------------------------------------
/tests/html/nested-table-alignment.html:
--------------------------------------------------------------------------------
 1 | <table>
 2 |     <tr><td><b>column with nested table</b></td>
 3 |       <td>column 2</td>
 4 |       <td>column 3</td>
 5 |       <td>column 4</td>
 6 |   </tr>
 7 |   <tr><td align="right">
 8 |     <table>
 9 |       <tr><td>nested</td><td>table</td>
10 |       <tr><td>11</td><td>12</td>
11 |       <tr><td>21</td><td>22</td>
12 |       <tr><td>31</td><td>32</td>
13 |     </table>
14 |       </td> 
15 |       <td valign="top" align="left">Tom</td>
16 |       <td valign="middle" align="center">Joe</td>
17 |       <td valign="bottom" align="right">Sue</td>
18 |   </tr>
19 |   <tr>
20 |     <td>last</td>
21 |     <td>line</td>
22 |  </tr>      
23 | </table>
24 | 


--------------------------------------------------------------------------------
/tests/html/nested-table-alignment.txt:
--------------------------------------------------------------------------------
1 | column with nested table  column 2  column 3  column 4
2 |            nested  table  Tom                         
3 |            11      12                                 
4 |            21      22                 Joe             
5 |            31      32                                 
6 |                                                    Sue
7 | last                      line


--------------------------------------------------------------------------------
/tests/html/nested-table.html:
--------------------------------------------------------------------------------
 1 | <table border="1">
 2 |   <tr><td>column with nested table</td>
 3 |       <td>column 2</td>
 4 |       <td>column 3</td>
 5 |   </tr>
 6 |   <tr><td>
 7 |     <table>
 8 |       <tr><td>nested</td><td>table</td>
 9 |       <tr><td>1</td><td>2</td>
10 |     </table>
11 |       </td> 
12 |       <td><b>Tom</b></td>
13 |       <td>Joe</td>
14 |   </tr>
15 |   <tr>
16 |     <td>last</td>
17 |     <td>line</td>
18 |  </tr>      
19 | </table>
20 | 


--------------------------------------------------------------------------------
/tests/html/nested-table.txt:
--------------------------------------------------------------------------------
1 | column with nested table  column 2  column 3
2 | nested  table                               
3 | 1       2                 Tom       Joe     
4 |                                             
5 | last                      line


--------------------------------------------------------------------------------
/tests/html/p-br.html:
--------------------------------------------------------------------------------
 1 | L<p><br/>
 2 | B
 3 | </p>
 4 | Line
 5 | <div>
 6 | <p>Another line<br />
 7 | Third line</p>
 8 | </div>
 9 | Forth line
10 | 


--------------------------------------------------------------------------------
/tests/html/p-br.txt:
--------------------------------------------------------------------------------
 1 | L
 2 | 
 3 | 
 4 | B
 5 | 
 6 | Line
 7 | 
 8 | Another line
 9 | Third line
10 | 
11 | Forth line
12 | 


--------------------------------------------------------------------------------
/tests/html/pre.html:
--------------------------------------------------------------------------------
 1 | <h1>Pre elements</h1>
 2 | 
 3 | <pre>
 4 | b = 1
 5 | for a in range(10):
 6 |    print(a)
 7 |    b *= a
 8 |    print(b)
 9 | </pre>
10 | 
11 | <h1>A pre block within an enumeration</h1>
12 | 
13 | <ul>
14 | 	<li>Hallo</li>
15 | 	<li><pre>b = 1
16 | for a in range(10):
17 |    print(a)
18 |    b *= a
19 |    print(b)
20 | </pre></li>
21 | 	<li>Echo</li>
22 | </li>
23 | 


--------------------------------------------------------------------------------
/tests/html/pre.txt:
--------------------------------------------------------------------------------
 1 | Pre elements
 2 | 
 3 | 
 4 | b = 1
 5 | for a in range(10):
 6 |    print(a)
 7 |    b *= a
 8 |    print(b)
 9 | 
10 | 
11 | A pre block within an enumeration
12 | 
13 |   * Hallo
14 |   * b = 1
15 |     for a in range(10):
16 |        print(a)
17 |        b *= a
18 |        print(b)
19 |     
20 |   * Echo
21 | 


--------------------------------------------------------------------------------
/tests/html/real-world/naturgruen-team.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weblyzard/inscriptis/2ef7e3bdc428816a34e8f6e6280888c7d64eec26/tests/html/real-world/naturgruen-team.html


--------------------------------------------------------------------------------
/tests/html/stackoverflow-list-snippet.html:
--------------------------------------------------------------------------------
 1 |     <li id="comment-89402163" class="comment js-comment " data-comment-id="89402163" data-comment-owner-id="2996578" data-comment-score="0">
 2 |         <div class="js-comment-actions comment-actions">
 3 |             <div class="comment-score js-comment-edit-hide">
 4 |             </div>
 5 |         </div>
 6 |         <div class="comment-text  js-comment-text-and-form">
 7 |             <div class="comment-body js-comment-edit-hide">
 8 |                 
 9 |                 <span class="comment-copy">I obtain &quot;NameError: name &#39;NamedTuple&#39; is not defined&quot;</span>
10 |                 
11 | &ndash;&nbsp;<a href="/users/2996578/nbedou"
12 |                        title="873 reputation"
13 |                        class="comment-user">nbedou</a>
14 |                 <span class="comment-date" dir="ltr"><span title="2018-07-06 12:45:21Z, License: CC BY-SA 4.0" class="relativetime-clean">Jul 6 '18 at 12:45</span></span>
15 |             </div>
16 |         </div>
17 |     </li>
18 |     <li id="comment-92177840" class="comment js-comment " data-comment-id="92177840" data-comment-owner-id="1190965" data-comment-score="0">
19 |         <div class="js-comment-actions comment-actions">
20 |             <div class="comment-score js-comment-edit-hide">
21 |             </div>
22 |         </div>
23 |         <div class="comment-text  js-comment-text-and-form">
24 |             <div class="comment-body js-comment-edit-hide">
25 |                 
26 |                 <span class="comment-copy">@nbedou <a href="https://docs.python.org/3/library/typing.html#typing.NamedTuple" rel="nofollow noreferrer">docs.python.org/3/library/typing.html#typing.NamedTuple</a></span>
27 |                 
28 | &ndash;&nbsp;<a href="/users/1190965/nodakai"
29 |                        title="6,911 reputation"
30 |                        class="comment-user">nodakai</a>
31 |                 <span class="comment-date" dir="ltr"><span title="2018-10-03 07:44:13Z, License: CC BY-SA 4.0" class="relativetime-clean">Oct 3 '18 at 7:44</span></span>
32 |             </div>
33 |         </div>
34 |     </li>
35 | 


--------------------------------------------------------------------------------
/tests/html/stackoverflow-list-snippet.txt:
--------------------------------------------------------------------------------
1 | * I obtain "NameError: name 'NamedTuple' is not defined" – nbedou Jul 6 '18 at 12:45
2 | * @nbedou docs.python.org/3/library/typing.html#typing.NamedTuple – nodakai Oct 3 '18 at 7:44
3 | 


--------------------------------------------------------------------------------
/tests/html/subsequent-headings.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 |     <head>
 3 |         <title>Test the spacing between subsequent headings</title>
 4 |     <body>
 5 |         <h1>The first</h1>
 6 | 
 7 |         And text, concerning the first heading.
 8 | 
 9 |         <h1>The second</h1>
10 |         Text concerning the second heading.
11 | 
12 |         <h2>Subheading</h2>
13 |         Sub1
14 | 
15 |         <h3>This is a subsubtopic</h3>
16 | 
17 |         <h2>Another subheading</h2>
18 |         Sub2
19 | 
20 |         <h1>The third</h1>
21 |         The third and final heading.
22 |     </body>
23 | </html>
24 | 
25 | 


--------------------------------------------------------------------------------
/tests/html/subsequent-headings.json:
--------------------------------------------------------------------------------
 1 | {"annotation_rules": {
 2 |     "h1": ["heading"],
 3 |     "h2": ["heading"],
 4 |     "h3": ["heading"],
 5 |     "b": ["emphasis"],
 6 |     "table": ["table"],
 7 |     "th": ["table-heading"],
 8 |     "td": ["table-cell"]
 9 |  },
10 |  "result": [
11 |   ["heading", "The first\n\n"],
12 |   ["heading", "\nThe second\n\n"],
13 |   ["heading", "\nSubheading\n\n"],
14 |   ["heading", "\nThis is a subsubtopic\n\n"],
15 |   ["heading", "Another subheading\n\n"],
16 |   ["heading", "\nThe third\n\n"]
17 |  ]
18 | }
19 | 


--------------------------------------------------------------------------------
/tests/html/subsequent-headings.txt:
--------------------------------------------------------------------------------
 1 | The first
 2 | 
 3 | And text, concerning the first heading.
 4 | 
 5 | The second
 6 | 
 7 | Text concerning the second heading.
 8 | 
 9 | Subheading
10 | 
11 | Sub1
12 | 
13 | This is a subsubtopic
14 | 
15 | Another subheading
16 | 
17 | Sub2
18 | 
19 | The third
20 | 
21 | The third and final heading.
22 | 


--------------------------------------------------------------------------------
/tests/html/table-alignment.html:
--------------------------------------------------------------------------------
1 | <html>
2 |   <table>
3 |      <tr><th>Titel</th><th>Beschreibung</th><th>Länge</th></tr>
4 |      <tr><td>1</td><td>2</td><td>3</td></tr>
5 |      <tr><td>Der Prinz von Ägypten</td><td>Basierend auf dem Buch Exodus</td><td>99 min</td></tr>
6 |      <tr><td>Leo Lausemaus        </td><td>Der Läusebub</td><td>99 min</td></tr>
7 |   </table>
8 | </html>
9 | 


--------------------------------------------------------------------------------
/tests/html/table-alignment.txt:
--------------------------------------------------------------------------------
1 | Titel                  Beschreibung                   Länge 
2 | 1                      2                              3     
3 | Der Prinz von Ägypten  Basierend auf dem Buch Exodus  99 min
4 | Leo Lausemaus          Der Läusebub                   99 min
5 | 


--------------------------------------------------------------------------------
/tests/html/table-empty-row.html:
--------------------------------------------------------------------------------
 1 | <table>
 2 |     <tr><td>Hallo
 3 |         <td>Echo
 4 |     </tr>
 5 |     <tr>Leer</tr>
 6 |     <tr><td>(1)
 7 |         <td>(2)
 8 |     </tr>
 9 | </table>
10 | 


--------------------------------------------------------------------------------
/tests/html/table-empty-row.txt:
--------------------------------------------------------------------------------
1 | Leer
2 | Hallo  Echo
3 | 
4 | (1)    (2) 
5 | 
6 | 


--------------------------------------------------------------------------------
/tests/html/table-in-table.html:
--------------------------------------------------------------------------------
 1 | <h1>Single</h1>
 2 | 
 3 | <h2>First</h2>
 4 | <table border=1><tr><td>red</td><td></td><td>green</td></tr>
 5 |                    <tr><td>   </td><td>blue</td><td></td></tr>
 6 | 		   <tr><td>red</td><td></td><td>green</td></tr></table>
 7 | 
 8 | <h2>Second</h2>
 9 | <table border=1><tr><td></td><td>blue</td><td></td></tr>
10 |     <tr><td><b>red?</b></td><td></td><td>green</td></tr>
11 | 		   <tr><td></td><td>blue</td><td></td></tr></table>
12 | 
13 | <h1>Nested</h1>
14 | <table border=1>
15 | <tr>
16 |     <td><table><tr><td>red</td><td></td><td><i>green.</i></td></tr>
17 |                    <tr><td>   </td><td>blue</td><td></td></tr>
18 |                    <tr><td>red</td><td></td><td>green</td></tr></table></td>
19 |     <td><table><tr><td></td><td>blue</td><td></td></tr>
20 |                    <tr><td>red</td><td></td><td>green</td></tr>
21 | 		   <tr><td></td><td>blue</td><td></td></tr></table></td>
22 | 	<td><table><tr><td></td><td>blue</td><td></td></tr>
23 |                    <tr><td>red</td><td></td><td>green</td></tr>
24 |                    <tr><td></td><td>blue</td><td></td></tr></table></td>
25 | </tr>
26 | <tr>
27 |     <td><table><tr><td></td><td><i>blue.</i></td><td></td></tr>
28 |                    <tr><td>red</td><td></td><td>green</td></tr>
29 | 		   <tr><td></td><td>blue</td><td></td></tr></table></td>
30 | 	<td><table><tr><td>red</td><td></td><td>green</td></tr>
31 |                    <tr><td>   </td><td>blue</td><td></td></tr>
32 |                    <tr><td>red</td><td></td><td><b>green!</b></td></tr></table></td>
33 | 	<td><table><tr><td></td><td>blue</td><td></td></tr>
34 |                    <tr><td>red</td><td></td><td>green</td></tr>
35 | 		   <tr><td></td><td>blue</td><td></td></tr></table></td>
36 | </tr>
37 | <tr>
38 | 	<td><table><tr><td>red</td><td></td><td>green</td></tr>
39 |                    <tr><td>   </td><td>blue</td><td></td></tr>
40 | 		   <tr><td>red</td><td></td><td>green</td></tr></table></td>
41 | 	<td><table><tr><td></td><td>blue</td><td></td></tr>
42 |                    <tr><td>red</td><td></td><td>green</td></tr>
43 | 		   <tr><td></td><td>blue</td><td></td></tr></table></td>
44 | 	<td><table><tr><td></td><td>blue</td><td></td></tr>
45 |                    <tr><td>red</td><td></td><td>green</td></tr>
46 |                    <tr><td></td><td><b>blue!</b></td><td></td></tr></table></td>
47 | </tr>
48 | </table>
49 | 


--------------------------------------------------------------------------------
/tests/html/table-in-table.json:
--------------------------------------------------------------------------------
 1 | {"annotation_rules": {
 2 |     "h1": ["heading"],
 3 |     "h2": ["heading"],
 4 |     "h3": ["heading"],
 5 |     "table#border": ["table"],
 6 |     "b": ["bold"],
 7 |     "i": ["italic"]
 8 |  },
 9 |  "result": [
10 |      ["heading", "Single\n\n"], 
11 |      ["heading", "First\n\n"], 
12 |      ["table", "red        green\n     blue       \nred        green\n\n"], 
13 |      ["heading", "\nSecond\n\n"], 
14 |      ["table", "      blue       \nred?        green\n      blue       \n\n"], 
15 |      ["bold", "red?"], 
16 |      ["heading", "\nNested\n\n"], 
17 |      ["table", "red        green.       blue               blue        \n     blue          red        green   red        green \nred        green        blue               blue        \n                                                       \n     blue.         red        green        blue        \nred         green       blue          red        green \n     blue          red        green!       blue        \n                                                       \nred        green        blue               blue        \n     blue          red        green   red         green\nred        green        blue               blue!       \n                                                       \n"], 
18 |      ["italic", "green."], 
19 |      ["italic", "blue."], 
20 |      ["bold", "green!"], 
21 |      ["bold", "blue!"]
22 |  ]
23 | }
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/tests/html/table-in-table.txt:
--------------------------------------------------------------------------------
 1 | Single
 2 | 
 3 | First
 4 | 
 5 | red        green
 6 |      blue       
 7 | red        green
 8 | 
 9 | 
10 | Second
11 | 
12 |       blue       
13 | red?        green
14 |       blue       
15 | 
16 | 
17 | Nested
18 | 
19 | red        green.       blue               blue        
20 |      blue          red        green   red        green 
21 | red        green        blue               blue        
22 |                                                        
23 |      blue.         red        green        blue        
24 | red         green       blue          red        green 
25 |      blue          red        green!       blue        
26 |                                                        
27 | red        green        blue               blue        
28 |      blue          red        green   red         green
29 | red        green        blue               blue!


--------------------------------------------------------------------------------
/tests/html/table-itemize.html:
--------------------------------------------------------------------------------
1 | <table>
2 | <tr>
3 |   <td width="134" valign="top" class="nav"><ul id="navi"><li class="act1"><a href="aktuell/aktuell.html" onfocus="blurLink(this);" class="act1">aktuell</a></li><li class="pas1"><a href="aktuell/projekte.html" onfocus="blurLink(this);" class="pas1">projekte</a></li><li class="pas1"><a href="aktuell/zu-verkaufen.html" onfocus="blurLink(this);" class="pas1">zu verkaufen</a></li><li class="pas1"><a href="aktuell/offene-stelle.html" onfocus="blurLink(this);" class="pas1">offene stelle</a></li></ul></td>
4 |                                         <td width="743" valign="top" class="con"><!--TYPO3SEARCH_begin--><div class="cnb"><a id="c49"></a><div class="csc-textpic csc-textpic-center csc-textpic-below csc-textpic-equalheight"><div class="csc-textpic-text"><p>An der Gewerbeausstellung vom 1.-3.September sind wir nicht persönlich anwesend. 
5 |                                         </p>
6 | </tr>
7 | </table>
8 | 


--------------------------------------------------------------------------------
/tests/html/table-itemize.txt:
--------------------------------------------------------------------------------
1 |   * aktuell        An der Gewerbeausstellung vom 1.-3.September sind wir nicht persönlich anwesend.
2 |   * projekte                                                                                       
3 |   * zu verkaufen                                                                                   
4 |   * offene stelle
5 | 


--------------------------------------------------------------------------------
/tests/html/table-pre.html:
--------------------------------------------------------------------------------
 1 | <h1>Pre elements that have been nested in a table.</h1>
 2 | 
 3 | 
 4 | <table>
 5 | 	<tr>
 6 | 		<th>Python</th>
 7 | 		<th>Java</th>
 8 | 	</tr>
 9 | 	<tr><td>
10 | <pre>
11 | b = 1
12 | for a in range(10):
13 |    print(a)
14 |    b *= a
15 |    print(b)
16 | </pre>
17 | 	</td>
18 | 	<td>
19 | <pre>
20 | int b = 1;
21 | for (int a=0; a&lt;10; a++) {
22 |    System.out.println(a);
23 |    b = b * a;
24 |    System.out.println(b);
25 | }
26 | </pre>
27 | 	</td></tr>
28 | 	<tr><td>
29 | 		3.8
30 | 	</td>
31 | 	<td>
32 | 		14
33 | 	</td>
34 | 	</tr>
35 | </table>
36 | 
37 | 


--------------------------------------------------------------------------------
/tests/html/table-pre.txt:
--------------------------------------------------------------------------------
 1 | Pre elements that have been nested in a table.
 2 | 
 3 | Python               Java                      
 4 |                                                
 5 | b = 1                int b = 1;                
 6 | for a in range(10):  for (int a=0; a<10; a++) {
 7 |    print(a)             System.out.println(a); 
 8 |    b *= a               b = b * a;             
 9 |    print(b)             System.out.println(b); 
10 |                      }                         
11 |                                                
12 | 3.8                  14
13 | 


--------------------------------------------------------------------------------
/tests/html/table.html:
--------------------------------------------------------------------------------
 1 | <table>
 2 |   <tr><th>First</th>
 3 |       <th>Second</th>
 4 |       <th>Third</th>
 5 |   </tr>
 6 |   <tr><td>a</td>
 7 |       <td><b>b</b></td>
 8 |       <td>c</td>
 9 |   </tr>
10 | </table>
11 | 


--------------------------------------------------------------------------------
/tests/html/table.json:
--------------------------------------------------------------------------------
 1 | {"annotation_rules": {
 2 |     "h1": ["heading"],
 3 |     "h2": ["heading"],
 4 |     "h3": ["heading"],
 5 |     "b": ["emphasis"],
 6 |     "table": ["table"],
 7 |     "th": ["table-heading"],
 8 |     "td": ["table-cell"]
 9 |  },
10 |  "result": [
11 |   ["table", "First  Second  Third\na      b       c    \n"],
12 |   ["table-heading", "First"], 
13 |   ["table-heading", "Second"], 
14 |   ["table-heading", "Third"], 
15 |   ["table-cell", "a"], 
16 |   ["emphasis", "b"], 
17 |   ["table-cell", "b"], 
18 |   ["table-cell", "c"]
19 |  ]
20 | }
21 | 


--------------------------------------------------------------------------------
/tests/html/table.txt:
--------------------------------------------------------------------------------
1 | First  Second  Third
2 | a      b       c
3 | 


--------------------------------------------------------------------------------
/tests/html/td-only-table.html:
--------------------------------------------------------------------------------
1 | <table>
2 |   <td>1</td>
3 |   <td>2</td>
4 |   <td>3</td>
5 | </table>
6 | 


--------------------------------------------------------------------------------
/tests/html/td-only-table.txt:
--------------------------------------------------------------------------------
1 | 1  2  3
2 | 


--------------------------------------------------------------------------------
/tests/html/test.html:
--------------------------------------------------------------------------------
  1 | <html>
  2 | <head>
  3 |   <meta charset="utf-8" />
  4 | </meta>
  5 | 
  6 | <body>
  7 | 
  8 | <!-- This document collects HTML elements which should be given a special
  9 |      consideration during rendering
 10 |  -->
 11 | 
 12 |  <h2>Test Cases</h2>
 13 | 
 14 |  Thomas <ul><li><div>Anton</div>Maria</ul>
 15 | 
 16 |  Thomas <ul><li>  <div>Anton</div>Maria</ul>
 17 | 
 18 |  Thomas <ul><li> a  <div>Anton</div>Maria</ul>
 19 | 
 20 |  <h2>Other examples</h2>
 21 | 
 22 | <!-- enumerations and display:block -->
 23 | The first enumeration
 24 | <ul>
 25 |  <li><p>first line</p>second line
 26 |  <li>third line
 27 | </ul>
 28 | 
 29 | 
 30 | The second enumeration
 31 | <ul>
 32 |  <li>first
 33 |  <li><p>second</p>
 34 |  <li>third
 35 |  <li>forth
 36 | </ul>
 37 | 
 38 | The third enumeration
 39 | <ul>
 40 |   <li>first line<p>second line</p>third line
 41 |   <li>last line
 42 | </ul>
 43 | 
 44 | The forth enumeration (div rather than p)
 45 | <ul>
 46 |   <li>first line<div>second line</div>third line
 47 |   <li>last line
 48 | </ul>
 49 | 
 50 | 
 51 | Spaces between enumerated items?
 52 | <ul>
 53 |   <li>first line<br><br><br>
 54 |   <li>second line
 55 |   <li><p>third line</p>
 56 |   <li>last line
 57 | </ul>
 58 | 
 59 | Normal enumeration
 60 | <ul>
 61 |  <li>first
 62 |  <li>second
 63 |  <li>third
 64 | </ul>
 65 | <ol>
 66 |  <li>first
 67 |  <li>second
 68 |  <li>first
 69 |  <li>second
 70 |  <li>first
 71 |  <li>second
 72 |  <li>first
 73 |  <li>second
 74 |  <li>first
 75 |  <li>second
 76 |  <li>first
 77 |  <li>second
 78 | </ol>
 79 | 
 80 | 
 81 | <!-- use of quotes -->
 82 | Amen, amen ich sage euch - <q>Ehre sei Gott</q> in der Höhe!
 83 | 
 84 | <hr />
 85 | 
 86 | <!-- margins in elements -->
 87 | Davor...
 88 | Inline <b style="-webkit-margin-before: 1em; -webkit-margin-after:1em; -webkit-padding-start: 40px;">alles drunter?</b> Weiter geht's?
 89 | 
 90 | <!-- does a block element requires a new line, even if we are at the beginning
 91 |      of a line? -->
 92 | 
 93 | <h3>Block elements</h3>
 94 | This is the first line <br />
 95 | <div>Block Element - is there a space to the previous line?</div>
 96 | 
 97 | <h3>Whitespaces</h3>
 98 | White  space <b> space</b>
 99 | und <pre>  mehr  ...</pre>.
100 | 
101 | <h3>Divs</h3>
102 | Thomas<div>Anton</div>Maria
103 | 
104 | <h3>One versus two divs</h3>
105 | One
106 | <div>Anna</div>
107 | Div.
108 | 
109 | <hr>
110 | 
111 | Two
112 | <div></div><div>Anna</div>
113 | Div.
114 | 
115 | <hr>
116 | 
117 | Empty
118 | <div />
119 | Div.
120 | 
121 | 
122 | </body>
123 | </html>
124 | 


--------------------------------------------------------------------------------
/tests/html/tr-only-table.html:
--------------------------------------------------------------------------------
1 | <table>
2 |   <tr>1</tr>
3 |   <tr>2</tr>
4 |   <tr>3</tr>
5 | </table>
6 | 


--------------------------------------------------------------------------------
/tests/html/tr-only-table.txt:
--------------------------------------------------------------------------------
1 | 1 2 3
2 | 
3 | 
4 | 


--------------------------------------------------------------------------------
/tests/html/whitespace.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 | 	<body>
 3 | 		<b style="white-space:pre">
 4 |    Das
 5 |    ist
 6 |    interessant
 7 | 		</b>
 8 | 	</body>
 9 | </html>
10 | 


--------------------------------------------------------------------------------
/tests/html/whitespace.txt:
--------------------------------------------------------------------------------
1 | 
2 |    Das
3 |    ist
4 |    interessant
5 | 


--------------------------------------------------------------------------------
/tests/html/wikipedia-code.html:
--------------------------------------------------------------------------------
 1 | <p>Pythons Schlüsselwort <code>lambda</code> könnte manche Anhänger der funktionalen Programmierung fehlleiten. Solche <code>lambda</code>-Blöcke in Python können nur Ausdrücke enthalten, aber keine Anweisungen. Damit werden solche Anweisungen generell nicht verwendet, um eine Funktion zurückzugeben. Die übliche Vorgehensweise ist stattdessen, den Namen einer lokalen Funktion zurückzugeben. Das folgende Beispiel zeigt dies anhand einer einfachen Funktion nach den Ideen von <a href="/wiki/Haskell_Brooks_Curry" title="Haskell Brooks Curry">Haskell Brooks Curry</a>:
 2 | </p>
 3 | <div class="mw-highlight mw-highlight-lang-python mw-content-ltr" dir="ltr"><pre><span></span><span class="k">def</span> <span class="nf">add_and_print_maker</span><span class="p">(</span><span class="n">x</span><span class="p">):</span>
 4 |     <span class="k">def</span> <span class="nf">temp</span><span class="p">(</span><span class="n">y</span><span class="p">):</span>
 5 |         <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;</span><span class="si">{}</span><span class="s2"> + </span><span class="si">{}</span><span class="s2"> = </span><span class="si">{}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">x</span> <span class="o">+</span> <span class="n">y</span><span class="p">))</span>
 6 | 
 7 |     <span class="k">return</span> <span class="n">temp</span>
 8 | </pre></div>
 9 | <p>Damit ist auch <a href="/wiki/Currying" title="Currying">Currying</a> auf einfache Art möglich, um generische Funktionsobjekte auf problemspezifische herunterzubrechen. Hier ein einfaches Beispiel:
10 | </p>
11 | <div class="mw-highlight mw-highlight-lang-python mw-content-ltr" dir="ltr"><pre><span></span><span class="k">def</span> <span class="nf">curry</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="n">known_argument</span><span class="p">):</span>
12 |     <span class="k">return</span> <span class="k">lambda</span> <span class="n">unknown_argument</span><span class="p">:</span> <span class="n">func</span><span class="p">(</span><span class="n">unknown_argument</span><span class="p">,</span> <span class="n">known_argument</span><span class="p">)</span>
13 | </pre></div>
14 | <p>Wird die <code>curry</code>-Funktion aufgerufen, erwartet diese eine Funktion mit zwei notwendigen Parametern sowie die Parameterbelegung für den zweiten Parameter dieser Funktion. Der Rückgabewert von <code>curry</code> ist eine Funktion, die das Gleiche tut wie <code>func</code>, aber nur noch einen Parameter benötigt.
15 | 


--------------------------------------------------------------------------------
/tests/html/wikipedia-code.txt:
--------------------------------------------------------------------------------
 1 | Pythons Schlüsselwort lambda könnte manche Anhänger der funktionalen Programmierung fehlleiten. Solche lambda-Blöcke in Python können nur Ausdrücke enthalten, aber keine Anweisungen. Damit werden solche Anweisungen generell nicht verwendet, um eine Funktion zurückzugeben. Die übliche Vorgehensweise ist stattdessen, den Namen einer lokalen Funktion zurückzugeben. Das folgende Beispiel zeigt dies anhand einer einfachen Funktion nach den Ideen von Haskell Brooks Curry:
 2 | 
 3 | def add_and_print_maker(x):
 4 |     def temp(y):
 5 |         print("{} + {} = {}".format(x, y, x + y))
 6 | 
 7 |     return temp
 8 | 
 9 | 
10 | Damit ist auch Currying auf einfache Art möglich, um generische Funktionsobjekte auf problemspezifische herunterzubrechen. Hier ein einfaches Beispiel:
11 | 
12 | def curry(func, known_argument):
13 |     return lambda unknown_argument: func(unknown_argument, known_argument)
14 | 
15 | 
16 | Wird die curry-Funktion aufgerufen, erwartet diese eine Funktion mit zwei notwendigen Parametern sowie die Parameterbelegung für den zweiten Parameter dieser Funktion. Der Rückgabewert von curry ist eine Funktion, die das Gleiche tut wie func, aber nur noch einen Parameter benötigt.
17 | 


--------------------------------------------------------------------------------
/tests/html/wikipedia-consequtive-links-and-umlauts.html:
--------------------------------------------------------------------------------
 1 | <p><a href="/wiki/Araschgen" title="Araschgen">Araschgen</a>&#160;&#124;
 2 | Chur City&#160;&#124;
 3 | <a href="/wiki/Dreib%C3%BCndenquartier" title="Dreibündenquartier">Dreibündenquartier</a>&#160;&#124;
 4 | <a href="/wiki/F%C3%BCrstenwald_(Quartier)" title="Fürstenwald (Quartier)">Fürstenwald</a>&#160;&#124;
 5 | <a href="/wiki/Giacomettiquartier" title="Giacomettiquartier">Giacomettiquartier</a>&#160;&#124;
 6 | <a href="/wiki/Kornquader" title="Kornquader">Kornquader</a>&#160;&#124;
 7 | <a href="/wiki/Lacunaquartier" title="Lacunaquartier">Lacunaquartier</a>&#160;&#124;
 8 | <a href="/wiki/Masans" title="Masans">Masans</a>&#160;&#124;
 9 | <a href="/wiki/Niederlachen-Untere_Au" title="Niederlachen-Untere Au">Niederlachen-Untere Au</a>&#160;&#124;
10 | <a href="/wiki/Rheinquartier" title="Rheinquartier">Rheinquartier</a>&#160;&#124;
11 | <a href="/wiki/Rossboden" title="Rossboden">Rossboden</a>&#160;&#124;
12 | <a href="/wiki/Sand_(Quartier)" title="Sand (Quartier)">Sand</a>&#160;&#124;
13 | <a href="/wiki/Sommerau_(Quartier)" title="Sommerau (Quartier)">Sommerau</a>&#160;&#124;
14 | <a href="/wiki/Tittwiesen" title="Tittwiesen">Tittwiesen</a>&#160;&#124;
15 | <a href="/wiki/Wiesental_(Chur)" title="Wiesental (Chur)">Wiesental</a>
16 | </p>
17 | 


--------------------------------------------------------------------------------
/tests/html/wikipedia-consequtive-links-and-umlauts.txt:
--------------------------------------------------------------------------------
1 | Araschgen | Chur City | Dreibündenquartier | Fürstenwald | Giacomettiquartier | Kornquader | Lacunaquartier | Masans | Niederlachen-Untere Au | Rheinquartier | Rossboden | Sand | Sommerau | Tittwiesen | Wiesental
2 | 


--------------------------------------------------------------------------------
/tests/html/wikipedia-consequtive-tables.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "annotation_rules": {
 3 |         "h1": ["heading"],
 4 |          "h2": ["heading"],
 5 |          "h3": ["subheading"],
 6 |          "h4": ["subheading"],
 7 |          "h5": ["subheading"],
 8 |          "i": ["emphasis"],
 9 |          "b": ["bold"],
10 |          "th": ["tableheading"],
11 |          "a": ["link"]
12 |     },
13 |     "result": [
14 |         ["bold", "Monatliche Durchschnittstemperaturen und -niederschl\u00e4ge f\u00fcr Chur 1981\u20132010"], 
15 |         ["link", "Temperatur"], 
16 |         ["bold", "\u00d8"], 
17 |         ["bold", "15,1"], 
18 |         ["bold", "\u00d8"], 
19 |         ["bold", "5,6"], 
20 |         ["bold", "\u00d8"], 
21 |         ["bold", "10"], 
22 |         ["link", "Niederschlag"], 
23 |         ["bold", "\u03a3"], 
24 |         ["bold", "848"], 
25 |         ["link", "Sonnenstunden"], 
26 |         ["bold", "\u00d8"], 
27 |         ["bold", "4,6"], 
28 |         ["link", "Regentage"], 
29 |         ["bold", "\u03a3"], 
30 |         ["bold", "104,6"]
31 |     ]
32 | }
33 | 


--------------------------------------------------------------------------------
/tests/html/wikipedia-enumeration-annotation.html:
--------------------------------------------------------------------------------
 1 | <div>
 2 | <h2>Inhaltsverzeichnis</h2>
 3 | </div>
 4 | Another <b>marker</b>.
 5 | <ul>
 6 |     <li class="toclevel-1 tocsection-1"><a href="#Name_und_Aussprache"><span class="tocnumber">1</span> <span class="toctext">Name und Aussprache - <b>marker2</b></span></a></li>
 7 |     <li class="toclevel-1 tocsection-2"><a href="#Geographie"><span class="tocnumber">2</span> <span class="toctext">Geographie - <b>marker3</b></span></a>
 8 | <ul>
 9 |     <li class="toclevel-2 tocsection-3"><a href="#Stadtquartiere"><span class="tocnumber">2.1</span> <span class="toctext">Stadtquartiere - <b>marker31</b></span></a></li>
10 | <li class="toclevel-2 tocsection-4"><a href="#Klima"><span class="tocnumber">2.2</span> <span class="toctext">Klima</span></a></li>
11 | </ul>
12 | </li>
13 | <li class="toclevel-1 tocsection-5"><a href="#Geschichte"><span class="tocnumber">3</span> <span class="toctext">Geschichte</span></a>
14 | <ul>
15 | <li class="toclevel-2 tocsection-6"><a href="#Vorr.C3.B6mische_Zeit"><span class="tocnumber">3.1</span> <span class="toctext">Vorrömische Zeit</span></a></li>
16 | <li class="toclevel-2 tocsection-7"><a href="#Antike"><span class="tocnumber">3.2</span> <span class="toctext">Antike</span></a></li>
17 | <li class="toclevel-2 tocsection-8"><a href="#Mittelalter"><span class="tocnumber">3.3</span> <span class="toctext">Mittelalter</span></a></li>
18 | <li class="toclevel-2 tocsection-9"><a href="#Wende_zur_Neuzeit"><span class="tocnumber">3.4</span> <span class="toctext">Wende zur Neuzeit</span></a></li>
19 | <li class="toclevel-2 tocsection-10"><a href="#Reformation_und_Dreissigj.C3.A4hriger_Krieg"><span class="tocnumber">3.5</span> <span class="toctext">Reformation und Dreissigjähriger Krieg</span></a></li>
20 | <li class="toclevel-2 tocsection-11"><a href="#19._Jahrhundert"><span class="tocnumber">3.6</span> <span class="toctext">19. Jahrhundert</span></a></li>
21 | <li class="toclevel-2 tocsection-12"><a href="#Moderne_und_Gegenwart"><span class="tocnumber">3.7</span> <span class="toctext">Moderne und Gegenwart</span></a></li>
22 | </ul>
23 | </li>
24 | <li class="toclevel-1 tocsection-13"><a href="#Bev.C3.B6lkerung"><span class="tocnumber">4</span> <span class="toctext">Bevölkerung</span></a>
25 | <ul>
26 | <li class="toclevel-2 tocsection-14"><a href="#Sprachen"><span class="tocnumber">4.1</span> <span class="toctext">Sprachen</span></a></li>
27 | <li class="toclevel-2 tocsection-15"><a href="#Religionen"><span class="tocnumber">4.2</span> <span class="toctext">Religionen</span></a></li>
28 | </ul>
29 | </li>
30 | <li class="toclevel-1 tocsection-16"><a href="#Wappen"><span class="tocnumber">5</span> <span class="toctext">Wappen</span></a></li>
31 | <li class="toclevel-1 tocsection-17"><a href="#Politik"><span class="tocnumber">6</span> <span class="toctext">Politik</span></a>
32 | <ul>
33 | <li class="toclevel-2 tocsection-18"><a href="#Stadtpr.C3.A4sidenten"><span class="tocnumber">6.1</span> <span class="toctext">Stadtpräsidenten</span></a></li>
34 | <li class="toclevel-2 tocsection-19"><a href="#Partnerst.C3.A4dte"><span class="tocnumber">6.2</span> <span class="toctext">Partnerstädte</span></a></li>
35 | </ul>
36 | </li>
37 | <li class="toclevel-1 tocsection-20"><a href="#Wirtschaft_und_Infrastruktur"><span class="tocnumber">7</span> <span class="toctext">Wirtschaft und Infrastruktur</span></a>
38 | <ul>
39 | <li class="toclevel-2 tocsection-21"><a href="#Wirtschaft"><span class="tocnumber">7.1</span> <span class="toctext">Wirtschaft</span></a></li>
40 | <li class="toclevel-2 tocsection-22"><a href="#Land-_und_Alpwirtschaft"><span class="tocnumber">7.2</span> <span class="toctext">Land- und Alpwirtschaft</span></a></li>
41 | <li class="toclevel-2 tocsection-23"><a href="#Verkehr"><span class="tocnumber">7.3</span> <span class="toctext">Verkehr</span></a></li>
42 | <li class="toclevel-2 tocsection-24"><a href="#Bildung"><span class="tocnumber">7.4</span> <span class="toctext">Bildung</span></a></li>
43 | <li class="toclevel-2 tocsection-25"><a href="#Medien"><span class="tocnumber">7.5</span> <span class="toctext">Medien</span></a></li>
44 | <li class="toclevel-2 tocsection-26"><a href="#Kultur"><span class="tocnumber">7.6</span> <span class="toctext">Kultur</span></a></li>
45 | <li class="toclevel-2 tocsection-27"><a href="#Justiz"><span class="tocnumber">7.7</span> <span class="toctext">Justiz</span></a></li>
46 | <li class="toclevel-2 tocsection-28"><a href="#Friedh.C3.B6fe"><span class="tocnumber">7.8</span> <span class="toctext">Friedhöfe</span></a></li>
47 | <li class="toclevel-2 tocsection-29"><a href="#Sportvereine"><span class="tocnumber">7.9</span> <span class="toctext">Sportvereine</span></a></li>
48 | </ul>
49 | </li>
50 | <li class="toclevel-1 tocsection-30"><a href="#Sehensw.C3.BCrdigkeiten_und_Tourismus"><span class="tocnumber">8</span> <span class="toctext">Sehenswürdigkeiten und Tourismus</span></a>
51 | <ul>
52 | <li class="toclevel-2 tocsection-31"><a href="#Tourismus"><span class="tocnumber">8.1</span> <span class="toctext">Tourismus</span></a></li>
53 | </ul>
54 | </li>
55 | <li class="toclevel-1 tocsection-32"><a href="#Besonderes"><span class="tocnumber">9</span> <span class="toctext">Besonderes</span></a></li>
56 | <li class="toclevel-1 tocsection-33"><a href="#Galerie"><span class="tocnumber">10</span> <span class="toctext">Galerie</span></a></li>
57 | <li class="toclevel-1 tocsection-34"><a href="#Pers.C3.B6nlichkeiten"><span class="tocnumber">11</span> <span class="toctext">Persönlichkeiten</span></a></li>
58 | <li class="toclevel-1 tocsection-35"><a href="#Siehe_auch"><span class="tocnumber">12</span> <span class="toctext">Siehe auch</span></a></li>
59 | <li class="toclevel-1 tocsection-36"><a href="#Literatur"><span class="tocnumber">13</span> <span class="toctext">Literatur</span></a></li>
60 | <li class="toclevel-1 tocsection-37"><a href="#Weblinks"><span class="tocnumber">14</span> <span class="toctext">Weblinks</span></a></li>
61 | <li class="toclevel-1 tocsection-38"><a href="#Einzelnachweise"><span class="tocnumber">15</span> <span class="toctext">Einzelnachweise</span></a></li>
62 | </ul>
63 | 
64 | <h2>End of enumeration</h2>
65 | 
66 | Closing remarks and an <b>emphasized text portion</b>.
67 | 


--------------------------------------------------------------------------------
/tests/html/wikipedia-enumeration-annotation.json:
--------------------------------------------------------------------------------
 1 | {"annotation_rules": {
 2 |     "h1": ["heading"],
 3 |     "h2": ["heading"],
 4 |     "h3": ["heading"],
 5 |     "b": ["emphasis"],
 6 |     "table": ["table"],
 7 |     "th": ["table-heading"],
 8 |     "td": ["table-cell"]
 9 |  },
10 |  "result": [
11 |   ["heading", "Inhaltsverzeichnis\n\n"],
12 |   ["emphasis", "marker"],
13 |   ["emphasis", "marker2"],
14 |   ["emphasis", "marker3"],
15 |   ["emphasis", "marker31"],
16 |   ["heading", "\nEnd of enumeration\n\n"],
17 |   ["emphasis", "emphasized text portion"]
18 |  ]
19 | }
20 | 


--------------------------------------------------------------------------------
/tests/html/wikipedia-enumeration-annotation.txt:
--------------------------------------------------------------------------------
 1 | Inhaltsverzeichnis
 2 | 
 3 | Another marker.
 4 |   * 1 Name und Aussprache - marker2
 5 |   * 2 Geographie - marker3
 6 |       + 2.1 Stadtquartiere - marker31
 7 |       + 2.2 Klima
 8 |   * 3 Geschichte
 9 |       + 3.1 Vorrömische Zeit
10 |       + 3.2 Antike
11 |       + 3.3 Mittelalter
12 |       + 3.4 Wende zur Neuzeit
13 |       + 3.5 Reformation und Dreissigjähriger Krieg
14 |       + 3.6 19. Jahrhundert
15 |       + 3.7 Moderne und Gegenwart
16 |   * 4 Bevölkerung
17 |       + 4.1 Sprachen
18 |       + 4.2 Religionen
19 |   * 5 Wappen
20 |   * 6 Politik
21 |       + 6.1 Stadtpräsidenten
22 |       + 6.2 Partnerstädte
23 |   * 7 Wirtschaft und Infrastruktur
24 |       + 7.1 Wirtschaft
25 |       + 7.2 Land- und Alpwirtschaft
26 |       + 7.3 Verkehr
27 |       + 7.4 Bildung
28 |       + 7.5 Medien
29 |       + 7.6 Kultur
30 |       + 7.7 Justiz
31 |       + 7.8 Friedhöfe
32 |       + 7.9 Sportvereine
33 |   * 8 Sehenswürdigkeiten und Tourismus
34 |       + 8.1 Tourismus
35 |   * 9 Besonderes
36 |   * 10 Galerie
37 |   * 11 Persönlichkeiten
38 |   * 12 Siehe auch
39 |   * 13 Literatur
40 |   * 14 Weblinks
41 |   * 15 Einzelnachweise
42 | 
43 | End of enumeration
44 | 
45 | Closing remarks and an emphasized text portion.
46 | 


--------------------------------------------------------------------------------
/tests/html/wikipedia-enumeration.html:
--------------------------------------------------------------------------------
 1 | <div>
 2 | Inhaltsverzeichnis</h2>
 3 | </div>
 4 | <ul>
 5 | <li class="toclevel-1 tocsection-1"><a href="#Name_und_Aussprache"><span class="tocnumber">1</span> <span class="toctext">Name und Aussprache</span></a></li>
 6 | <li class="toclevel-1 tocsection-2"><a href="#Geographie"><span class="tocnumber">2</span> <span class="toctext">Geographie</span></a>
 7 | <ul>
 8 | <li class="toclevel-2 tocsection-3"><a href="#Stadtquartiere"><span class="tocnumber">2.1</span> <span class="toctext">Stadtquartiere</span></a></li>
 9 | <li class="toclevel-2 tocsection-4"><a href="#Klima"><span class="tocnumber">2.2</span> <span class="toctext">Klima</span></a></li>
10 | </ul>
11 | </li>
12 | <li class="toclevel-1 tocsection-5"><a href="#Geschichte"><span class="tocnumber">3</span> <span class="toctext">Geschichte</span></a>
13 | <ul>
14 | <li class="toclevel-2 tocsection-6"><a href="#Vorr.C3.B6mische_Zeit"><span class="tocnumber">3.1</span> <span class="toctext">Vorrömische Zeit</span></a></li>
15 | <li class="toclevel-2 tocsection-7"><a href="#Antike"><span class="tocnumber">3.2</span> <span class="toctext">Antike</span></a></li>
16 | <li class="toclevel-2 tocsection-8"><a href="#Mittelalter"><span class="tocnumber">3.3</span> <span class="toctext">Mittelalter</span></a></li>
17 | <li class="toclevel-2 tocsection-9"><a href="#Wende_zur_Neuzeit"><span class="tocnumber">3.4</span> <span class="toctext">Wende zur Neuzeit</span></a></li>
18 | <li class="toclevel-2 tocsection-10"><a href="#Reformation_und_Dreissigj.C3.A4hriger_Krieg"><span class="tocnumber">3.5</span> <span class="toctext">Reformation und Dreissigjähriger Krieg</span></a></li>
19 | <li class="toclevel-2 tocsection-11"><a href="#19._Jahrhundert"><span class="tocnumber">3.6</span> <span class="toctext">19. Jahrhundert</span></a></li>
20 | <li class="toclevel-2 tocsection-12"><a href="#Moderne_und_Gegenwart"><span class="tocnumber">3.7</span> <span class="toctext">Moderne und Gegenwart</span></a></li>
21 | </ul>
22 | </li>
23 | <li class="toclevel-1 tocsection-13"><a href="#Bev.C3.B6lkerung"><span class="tocnumber">4</span> <span class="toctext">Bevölkerung</span></a>
24 | <ul>
25 | <li class="toclevel-2 tocsection-14"><a href="#Sprachen"><span class="tocnumber">4.1</span> <span class="toctext">Sprachen</span></a></li>
26 | <li class="toclevel-2 tocsection-15"><a href="#Religionen"><span class="tocnumber">4.2</span> <span class="toctext">Religionen</span></a></li>
27 | </ul>
28 | </li>
29 | <li class="toclevel-1 tocsection-16"><a href="#Wappen"><span class="tocnumber">5</span> <span class="toctext">Wappen</span></a></li>
30 | <li class="toclevel-1 tocsection-17"><a href="#Politik"><span class="tocnumber">6</span> <span class="toctext">Politik</span></a>
31 | <ul>
32 | <li class="toclevel-2 tocsection-18"><a href="#Stadtpr.C3.A4sidenten"><span class="tocnumber">6.1</span> <span class="toctext">Stadtpräsidenten</span></a></li>
33 | <li class="toclevel-2 tocsection-19"><a href="#Partnerst.C3.A4dte"><span class="tocnumber">6.2</span> <span class="toctext">Partnerstädte</span></a></li>
34 | </ul>
35 | </li>
36 | <li class="toclevel-1 tocsection-20"><a href="#Wirtschaft_und_Infrastruktur"><span class="tocnumber">7</span> <span class="toctext">Wirtschaft und Infrastruktur</span></a>
37 | <ul>
38 | <li class="toclevel-2 tocsection-21"><a href="#Wirtschaft"><span class="tocnumber">7.1</span> <span class="toctext">Wirtschaft</span></a></li>
39 | <li class="toclevel-2 tocsection-22"><a href="#Land-_und_Alpwirtschaft"><span class="tocnumber">7.2</span> <span class="toctext">Land- und Alpwirtschaft</span></a></li>
40 | <li class="toclevel-2 tocsection-23"><a href="#Verkehr"><span class="tocnumber">7.3</span> <span class="toctext">Verkehr</span></a></li>
41 | <li class="toclevel-2 tocsection-24"><a href="#Bildung"><span class="tocnumber">7.4</span> <span class="toctext">Bildung</span></a></li>
42 | <li class="toclevel-2 tocsection-25"><a href="#Medien"><span class="tocnumber">7.5</span> <span class="toctext">Medien</span></a></li>
43 | <li class="toclevel-2 tocsection-26"><a href="#Kultur"><span class="tocnumber">7.6</span> <span class="toctext">Kultur</span></a></li>
44 | <li class="toclevel-2 tocsection-27"><a href="#Justiz"><span class="tocnumber">7.7</span> <span class="toctext">Justiz</span></a></li>
45 | <li class="toclevel-2 tocsection-28"><a href="#Friedh.C3.B6fe"><span class="tocnumber">7.8</span> <span class="toctext">Friedhöfe</span></a></li>
46 | <li class="toclevel-2 tocsection-29"><a href="#Sportvereine"><span class="tocnumber">7.9</span> <span class="toctext">Sportvereine</span></a></li>
47 | </ul>
48 | </li>
49 | <li class="toclevel-1 tocsection-30"><a href="#Sehensw.C3.BCrdigkeiten_und_Tourismus"><span class="tocnumber">8</span> <span class="toctext">Sehenswürdigkeiten und Tourismus</span></a>
50 | <ul>
51 | <li class="toclevel-2 tocsection-31"><a href="#Tourismus"><span class="tocnumber">8.1</span> <span class="toctext">Tourismus</span></a></li>
52 | </ul>
53 | </li>
54 | <li class="toclevel-1 tocsection-32"><a href="#Besonderes"><span class="tocnumber">9</span> <span class="toctext">Besonderes</span></a></li>
55 | <li class="toclevel-1 tocsection-33"><a href="#Galerie"><span class="tocnumber">10</span> <span class="toctext">Galerie</span></a></li>
56 | <li class="toclevel-1 tocsection-34"><a href="#Pers.C3.B6nlichkeiten"><span class="tocnumber">11</span> <span class="toctext">Persönlichkeiten</span></a></li>
57 | <li class="toclevel-1 tocsection-35"><a href="#Siehe_auch"><span class="tocnumber">12</span> <span class="toctext">Siehe auch</span></a></li>
58 | <li class="toclevel-1 tocsection-36"><a href="#Literatur"><span class="tocnumber">13</span> <span class="toctext">Literatur</span></a></li>
59 | <li class="toclevel-1 tocsection-37"><a href="#Weblinks"><span class="tocnumber">14</span> <span class="toctext">Weblinks</span></a></li>
60 | <li class="toclevel-1 tocsection-38"><a href="#Einzelnachweise"><span class="tocnumber">15</span> <span class="toctext">Einzelnachweise</span></a></li>
61 | </ul>
62 | 


--------------------------------------------------------------------------------
/tests/html/wikipedia-enumeration.txt:
--------------------------------------------------------------------------------
 1 | Inhaltsverzeichnis
 2 |   * 1 Name und Aussprache
 3 |   * 2 Geographie
 4 |       + 2.1 Stadtquartiere
 5 |       + 2.2 Klima
 6 |   * 3 Geschichte
 7 |       + 3.1 Vorrömische Zeit
 8 |       + 3.2 Antike
 9 |       + 3.3 Mittelalter
10 |       + 3.4 Wende zur Neuzeit
11 |       + 3.5 Reformation und Dreissigjähriger Krieg
12 |       + 3.6 19. Jahrhundert
13 |       + 3.7 Moderne und Gegenwart
14 |   * 4 Bevölkerung
15 |       + 4.1 Sprachen
16 |       + 4.2 Religionen
17 |   * 5 Wappen
18 |   * 6 Politik
19 |       + 6.1 Stadtpräsidenten
20 |       + 6.2 Partnerstädte
21 |   * 7 Wirtschaft und Infrastruktur
22 |       + 7.1 Wirtschaft
23 |       + 7.2 Land- und Alpwirtschaft
24 |       + 7.3 Verkehr
25 |       + 7.4 Bildung
26 |       + 7.5 Medien
27 |       + 7.6 Kultur
28 |       + 7.7 Justiz
29 |       + 7.8 Friedhöfe
30 |       + 7.9 Sportvereine
31 |   * 8 Sehenswürdigkeiten und Tourismus
32 |       + 8.1 Tourismus
33 |   * 9 Besonderes
34 |   * 10 Galerie
35 |   * 11 Persönlichkeiten
36 |   * 12 Siehe auch
37 |   * 13 Literatur
38 |   * 14 Weblinks
39 |   * 15 Einzelnachweise
40 | 


--------------------------------------------------------------------------------
/tests/html/wikipedia-equation.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 |     <body>
 3 | <div><pre><span></span><span class="kt">int</span> <span class="nf">factorial</span><span class="p">(</span><span class="kt">int</span> <span class="n">x</span><span class="p">)</span> <span class="p">{</span>
 4 |     <span class="k">if</span> <span class="p">(</span><span class="n">x</span> <span class="o">&lt;=</span> <span class="mi">1</span><span class="p">)</span>
 5 |             <span class="k">return</span> <span class="mi">1</span><span class="p">;</span>
 6 | 
 7 |                 <span class="k">return</span> <span class="n">x</span> <span class="o">*</span> <span class="n">factorial</span><span class="p">(</span><span class="n">x</span> <span class="o">-</span> <span class="mi">1</span><span class="p">);</span>
 8 |                 <span class="p">}</span>
 9 | </pre></div>
10 |     </body></html>
11 | 


--------------------------------------------------------------------------------
/tests/html/wikipedia-equation.txt:
--------------------------------------------------------------------------------
1 | int factorial(int x) {
2 |     if (x <= 1)
3 |             return 1;
4 | 
5 |                 return x * factorial(x - 1);
6 |                 }
7 | 
8 | 


--------------------------------------------------------------------------------
/tests/html/wikipedia-table-bordercase-verticial-alignmnet.html:
--------------------------------------------------------------------------------
 1 | <table class="toptextcells">
 2 | <tbody><tr>
 3 | <td>
 4 | <ul><li><a href="/wiki/Araschgen" title="Araschgen">Araschgen</a></li>
 5 | <li><a href="/wiki/F%C3%BCrstenwald_(Quartier)" title="Fürstenwald (Quartier)">Fürstenwald</a></li>
 6 | <li><a href="/wiki/Masans" title="Masans">Masans</a></li>
 7 | <li><a href="/wiki/Niederlachen-Untere_Au" title="Niederlachen-Untere Au">Niederlachen-Untere Au</a></li>
 8 | <li><a href="/wiki/Lacunaquartier" title="Lacunaquartier">Lacuna</a></li>
 9 | <li><a href="/wiki/Giacomettiquartier" title="Giacomettiquartier">Giacomettiquartier</a></li>
10 | <li><a href="/wiki/Chur_West" title="Chur West">Chur West</a></li>
11 | <li><a href="/wiki/Dreib%C3%BCndenquartier" title="Dreibündenquartier">Dreibünden</a></li></ul>
12 | </td>
13 | <td>
14 | </td>
15 | <td>
16 | <ul><li>Haldenstein</li>
17 | <li>Maladers</li>
18 | <li><a href="/wiki/Sand_(Quartier)" title="Sand (Quartier)">Sand</a></li>
19 | <li><a href="/wiki/Kornquader" title="Kornquader">Kornquader</a></li>
20 | <li><a href="/wiki/Rheinquartier" title="Rheinquartier">Rheinquartier</a></li>
21 | <li><a href="/wiki/Rossboden" title="Rossboden">Rossboden</a></li>
22 | <li>Plankis/<a href="/wiki/Sommerau_(Quartier)" title="Sommerau (Quartier)">Sommerau</a></li>
23 | <li><a href="/wiki/Wiesental_(Chur)" title="Wiesental (Chur)">Wiesental</a></li>
24 | <li><a href="/wiki/Tittwiesen" title="Tittwiesen">Tittwiesen</a></li>
25 | <li>Lürlibad<sup id="cite_ref-8" class="reference"><a href="#cite_note-8">&#91;8&#93;</a></sup></li></ul>
26 | </td>
27 | <td>
28 | </td></tr></tbody></table>
29 | 


--------------------------------------------------------------------------------
/tests/html/wikipedia-table-bordercase-verticial-alignmnet.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "annotation_rules": {
 3 |         "h1": ["heading"],
 4 |          "h2": ["heading"],
 5 |          "h3": ["subheading"],
 6 |          "h4": ["subheading"],
 7 |          "h5": ["subheading"],
 8 |          "i": ["emphasis"],
 9 |          "b": ["bold"],
10 |          "th": ["tableheading"],
11 |          "a": ["link"]
12 |     },
13 |     "result": [
14 |         ["link", "  * Araschgen"], 
15 |         ["link", "  * F\u00fcrstenwald"], 
16 |         ["link", "  * Masans"], 
17 |         ["link", "  * Niederlachen-Untere Au"], 
18 |         ["link", "  * Lacuna"], 
19 |         ["link", "  * Giacomettiquartier"], 
20 |         ["link", "  * Chur West"], 
21 |         ["link", "  * Dreib\u00fcnden"], 
22 |         ["link", "  * Sand"], 
23 |         ["link", "  * Kornquader"], 
24 |         ["link", "  * Rheinquartier"], 
25 |         ["link", "  * Rossboden"], 
26 |         ["link", "Sommerau"], 
27 |         ["link", "  * Wiesental"], 
28 |         ["link", "  * Tittwiesen"], 
29 |         ["link", "[8]"]
30 |     ]
31 | }
32 | 


--------------------------------------------------------------------------------
/tests/html/wikipedia-table-bordercase1.html:
--------------------------------------------------------------------------------
 1 | <div role="navigation">
 2 | Dieser Artikel behandelt den Bündner Hauptort. Für andere Bedeutungen siehe <a href="/wiki/Chur_(Begriffskl%C3%A4rung)" class="mw-disambig" title="Chur (Begriffsklärung)">Chur (Begriffsklärung)</a>.</div>
 3 | </div></div>
 4 | <table class="wikitable float-right" style="line-height: 140%; border-spacing: 0; background-color:white;">
 5 | <tbody><tr>
 6 | <th colspan="2" class="hintergrundfarbe5" style="text-align:center; font-size: 1.4em;">Chur
 7 | </th></tr>
 8 | <tr>
 9 | <td colspan="2" align="center"><div class="center"><div class="floatnone"><a href="/wiki/Datei:Chur_wappen.svg" class="image" title="Wappen von Chur"><img alt="Wappen von Chur" src="//upload.wikimedia.org/wikipedia/commons/thumb/7/7f/Chur_wappen.svg/120px-Chur_wappen.svg.png" decoding="async" width="120" height="132" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/7/7f/Chur_wappen.svg/180px-Chur_wappen.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/7/7f/Chur_wappen.svg/240px-Chur_wappen.svg.png 2x" data-file-width="438" data-file-height="480" /></a></div></div>
10 | </td></tr>
11 | <tr>
12 | <td style="border-bottom: 1px solid #eee; border-right: 1px solid #eee;"><a href="/wiki/Staat" title="Staat">Staat</a>:
13 | </td>
14 | <td style="border-bottom: 1px solid #eee;"><span style="display:none;">Schweiz</span><a href="/wiki/Schweiz" title="Schweiz"><img alt="Schweiz" src="//upload.wikimedia.org/wikipedia/commons/thumb/f/f3/Flag_of_Switzerland.svg/20px-Flag_of_Switzerland.svg.png" decoding="async" width="20" height="20" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/f/f3/Flag_of_Switzerland.svg/30px-Flag_of_Switzerland.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/f/f3/Flag_of_Switzerland.svg/40px-Flag_of_Switzerland.svg.png 2x" data-file-width="512" data-file-height="512" /></a>&#160;<a href="/wiki/Schweiz" title="Schweiz">Schweiz</a>
15 | </td></tr>
16 | <tr>
17 | <td style="border-bottom: 1px solid #eee; border-right: 1px solid #eee;"><a href="/wiki/Kanton_(Schweiz)" title="Kanton (Schweiz)">Kanton</a>:
18 | </td>
19 | <td style="border-bottom: 1px solid #eee;"><span style="display:none;">Kanton Graubünden</span><a href="/wiki/Datei:Wappen_Graub%C3%BCnden_matt.svg" class="image" title="Kanton Graubünden"><img alt="Kanton Graubünden" src="//upload.wikimedia.org/wikipedia/commons/thumb/1/19/Wappen_Graub%C3%BCnden_matt.svg/20px-Wappen_Graub%C3%BCnden_matt.svg.png" decoding="async" width="20" height="24" class="noviewer" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/1/19/Wappen_Graub%C3%BCnden_matt.svg/30px-Wappen_Graub%C3%BCnden_matt.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/1/19/Wappen_Graub%C3%BCnden_matt.svg/40px-Wappen_Graub%C3%BCnden_matt.svg.png 2x" data-file-width="512" data-file-height="622" /></a> <a href="/wiki/Kanton_Graub%C3%BCnden" title="Kanton Graubünden">Graubünden</a> (GR)
20 | </td></tr>
21 | </table>
22 | 


--------------------------------------------------------------------------------
/tests/html/wikipedia-table-bordercase1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "annotation_rules": {
 3 |         "h1": ["heading"],
 4 |          "h2": ["heading"],
 5 |          "h3": ["subheading"],
 6 |          "h4": ["subheading"],
 7 |          "h5": ["subheading"],
 8 |          "i": ["emphasis"],
 9 |          "b": ["bold"],
10 |          "th": ["tableheading"],
11 |          "a": ["link"]
12 |     },
13 |     "result": [
14 |         ["link", "Chur (Begriffskl\u00e4rung)"],
15 |         ["tableheading", "Chur "],
16 |         ["link", "Staat"],
17 |         ["link", "Schweiz"],
18 |         ["link", "Kanton"],
19 |         ["link", "Graub\u00fcnden"]
20 |     ]
21 | }
22 | 


--------------------------------------------------------------------------------
/tests/html/wikipedia-table.html:
--------------------------------------------------------------------------------
 1 | <h1>Ehre sei Gott in der Höhe!</h1>
 2 | und Friede den Menschen, die guten Willens sind.
 3 | 
 4 | <h2><span class="mw-headline" id="Bev.C3.B6lkerung">Bev&ouml;lkerung</span><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Chur&amp;action=edit&amp;section=13" title="Abschnitt bearbeiten: Bevölkerung">Bearbeiten</a><span class="mw-editsection-bracket">]</span></span></h2>
 5 | <table class="wikitable centered" style="text-align:center" width="500">
 6 | <tr class="hintergrundfarbe5">
 7 | <th colspan="10">Bev&ouml;lkerungsentwicklung<sup id="cite_ref-Einwohnerzahlen_6-0" class="reference"><a href="#cite_note-Einwohnerzahlen-6">[6]</a></sup></th>
 8 | </tr>
 9 | <tr>
10 | <th style="text-align:left;">Jahr</th>
11 | <th>1500</th>
12 | <th>1860</th>
13 | <th>1900</th>
14 | <th>1950</th>
15 | <th>1970</th>
16 | <th>2000</th>
17 | <th>2005</th>
18 | <th>2011</th>
19 | <th>2012</th>
20 | </tr>
21 | <tr>
22 | <th style="text-align:left;">Einwohner</th>
23 | <td>ca. 1500</td>
24 | <td>3990</td>
25 | <td>11'532</td>
26 | <td>19'382</td>
27 | <td>31'193</td>
28 | <td>32'989</td>
29 | <td>32'409</td>
30 | <td>36'690</td>
31 | <td>37'036</td>
32 | </tr>
33 | </table>
34 | 


--------------------------------------------------------------------------------
/tests/html/wikipedia-table.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "annotation_rules": {
 3 |         "h1": ["heading"],
 4 |         "h2": ["heading"],
 5 |         "h3": ["subheading"],
 6 |         "h4": ["subheading"],
 7 |         "h5": ["subheading"],
 8 |         "i": ["emphasis"],
 9 |         "b": ["bold"],
10 |         "table": ["table"],
11 |         "th": ["tableheading"],
12 |         "a": ["link"]
13 |     },
14 |     "result": [
15 |         ["heading", "Ehre sei Gott in der H\u00f6he!\n\n"], 
16 |         ["link", "Bearbeiten"], 
17 |         ["heading", "\nBev\u00f6lkerung[Bearbeiten]\n\n"], 
18 |         ["table", "Bev\u00f6lkerungsentwicklung[6]\nJahr                        1500      1860  1900    1950    1970    2000    2005    2011    2012  \nEinwohner                   ca. 1500  3990  11'532  19'382  31'193  32'989  32'409  36'690  37'036\n"], 
19 |         ["link", "[6]"], 
20 |         ["tableheading", "Bev\u00f6lkerungsentwicklung[6]"], 
21 |         ["tableheading", "Jahr"], 
22 |         ["tableheading", "1500"], 
23 |         ["tableheading", "1860"], 
24 |         ["tableheading", "1900"], 
25 |         ["tableheading", "1950"], 
26 |         ["tableheading", "1970"], 
27 |         ["tableheading", "2000"], 
28 |         ["tableheading", "2005"], 
29 |         ["tableheading", "2011"], 
30 |         ["tableheading", "2012"], 
31 |         ["tableheading", "Einwohner"]
32 |     ]
33 | }
34 | 


--------------------------------------------------------------------------------
/tests/html/wikipedia-table.txt:
--------------------------------------------------------------------------------
1 | Ehre sei Gott in der Höhe!
2 | 
3 | und Friede den Menschen, die guten Willens sind.
4 | 
5 | Bevölkerung[Bearbeiten]
6 | 
7 | Bevölkerungsentwicklung[6]
8 | Jahr                        1500      1860  1900    1950    1970    2000    2005    2011    2012  
9 | Einwohner                   ca. 1500  3990  11'532  19'382  31'193  32'989  32'409  36'690  37'036


--------------------------------------------------------------------------------
/tests/test_annotation.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | """
 5 | Tests the Table formatting with different parameters such as width and
 6 | alignment
 7 | """
 8 | 
 9 | from inscriptis.annotation import Annotation, horizontal_shift
10 | from inscriptis.html_properties import HorizontalAlignment
11 | 
12 | 
13 | def test_horizontal_shift():
14 |     a = [Annotation(0, 4, "test")]
15 | 
16 |     # no shift
17 |     assert horizontal_shift(
18 |         a, content_width=5, line_width=10, align=HorizontalAlignment.left, shift=0
19 |     ).pop() == Annotation(0, 4, "test")
20 | 
21 |     # shift
22 |     assert horizontal_shift(
23 |         a, content_width=5, line_width=10, align=HorizontalAlignment.left, shift=3
24 |     ).pop() == Annotation(3, 7, "test")
25 | 
26 |     # realignment to the right
27 |     assert horizontal_shift(
28 |         a,
29 |         content_width=len("test"),
30 |         line_width=10,
31 |         align=HorizontalAlignment.right,
32 |         shift=0,
33 |     ).pop() == Annotation(6, 10, "test")
34 |     assert "{:>10}".format("test")[6:10] == "test"
35 | 
36 |     # shift + realignment to the right
37 |     assert horizontal_shift(
38 |         a,
39 |         content_width=len("test"),
40 |         line_width=10,
41 |         align=HorizontalAlignment.right,
42 |         shift=3,
43 |     ).pop() == Annotation(9, 13, "test")
44 | 
45 |     # realignment to the center
46 |     assert horizontal_shift(
47 |         a,
48 |         content_width=len("test"),
49 |         line_width=10,
50 |         align=HorizontalAlignment.center,
51 |         shift=0,
52 |     ).pop() == Annotation(3, 7, "test")
53 |     assert "{:^10}".format("test")[3:7] == "test"
54 | 
55 |     assert horizontal_shift(
56 |         a,
57 |         content_width=len("test"),
58 |         line_width=11,
59 |         align=HorizontalAlignment.center,
60 |         shift=0,
61 |     ).pop() == Annotation(3, 7, "test")
62 |     assert "{:^11}".format("test")[3:7] == "test"
63 | 
64 |     # realignment + shift
65 |     assert horizontal_shift(
66 |         a,
67 |         content_width=len("test"),
68 |         line_width=11,
69 |         align=HorizontalAlignment.center,
70 |         shift=7,
71 |     ).pop() == Annotation(10, 14, "test")
72 | 


--------------------------------------------------------------------------------
/tests/test_annotation_engine.py:
--------------------------------------------------------------------------------
 1 | # test the annotation handling
 2 | 
 3 | import pytest
 4 | 
 5 | from inscriptis.annotation import Annotation
 6 | from inscriptis.html_engine import Inscriptis
 7 | from inscriptis.model.config import ParserConfig
 8 | from lxml.html import fromstring
 9 | 
10 | 
11 | def test_get_annotation():
12 |     """Test get_anntation from the Inscriptis class"""
13 |     html = "<b>Chur</b> is a City in <b>Switzerland</b>"
14 |     rules = {"b": ["bold"]}
15 | 
16 |     inscriptis = Inscriptis(fromstring(html), ParserConfig(annotation_rules=rules))
17 | 
18 |     assert inscriptis.get_text() == "Chur is a City in Switzerland"
19 |     assert inscriptis.get_annotations() == [
20 |         Annotation(start=0, end=4, metadata="bold"),
21 |         Annotation(start=18, end=29, metadata="bold"),
22 |     ]
23 | 


--------------------------------------------------------------------------------
/tests/test_annotation_output_processor.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | Test the annotation output formatter.
 5 | """
 6 | 
 7 | import pytest
 8 | 
 9 | from inscriptis.annotation.output import AnnotationProcessor
10 | from inscriptis.annotation.output.html import HtmlExtractor
11 | from inscriptis.annotation.output.surface import SurfaceExtractor
12 | from inscriptis.annotation.output.xml import XmlExtractor
13 | 
14 | EXAMPLE_OUTPUT = {
15 |     "text": "Chur\n\nChur is the capital and largest town of "
16 |     "the Swiss canton of the Grisons and lies in the "
17 |     "Grisonian Rhine Valley.",
18 |     "label": [[0, 4, "h1"], [0, 4, "heading"], [6, 10, "emphasis"]],
19 | }
20 | 
21 | 
22 | def test_abstract_class():
23 |     processor = AnnotationProcessor()
24 | 
25 |     with pytest.raises(NotImplementedError):
26 |         result = processor(EXAMPLE_OUTPUT)
27 | 
28 | 
29 | def test_surface_annotator():
30 |     processor = SurfaceExtractor()
31 |     result = processor(EXAMPLE_OUTPUT)
32 | 
33 |     # the old keys haven't been changed
34 |     assert "text" in result
35 |     assert "label" in result
36 | 
37 |     # and we have additional information on surface forms :)
38 |     assert result["surface"] == [
39 |         ("h1", "Chur"),
40 |         ("heading", "Chur"),
41 |         ("emphasis", "Chur"),
42 |     ]
43 | 
44 | 
45 | def test_xml_annotator():
46 |     processor = XmlExtractor()
47 |     result = processor(EXAMPLE_OUTPUT)
48 | 
49 |     # and we have additional information on surface forms :)
50 |     assert result == (
51 |         '<?xml version="1.0" encoding="UTF-8" ?>\n<content>\n'
52 |         "<heading><h1>Chur</h1></heading>\n\n<emphasis>"
53 |         "Chur</emphasis> is the capital and largest town "
54 |         "of the Swiss canton of the Grisons and lies in "
55 |         "the Grisonian Rhine Valley.\n</content>"
56 |     )
57 | 
58 | 
59 | def test_html_annotator():
60 |     processor = HtmlExtractor()
61 |     result = processor(EXAMPLE_OUTPUT)
62 | 
63 |     assert result.startswith("<html><head><style>")
64 |     assert result.split("</style>")[1] == (
65 |         "</head>"
66 |         '<body><pre><span class="heading-label">heading'
67 |         '</span><span class="heading">'
68 |         '<span class="h1-label">h1</span><span class="h1">'
69 |         "Chur</span></span></pre>\n"
70 |         "<pre></pre>\n"
71 |         '<pre><span class="emphasis-label">emphasis</span>'
72 |         '<span class="emphasis">Chur</span> is the capital '
73 |         "and largest town of the Swiss canton of the "
74 |         "Grisons and lies in the Grisonian Rhine Valley."
75 |         "</pre></body></html>"
76 |     )
77 | 
78 | 
79 | def test_trailing_tag_annotation():
80 |     processor = XmlExtractor()
81 |     result = processor({"text": "Ehre sei Gott!", "label": [[9, 14, "emphasis"]]})
82 | 
83 |     assert result == (
84 |         '<?xml version="1.0" encoding="UTF-8" ?>\n<content>\n'
85 |         "Ehre sei <emphasis>Gott!</emphasis>\n</content>"
86 |     )
87 | 


--------------------------------------------------------------------------------
/tests/test_annotation_output_xml.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | Test the annotation XmlExtractor.
 5 | """
 6 | from lxml.html import fromstring
 7 | 
 8 | from inscriptis import Inscriptis, ParserConfig
 9 | from inscriptis.annotation.output.xml import XmlExtractor
10 | 
11 | 
12 | def test_tag_error_issue_93():
13 |     """
14 |     Test for the correct tag order in the XmlOutput as described in Issue #93.
15 |     """
16 |     html_issue_93 = """<html>
17 |        <body>
18 |          <div class="a">
19 |             <span class="b">Item1</span>
20 |             <span class="b">Item2</span>
21 |             <span class="b">Item3</span>
22 |             <span class="b">Item4</span>
23 |          </div>
24 |        </body>
25 |     </html>"""
26 | 
27 |     expected_output_issue_93 = (
28 |         """<?xml version="1.0" encoding="UTF-8" ?>\n<content>\n"""
29 |         "<outer><inner>  Item1 </inner><inner>Item2 </inner><inner>Item3 </inner>"
30 |         "<inner>Item4</inner></outer>\n</content>"
31 |     )
32 |     rules = {"div#class=a": ["outer"], "span#class=b": ["inner"]}
33 | 
34 |     inscriptis = Inscriptis(
35 |         fromstring(html_issue_93), ParserConfig(annotation_rules=rules)
36 |     )
37 |     annotated_html = {
38 |         "text": inscriptis.get_text(),
39 |         "label": inscriptis.get_annotations(),
40 |     }
41 |     result = XmlExtractor()(annotated_html)
42 |     assert result == expected_output_issue_93
43 | 
44 | 
45 | def test_tag_folding_issue_93_extended():
46 |     html_issue_93 = """<html>
47 |        <body>
48 |          <div class="a">
49 |          Some Test to add :)
50 |             <span class="b">Item<b>1</b></span>
51 |             <span class="b">Item2</span>
52 |             <span class="b"><b>Item3</b></span>
53 |             <span class="b"><b>It</b>e<b>m4</b></span>
54 |          </div>
55 |        </body>
56 |     </html>"""
57 | 
58 |     expected_output_issue_93 = (
59 |         """<?xml version="1.0" encoding="UTF-8" ?>\n"""
60 |         """<content>\n"""
61 |         """<outer>  Some Test to add :) <inner>Item <bold>1</bold></inner> <inner>Item2 </inner>"""
62 |         """<inner><bold>Item3</bold></inner> <inner><bold>It</bold> e <bold>m4</bold></inner></outer>\n"""
63 |         """</content>"""
64 |     )
65 |     rules = {"div#class=a": ["outer"], "span#class=b": ["inner"], "b": ["bold"]}
66 | 
67 |     inscriptis = Inscriptis(
68 |         fromstring(html_issue_93), ParserConfig(annotation_rules=rules)
69 |     )
70 |     annotated_html = {
71 |         "text": inscriptis.get_text(),
72 |         "label": inscriptis.get_annotations(),
73 |     }
74 |     result = XmlExtractor()(annotated_html)
75 |     assert result == expected_output_issue_93
76 | 


--------------------------------------------------------------------------------
/tests/test_annotation_rule_parsing.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | """
 5 | Tests the Table formatting with different parameters such as width and
 6 | alignment
 7 | """
 8 | 
 9 | from copy import deepcopy
10 | 
11 | from inscriptis.css_profiles import CSS_PROFILES
12 | from inscriptis.annotation.parser import AnnotationModel, ApplyAnnotation
13 | from inscriptis.model.attribute import Attribute
14 | from inscriptis.model.html_element import HtmlElement
15 | 
16 | 
17 | def test_parse():
18 |     """
19 |     basic rule parsing.
20 |     """
21 |     rules = {"table#border=1": ["table"], "hr": ["horizontal-line"]}
22 |     tags, attrs = AnnotationModel._parse(rules)
23 | 
24 |     assert tags == {"hr": ["horizontal-line"]}
25 | 
26 |     apply_annotation = attrs[0]
27 |     assert apply_annotation.match_tag == "table"
28 |     assert apply_annotation.match_value == "1"
29 |     assert apply_annotation.attr == "border"
30 | 
31 |     e = HtmlElement(tag="table")
32 |     apply_annotation.apply("1", e)
33 |     assert e.annotation == ("table",)
34 | 
35 | 
36 | def test_apply_annotation():
37 |     """
38 |     rule application.
39 |     """
40 |     rules = {
41 |         "table#border=1": ["table"],
42 |         "hr": ["horizontal-line"],
43 |         "#color=red": ["red"],
44 |         "#bgcolor": ["bgcolor"],
45 |     }
46 | 
47 |     css = deepcopy(CSS_PROFILES["strict"])
48 |     annotation_model = AnnotationModel(css, rules)
49 |     assert annotation_model.css["hr"].annotation == ("horizontal-line",)
50 | 
51 |     attribute_handler = Attribute()
52 |     attribute_handler.merge_attribute_map(annotation_model.css_attr)
53 |     assert "table#border=1" in str(attribute_handler.attribute_mapping["border"])
54 |     assert "{any}#color=red" in str(attribute_handler.attribute_mapping["color"])
55 |     assert "{any}#bgcolor={any}" in str(attribute_handler.attribute_mapping["bgcolor"])
56 | 
57 | 
58 | def test_merged_attribute():
59 |     """
60 |     test multiple rules per attribute
61 |     """
62 |     rules = {"#color=white": ["white"], "#color=yellow": ["yellow"]}
63 |     css = deepcopy(CSS_PROFILES["strict"])
64 |     annotation_model = AnnotationModel(css, rules)
65 | 
66 |     attribute_handler = Attribute()
67 |     attribute_handler.merge_attribute_map(annotation_model.css_attr)
68 | 
69 |     e = HtmlElement()
70 |     attribute_handler.attribute_mapping["color"]("green", e)
71 |     assert e.annotation == ()
72 |     attribute_handler.attribute_mapping["color"]("yellow", e)
73 |     assert e.annotation == ("yellow",)
74 |     attribute_handler.attribute_mapping["color"]("white", e)
75 |     assert e.annotation == ("yellow", "white")
76 | 


--------------------------------------------------------------------------------
/tests/test_block.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test cases for the Block class.
 3 | """
 4 | 
 5 | from inscriptis.model.canvas.block import Block
 6 | from inscriptis.model.canvas.prefix import Prefix
 7 | 
 8 | 
 9 | def test_merge_normal_text_collapsable_whitespaces():
10 |     """
11 |     test cases where the block has collapsable whitespaces
12 |     """
13 |     b = Block(0, Prefix())
14 |     b.merge_normal_text("Hallo")
15 |     assert b._content == "Hallo"
16 |     assert not b.collapsable_whitespace
17 | 
18 |     b = Block(0, Prefix())
19 |     b.merge_normal_text(" Hallo ")
20 |     assert b._content == "Hallo "
21 |     assert b.collapsable_whitespace
22 | 
23 |     b = Block(0, Prefix())
24 |     b.merge_normal_text("")
25 |     assert b._content == ""
26 |     assert b.collapsable_whitespace
27 | 
28 |     b.merge_normal_text(" ")
29 |     assert b._content == ""
30 |     assert b.collapsable_whitespace
31 | 
32 |     b.merge_normal_text("  ")
33 |     assert b._content == ""
34 |     assert b.collapsable_whitespace
35 | 
36 | 
37 | def test_merge_normal_non_collapsable_whitespaces():
38 |     b = Block(0, Prefix())
39 |     b.collapsable_whitespace = False
40 |     b.merge_normal_text("Hallo")
41 |     assert b._content == "Hallo"
42 |     assert not b.collapsable_whitespace
43 | 
44 |     b = Block(0, Prefix())
45 |     b.collapsable_whitespace = False
46 |     b.merge_normal_text(" Hallo ")
47 |     assert b._content == " Hallo "
48 |     assert b.collapsable_whitespace
49 | 
50 |     b = Block(0, Prefix())
51 |     b.collapsable_whitespace = False
52 |     b.merge_normal_text("")
53 |     assert b._content == ""
54 |     assert not b.collapsable_whitespace
55 | 
56 |     b = Block(0, Prefix())
57 |     b.collapsable_whitespace = False
58 |     b.merge_normal_text(" ")
59 |     assert b._content == " "
60 |     assert b.collapsable_whitespace
61 | 
62 |     b = Block(0, Prefix())
63 |     b.collapsable_whitespace = False
64 |     b.merge_normal_text("  ")
65 |     assert b._content == " "
66 |     assert b.collapsable_whitespace
67 | 


--------------------------------------------------------------------------------
/tests/test_broken_table_handling.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | """
 5 | Tests the handling of tables that do not properly close all column tags.
 6 | """
 7 | 
 8 | from inscriptis import get_text
 9 | from inscriptis.css_profiles import CSS_PROFILES
10 | from inscriptis.model.config import ParserConfig
11 | 
12 | config = ParserConfig(css=CSS_PROFILES["strict"])
13 | 
14 | 
15 | def test_forgotten_td_close_tag():
16 |     # one line (i.e., missing </td> before the next <td> and the next </tr>
17 |     html = "<body>hallo<table>" "<tr><td>1<td>2</tr>" "</table>echo</body>"
18 |     print(html)
19 |     # assert get_text(html, config) == u'hallo\n1  2\necho'
20 | 
21 |     # two lines (i.e. missing </td> before the <tr> and before the </table>
22 |     html = "<body>hallo<table>" "<tr><td>1<td>2" "<tr><td>3<td>4" "</table>echo</body>"
23 |     print(html)
24 |     assert get_text(html, config) == "hallo\n1  2\n3  4\n\necho"
25 | 


--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tests the Inscriptis CLI client.
  3 | """
  4 | 
  5 | from io import StringIO
  6 | from pathlib import Path
  7 | from json import loads
  8 | from unittest.mock import Mock, mock_open, patch, call
  9 | 
 10 | import pytest
 11 | 
 12 | from inscriptis.cli.inscript import cli
 13 | 
 14 | INPUT_DATA = """<html><body>Hello <b>World</b>!</body></html>"""
 15 | 
 16 | 
 17 | def test_cli_read_from_stdin(monkeypatch, capsys):
 18 |     """Test converting HTML from standard input with the command line client."""
 19 |     # Use monkeypatch to replace the 'input' function
 20 |     monkeypatch.setattr("sys.argv", ["inscript"])
 21 |     monkeypatch.setattr("sys.stdin", StringIO(INPUT_DATA))
 22 |     cli()
 23 | 
 24 |     # Capture the printed output
 25 |     captured = capsys.readouterr()
 26 |     assert captured.out.strip() == "Hello World!"
 27 | 
 28 | 
 29 | def test_cli_read_from_stdin_write_to_file(monkeypatch, capsys):
 30 |     """Test converting HTML from standard input with the command line client and
 31 |     writing it to a file."""
 32 |     # Use monkeypatch to replace the 'input' function
 33 |     monkeypatch.setattr("sys.argv", ["inscript", "--output", "test.txt"])
 34 |     monkeypatch.setattr("sys.stdin", StringIO(INPUT_DATA))
 35 |     with patch("pathlib.Path.open", create=True) as mock_file:
 36 |         cli()
 37 | 
 38 |     # Capture the printed output
 39 |     captured = capsys.readouterr()
 40 |     assert captured.out.strip() == ""
 41 |     # Capture the test written to the mock output file
 42 |     assert call().__enter__().write("Hello World!") in mock_file.mock_calls
 43 | 
 44 | 
 45 | def test_cli_read_from_file(monkeypatch, capsys):
 46 |     """Test converting HTML from a file with the command line client."""
 47 |     # Use monkeypatch to replace the 'input' function
 48 |     monkeypatch.setattr("sys.argv", ["inscript", "test.html"])
 49 |     monkeypatch.setattr("pathlib.Path.is_file", lambda _: True)
 50 |     monkeypatch.setattr("pathlib.Path.open", mock_open(read_data=INPUT_DATA))
 51 |     cli()
 52 | 
 53 |     # Capture the printed output
 54 |     captured = capsys.readouterr()
 55 |     assert captured.out.strip() == "Hello World!"
 56 | 
 57 | 
 58 | def test_cli_read_from_url(monkeypatch, capsys):
 59 |     """Test converting HTML from an URL with the command line client."""
 60 |     # Use monkeypatch to replace the 'input' function
 61 |     monkeypatch.setattr("sys.argv", ["inscript", "https://www.fhgr.ch/test.html"])
 62 | 
 63 |     mock_request = Mock()
 64 |     mock_request.content = INPUT_DATA.encode("utf8")
 65 |     mock_request.encoding = "utf-8"
 66 |     monkeypatch.setattr("requests.get", lambda url, timeout=0: mock_request)
 67 |     cli()
 68 | 
 69 |     # Capture the printed output
 70 |     captured = capsys.readouterr()
 71 |     assert captured.out.strip() == "Hello World!"
 72 | 
 73 | 
 74 | def test_cli_annotations(monkeypatch, capsys):
 75 |     """Test annotation handling in the command line client."""
 76 |     # Prepare input data for the test
 77 |     annotation_rule_path = (
 78 |         Path(__file__).parent / "data" / "annotation-profile-unittest.json"
 79 |     )
 80 | 
 81 |     # Use monkeypatch to replace the 'input' function
 82 |     monkeypatch.setattr(
 83 |         "sys.argv", ["inscript", "-p", "surface", "-r", str(annotation_rule_path)]
 84 |     )
 85 |     monkeypatch.setattr("sys.stdin", StringIO(INPUT_DATA))
 86 |     cli()
 87 | 
 88 |     # Capture the printed json data and convert it to an object
 89 |     captured = loads(capsys.readouterr().out.strip())
 90 |     assert captured["text"].strip() == "Hello World!"
 91 |     assert captured["label"] == [[6, 11, "emphasis"]]
 92 |     assert captured["surface"] == [["emphasis", "World"]]
 93 | 
 94 | 
 95 | def test_help(monkeypatch, capsys):
 96 |     monkeypatch.setattr("sys.argv", ["inscript", "--version"])
 97 | 
 98 |     # the cli should exit with exit code 0
 99 |     with pytest.raises(SystemExit) as exit_info:
100 |         cli()
101 |     assert exit_info.value.code == 0
102 | 
103 |     captured = capsys.readouterr().out
104 |     assert captured.startswith("Inscript HTML to text conversion")
105 |     assert "Inscript comes with ABSOLUTELY NO WARRANTY." in captured
106 | 
107 | 
108 | def test_missing_input_file(monkeypatch, capsys):
109 |     monkeypatch.setattr("sys.argv", ["inscript", "test.html"])
110 |     with pytest.raises(SystemExit) as exit_info:
111 |         cli()
112 | 
113 |     captured = capsys.readouterr()
114 |     assert exit_info.value.code == -1
115 |     assert captured.out.strip().startswith("ERROR: Cannot open input file")
116 | 
117 | 
118 | def test_missing_annotation_file(monkeypatch, capsys):
119 |     monkeypatch.setattr("sys.argv", ["inscript", "--annotation-rules", "rules.json"])
120 |     monkeypatch.setattr("sys.stdin", StringIO(INPUT_DATA))
121 |     with pytest.raises(SystemExit) as exit_info:
122 |         cli()
123 | 
124 |     captured = capsys.readouterr()
125 |     assert exit_info.value.code == -1
126 |     assert captured.out.strip().startswith("ERROR: Cannot open annotation rule file")
127 | 


--------------------------------------------------------------------------------
/tests/test_custom_html_tag_handling.py:
--------------------------------------------------------------------------------
 1 | """Test the custom HTML tag handling."""
 2 | 
 3 | from lxml.html import fromstring
 4 | 
 5 | from inscriptis import Inscriptis, ParserConfig
 6 | from inscriptis.model.html_document_state import HtmlDocumentState
 7 | from inscriptis.model.tag import CustomHtmlTagHandlerMapping
 8 | 
 9 | 
10 | def test_custom_html_handler():
11 |     def my_handle_start_b(state: HtmlDocumentState, _):
12 |         """Handle the opening <b> tag."""
13 |         state.tags[-1].write("**")
14 | 
15 |     def my_handle_end_b(state: HtmlDocumentState):
16 |         """Handle the closing </b> tag."""
17 |         state.tags[-1].write("**")
18 | 
19 |     custom_mapping = CustomHtmlTagHandlerMapping(
20 |         start_tag_mapping={"b": my_handle_start_b},
21 |         end_tag_mapping={"b": my_handle_end_b},
22 |     )
23 | 
24 |     html_tree = fromstring("Welcome to <b>Chur</b>")
25 |     inscriptis = Inscriptis(
26 |         html_tree, ParserConfig(custom_html_tag_handler_mapping=custom_mapping)
27 |     )
28 | 
29 |     # custom HTML Handler
30 |     assert inscriptis.get_text().strip() == "Welcome to **Chur**"
31 |     # standard HTML handler
32 |     assert Inscriptis(html_tree).get_text().strip() == "Welcome to Chur"
33 | 


--------------------------------------------------------------------------------
/tests/test_double_a.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """ ensures that two successive <a>text</a> contain
 4 |     a space between each other, if there is a linebreak
 5 |     or space between the tags.
 6 | """
 7 | 
 8 | from inscriptis import get_text
 9 | 
10 | 
11 | def test_successive_a():
12 |     html = (
13 |         '<html><body><a href="first">first</a>'
14 |         '<a href="second">second</a></body></html>'
15 |     )
16 |     assert get_text(html) == "firstsecond"
17 | 
18 |     html = (
19 |         '<html><body><a href="first">first</a>\n'
20 |         '<a href="second">second</a></body></html>'
21 |     )
22 |     assert get_text(html) == "first second"
23 | 


--------------------------------------------------------------------------------
/tests/test_empty_string.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """ ensures that two successive <a>text</a> contain
 4 |     a space between each other, if there is a linebreak
 5 |     or space between the tags.
 6 | """
 7 | 
 8 | from inscriptis import get_text
 9 | 
10 | 
11 | def test_empty_and_corrupt():
12 |     assert get_text("test").strip() == "test"
13 |     assert get_text("  ") == ""
14 |     assert get_text("") == ""
15 |     # test for the behaviour of older and recent lxml versions.
16 |     assert get_text("<<<").strip() in ("<<<", "<<", "")
17 | 


--------------------------------------------------------------------------------
/tests/test_engine.py:
--------------------------------------------------------------------------------
 1 | # test borderline cases
 2 | 
 3 | from inscriptis import get_text, get_annotated_text
 4 | 
 5 | 
 6 | def test_text_from_empty_content():
 7 |     assert get_text("") == ""
 8 | 
 9 | 
10 | def test_annotations_from_empty_content():
11 |     assert get_annotated_text("") == {}
12 | 


--------------------------------------------------------------------------------
/tests/test_html_conversion_options.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | Tests different HTML to text conversion options.
 5 | """
 6 | 
 7 | from inscriptis import get_text
 8 | from inscriptis.model.config import ParserConfig
 9 | 
10 | 
11 | def test_display_links():
12 |     html = """<html>
13 |                  <body>
14 |                    <a href="first">first</a>
15 |                    <a href="second">second</a>
16 |                    <a name="third">third</a>
17 |                  </body>
18 |                 </html>
19 |             """
20 |     config = ParserConfig(display_links=True)
21 |     assert get_text(html, config).strip() == "[first](first) [second](second) third"
22 | 
23 | 
24 | def test_display_anchors():
25 |     html = """<html>
26 |                  <body>
27 |                    <a name="first">first</a>
28 |                    <a href="second">second</a>
29 |                  </body>
30 |                 </html>
31 |             """
32 |     config = ParserConfig(display_anchors=True)
33 |     assert get_text(html, config).strip() == "[first](first) second"
34 | 
35 | 
36 | def test_display_links_and_anchors():
37 |     html = """<html>
38 |                  <body>
39 |                    <a href="first">first</a>
40 |                    <a href="second">second</a>
41 |                    <a name="third">third</a>
42 |                  </body>
43 |                 </html>
44 |             """
45 |     config = ParserConfig(display_links=True, display_anchors=True)
46 |     assert (
47 |         get_text(html, config).strip()
48 |         == "[first](first) [second](second) [third](third)"
49 |     )
50 | 
51 | 
52 | def test_display_images():
53 |     html = """<html>
54 |                  <body>
55 |                    <img src="test1" alt="Ein Test Bild" title="Hallo" />
56 |                    <img src="test2" alt="Ein Test Bild" title="Juhu" />
57 |                    <img src="test3" alt="Ein zweites Bild" title="Echo" />
58 |                  </body>
59 |                 </html>
60 |             """
61 |     config = ParserConfig(display_images=True)
62 |     assert (
63 |         get_text(html, config).strip()
64 |         == "[Ein Test Bild] [Ein Test Bild] [Ein zweites Bild]"
65 |     )
66 | 
67 | 
68 | def test_display_images_deduplicated():
69 |     html = """<html>
70 |                  <body>
71 |                    <img src="test1" alt="Ein Test Bild" title="Hallo" />
72 |                    <img src="test2" alt="Ein Test Bild" title="Juhu" />
73 |                    <img src="test3" alt="Ein zweites Bild" title="Echo" />
74 |                  </body>
75 |                 </html>
76 |             """
77 |     config = ParserConfig(display_images=True, deduplicate_captions=True)
78 |     assert get_text(html, config).strip() == "[Ein Test Bild] [Ein zweites Bild]"
79 | 


--------------------------------------------------------------------------------
/tests/test_html_snippets.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | Test HTML snippets in the project's HTML directory. The corresponding .txt file
 5 | contains the reference conversion.
 6 | """
 7 | from os.path import dirname, join
 8 | from glob import glob
 9 | 
10 | from inscriptis import get_text
11 | from inscriptis.css_profiles import CSS_PROFILES
12 | from inscriptis.model.config import ParserConfig
13 | 
14 | TESTCASE_PATTERN = join(dirname(__file__), "html/*.txt")
15 | 
16 | 
17 | def test_html_snippets(filter_str=""):
18 |     for testcase_txt in glob(TESTCASE_PATTERN):
19 |         if filter_str not in testcase_txt:
20 |             continue
21 | 
22 |         with open(testcase_txt) as f:
23 |             reference_txt = f.read().rstrip()
24 | 
25 |         with open(testcase_txt.replace(".txt", ".html")) as f:
26 |             print(f.name)
27 |             html = "<html><body>{}</body></html>".format(f.read())
28 | 
29 |         converted_txt = get_text(
30 |             html, ParserConfig(css=CSS_PROFILES["strict"])
31 |         ).rstrip()
32 | 
33 |         if converted_txt != reference_txt:
34 |             print(
35 |                 "File:{}\nHTML:\n{}\n\nReference:\n{}\n\nConverted:\n{}".format(
36 |                     testcase_txt, html, reference_txt, converted_txt
37 |                 )
38 |             )
39 |             print("HTML file:", testcase_txt.replace(".txt", ".html"))
40 |             print("Visualize differences with `vimdiff reference.txt " "converted.txt`")
41 |             open("reference.txt", "w").write(reference_txt)
42 |             open("converted.txt", "w").write(converted_txt)
43 | 
44 |         assert converted_txt == reference_txt
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     from sys import argv
49 | 
50 |     filter_str = argv[1] if len(argv) > 1 else ""
51 |     test_html_snippets(filter_str)
52 | 


--------------------------------------------------------------------------------
/tests/test_html_snippets_annotations.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | This test case verifies that annotation are correctly computed.
 5 | """
 6 | import os
 7 | from json import load
 8 | from glob import glob
 9 | from typing import List
10 | 
11 | from inscriptis import get_annotated_text
12 | from inscriptis.css_profiles import CSS_PROFILES
13 | from inscriptis.model.config import ParserConfig
14 | 
15 | TESTCASE_PATTERN = os.path.join(os.path.dirname(__file__), "html/*.json")
16 | 
17 | 
18 | def assert_equal_ignoring_whitespace(
19 |     reference: List[str], converted: List[str]
20 | ) -> bool:
21 |     for (ref_tag, ref_str), (conv_tag, conv_str) in zip(reference, converted):
22 |         assert ref_tag == conv_tag
23 |         assert "".join(ref_str.split()) == "".join(conv_str.split())
24 | 
25 | 
26 | def test_html_annotations(filter_str=""):
27 |     for annotation_file in glob(TESTCASE_PATTERN):
28 |         if filter_str not in annotation_file:
29 |             continue
30 | 
31 |         with open(annotation_file) as f:
32 |             reference = load(f)
33 | 
34 |         with open(annotation_file.replace(".json", ".html")) as f:
35 |             print(f.name)
36 |             html = "<html><body>{}</body></html>".format(f.read())
37 | 
38 |         for indentation_strategy in ("strict", "relaxed"):
39 |             result = get_annotated_text(
40 |                 html,
41 |                 ParserConfig(
42 |                     css=CSS_PROFILES[indentation_strategy],
43 |                     annotation_rules=reference["annotation_rules"],
44 |                 ),
45 |             )
46 | 
47 |             converted = [[a[2], result["text"][a[0] : a[1]]] for a in result["label"]]
48 | 
49 |             if reference["result"] != converted:
50 |                 print("Reference:")
51 |                 print(reference["result"])
52 |                 print(
53 |                     "\nConverted (indentation strategy: {})".format(
54 |                         indentation_strategy
55 |                     )
56 |                 )
57 |                 print(converted)
58 | 
59 |             if indentation_strategy == "strict":
60 |                 assert reference["result"] == converted
61 |             else:
62 |                 assert_equal_ignoring_whitespace(reference["result"], converted)
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     from sys import argv
67 | 
68 |     filter_str = argv[1] if len(argv) > 1 else ""
69 |     test_html_annotations(filter_str)
70 | 


--------------------------------------------------------------------------------
/tests/test_invalid_float_specification.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | """
 5 | Tests handling of invalid length specifications.
 6 | (https://github.com/weblyzard/inscriptis/issues/63)
 7 | """
 8 | 
 9 | from inscriptis import get_text
10 | 
11 | 
12 | def test_invalid_length_specification_handling():
13 |     html = """<p style="margin:0;padding:0;margin: 0cm; margin-bottom: ..0001pt; -ms-word-wrap: break-word;"><span style="font-size: 10.0pt; font-family: \'Arial\',sans-serif; color: black;">"""
14 |     print(get_text(html))
15 | 


--------------------------------------------------------------------------------
/tests/test_limit_whitespace_affixes.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | Tests different HTML to text conversion options.
 5 | """
 6 | 
 7 | from copy import copy
 8 | from inscriptis import get_text
 9 | from inscriptis.css_profiles import RELAXED_CSS_PROFILE
10 | from inscriptis.html_properties import Display, WhiteSpace
11 | from inscriptis.model.config import ParserConfig
12 | from inscriptis.model.html_element import HtmlElement
13 | 
14 | 
15 | def test_html_element_refinement():
16 |     new = HtmlElement(
17 |         "span",
18 |         display=Display.inline,
19 |         prefix=" ",
20 |         suffix=" ",
21 |         limit_whitespace_affixes=True,
22 |     )
23 |     pre = HtmlElement("pre", display=Display.block, whitespace=WhiteSpace.pre)
24 |     code = HtmlElement("code")
25 | 
26 |     # refinement with pre and whitespaces
27 |     refined = pre.get_refined_html_element(copy(new))
28 |     assert refined.prefix == ""
29 |     assert refined.suffix == ""
30 | 
31 |     # refinement with code and whitespaces
32 |     refined = code.get_refined_html_element(copy(new))
33 |     assert refined.prefix == " "
34 |     assert refined.suffix == " "
35 | 
36 |     # refinement with pre and non-whitespaces
37 |     new.prefix = " 1. "
38 |     new.suffix = "<"
39 |     refined = pre.get_refined_html_element(copy(new))
40 |     assert refined.prefix == " 1. "
41 |     assert refined.suffix == "<"
42 | 
43 |     # refinement with code and non-whitespaces
44 |     refined = code.get_refined_html_element(copy(new))
45 |     assert refined.prefix == " 1. "
46 |     assert refined.suffix == "<"
47 | 
48 | 
49 | def test_limit_whitespace_affixes():
50 |     html = """<html>
51 |                  <body>
52 |                    hallo<span>echo</span>
53 |                    <pre>
54 | def <span>hallo</span>():
55 |    print("echo")
56 |                    </pre>
57 |                  </body>
58 |                 </html>
59 |             """
60 |     config = ParserConfig(css=RELAXED_CSS_PROFILE)
61 |     assert (
62 |         get_text(html, config).strip() == "hallo echo\n\n"
63 |         "def hallo():\n"
64 |         '   print("echo")'
65 |     )
66 | 


--------------------------------------------------------------------------------
/tests/test_list_div.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | """ ensures that two successive <a>text</a> contain
 5 |     a space between each other, if there is a linebreak
 6 |     or space between the tags.
 7 | """
 8 | 
 9 | from inscriptis import get_text
10 | from inscriptis.css_profiles import CSS_PROFILES
11 | from inscriptis.model.config import ParserConfig
12 | 
13 | config = ParserConfig(css=CSS_PROFILES["strict"])
14 | 
15 | 
16 | def test_divs():
17 |     html = "<body>Thomas<div>Anton</div>Maria</body>"
18 |     assert get_text(html, config) == "Thomas\nAnton\nMaria"
19 | 
20 |     html = "<body>Thomas<div>Anna <b>läuft</b> weit weg.</div>"
21 |     assert get_text(html, config) == "Thomas\nAnna läuft weit weg."
22 | 
23 |     html = "<body>Thomas <ul><li><div>Anton</div>Maria</ul></body>"
24 |     assert get_text(html, config) == "Thomas\n  * Anton\n    Maria"
25 | 
26 |     html = "<body>Thomas <ul><li>  <div>Anton</div>Maria</ul></body>"
27 |     assert get_text(html, config) == "Thomas\n  * Anton\n    Maria"
28 | 
29 |     html = "<body>Thomas <ul><li> a  <div>Anton</div>Maria</ul></body>"
30 |     assert get_text(html, config) == "Thomas\n  * a\n    Anton\n    Maria"
31 | 


--------------------------------------------------------------------------------
/tests/test_margin_before_at_start.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """ ensures that two successive <a>text</a> contain
 4 |     a space between each other, if there is a linebreak
 5 |     or space between the tags.
 6 | """
 7 | 
 8 | from inscriptis import get_text
 9 | 
10 | 
11 | def test_content():
12 |     html = "<html><body>first</body></html>"
13 |     assert get_text(html) == "first"
14 | 
15 | 
16 | def test_margin_before():
17 |     html = "<html><body><p>first</p></body></html>"
18 |     assert get_text(html) == "first\n"
19 | 
20 |     html = "<html><body>first<p>" "second</p></body></html>"
21 |     assert get_text(html) == "first\n\nsecond\n"
22 | 
23 | 
24 | def test_br():
25 |     html = "<html><body><br>" "first</p></body></html>"
26 |     assert get_text(html) == "\nfirst"
27 | 


--------------------------------------------------------------------------------
/tests/test_margin_handling.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | """
 5 | Tests different white-space handling.
 6 | """
 7 | 
 8 | from inscriptis import get_text
 9 | from inscriptis.css_profiles import CSS_PROFILES
10 | from inscriptis.model.config import ParserConfig
11 | 
12 | config = ParserConfig(css=CSS_PROFILES["strict"])
13 | 
14 | 
15 | def test_margin_handling():
16 |     html = """<body>Hallo
17 |                      <div style="margin-top: 1em; margin-bottom: 1em">Echo
18 |                          <div style="margin-top: 2em">Mecho</div>
19 |                      </div>
20 |                      sei Gott
21 |                </body>"""
22 |     assert get_text(html, config) == "Hallo\n\nEcho\n\n\nMecho\n\nsei Gott"
23 | 
24 |     html = """<body>Hallo
25 |                      <div style="margin-top: 1em; margin-bottom: 1em">Echo</div>
26 |                          <div style="margin-top: 2em">Mecho</div>
27 |                      sei Gott
28 |                </body>"""
29 |     assert get_text(html, config) == "Hallo\n\nEcho\n\n\nMecho\nsei Gott"
30 | 
31 |     html = """<body>Hallo
32 |                      <div style="margin-top: 1em; margin-bottom: 1em">
33 |                          <div style="margin-top: 2em">Ehre</div>
34 |                     </div>
35 |                     sei Gott
36 |                </body>"""
37 |     assert get_text(html, config) == "Hallo\n\n\nEhre\n\nsei Gott"
38 | 


--------------------------------------------------------------------------------
/tests/test_metadata.py:
--------------------------------------------------------------------------------
 1 | from inscriptis.metadata import (
 2 |     __author__,
 3 |     __author_email__,
 4 |     __copyright__,
 5 |     __license__,
 6 |     __version__,
 7 | )
 8 | 
 9 | 
10 | def test_metadata():
11 |     """Test inscriptis package metadata."""
12 |     assert "Albert Weichselbraun" in __author__
13 |     assert "Fabian Odoni" in __author__
14 | 
15 |     assert "Albert Weichselbraun" in __copyright__
16 |     assert "Fabian Odoni" in __copyright__
17 | 
18 |     assert "@" in __author_email__
19 |     assert __license__ == "Apache-2.0"
20 |     assert __version__[0].isnumeric()
21 |     assert "." in __version__
22 | 


--------------------------------------------------------------------------------
/tests/test_model_html_element_canvas.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | """
 5 | Tests the rendering of a single table line.
 6 | """
 7 | 
 8 | from inscriptis.model.canvas import Canvas
 9 | from inscriptis.model.html_element import HtmlElement
10 | from inscriptis.html_properties import Display
11 | 
12 | 
13 | def _get_text(html_element):
14 |     """
15 |     Returns
16 |         the text formatted based on the current HTML element.
17 |     """
18 |     c = Canvas()
19 |     html_element.canvas = c
20 | 
21 |     HtmlElement().set_canvas(c).write("first")
22 | 
23 |     c.open_tag(html_element)
24 |     html_element.write("Ehre sei Gott!")
25 |     c.close_tag(html_element)
26 | 
27 |     HtmlElement().set_canvas(c).write("last")
28 |     c.flush_inline()
29 |     return "\n".join(c.blocks)
30 | 
31 | 
32 | def test_formatting():
33 |     # standard line
34 | 
35 |     h = HtmlElement()
36 |     assert _get_text(h) == "firstEhre sei Gott!last"
37 | 
38 |     h.display = Display.block
39 |     h.margin_before = 1
40 |     h.margin_after = 2
41 |     print(h)
42 |     print(_get_text(h))
43 |     assert _get_text(h) == "first\n\nEhre sei Gott!\n\n\nlast"
44 | 
45 |     # list bullet without padding_inline
46 |     h.list_bullet = "* "
47 |     assert _get_text(h) == "first\n\n* Ehre sei Gott!\n\n\nlast"
48 | 
49 |     # add a padding_inline
50 |     h.padding_inline = 3
51 |     assert _get_text(h) == "first\n\n * Ehre sei Gott!\n\n\nlast"
52 | 
53 |     # and prefixes + suffixes
54 |     h.prefix = ">>"
55 |     h.suffix = "<<"
56 |     assert _get_text(h) == "first\n\n * >>Ehre sei Gott!<<\n\n\nlast"
57 | 


--------------------------------------------------------------------------------
/tests/test_model_prefix.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | """
 5 | Tests the rendering of a single table line.
 6 | """
 7 | 
 8 | from inscriptis.model.canvas import Prefix
 9 | 
10 | 
11 | def test_simple_prefix():
12 |     p = Prefix()
13 | 
14 |     p.register_prefix(5, "1. ")
15 | 
16 |     # first use
17 |     assert p.first == "  1. "
18 | 
19 |     # the prefix has been consumed
20 |     assert p.first == ""
21 | 
22 |     # prefix used to indent lines separated with newlines
23 |     assert p.rest == "     "
24 | 
25 | 
26 | def test_combined_prefix():
27 |     p = Prefix()
28 | 
29 |     p.register_prefix(5, "1. ")
30 |     p.register_prefix(2, "")
31 | 
32 |     assert p.first == "    1. "
33 |     assert p.first == ""
34 | 
35 |     p.remove_last_prefix()
36 |     assert p.first == ""
37 | 
38 |     p.remove_last_prefix()
39 |     # final consumption - no prefix
40 |     assert p.first == ""
41 | 
42 |     # ensure that there are no interactions between different runs with
43 |     # bullets
44 |     p.consumed = False
45 |     p.register_prefix(5, "2. ")
46 |     p.register_prefix(2, "- ")
47 | 
48 |     assert p.first == "     - "
49 |     assert p.first == ""
50 |     assert p.rest == "       "
51 | 
52 |     p.consumed = False
53 |     p.remove_last_prefix()
54 |     assert p.first == "  2. "
55 |     assert p.rest == "     "
56 | 


--------------------------------------------------------------------------------
/tests/test_parse_css.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | """
 5 | Tests HtmlElement and the parsing of CSS style definitiosn
 6 | """
 7 | 
 8 | from copy import copy
 9 | from inscriptis.css_profiles import CSS_PROFILES
10 | from inscriptis.html_properties import (
11 |     Display,
12 |     WhiteSpace,
13 |     VerticalAlignment,
14 |     HorizontalAlignment,
15 | )
16 | from inscriptis.model.css import CssParse
17 | from inscriptis.model.html_element import HtmlElement
18 | 
19 | 
20 | def test_css_parsing():
21 |     html_element = copy(CSS_PROFILES["strict"]["div"])
22 |     CssParse.attr_style("padding_left: 8px; display: block", html_element)
23 |     assert html_element.padding_inline == 1
24 |     assert html_element.display == Display.block
25 | 
26 |     CssParse.attr_style("margin_before: 8em; display: inline", html_element)
27 |     assert html_element.margin_before == 8
28 |     assert html_element.display == Display.inline
29 | 
30 | 
31 | def test_html_element_str():
32 |     """
33 |     Tests the string representation of an HtmlElement.
34 |     """
35 |     html_element = HtmlElement(
36 |         "div", "", "", Display.inline, 0, 0, 0, "", WhiteSpace.pre
37 |     )
38 |     assert str(html_element) == (
39 |         "<div prefix=, suffix=, "
40 |         "display=Display.inline, margin_before=0, "
41 |         "margin_after=0, padding_inline=0, "
42 |         "list_bullet=, "
43 |         "whitespace=WhiteSpace.pre, "
44 |         "align=HorizontalAlignment.left, "
45 |         "valign=VerticalAlignment.middle, "
46 |         "annotation=()>"
47 |     )
48 | 
49 | 
50 | def test_parse_vertical_align():
51 |     html_element = HtmlElement()
52 |     CssParse.attr_vertical_align("top", html_element)
53 |     assert html_element.valign == VerticalAlignment.top
54 | 
55 |     # invalid value
56 |     CssParse.attr_vertical_align("unknown", html_element)
57 |     assert html_element.valign == VerticalAlignment.top
58 | 
59 | 
60 | def test_parse_horizontal_align():
61 |     html_element = HtmlElement()
62 |     CssParse.attr_horizontal_align("center", html_element)
63 |     assert html_element.align == HorizontalAlignment.center
64 | 
65 |     # invalid value
66 |     CssParse.attr_horizontal_align("unknown", html_element)
67 |     assert html_element.align == HorizontalAlignment.center
68 | 


--------------------------------------------------------------------------------
/tests/test_strip_xml_header.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """ ensures that xml declaration headers are correctly stripped"""
 4 | 
 5 | from inscriptis import get_text
 6 | 
 7 | 
 8 | def test_successive_a():
 9 |     html = '<?xml version="1.0" encoding="UTF-8" ?> Hallo?>'
10 |     assert get_text(html).strip() == "Hallo?>"
11 | 


--------------------------------------------------------------------------------
/tests/test_style_parsing.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | Tests inscriptis' parsing of CSS style definitions.
 5 | """
 6 | 
 7 | from inscriptis.model.css import CssParse
 8 | from inscriptis.model.html_element import HtmlElement
 9 | 
10 | 
11 | def test_style_unit_parsing():
12 |     html_element = HtmlElement()
13 |     CssParse.attr_style(
14 |         "margin-top:2.666666667em;margin-bottom: 2.666666667em", html_element
15 |     )
16 |     assert html_element.margin_before == 3
17 |     assert html_element.margin_after == 3
18 | 


--------------------------------------------------------------------------------
/tests/test_table_cell.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | """
 5 | Tests the Table formatting with different parameters such as width and
 6 | alignment
 7 | """
 8 | 
 9 | from inscriptis.model.table import TableCell
10 | from inscriptis.html_properties import HorizontalAlignment, VerticalAlignment
11 | 
12 | 
13 | def test_height():
14 |     cell = TableCell(HorizontalAlignment.left, VerticalAlignment.top)
15 | 
16 |     cell.blocks = ["hallo"]
17 |     cell.normalize_blocks()
18 |     assert cell.height == len("\n".join(cell.blocks).split("\n"))
19 | 
20 |     cell.blocks = ["hallo", "echo"]
21 |     cell.normalize_blocks()
22 |     assert cell.height == 2
23 | 
24 |     cell.blocks = ["hallo\necho"]
25 |     cell.normalize_blocks()
26 |     assert cell.height == 2
27 | 
28 |     cell.blocks = ["hallo\necho", "Ehre sei Gott", "Jump\n&\nRun!\n\n\n"]
29 |     cell.normalize_blocks()
30 |     assert cell.height == 9
31 |     assert cell.height == len("\n".join(cell.blocks).split("\n"))
32 | 
33 | 
34 | def test_width():
35 |     cell = TableCell(HorizontalAlignment.left, VerticalAlignment.top)
36 | 
37 |     cell.blocks = ["hallo"]
38 |     cell.normalize_blocks()
39 |     assert cell.width == len(cell.blocks[0])
40 | 
41 |     cell.blocks = ["hallo\necho", "Ehre sei Gott", "Jump\n&\nRun!\n\n\n"]
42 |     cell.normalize_blocks()
43 |     assert cell.width == len("Ehre sei Gott")
44 | 
45 |     # fixed set width
46 |     cell.width = 95
47 |     cell.normalize_blocks()
48 |     assert cell.width == 95
49 | 


--------------------------------------------------------------------------------
/tests/test_table_cell_formatting.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | """
 5 | Tests the Table formatting with different parameters such as width and
 6 | alignment
 7 | """
 8 | 
 9 | from inscriptis.model.table import TableCell
10 | from inscriptis.html_properties import HorizontalAlignment, VerticalAlignment
11 | 
12 | 
13 | def test_horizontal_cell_formatting():
14 |     cell = TableCell(align=HorizontalAlignment.left, valign=VerticalAlignment.top)
15 |     # left alignment
16 |     cell.blocks = ["Ehre sei Gott!"]
17 |     cell.width = 16
18 |     assert cell.blocks == ["Ehre sei Gott!  "]
19 | 
20 |     # right alignment
21 |     cell.align = HorizontalAlignment.right
22 |     cell.blocks = ["Ehre sei Gott!"]
23 |     cell.width = 16
24 |     assert cell.blocks == ["  Ehre sei Gott!"]
25 | 
26 | 
27 | def test_vertical_cell_formatting():
28 |     cell = TableCell(align=HorizontalAlignment.left, valign=VerticalAlignment.top)
29 | 
30 |     # default top alignment
31 |     cell.blocks = ["Ehre sei Gott!"]
32 |     cell.width = 16
33 |     cell.height = 4
34 |     assert cell.blocks == ["Ehre sei Gott!  ", "", "", ""]
35 | 
36 |     # bottom alignment
37 |     cell.blocks = ["Ehre sei Gott!"]
38 |     cell.valign = VerticalAlignment.bottom
39 |     cell.width = 16
40 |     cell.height = 4
41 |     assert cell.blocks == ["", "", "", "Ehre sei Gott!  "]
42 | 
43 |     # middle alignment
44 |     cell.blocks = ["Ehre sei Gott!"]
45 |     cell.valign = VerticalAlignment.middle
46 |     cell.width = 16
47 |     cell.height = 4
48 |     assert cell.blocks == ["", "Ehre sei Gott!  ", "", ""]
49 | 


--------------------------------------------------------------------------------
/tests/test_table_row.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | """
 5 | Test borderline cases for table rows
 6 | """
 7 | 
 8 | from inscriptis import get_text
 9 | from inscriptis.model.config import ParserConfig
10 | from inscriptis.model.table import TableRow
11 | 
12 | 
13 | def test_empty_row():
14 |     tr = TableRow(cell_separator="   ")
15 | 
16 |     assert tr.width == 0
17 |     assert tr.get_text() == ""
18 | 
19 | 
20 | def test_table_cell_separator():
21 |     html = "<html><body><table><tr><td>Hallo<br>Eins</td><td>Echo<br>Zwei</td></tr></table></html>"
22 | 
23 |     config = ParserConfig()
24 |     assert get_text(html, config) == "Hallo  Echo\nEins   Zwei\n"
25 | 
26 |     config = ParserConfig(table_cell_separator="\t")
27 |     assert get_text(html, config) == "Hallo\tEcho\nEins \tZwei\n"
28 | 


--------------------------------------------------------------------------------
/tests/test_web_service.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from fastapi.testclient import TestClient
 3 | from inscriptis.service.web import app
 4 | from inscriptis.metadata import __version__
 5 | 
 6 | 
 7 | @pytest.fixture
 8 | def client():
 9 |     return TestClient(app)
10 | 
11 | 
12 | def test_index(client):
13 |     response = client.get("/")
14 |     assert response.status_code == 200
15 |     assert response.text == "Inscriptis text to HTML Web service."
16 | 
17 | 
18 | def test_get_text_call_with_content_type(client):
19 |     html_content = "<html><body>Österliche Freuden!</body></html>"
20 |     response = client.post(
21 |         "/get_text",
22 |         content=html_content,
23 |         headers={"Content-type": "text/html; charset=UTF-8"},
24 |     )
25 |     assert response.status_code == 200
26 |     assert response.text == "Österliche Freuden!"
27 | 
28 | 
29 | def test_get_text_call_without_content_type(client):
30 |     html_content = "<html><body>Hello World!</body></html>"
31 |     response = client.post(
32 |         "/get_text",
33 |         content=html_content,
34 |         headers={"Content-type": "text/html"},
35 |     )
36 |     assert response.status_code == 200
37 |     assert response.text == "Hello World!"
38 | 
39 | 
40 | def test_get_version_call(client):
41 |     response = client.get("/version")
42 |     assert response.status_code == 200
43 |     assert response.text == __version__
44 | 


--------------------------------------------------------------------------------
/tests/test_white_space_handling.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | """
 5 | Tests different white-space handling.
 6 | """
 7 | 
 8 | from inscriptis import get_text
 9 | from inscriptis.css_profiles import CSS_PROFILES
10 | from inscriptis.model.config import ParserConfig
11 | 
12 | config = ParserConfig(css=CSS_PROFILES["strict"])
13 | 
14 | 
15 | def test_white_space():
16 |     html = '<body><span style="white-space: normal"><i>1</i>2\n3</span>' "</body>"
17 |     assert get_text(html, config) == "12 3"
18 | 
19 |     html = '<body><span style="white-space: nowrap"><i>1</i>2\n3</span>' "</body>"
20 |     assert get_text(html, config) == "12 3"
21 | 
22 |     html = '<body><span style="white-space: pre"><i>1</i>2\n3</span>' "</body>"
23 |     assert get_text(html, config) == "12\n3"
24 | 
25 |     html = '<body><span style="white-space: pre-line"><i>1</i>2\n3</span>' "</body>"
26 |     assert get_text(html, config) == "12\n3"
27 | 
28 |     html = '<body><span style="white-space: pre-wrap"><i>1</i>2\n3</span>' "</body>"
29 |     assert get_text(html, config) == "12\n3"
30 | 
31 | 
32 | def test_borderline_cases():
33 |     """
34 |     testing of borderline cases based on the behavior found in Firefox and
35 |     Google Chrome.
36 |     """
37 |     # change of whitespace handling between terms; no whitespace
38 |     # between the terms
39 |     html = '<body>Hallo<span style="white-space: pre">echo</span> versus'
40 |     assert get_text(html, config) == "Halloecho versus"
41 | 
42 |     # change of whitespace handling between terms; one whitespace
43 |     # between the terms; option 1
44 |     html = '<body>Hallo<span style="white-space: pre"> echo</span> versus'
45 |     assert get_text(html, config) == "Hallo echo versus"
46 | 
47 |     # change of whitespace handling between terms; one whitespace
48 |     # between the terms; option 2
49 |     html = '<body>Hallo <span style="white-space: pre">echo</span> versus'
50 |     assert get_text(html, config) == "Hallo echo versus"
51 | 
52 |     # change of whitespace handling between terms; two whitespaces
53 |     # between the terms
54 |     html = '<body>Hallo <span style="white-space: pre"> echo</span> versus'
55 |     assert get_text(html, config) == "Hallo  echo versus"
56 | 
57 |     # change of whitespace handling between terms; multiple whitespaces
58 |     # between the terms
59 |     html = '<body>Hallo   <span style="white-space: pre"> echo</span> versus'
60 |     assert get_text(html, config) == "Hallo  echo versus"
61 | 
62 |     # change of whitespace handling between terms; multiple whitespaces
63 |     # between the terms
64 |     html = '<body>Hallo   <span style="white-space: pre">   echo</span> versus'
65 |     assert get_text(html, config) == "Hallo    echo versus"
66 | 
67 | 
68 | def test_tail():
69 |     """
70 |     ensure that the tail elements are formated based on the container element.
71 |     """
72 |     html = '<body>Hi<span style="white-space: pre"> 1   3 </span>' " versus 1   3"
73 |     assert get_text(html, config) == "Hi 1   3  versus 1 3"
74 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = pytest, pyroma, flake8
 3 | 
 4 | # standard unit tests
 5 | [testenv:pytest]
 6 | deps = pytest ~= 7.4.4
 7 |        pytest-cov ~= 4.1.0
 8 |        fastapi ~= 0.109.2
 9 |        httpx ~= 0.26.0
10 | commands = pytest --cov-config=.coveragerc --cov=inscriptis ./tests
11 | 
12 | # python packaging best practices
13 | [testenv:pyroma]
14 | deps = pyroma
15 | commands = pyroma .
16 | 
17 | [testenv:flake8]
18 | deps = flake8 ~= 7.0.0
19 |        dlint ~= 0.14.1
20 |        flake8-bandit ~= 4.1.1
21 |        flake8-blind-except ~= 0.2.1
22 |        flake8-bugbear ~= 24.2.6
23 |        flake8-builtins ~= 2.2.0
24 |        flake8-cognitive-complexity ~= 0.1.0
25 |        flake8-colors ~= 0.1.9
26 |        flake8-comprehensions ~= 3.14.0
27 |        flake8-docstrings ~= 1.7.0
28 |        flake8-eradicate ~= 1.5.0
29 |        flake8-encodings ~= 0.5.1
30 |        flake8-expression-complexity ~= 0.0.11
31 |        flake8-logging-format ~= 0.9.0
32 |        flake8-mutable ~= 1.2.0
33 |        flake8-pie ~= 0.16.0
34 |        flake8-pytest ~= 1.4
35 |        flake8-raise ~= 0.0.5
36 |        flake8-simplify ~= 0.21.0
37 |        flake8-string-format ~= 0.3.0
38 |        flake8-tuple ~= 0.4.1
39 |        flake8-use-pathlib ~= 0.3.0
40 |        flake8-warnings ~= 0.4.1
41 |        pep8-naming ~= 0.13.3
42 | 
43 | # S104 - do not cleanup XML data prior to processing
44 | # S410 - bind to all IPs is okay in the case of the Web service, since it is
45 | #        aimed for use with docker.
46 | # W503 - replaced with W504
47 | # D102 - missing docstring in public method
48 | # D105 - missing docstring in magic method (e.g., __str__)
49 | # D107 - missing docstring in __init__
50 | # E203, E704 black
51 | commands = flake8 --exclude=".tox, setup.py, tests, venv, docs, benchmarking, build" \
52 |               --show-source \
53 |           --max-line-length=88 \  
54 | 	      --ignore="DUO107, W503, D107, D105, D102, S104, S410, E203, E708" \
55 | 	      --max-cognitive-complexity=13
56 | 
57 | #              --ignore="S104, S410, W503, D107, D105, D102" \
58 | #	      --enable-extensions=G \
59 | #	      --max-cognitive-complexity=13
60 | 


--------------------------------------------------------------------------------