├── doc2dict
├── doc2dict
│ ├── pdf
│ │ ├── __init__.py
│ │ ├── mapping.py
│ │ ├── __pycache__
│ │ │ ├── utils.cpython-313.pyc
│ │ │ ├── mapping.cpython-313.pyc
│ │ │ ├── __init__.cpython-313.pyc
│ │ │ ├── pdf2dict.cpython-313.pyc
│ │ │ ├── pdf_utils.cpython-313.pyc
│ │ │ └── convert_pdf_to_instructions.cpython-313.pyc
│ │ ├── pdf2dict.py
│ │ ├── convert_pdf_to_instructions.py
│ │ ├── pdf_utils.py
│ │ └── utils.py
│ ├── txt
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ ├── parser.cpython-313.pyc
│ │ │ └── __init__.cpython-313.pyc
│ │ ├── convert_txt_to_instructions.py
│ │ └── txt2dict.py
│ ├── xml
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ ├── mapping.cpython-311.pyc
│ │ │ ├── parser.cpython-311.pyc
│ │ │ ├── parser.cpython-313.pyc
│ │ │ ├── __init__.cpython-311.pyc
│ │ │ ├── __init__.cpython-313.pyc
│ │ │ └── mapping_dicts.cpython-311.pyc
│ │ └── parser.py
│ ├── html
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ ├── __init__.cpython-313.pyc
│ │ │ ├── html2dict.cpython-313.pyc
│ │ │ ├── mapping.cpython-313.pyc
│ │ │ ├── visualize_dict.cpython-313.pyc
│ │ │ ├── visualize_instructions.cpython-313.pyc
│ │ │ ├── convert_html_to_instructions.cpython-313.pyc
│ │ │ └── convert_instructions_to_dict.cpython-313.pyc
│ │ ├── mapping.py
│ │ ├── html2dict.py
│ │ ├── visualize_instructions.py
│ │ ├── visualize_dict.py
│ │ └── convert_html_to_instructions.py
│ ├── utils
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ ├── utils.cpython-313.pyc
│ │ │ ├── __init__.cpython-313.pyc
│ │ │ └── storage.cpython-313.pyc
│ │ ├── strings.py
│ │ ├── utils.py
│ │ └── format_dict.py
│ ├── __pycache__
│ │ ├── utils.cpython-313.pyc
│ │ ├── __init__.cpython-313.pyc
│ │ ├── dict2dict.cpython-313.pyc
│ │ └── mapping.cpython-313.pyc
│ ├── __init__.py
│ ├── dict2dict.py
│ ├── convert_instructions_to_dict.py
│ └── mapping.py
├── doc2dict.egg-info
│ ├── dependency_links.txt
│ ├── top_level.txt
│ ├── requires.txt
│ ├── PKG-INFO
│ └── SOURCES.txt
├── setup.py
└── docs
│ ├── docs
│ ├── index.md
│ ├── parsing
│ │ ├── todo.md
│ │ ├── pdf.md
│ │ └── html.md
│ └── whitepaper.md
│ └── mkdocs.yml
├── .gitignore
├── contributors.md
├── .github
└── workflows
│ ├── deploy-docs.yml
│ └── build_wheels.yml
├── LICENSE
├── readme.md
└── example_output
└── html
├── unnest.txt
└── levels.txt
/doc2dict/doc2dict/pdf/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/doc2dict/doc2dict/txt/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/doc2dict/doc2dict/xml/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/doc2dict/doc2dict/html/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/doc2dict/doc2dict/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *__pycache__/*
2 | data/*
3 | *.pyc
4 |
--------------------------------------------------------------------------------
/doc2dict/doc2dict.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/doc2dict/doc2dict.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | doc2dict
2 |
--------------------------------------------------------------------------------
/doc2dict/doc2dict.egg-info/requires.txt:
--------------------------------------------------------------------------------
1 | selectolax
2 | xmltodict
3 |
--------------------------------------------------------------------------------
/contributors.md:
--------------------------------------------------------------------------------
1 | * John Friedman
2 | * Benedetto Leto
3 | * Rian Dolphin
4 | * Gal Skarishevsky
--------------------------------------------------------------------------------
/doc2dict/doc2dict/pdf/mapping.py:
--------------------------------------------------------------------------------
1 | pdf_base_mapping_dict = {
2 | 'rules': {'use_font_size_only_for_level':True}
3 | }
--------------------------------------------------------------------------------
/doc2dict/doc2dict/__pycache__/utils.cpython-313.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/__pycache__/utils.cpython-313.pyc
--------------------------------------------------------------------------------
/doc2dict/doc2dict/__pycache__/__init__.cpython-313.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/__pycache__/__init__.cpython-313.pyc
--------------------------------------------------------------------------------
/doc2dict/doc2dict/__pycache__/dict2dict.cpython-313.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/__pycache__/dict2dict.cpython-313.pyc
--------------------------------------------------------------------------------
/doc2dict/doc2dict/__pycache__/mapping.cpython-313.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/__pycache__/mapping.cpython-313.pyc
--------------------------------------------------------------------------------
/doc2dict/doc2dict/pdf/__pycache__/utils.cpython-313.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/pdf/__pycache__/utils.cpython-313.pyc
--------------------------------------------------------------------------------
/doc2dict/doc2dict/pdf/__pycache__/mapping.cpython-313.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/pdf/__pycache__/mapping.cpython-313.pyc
--------------------------------------------------------------------------------
/doc2dict/doc2dict/txt/__pycache__/parser.cpython-313.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/txt/__pycache__/parser.cpython-313.pyc
--------------------------------------------------------------------------------
/doc2dict/doc2dict/utils/__pycache__/utils.cpython-313.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/utils/__pycache__/utils.cpython-313.pyc
--------------------------------------------------------------------------------
/doc2dict/doc2dict/xml/__pycache__/mapping.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/xml/__pycache__/mapping.cpython-311.pyc
--------------------------------------------------------------------------------
/doc2dict/doc2dict/xml/__pycache__/parser.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/xml/__pycache__/parser.cpython-311.pyc
--------------------------------------------------------------------------------
/doc2dict/doc2dict/xml/__pycache__/parser.cpython-313.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/xml/__pycache__/parser.cpython-313.pyc
--------------------------------------------------------------------------------
/doc2dict/doc2dict/html/__pycache__/__init__.cpython-313.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/html/__pycache__/__init__.cpython-313.pyc
--------------------------------------------------------------------------------
/doc2dict/doc2dict/html/__pycache__/html2dict.cpython-313.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/html/__pycache__/html2dict.cpython-313.pyc
--------------------------------------------------------------------------------
/doc2dict/doc2dict/html/__pycache__/mapping.cpython-313.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/html/__pycache__/mapping.cpython-313.pyc
--------------------------------------------------------------------------------
/doc2dict/doc2dict/pdf/__pycache__/__init__.cpython-313.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/pdf/__pycache__/__init__.cpython-313.pyc
--------------------------------------------------------------------------------
/doc2dict/doc2dict/pdf/__pycache__/pdf2dict.cpython-313.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/pdf/__pycache__/pdf2dict.cpython-313.pyc
--------------------------------------------------------------------------------
/doc2dict/doc2dict/pdf/__pycache__/pdf_utils.cpython-313.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/pdf/__pycache__/pdf_utils.cpython-313.pyc
--------------------------------------------------------------------------------
/doc2dict/doc2dict/txt/__pycache__/__init__.cpython-313.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/txt/__pycache__/__init__.cpython-313.pyc
--------------------------------------------------------------------------------
/doc2dict/doc2dict/utils/__pycache__/__init__.cpython-313.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/utils/__pycache__/__init__.cpython-313.pyc
--------------------------------------------------------------------------------
/doc2dict/doc2dict/utils/__pycache__/storage.cpython-313.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/utils/__pycache__/storage.cpython-313.pyc
--------------------------------------------------------------------------------
/doc2dict/doc2dict/xml/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/xml/__pycache__/__init__.cpython-311.pyc
--------------------------------------------------------------------------------
/doc2dict/doc2dict/xml/__pycache__/__init__.cpython-313.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/xml/__pycache__/__init__.cpython-313.pyc
--------------------------------------------------------------------------------
/doc2dict/doc2dict/html/__pycache__/visualize_dict.cpython-313.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/html/__pycache__/visualize_dict.cpython-313.pyc
--------------------------------------------------------------------------------
/doc2dict/doc2dict/xml/__pycache__/mapping_dicts.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/xml/__pycache__/mapping_dicts.cpython-311.pyc
--------------------------------------------------------------------------------
/doc2dict/doc2dict/html/__pycache__/visualize_instructions.cpython-313.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/html/__pycache__/visualize_instructions.cpython-313.pyc
--------------------------------------------------------------------------------
/doc2dict/doc2dict/pdf/__pycache__/convert_pdf_to_instructions.cpython-313.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/pdf/__pycache__/convert_pdf_to_instructions.cpython-313.pyc
--------------------------------------------------------------------------------
/doc2dict/doc2dict/html/__pycache__/convert_html_to_instructions.cpython-313.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/html/__pycache__/convert_html_to_instructions.cpython-313.pyc
--------------------------------------------------------------------------------
/doc2dict/doc2dict/html/__pycache__/convert_instructions_to_dict.cpython-313.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/html/__pycache__/convert_instructions_to_dict.cpython-313.pyc
--------------------------------------------------------------------------------
/doc2dict/doc2dict.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
1 | Metadata-Version: 2.4
2 | Name: doc2dict
3 | Version: 0.2.6
4 | Requires-Python: >=3.8
5 | Requires-Dist: selectolax
6 | Requires-Dist: xmltodict
7 | Dynamic: requires-dist
8 | Dynamic: requires-python
9 |
--------------------------------------------------------------------------------
/doc2dict/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | setup(
4 | name="doc2dict",
5 | version="0.4.9",
6 | packages=find_packages(),
7 | install_requires=['selectolax','xmltodict','pypdfium2'
8 | ]
9 | )
--------------------------------------------------------------------------------
/doc2dict/doc2dict/html/mapping.py:
--------------------------------------------------------------------------------
1 | # This will be heavily reworked in the future.
2 |
3 | dict_10k_html = {
4 | ('part',r'^part\s*([ivx]+)$') : 0,
5 | ('signatures',r'^signatures?\.*$') : 0,
6 | ('item',r'^item\s*(\d+)\.?([a-z])?') : 1,
7 | }
--------------------------------------------------------------------------------
/doc2dict/docs/docs/index.md:
--------------------------------------------------------------------------------
1 | # Welcome to doc2dict
2 |
3 | doc2dict is a package to quickly parse documents in `pdf`, `html`, `xml`, and `txt` formats. It supports the [datamule](https://github.com/john-friedman/datamule-python) project.
4 |
5 | ???+ warning "Package is in early development"
--------------------------------------------------------------------------------
/doc2dict/docs/docs/parsing/todo.md:
--------------------------------------------------------------------------------
1 | # TODO
2 | * Seperate instructions into own directory
3 | * add rules option to relax headers restrctions, e.g. useful for form 10-D https://www.sec.gov/Archives/edgar/data/1766367/000188852425005427/dma19b10_10d-202503.htm
4 | * modify base pdf parsing mapping dict to user underline - then test with APP NTC form.
--------------------------------------------------------------------------------
/doc2dict/doc2dict.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
1 | setup.py
2 | doc2dict/__init__.py
3 | doc2dict/dict2dict.py
4 | doc2dict/mapping.py
5 | doc2dict.egg-info/PKG-INFO
6 | doc2dict.egg-info/SOURCES.txt
7 | doc2dict.egg-info/dependency_links.txt
8 | doc2dict.egg-info/requires.txt
9 | doc2dict.egg-info/top_level.txt
10 | doc2dict/txt/__init__.py
11 | doc2dict/txt/parser.py
12 | doc2dict/xml/__init__.py
13 | doc2dict/xml/parser.py
--------------------------------------------------------------------------------
/doc2dict/doc2dict/html/html2dict.py:
--------------------------------------------------------------------------------
1 | from .convert_html_to_instructions import convert_html_to_instructions
2 | from ..convert_instructions_to_dict import convert_instructions_to_dict
3 | from selectolax.parser import HTMLParser
4 | def html2dict(content,mapping_dict=None):
5 | parser = HTMLParser(content)
6 |
7 | body = parser.body
8 | instructions = convert_html_to_instructions(body)
9 | dct = convert_instructions_to_dict(instructions, mapping_dict)
10 | return dct
--------------------------------------------------------------------------------
/doc2dict/doc2dict/pdf/pdf2dict.py:
--------------------------------------------------------------------------------
1 | from .convert_pdf_to_instructions import convert_pdf_to_instructions
2 | from ..convert_instructions_to_dict import convert_instructions_to_dict
3 | from .mapping import pdf_base_mapping_dict
4 | def pdf2dict(content,mapping_dict=None):
5 | instructions = convert_pdf_to_instructions(content)
6 | if mapping_dict is None:
7 | mapping_dict=pdf_base_mapping_dict
8 | dct = convert_instructions_to_dict(instructions, mapping_dict)
9 | return dct
--------------------------------------------------------------------------------
/doc2dict/doc2dict/__init__.py:
--------------------------------------------------------------------------------
1 | from .xml.parser import xml2dict
2 | from .txt.txt2dict import txt2dict
3 | from .dict2dict import dict2dict
4 |
5 | from .html.convert_html_to_instructions import convert_html_to_instructions
6 | from .convert_instructions_to_dict import convert_instructions_to_dict
7 | from .html.visualize_instructions import visualize_instructions
8 | from .html.visualize_dict import visualize_dict
9 | from .html.html2dict import html2dict
10 |
11 | from .pdf.pdf2dict import pdf2dict
12 |
13 | from .utils.utils import get_title
14 | from .utils.format_dict import unnest_dict, flatten_dict
--------------------------------------------------------------------------------
/doc2dict/docs/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: doc2dict
2 |
3 | theme:
4 | name: material
5 | palette:
6 | primary: indigo
7 | accent: indigo
8 | features:
9 | - navigation.instant
10 | - navigation.tracking
11 | - navigation.expand
12 | - content.code.copy
13 |
14 | nav:
15 | - Home: index.md
16 | - doc2dict:
17 | - html: parsing/html.md
18 | - pdf: parsing/pdf.md
19 | - White Paper: whitepaper.md
20 |
21 | markdown_extensions:
22 | - pymdownx.superfences
23 | - pymdownx.highlight
24 | - def_list
25 | - admonition
26 | - pymdownx.details
27 | - toc:
28 | permalink: true
29 |
30 | repo_url: https://github.com/john-friedman/doc2dict
31 | repo_name: john-friedman/doc2dict
--------------------------------------------------------------------------------
/doc2dict/doc2dict/utils/strings.py:
--------------------------------------------------------------------------------
1 | import re
2 | def check_string_style(text):
3 | if not text or not text.strip():
4 | return {}
5 |
6 | styles = {}
7 |
8 | if text.isupper():
9 | styles['all_caps'] = True
10 | else:
11 | # Stop words that can be lowercase in proper case
12 | stop_words = r'\b(and|or|of|the|in|on|at|to|for|with|by|a|an)\b'
13 |
14 | # Replace stop words with placeholder, check if remaining words are proper case
15 | text_no_stops = re.sub(stop_words, 'STOP', text, flags=re.IGNORECASE)
16 |
17 | # Check if all non-stop words start with capital and have at least one capital
18 | if re.match(r'^[A-Z][a-zA-Z]*(\s+(STOP|[A-Z][a-zA-Z]*))*$', text_no_stops) and re.search(r'[A-Z]', text):
19 | styles['proper_case'] = True
20 |
21 | return styles
--------------------------------------------------------------------------------
/.github/workflows/deploy-docs.yml:
--------------------------------------------------------------------------------
1 | name: Deploy MkDocs
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | workflow_dispatch: # Allows manual triggering
8 |
9 | permissions:
10 | contents: write
11 |
12 | jobs:
13 | deploy:
14 | runs-on: ubuntu-latest
15 | steps:
16 | - uses: actions/checkout@v3
17 |
18 | - name: Set up Python
19 | uses: actions/setup-python@v4
20 | with:
21 | python-version: '3.x'
22 |
23 | - name: Install dependencies
24 | run: |
25 | python -m pip install --upgrade pip
26 | pip install mkdocs mkdocs-material pymdown-extensions
27 |
28 | - name: Build and deploy MkDocs
29 | run: |
30 | # Navigate to the docs directory
31 | cd doc2dict/docs
32 |
33 | # Build the site
34 | mkdocs build
35 |
36 | # Deploy to GitHub Pages
37 | mkdocs gh-deploy --force
--------------------------------------------------------------------------------
/.github/workflows/build_wheels.yml:
--------------------------------------------------------------------------------
1 | name: Build and Upload to PyPI
2 |
3 | on:
4 | push:
5 | tags:
6 | - 'v*'
7 |
8 | jobs:
9 | deploy:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - uses: actions/checkout@v4
13 |
14 | - name: Set up Python
15 | uses: actions/setup-python@v4
16 | with:
17 | python-version: '3.11'
18 |
19 | - name: Install dependencies
20 | working-directory: ./doc2dict # Added this
21 | run: |
22 | python -m pip install --upgrade pip
23 | pip install setuptools wheel twine
24 |
25 | - name: Build package
26 | working-directory: ./doc2dict # Added this
27 | run: |
28 | python setup.py sdist bdist_wheel
29 |
30 | - name: Upload to PyPI
31 | env:
32 | TWINE_USERNAME: __token__
33 | TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
34 | working-directory: ./doc2dict # Added this
35 | run: |
36 | twine upload dist/*
--------------------------------------------------------------------------------
/doc2dict/doc2dict/xml/parser.py:
--------------------------------------------------------------------------------
1 | import xmltodict
2 | from ..mapping import JSONTransformer
3 |
4 | def remove_namespace_and_none(path, key, value):
5 | # Skip this key-value pair if value is None
6 | if value is None:
7 | return None # Return None to exclude this key-value pair
8 |
9 | # Remove xmlns attribute altogether
10 | if key == '@xmlns':
11 | return None
12 |
13 | # Remove namespace from keys
14 | if ':' in key:
15 | # Keep only the part after the last colon
16 | return key.split(':')[-1], value
17 |
18 | return key, value
19 |
20 | def xml2dict(content, mapping_dict=None):
21 | data = xmltodict.parse(
22 | content,
23 | postprocessor=remove_namespace_and_none,
24 | process_namespaces=True, # Handle namespaces
25 | namespaces={}
26 | )
27 |
28 | if mapping_dict is None:
29 | return data
30 |
31 | transformer = JSONTransformer(mapping_dict)
32 | transformed_data = transformer.transform(data)
33 | return transformed_data
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2025 John Friedman
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/doc2dict/doc2dict/txt/convert_txt_to_instructions.py:
--------------------------------------------------------------------------------
1 |
2 | # need to rememember how html 2 instructions treats empty lines
3 | # may need to rejig to standardize
4 |
5 | from ..utils.strings import check_string_style
6 |
7 | TAB_SIZE = 4
8 |
9 | def get_left_indent(line):
10 | count = 0
11 | for c in line:
12 | if c == '\t':
13 | count += TAB_SIZE
14 | elif c.isspace() and c not in '\r\n\f\v':
15 | count += 1
16 | else:
17 | break
18 | return count
19 |
20 | def convert_txt_to_instructions(content):
21 | lines = content.split('\n')
22 | instructions_list = []
23 |
24 | for line in lines:
25 | instructions = []
26 | if len(line) != 0:
27 | instruction = {'text':line}
28 | left_indent = get_left_indent(line)
29 | if left_indent != 0:
30 | instruction['left-indent'] = str(left_indent)
31 |
32 | # style
33 | styles = check_string_style(line)
34 | instruction.update(styles)
35 |
36 | instructions.append(instruction)
37 | instructions_list.append(instructions)
38 | else:
39 | instructions_list.append([])
40 |
41 | return instructions_list
--------------------------------------------------------------------------------
/doc2dict/doc2dict/pdf/convert_pdf_to_instructions.py:
--------------------------------------------------------------------------------
1 | import pypdfium2 as pdfium
2 | from .pdf_utils import get_text, get_font_name, get_font, get_font_size
3 | from .utils import get_font_attributes, assign_line, standardize_font_size
4 |
5 |
6 | def convert_pdf_to_instructions(content):
7 |
8 | # Open the PDF
9 | pdf = pdfium.PdfDocument(content)
10 |
11 | instructions_stream = []
12 | # Extract text and font info from each page
13 | for page_index in range(len(pdf)):
14 | page = pdf[page_index]
15 | text_page = page.get_textpage()
16 | page_width = page.get_width()
17 |
18 |
19 | # Get page objects
20 | for obj in page.get_objects():
21 | text = get_text(text_page, obj)
22 | font = get_font(obj)
23 | font_name = get_font_name(font)
24 | font_attributes = get_font_attributes(font_name) # mild duplication
25 |
26 | font_size = get_font_size(obj)
27 |
28 |
29 |
30 | # left bottom righ top
31 | coords_tuple = obj.get_pos()
32 |
33 | # lets not add items if font size is 0
34 | if font_size is None:
35 | continue
36 | else:
37 | instruction = {'text': text} | {'coords': coords_tuple, 'font-size': font_size, 'font-name': font_name} | font_attributes
38 | instructions_stream.append(instruction)
39 |
40 |
41 | # Clean up resources
42 | pdf.close()
43 |
44 | #instructions_stream = standardize_font_size(instructions_stream)
45 | instructions_list = assign_line(instructions_stream)
46 |
47 |
48 | return instructions_list
--------------------------------------------------------------------------------
/doc2dict/docs/docs/parsing/pdf.md:
--------------------------------------------------------------------------------
1 | # PDF
2 |
3 | ???+ warning "Very Early Stage"
4 | This code is in a very early stage.
5 |
6 | ## Quickstart
7 | ```
8 | # Load your pdf file
9 | with open('apple_10k_2024.pdf','rb') as f:
10 | content = f.read()
11 |
12 | # Convert to dictionary
13 | dct = pdf2dict(content,mapping_dict=None)
14 | ```
15 |
16 |
17 | ## Benchmarks
18 | * About 200 pages per second single threaded.
19 |
20 | ???+ warning "multithreading"
21 | pdf2dict can't be run multithreaded due to the limitations of pypdfium2
22 |
23 |
24 | ## Compatibility
25 | Requires pdfs with underlying text structure so no scans yet.
26 |
27 | `convert_scan_to_instructions` would be fairly straightforward to implement. Font-size can be inferred from bounding boxes, as can line alignment. Rotation probably won't be an issue for decent scans like the ones submitted to the SEC.
28 |
29 | The issue is performance.
30 |
31 | The point of `doc2dict` is mostly that it's fast. Local OCR such as pytesseract would put a hard cap of 10 pages per second.
32 |
33 | This is too slow to be useful for my use-case. Here's a benchmark.
34 |
35 | **Convert all 2024 Annual Report to Shareholders to dict form**
36 | 2000 a year * mean 50 pages / 200 pages per second = 500 seconds = ~ 10 minutes. (PDF Parser)
37 |
38 | Where as a scan parser would take at least 200 minutes ~ 3 hours.
39 |
40 | I think the solution will be to write a scan parser that takes input of bounding boxes/ minimum features required as input. Users can then use their preferred OCR - e.g. local, Google, AWS, etc for the slow part.
41 |
42 | ## TODO
43 | think about tables
44 | get center
45 | get other old attributes like indent
46 |
47 | ## Issues
48 | * Adobe PDF encodings return weird characters.
49 |
--------------------------------------------------------------------------------
/doc2dict/doc2dict/utils/utils.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | def get_title(dct, title=None, title_regex=None, title_class=None):
4 | results = []
5 |
6 | # Ensure exactly one of title or title_regex is specified
7 | if (title is None and title_regex is None) or (title is not None and title_regex is not None):
8 | raise ValueError("Exactly one of 'title' or 'title_regex' must be specified")
9 |
10 | title_class = title_class.lower() if title_class else None
11 |
12 | if title_regex:
13 | title_pattern = re.compile(title_regex, re.IGNORECASE)
14 | else:
15 | title_lower = title.lower()
16 |
17 | def search(node, parent_id=None):
18 | if isinstance(node, dict):
19 | node_title = node.get('title', '')
20 | node_class = node.get('class', '').lower()
21 | node_standardized_title = node.get('standardized_title', '')
22 |
23 | # Check title match based on which parameter was provided
24 | if title_regex:
25 | title_match = (title_pattern.match(node_title) or
26 | title_pattern.match(node_standardized_title))
27 | else:
28 | title_match = (node_title.lower() == title_lower or
29 | node_standardized_title.lower() == title_lower)
30 |
31 | if title_match and (title_class is None or node_class == title_class):
32 | results.append((parent_id, node))
33 |
34 | contents = node.get('contents', {})
35 | for key, value in contents.items():
36 | search(value, key)
37 |
38 | if 'document' in dct:
39 | for doc_id, doc_node in dct['document'].items():
40 | search(doc_node, doc_id)
41 |
42 | return results
43 |
--------------------------------------------------------------------------------
/doc2dict/doc2dict/dict2dict.py:
--------------------------------------------------------------------------------
1 | def dict2dict(data):
2 | result = {}
3 |
4 | def process_item(item):
5 | # If item is a string, return it directly
6 | if isinstance(item, str):
7 | return item.strip()
8 |
9 | # If item is not a dict, return string version
10 | if not isinstance(item, dict):
11 | return str(item).strip()
12 |
13 | # Base case: if there's no further content, return the item itself
14 | if 'content' not in item:
15 | return item
16 |
17 | # If there's a text key, use it as the dict key, otherwise use the type
18 | key = item.get('text', item.get('type', ''))
19 |
20 | # Process the content
21 | if isinstance(item['content'], list):
22 | # Check if content contains dictionaries with type/text
23 | if any(isinstance(x, dict) and ('type' in x or 'text' in x) for x in item['content']):
24 | nested_result = {}
25 | for content_item in item['content']:
26 | if isinstance(content_item, dict):
27 | nested_key = content_item.get('text', content_item.get('type', ''))
28 | nested_result[nested_key] = process_item(content_item)
29 | return nested_result
30 | # If content items are simple values (strings/numbers), join with newlines and strip
31 | else:
32 | return '\n'.join(str(x) for x in item['content']).strip()
33 | else:
34 | return str(item['content']).strip()
35 |
36 | # Handle case where data itself might be a string
37 | if isinstance(data, str):
38 | return data.strip()
39 |
40 | # Handle case where content is a list directly
41 | if isinstance(data.get('content', []), list):
42 | for item in data['content']:
43 | if isinstance(item, dict):
44 | key = item.get('text', item.get('type', ''))
45 | result[key] = process_item(item)
46 | else:
47 | # If we have a string in content, use it as both key and value
48 | result[str(item).strip()] = str(item).strip()
49 |
50 | return result
--------------------------------------------------------------------------------
/doc2dict/doc2dict/pdf/pdf_utils.py:
--------------------------------------------------------------------------------
1 | import pypdfium2 as pdfium
2 | import pypdfium2.raw as pdfium_c
3 | from ctypes import c_ushort, c_ulong, POINTER, c_float, c_void_p, c_size_t, c_uint8, c_int
4 |
5 |
6 | def get_text(text_page,obj):
7 | text_len = pdfium_c.FPDFTextObj_GetText(
8 | obj.raw, # FPDF_PAGEOBJECT
9 | text_page.raw, # FPDF_TEXTPAGE (NULL in this case)
10 | None, # POINTER(FPDF_WCHAR) - NULL to get the length
11 | c_ulong(0) # c_ulong - specify 0 to get the required buffer size
12 | )
13 |
14 | # Create buffer for the text
15 | buffer = pdfium_c.create_string_buffer(text_len * 2) # UTF-16LE encoding
16 | text_ptr = pdfium_c.cast(buffer, pdfium_c.POINTER(pdfium_c.c_ushort))
17 |
18 | # Second call to actually get the text
19 | chars_copied = pdfium_c.FPDFTextObj_GetText(
20 | obj.raw, # FPDF_PAGEOBJECT
21 | text_page.raw, # FPDF_TEXTPAGE (NULL in this case)
22 | text_ptr, # POINTER(FPDF_WCHAR) - pointer to our buffer
23 | c_ulong(text_len) # c_ulong - the buffer size
24 | )
25 |
26 | # Convert UTF-16LE to string
27 | # Only convert the number of characters actually copied
28 | text = buffer.raw[:chars_copied*2].decode('utf-16le', errors='ignore')
29 |
30 | # remove buffer
31 | text = text.strip('\x00')
32 | return text
33 |
34 |
35 | def get_font_size(obj):
36 | # Create a c_float to receive the font size value
37 | font_size = c_float(0.0)
38 |
39 | # Call the PDFium function to get the font size
40 | result = pdfium_c.FPDFTextObj_GetFontSize(
41 | obj.raw, # FPDF_PAGEOBJECT
42 | pdfium_c.byref(font_size) # POINTER(c_float)
43 | )
44 |
45 | # Check if the function call was successful
46 | if result:
47 | matrix = obj.get_matrix().get()
48 | # Apply the transformation matrix to the font size
49 | mean_scale = (matrix[0] + matrix[3]) / 2
50 |
51 | return round(font_size.value * mean_scale,2)
52 | else:
53 | return None
54 |
55 |
56 | def get_font(obj):
57 | font = pdfium_c.FPDFTextObj_GetFont(obj.raw)
58 | return font
59 |
60 | def get_font_name(font):
61 | # Get font name
62 | name_len = pdfium_c.FPDFFont_GetBaseFontName(font, None, 0)
63 | name_buffer = pdfium_c.create_string_buffer(name_len)
64 | pdfium_c.FPDFFont_GetBaseFontName(font, name_buffer, name_len)
65 | font_name = name_buffer.value.decode('utf-8', errors='ignore')
66 |
67 |
68 | return font_name
69 |
--------------------------------------------------------------------------------
/doc2dict/docs/docs/parsing/html.md:
--------------------------------------------------------------------------------
1 | # HTML
2 |
3 | ## Quickstart
4 | ```
5 | # Load your html file
6 | with open('apple_10k_2024.htm','r') as f:
7 | content = f.read()
8 |
9 | # Convert to dictionary
10 | dct = html2dict(content,mapping_dict=None)
11 | ```
12 |
13 | ### Example
14 | ```
15 | ...
16 | "37": {
17 | "title": "PART I",
18 | "standardized_title": "parti",
19 | "class": "part",
20 | "contents": {
21 | "38": {
22 | "title": "ITEM 1. BUSINESS",
23 | "standardized_title": "item1",
24 | "class": "item",
25 | "contents": {
26 | "39": {
27 | "title": "GENERAL",
28 | "standardized_title": "",
29 | "class": "predicted header",
30 | "contents": {
31 | "40": {
32 | "title": "Embracing Our Future",
33 | ...
34 | "292": {
35 | "table": [
36 | [
37 | "Name",
38 | "Age",
39 | "Position with the Company"
40 | ],
41 | [
42 | "Satya Nadella",
43 | "56",
44 | "Chairman and Chief Executive Officer"
45 | ],
46 | ...
47 | ```
48 |
49 |
50 |
51 | ## Tweaking the engine for your use case
52 |
53 | ???+ note "I will make this section better soon"
54 | I just want to get the basic docs out!
55 |
56 | ### Debugging
57 | ```
58 | from doc2dict import convert_html_to_instructions, convert_instructions_to_dict, visualize_instructions, visualize_dict
59 |
60 | # load your html file
61 | with open('tesla10k.htm','r') as f:
62 | content = f.read()
63 |
64 | # convert html to a series of instructions
65 | instructions = convert_html_to_instructions(content)
66 |
67 | # visualize the conversion
68 | visualize_instructions(instructions)
69 |
70 | # convert instructions to dictionary
71 | dct = html2dict(content,mapping_dict=None)
72 |
73 | # visualize dictionary
74 | visualize_dict(dct)
75 | ```
76 |
77 | ### Writing your own mapping dictionaries
78 |
79 | ???+ warning "Experimental"
80 | If you write a mapping dict, and I change something so it stops working - please [email me](mailto:johnfriedman@datamule.xyz).
81 |
82 | Mapping dicts currently work by specifying the class of the section header: `part`, regex for section header `r'^part\s*([ivx]+)$'` where the capture group `([ivx]+)` and class `part` determine the `standardized_title`, and the level, where `0` is the root.
83 |
84 | In this example, `items` will always be nested under `parts`.
85 | ```
86 | dict_10k_html = {
87 | ('part',r'^part\s*([ivx]+)$') : 0,
88 | ('signatures',r'^signatures?\.*$') : 0,
89 | ('item',r'^item\s*(\d+)\.?([a-z])?') : 1,
90 | }
91 | ```
92 |
--------------------------------------------------------------------------------
/doc2dict/doc2dict/txt/txt2dict.py:
--------------------------------------------------------------------------------
1 | from .convert_txt_to_instructions import convert_txt_to_instructions
2 | from ..convert_instructions_to_dict import convert_instructions_to_dict
3 |
4 |
5 | # FIX THIS # TODO TODO
6 | def combine_text_wraparound(instructions_list):
7 | """Used for e.g. text files where the next line is meant to be part of the same paragraph, but the next next line is a new paragraph"""
8 |
9 | # merge instructions
10 | new_instructions_list = []
11 | current_instructions = []
12 |
13 | for line_num in range(len(instructions_list) - 1):
14 | instructions = instructions_list[line_num]
15 | # Add wraparound attribute to each instruction
16 | for instruction in instructions:
17 | instruction['wraparound'] = True
18 |
19 | # Only add space if this is NOT the first line of the paragraph
20 | if current_instructions and 'text' in instructions[0]:
21 | instructions[0]['text'] = ' ' + instructions[0]['text']
22 |
23 | # Extend current_instructions with this line's instructions
24 | current_instructions.extend(instructions)
25 |
26 | if instructions_list[line_num + 1] == []: # Next line is empty
27 | if current_instructions: # Only append if not empty
28 | new_instructions_list.append(current_instructions)
29 | current_instructions = [] # Reset for new paragraph
30 |
31 | # Handle the last line
32 | if instructions_list: # Check if list is not empty
33 | last_instructions = instructions_list[-1]
34 |
35 | # Only add space if this is NOT the first line of the paragraph
36 | if current_instructions and 'text' in last_instructions[0]:
37 | last_instructions[0]['text'] = ' ' + last_instructions[0]['text']
38 |
39 | current_instructions.extend(last_instructions)
40 | if current_instructions: # Only append if not empty
41 | new_instructions_list.append(current_instructions)
42 |
43 | return new_instructions_list
44 |
45 |
46 | def txt2dict(content,mapping_dict=None,encoding='utf-8'):
47 | content = content.decode(encoding=encoding)
48 | instructions_list = convert_txt_to_instructions(content=content)
49 |
50 | # we need to add a filter here, ideally via mapping
51 | # should use whether ends with '.' to merge. into blocks
52 | # probably add default and if detected for the pdf use case
53 |
54 | instructions_list = combine_text_wraparound(instructions_list=instructions_list)
55 |
56 | # handle dash headers e.g. [{'text': 'Item 2. Properties', 'wraparound': True}, {'text': ' -------------------', 'wraparound': True}]
57 | # duct tape solution TODO fix
58 | for instructions in instructions_list:
59 | if 'text' in instructions[-1]:
60 | if set(instructions[-1]['text'].replace(' ','')) == {'-'}:
61 | # add bold to all instructions
62 | [item.update({'bold': True}) or item for item in instructions]
63 | instructions.pop()
64 |
65 | instructions_list = [item for item in instructions_list if item !=[]]
66 |
67 | dct = convert_instructions_to_dict(instructions_list=instructions_list,mapping_dict=mapping_dict)
68 | return dct
69 |
--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | # doc2dict
2 |
3 | Convert HTML, XML, and PDFs into dictionaries.
4 |
5 | * [Documentation](https://john-friedman.github.io/doc2dict/)
6 |
7 | Note that `doc2dict` is in an early stage. The goal is to create a fast, generalized, algorithmic parser that can be easily tweaked depending on the document.
8 |
9 | `doc2dict` supports the [datamule](https://github.com/john-friedman/datamule-python) project.
10 |
11 | ## Parsers
12 |
13 | 1. HTML Parser
14 | 2. PDF Parser - very early stage, currently only supports some pdf types.
15 | 3. XML Parser - please use Martin Blech's excellent xmltodict. doc2dict's xml2dict is currently a mess.
16 |
17 | ## Installation
18 |
19 | ```bash
20 | pip install doc2dict
21 | ```
22 |
23 | ## HTML
24 |
25 | ### Examples
26 |
27 | Parsed HTML in Dictionary Form:
28 | [example](example_output/html/dict.json)
29 |
30 | Dictionary Form converted to HTML for easy visualiztion:
31 | [example](example_output/html/document_visualization.html)
32 |
33 | ### Quickstart
34 |
35 | ```python
36 | from doc2dict import html2dict, visualize_dict
37 |
38 | # Load your html file
39 | with open('apple_10k_2024.html','r') as f:
40 | content = f.read()
41 |
42 | # Parse
43 | dct = html2dict(content,mapping_dict=None)
44 |
45 | # Visualize Parsing
46 | visualize_dict(dct)
47 | ```
48 |
49 | ### Mapping Dicts
50 |
51 | Mapping dictionaries are rules that you pass into the parser to tweak its functionality.
52 |
53 | The below mapping dict tells the parser that "item" header should appear in the nesting of "part" headers.
54 |
55 | ```python
56 | tenk_mapping_dict = {
57 | ('part',r'^part\s*([ivx]+)$') : 0,
58 | ('signatures',r'^signatures?\.*$') : 0,
59 | ('item',r'^item\s*(\d+)') : 1,
60 | }
61 | ```
62 |
63 | ### Debugging
64 |
65 | ```python
66 | from doc2dict import *
67 | from selectolax.parser import HTMLParser
68 |
69 | # Load your html file
70 | with open('apple_10k_2024.htm','r') as f:
71 | content = f.read()
72 |
73 |
74 | body = HTMLParser(content).body
75 |
76 | # convert html to a series of instructions
77 | instructions = convert_html_to_instructions(body)
78 |
79 | # visualize the conversion
80 | visualize_instructions(instructions)
81 |
82 | # convert instructions to dictionary
83 | dct = html2dict(content,mapping_dict=None)
84 |
85 | # visualize dictionary
86 | visualize_dict(dct)
87 | ```
88 |
89 | ### Benchmarks
90 |
91 | Based on my personal (potato) laptop:
92 | * About 500 pages per second single threaded.
93 | * Parses the 57 page Apple 10-K in 160 milliseconds.
94 |
95 | ## PDF
96 |
97 | The pdf parser is in a very early stage. It does not always handle encoding issues and the resulting hierarchies can be quite odd.
98 |
99 | I've released this because it may be useful to you, and as a proof of concept that fast pdf to dictionary parsing is possible. I plan to develop this further when presented with an interesting use case.
100 |
101 | ### Quickstart
102 |
103 | ```python
104 | from doc2dict import pdf2dict, visualize_dict
105 |
106 | # Load your html file
107 | with open('apple_10k_2024.pdf','rb') as f:
108 | content = f.read()
109 |
110 | # Parse
111 | dct = pdf2dict(content,mapping_dict=None)
112 |
113 | # Visualize Parsing
114 | visualize_dict(dct)
115 | ```
116 |
117 | ### Benchmarks
118 |
119 | * About 200 pages per second single threaded.
120 |
121 | ### Other Functions:
122 | - flatten_dict(dct, format='markdown') or flatten_dict(dct, format='text')
123 | - unnest_dict(dct) - returns dict in form (id,type,content,level)
124 |
125 | # TODO
126 | - generalize instructions to dict
127 | - add github workflow to run parser on examples after each push.
--------------------------------------------------------------------------------
/doc2dict/doc2dict/utils/format_dict.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | def _clean_cell_content(cell_content):
4 |
5 | text = str(cell_content)
6 |
7 | # Replace non-breaking space
8 | text = text.replace('\u00a0', '')
9 |
10 | # Replace tabs with spaces
11 | text = text.replace('\t', ' ')
12 |
13 | # Replace multiple newlines with single spaces
14 | text = text.replace('\n\n', ' ')
15 | text = text.replace('\n', ' ')
16 |
17 | # Replace multiple spaces with single spaces
18 | text = re.sub(r'\s+', ' ', text)
19 |
20 | # Strip leading/trailing whitespace
21 | text = text.strip()
22 |
23 | return text
24 |
25 | def _format_table(table_data):
26 | if not table_data:
27 | return []
28 |
29 | # Clean all cell content first
30 | cleaned_data = []
31 | for row in table_data:
32 | cleaned_row = [_clean_cell_content(cell) for cell in row]
33 | cleaned_data.append(cleaned_row)
34 |
35 | # Calculate column widths using cleaned data
36 | col_widths = []
37 | for row in cleaned_data:
38 | for i, cell in enumerate(row):
39 | cell_len = len(cell)
40 | if i >= len(col_widths):
41 | col_widths.append(cell_len)
42 | else:
43 | col_widths[i] = max(col_widths[i], cell_len)
44 |
45 | formatted_rows = []
46 | formatted_rows.append('') # Empty line before table
47 |
48 | for i, row in enumerate(cleaned_data):
49 | padded_cells = [cell.ljust(col_widths[j]) for j, cell in enumerate(row)]
50 | formatted_rows.append('| ' + ' | '.join(padded_cells) + ' |')
51 |
52 | # Add separator after first row (header)
53 | if i == 0:
54 | separator = '|' + '|'.join('-' * (w + 2) for w in col_widths) + '|'
55 | formatted_rows.append(separator)
56 |
57 | formatted_rows.append('') # Empty line after table
58 | return formatted_rows
59 |
60 |
61 | def _format_title(text, level):
62 | # Ensure level is at least 1 for proper markdown heading
63 | markdown_level = max(1, min(level + 1, 6))
64 | return "#" * markdown_level + " " + text
65 |
66 | def unnest_dict(dct):
67 | result = []
68 |
69 | def process_content(content, current_id=None, level=0):
70 | if not isinstance(content, dict):
71 | return
72 |
73 | # Process title, text, textsmall, and table directly
74 | for key in ['title', 'text', 'textsmall', 'table']:
75 | if key in content:
76 | # skip introduction filler
77 | if current_id == -1:
78 | pass
79 | else:
80 | result.append((current_id, key, content[key], level))
81 |
82 | # Process contents recursively in numeric order
83 | contents = content.get('contents', {})
84 | if contents:
85 | for key in contents.keys():
86 | process_content(contents[key], key, level + 1)
87 |
88 | # Start processing from document
89 | if 'document' in dct:
90 | document = dct['document']
91 | for key in document.keys():
92 | process_content(document[key], key, 0)
93 | else:
94 | # If no document key, process the entire dictionary
95 | process_content(dct, level=0)
96 |
97 | return result
98 |
99 | def flatten_dict(dct=None, format='markdown',tuples_list=None):
100 | if tuples_list is None:
101 | tuples_list = unnest_dict(dct)
102 | results = []
103 | if format == 'markdown':
104 | for tuple in tuples_list:
105 | tuple_type = tuple[1]
106 | content = tuple[2]
107 | level = tuple[3]
108 | if tuple_type == 'table':
109 | results.extend(_format_table(content))
110 | elif tuple_type == 'text':
111 | results.append(content)
112 | elif tuple_type == 'textsmall':
113 | results.append(f'{content}')
114 | elif tuple_type == 'title':
115 | results.append(_format_title(content,level))
116 |
117 | return '\n'.join(results)
118 | elif format == 'text':
119 | for tuple in tuples_list:
120 | tuple_type = tuple[1]
121 | content = tuple[2]
122 | level = tuple[3]
123 |
124 | # reuse markdown format
125 | if tuple_type == 'table':
126 | results.extend(_format_table(content))
127 | elif tuple_type == 'text':
128 | results.append(content)
129 | elif tuple_type == 'textsmall':
130 | results.append(content)
131 | elif tuple_type == 'title':
132 | results.append('')
133 | results.append(content)
134 | results.append('')
135 |
136 | return '\n'.join(results)
137 | else:
138 | raise ValueError(f'Format not found: {format}')
--------------------------------------------------------------------------------
/doc2dict/doc2dict/pdf/utils.py:
--------------------------------------------------------------------------------
1 |
2 | # TODO, modify for e.g. BOLD AND ITALIC or IT etc name variations
3 | def get_font_attributes(font_name):
4 | dct = {}
5 | attribute = font_name.split('-')
6 | if len(attribute) > 1:
7 | key = attribute[-1].lower()
8 | dct[key] = True
9 | return dct
10 |
11 | def get_font_size(coords_tuple):
12 | left = coords_tuple[0]
13 | bottom = coords_tuple[1]
14 | right = coords_tuple[2]
15 | top = coords_tuple[3]
16 | height = top - bottom
17 | font_size = height / 2
18 | return font_size * 4 # Multiplying just because why not?
19 |
20 | # TODO REMOVE. we do need to find how to get actual font size
21 | def standardize_font_size(instructions_stream):
22 | """
23 | Standardize font sizes in the instructions stream by merging font sizes that are close to each other.
24 |
25 | Args:
26 | instructions_stream (list): List of dictionaries containing text elements with font-size information
27 |
28 | Returns:
29 | list: The instructions stream with standardized font sizes
30 | """
31 | if not instructions_stream:
32 | return []
33 |
34 | # First, extract all unique font sizes
35 | font_sizes = []
36 | for item in instructions_stream:
37 | if 'font-size' in item:
38 | font_sizes.append(item['font-size'])
39 |
40 | # If no font sizes found, return original stream
41 | if not font_sizes:
42 | return instructions_stream
43 |
44 | # Sort font sizes
45 | font_sizes = sorted(set(font_sizes))
46 |
47 | # Group similar font sizes
48 | standardized_sizes = []
49 | current_group = [font_sizes[0]]
50 |
51 | for i in range(1, len(font_sizes)):
52 | # Calculate relative difference between consecutive font sizes
53 | current_size = font_sizes[i]
54 | prev_size = font_sizes[i-1]
55 | relative_diff = abs(current_size - prev_size) / max(current_size, prev_size)
56 |
57 | # If the difference is less than a threshold (e.g., 5%), group them
58 | if relative_diff < 0.05:
59 | current_group.append(current_size)
60 | else:
61 | # Calculate average for the current group
62 | avg_size = sum(current_group) / len(current_group)
63 | standardized_sizes.append((current_group, avg_size))
64 | current_group = [current_size]
65 |
66 | # Add the last group
67 | if current_group:
68 | avg_size = sum(current_group) / len(current_group)
69 | standardized_sizes.append((current_group, avg_size))
70 |
71 | # Create a mapping from original sizes to standardized sizes
72 | size_mapping = {}
73 | for group, avg_size in standardized_sizes:
74 | for size in group:
75 | size_mapping[size] = avg_size
76 |
77 | # Apply the mapping to the instructions stream
78 | for item in instructions_stream:
79 | if 'font-size' in item and item['font-size'] in size_mapping:
80 | item['font-size'] = size_mapping[item['font-size']]
81 |
82 | return instructions_stream
83 |
84 | def assign_line(instructions_stream):
85 | """
86 | Assign line numbers to text elements that are positioned on the same line.
87 | Only compares with the next neighbor in the list.
88 | """
89 |
90 | # Initialize with first element
91 | current_line = 0
92 | instructions_list = []
93 | instructions = [instructions_stream[0]]
94 |
95 | # Process remaining elements
96 | for i in range(len(instructions_stream) - 1):
97 | current = instructions_stream[i]
98 | next_item = instructions_stream[i + 1]
99 |
100 | # Extract y-coordinates (bottom of text)
101 | current_y = current['coords'][1] # bottom y of current
102 | next_y = next_item['coords'][1] # bottom y of next
103 |
104 | # Get font sizes for tolerance calculation
105 | current_font_size = current['font-size']
106 | next_font_size = next_item['font-size']
107 |
108 | # Calculate tolerance based on larger font size
109 | tolerance = max(current_font_size, next_font_size) * 0.5
110 |
111 | # Check if next item is on the same line
112 | if abs(current_y - next_y) <= tolerance:
113 | # if font-name and font-size are the same, then we can merge them. We can do this, because font name contains bold/italic
114 | if current['font-name'] == next_item['font-name'] and current['font-size'] == next_item['font-size']:
115 | # Merge the two items
116 | current['text'] += next_item['text']
117 | current['coords'] = (
118 | min(current['coords'][0], next_item['coords'][0]), # left
119 | min(current['coords'][1], next_item['coords'][1]), # bottom
120 | max(current['coords'][2], next_item['coords'][2]), # right
121 | max(current['coords'][3], next_item['coords'][3]) # top
122 | )
123 | else:
124 | instructions.append(next_item)
125 | else:
126 | instructions_list.append(instructions)
127 | instructions = [next_item]
128 |
129 | return instructions_list
130 |
131 | # so these need to be modified to look at all the dicts.
132 | def get_left_indent(coords_tuple):
133 | return
134 |
135 | def get_is_centered(coords_tuple):
136 | return
--------------------------------------------------------------------------------
/doc2dict/docs/docs/whitepaper.md:
--------------------------------------------------------------------------------
1 | # High Speed Document Algorithmic Parsing
2 |
3 | ## Abstract
4 | Parsing documents that are human readable into machine readable form is difficult due to under the hood variation. Here is my attempt at providing a fast, robust generalized approach, that can be easily modified to account for variation in documents.
5 |
6 |
9 |
10 | ???+ note "Caveats"
11 | This is not meant to be a perfect parsing approach. It's meant to be a "good enough" approach that is fast enough to parse the entire SEC corpus on a personal laptop. This is also in an early stage - things will change.
12 |
13 | ???+ note "Terminology"
14 | I don't know the right words to use. If you do, please [email me](mailto:johnfriedman@datamule.xyz) and/or bully me into correcting the terminology.
15 |
16 |
17 | ## General
18 |
19 | ### Approach
20 | 1. Convert messy document into a simple list of instructions.
21 | 2. Convert the list of instructions into dictionary using a set of rules that can be easily tailored for the document.
22 |
23 | The idea here is to turn a complex problem that is hard to solve, into a simple problem, that is easy to solve.
24 | * Nested html is hard to understand -> the same html in list form is easy
25 | * Raw pdfs are hard to understand -> the same pdf in list form is easy
26 |
27 | We can then convert from the (flat) list form into a nested dictionary by using simple rules like "bigger headers have higher nesting" as well as specify where certain headers go - "item 1a risk factors should be nested under part i".
28 |
29 | This also makes the parsing easier to modify for less technical users. A grad student in economics is unlikely to be able to modify the walk through a html document to properly account for style inheritance, but likely can modify rules such as "ignore italics for header selection".
30 |
31 | #### Examples
32 | Instructions List:
33 | ```
34 | [{'text': 'PART I', 'text-style': 'all_caps', 'left-indent': 8.0, 'font-size': 13.33, 'text-center': True, 'bold': True}]
35 | [{'text': 'ITEM 1. BUSINESS', 'text-style': 'all_caps', 'left-indent': 8.0, 'font-size': 15.995999999999999, 'text-center': True, 'bold': True}]
36 | [{'text': 'GENERAL', 'text-style': 'all_caps', 'left-indent': 8.0, 'font-size': 13.33, 'text-center': True, 'underline': True}]
37 | [{'text': 'Embracing Our Future', 'left-indent': 8.0, 'font-size': 13.33, 'bold': True}]...
38 | ```
39 |
40 | Dictionary
41 | ```
42 | "37": {
43 | "title": "PART I",
44 | "standardized_title": "parti",
45 | "class": "part",
46 | "contents": {
47 | "38": {
48 | "title": "ITEM 1. BUSINESS",
49 | "standardized_title": "item1",
50 | "class": "item",
51 | "contents": {
52 | "39": {
53 | "title": "GENERAL",
54 | "standardized_title": "",
55 | "class": "predicted header",
56 | "contents": {
57 | "40": {
58 | "title": "Embracing Our Future",...
59 | ```
60 |
61 |
62 |
63 | ### Mapping Dictionaries
64 | I call the set of rules used to convert the list of instructions into a dictionary a "mapping dict". The idea is that a less technical user who will have trouble tweaking the engine can easily modify a list of rules that tweak the output.
65 |
66 | #### Example
67 | ```
68 | dict_10k_html = {
69 | ('part',r'^part\s*([ivx]+)$') : 0,
70 | ('signatures',r'^signatures?\.*$') : 0,
71 | ('item',r'^item\s*(\d+)\.?([a-z])?') : 1,
72 | }
73 | ```
74 |
75 | The above mapping dict tells the parser to assign class 'part' to predicted headers and assign hierarchy '0' or root level. It then uses the capture group `([ivx]+)` and the class to determine the standarized_title.
76 |
77 | ## HTML
78 |
79 | The basic html approach has already been implemented. Ballpark speed is about 500 pages per second on my two year old personal laptop.
80 |
81 | ### Approach
82 |
83 | 1. Iterate through the html file, keeping track of attributes that apply for each text node, with special handling for tables to create the instructions list. Output each text node as an instruction on the same line if the two text nodes visually appear on the same line.
84 | 2. For the instructions list, determine which instructions are likely to be headers. If an instruction is a header, determine hierarchy with the aid of a mapping dict if present.
85 |
86 | ### Tables
87 |
88 | 1. Construct a matrix with each cell representing a cell in the table
89 | 2. If a cell spans multiple rows or columns, duplicate the cell in the matrix
90 | 3. Remove rows and columns that are considered empty - e.g. have only empty characters
91 | 4. Remove rows and columns that contain no unique information - e.g. if a column is a subset of another column, remove it.
92 |
93 | TODO
94 |
95 | * Currently removes unmatched parenthesis columns, in the future will merge them
96 | * Currently does not handle indents - many tables can be split into multiple tables using information from indents
97 |
98 | ???+ note "Goal"
99 | The goal here is not to perfectly parse tables. We can get close, but often the information for html tables is above the table in a seperate block.
100 |
101 | ### Visualization
102 |
103 | Visualization is important for both the instructions_list stage and the final dict stage. Visualization lets users quickly debug whether the parser is working as expected, and what to tweak.
--------------------------------------------------------------------------------
/doc2dict/doc2dict/html/visualize_instructions.py:
--------------------------------------------------------------------------------
1 | import webbrowser
2 | import os
3 |
4 | def format_dct_style(line):
5 | text = line.get('text', '')
6 | href = line.get('href', '')
7 |
8 | style_properties = []
9 | # might have issues here in the future
10 | if 'bold' in line:
11 | style_properties.append('font-weight: bold')
12 | if 'italic' in line:
13 | style_properties.append('font-style: italic')
14 | if 'underline' in line:
15 | style_properties.append('text-decoration: underline')
16 | if 'font-size' in line:
17 | font_size = line['font-size']
18 | if font_size:
19 | style_properties.append(f'font-size: {font_size}')
20 | if 'left-indent' in line:
21 | left_indent = line['left-indent']
22 | if left_indent:
23 | style_properties.append(f'margin-left: {left_indent}px')
24 |
25 | return style_properties, text, href
26 |
27 | def format_table(table):
28 | table_html = ""
29 | for idx, row in enumerate(table):
30 | table_html += ""
31 | for cell in row:
32 | if 'image' in cell:
33 | image_data = cell['image']
34 | src = image_data.get('src', '')
35 | alt = image_data.get('alt', '')
36 | cell_content = f"
"
37 | else:
38 | cell_text = cell.get('text', '')
39 | cell_href = cell.get('href', '')
40 |
41 | if cell_href:
42 | cell_content = f"{cell_text}"
43 | else:
44 | cell_content = cell_text
45 |
46 | if idx == 0:
47 | table_html += f"{cell_content} | "
48 | else:
49 | table_html += f"{cell_content} | "
50 | table_html += "
"
51 |
52 | table_html += "
"
53 | return table_html
54 |
55 | def visualize_instructions(instructions_list):
56 | # Simplified color scheme
57 | single_line_color = '#E8EAF6' # Light indigo - clean, professional
58 | multi_first_color = '#DCEDC8' # Light sage green - clear starting point
59 | multi_rest_color = '#F9FBE7' # Very pale yellow-green - subtle continuation
60 |
61 | table_uncleaned_color = '#FFECB3' # Warm amber - intuitive "needs attention"
62 | table_cleaned_color = '#B2DFDB' # Teal - fresh and clean feeling
63 |
64 | html_content = """
65 |
66 |
67 |
126 |
127 | """
128 |
129 | for instructions in instructions_list:
130 | if len(instructions) == 1:
131 | if 'table' in instructions[0]:
132 | table_html = format_table(instructions[0]['table'])
133 | html_content += ""
134 | if instructions[0].get('cleaned', False):
135 | html_content += f"
{table_html}
"
136 | else:
137 | html_content += f"
{table_html}
"
138 | html_content += "
"
139 | continue
140 | elif 'image' in instructions[0]:
141 | image_data = instructions[0]['image']
142 | src = image_data.get('src', '')
143 | alt = image_data.get('alt', '')
144 |
145 | html_content += ""
146 | html_content += f"

"
147 | html_content += "
"
148 | continue
149 |
150 | first_instruction = instructions[0]
151 | is_centered = first_instruction.get('text-center', False)
152 | div_style = ''
153 |
154 | if is_centered:
155 | div_style = 'text-align: center;'
156 |
157 | html_content += f""
158 | for idx, instruction in enumerate(instructions):
159 | if 'image' in instruction:
160 | # Handle image instructions
161 | image_data = instruction['image']
162 | src = image_data.get('src', '')
163 | alt = image_data.get('alt', '')
164 |
165 | if len(instructions) == 1:
166 | color = single_line_color
167 | elif idx == 0:
168 | color = multi_first_color
169 | else:
170 | color = multi_rest_color
171 |
172 | html_content += f"
"
173 | html_content += f"
"
174 | html_content += ""
175 | else:
176 | # Handle text instructions
177 | style_properties, text, href = format_dct_style(instruction)
178 |
179 | if len(instructions) == 1:
180 | color = single_line_color
181 | elif idx == 0:
182 | color = multi_first_color
183 | else:
184 | color = multi_rest_color
185 |
186 | style_properties.append(f'background-color: {color}')
187 | style = '; '.join(style_properties)
188 |
189 | if href:
190 | span_content = f"
{text}"
191 | else:
192 | span_content = text
193 |
194 | html_content += f"
{span_content}"
195 |
196 | html_content += "
"
197 |
198 | html_content += """
199 |
200 | """
201 |
202 | # Write HTML content to a temporary file
203 | with open('instructions_visualization.html', 'w', encoding='utf-8') as f:
204 | f.write(html_content)
205 |
206 | # Get the absolute path of the file
207 | file_path = os.path.abspath('instructions_visualization.html')
208 |
209 | # Open the HTML file in the default web browser
210 | webbrowser.open('file://' + file_path)
--------------------------------------------------------------------------------
/doc2dict/doc2dict/html/visualize_dict.py:
--------------------------------------------------------------------------------
1 | import webbrowser
2 | import os
3 |
4 | def visualize_dict(data_dict, filename='document_visualization.html', open_browser=True):
5 | """
6 | Convert nested dictionary to HTML visualization and open in browser
7 |
8 | Parameters:
9 | data_dict (dict): The nested dictionary to visualize
10 | filename (str): The name of the HTML file to create
11 | open_browser (bool): Whether to automatically open in browser
12 |
13 | Returns:
14 | str: The path to the created HTML file
15 | """
16 | html = []
17 |
18 | # Add HTML document opening tags and CSS
19 | html.append("""
20 |
21 |
22 |
23 |
24 |
25 | Document Visualization
26 |
104 |
105 |
106 | """)
107 |
108 | # Add metadata box
109 | if "metadata" in data_dict:
110 | html.append('')
116 |
117 | # Process the document structure
118 | if "document" in data_dict:
119 | html.append('')
120 | process_document(data_dict["document"], html, 1)
121 | html.append('
')
122 |
123 | # Add HTML closing tags
124 | html.append("""
125 |
126 |
127 | """)
128 |
129 | html_content = ''.join(html)
130 |
131 | # Write HTML content to a file
132 | with open(filename, 'w', encoding='utf-8') as f:
133 | f.write(html_content)
134 |
135 | # Get the absolute path of the file
136 | file_path = os.path.abspath(filename)
137 |
138 | # Open the HTML file in the default web browser if requested
139 | if open_browser:
140 | webbrowser.open('file://' + file_path)
141 |
142 | return file_path
143 |
144 | def process_document(doc_dict, html, level):
145 | """Process document elements recursively"""
146 | # Sort keys to ensure numerical order for items like "1", "2", etc.
147 | try:
148 | sorted_keys = sorted(doc_dict.keys(), key=lambda x: (not x.lstrip('-').isdigit(), int(x) if x.lstrip('-').isdigit() else x))
149 | except:
150 | # Fallback if sorting fails
151 | sorted_keys = list(doc_dict.keys())
152 |
153 | for key in sorted_keys:
154 | value = doc_dict[key]
155 |
156 | if isinstance(value, dict):
157 | section_title = value.get("title", "")
158 |
159 | # Output the section title
160 | if section_title:
161 | heading_level = min(level, 6) # Limit to h6
162 | html.append(f'{section_title}')
163 |
164 | # Process the section content
165 | html.append('')
166 |
167 | # Handle direct content fields
168 | for attr_key, attr_value in value.items():
169 | if attr_key not in ["title", "class", "contents", "standardized_title"]:
170 | process_content(attr_key, attr_value, html)
171 |
172 | # Process contents dictionary if it exists
173 | if "contents" in value and value["contents"]:
174 | process_document(value["contents"], html, level + 1)
175 |
176 | html.append('
')
177 | else:
178 | # Direct content
179 | process_content(key, value, html)
180 |
181 | def process_content(content_type, content, html):
182 | """Process specific content types"""
183 | if content_type == "text":
184 | # Preserve bullet points and other formatting
185 | html.append(f'{content}
')
186 | elif content_type == "textsmall":
187 | html.append(f'{content}
')
188 | elif content_type == "image":
189 | process_image(content, html)
190 | elif content_type == "table":
191 | process_table(content, html)
192 | else:
193 | pass
194 |
195 | def process_image(image_data, html):
196 | """Convert image data to HTML img tag"""
197 | src = image_data.get('src', '')
198 | alt = image_data.get('alt', 'Image')
199 |
200 | html.append('')
201 | html.append(f'

')
202 | html.append('
')
203 |
204 | def process_table_cell(cell):
205 | """Process a single table cell that may contain text or image data"""
206 | if isinstance(cell, dict):
207 | if 'image' in cell:
208 | # Cell contains an image
209 | image_data = cell['image']
210 | src = image_data.get('src', '')
211 | alt = image_data.get('alt', 'Image')
212 | return f'
'
213 | elif 'text' in cell:
214 | # Cell contains structured text data
215 | return cell['text']
216 | else:
217 | # Cell is a dict but doesn't match expected structure
218 | return str(cell)
219 | else:
220 | # Cell is a string or other simple type
221 | return str(cell)
222 |
223 | def process_table(table_data, html):
224 | """Convert table data to HTML table"""
225 | html.append('')
226 |
227 | # Check if first row should be treated as header
228 | has_header = False
229 | if len(table_data) > 1:
230 | # Heuristic: if first row contains mostly text content, treat as header
231 | first_row = table_data[0]
232 | text_cells = 0
233 | for cell in first_row:
234 | if isinstance(cell, str) and cell.strip():
235 | text_cells += 1
236 | elif isinstance(cell, dict) and cell.get('text', '').strip():
237 | text_cells += 1
238 |
239 | if text_cells >= len(first_row) / 2: # At least half the cells have text
240 | has_header = True
241 |
242 | for i, row in enumerate(table_data):
243 | html.append('')
244 | for cell in row:
245 | # Use th for header cells, otherwise td
246 | tag = 'th' if has_header and i == 0 else 'td'
247 | cell_content = process_table_cell(cell)
248 | html.append(f'<{tag}>{cell_content}{tag}>')
249 | html.append('
')
250 |
251 | html.append('
')
--------------------------------------------------------------------------------
/example_output/html/unnest.txt:
--------------------------------------------------------------------------------
1 | PART IV
2 | ITEM 15. EXHIBIT AND FINANCIAL STATEMENT SCHEDULES
3 | (a)Financial Statements and Schedules
4 | The financial statements are set forth under Part II, Item 8 of this Form 10-K, as indexed below. Financial statement schedules have been omitted since they either are not required, not applicable, or the information is otherwise included.
5 | Index to Financial Statements Page
6 | Income Statements 56
7 | Comprehensive Income Statements 57
8 | Balance Sheets 58
9 | Cash Flows Statements 59
10 | Stockholders’ Equity Statements 60
11 | Notes to Financial Statements 61
12 | Report of Independent Registered Public Accounting Firm 94
13 | (b)Exhibit Listing
14 | Incorporated by Reference Incorporated by Reference Incorporated by Reference Incorporated by Reference
15 | Exhibit
16 | Number Exhibit Description Filed
17 | Herewith Form Period
18 | Ending Exhibit Filing Date
19 | 3.1 Amended and Restated Articles of Incorporation of Microsoft Corporation 8-K 3.1 12/1/2016
20 | 3.2 Bylaws of Microsoft Corporation 8-K 3.2 7/3/2023
21 | 4.1 Indenture, dated as of May 18, 2009, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee (“Base Indenture”) S-3ASR 4.1 10/29/2015
22 | 4.2 Form of First Supplemental Indenture for 2.95% Notes due 2014, 4.20% Notes due 2019, and 5.20% Notes due 2039, dated as of May 18, 2009, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee, to the Base Indenture 8-K 4.2 5/15/2009
23 | 4.5 Form of Second Supplemental Indenture for 0.875% Notes due 2013, 1.625% Notes due 2015, 3.00% Notes due 2020, and 4.50% Notes due 2040, dated as of September 27, 2010, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee, to the Indenture, dated as of May 18, 2009, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee 8-K 4.2 9/27/2010
24 | 103
25 | PART IV
26 | Item 15
27 | Incorporated by Reference Incorporated by Reference Incorporated by Reference Incorporated by Reference
28 | Exhibit
29 | Number Exhibit Description Filed
30 | Herewith Form Period
31 | Ending Exhibit Filing Date
32 | 4.6 Third Supplemental Indenture for 2.500% Notes due 2016, 4.000% Notes due 2021, and 5.300% Notes due 2041, dated as of February 8, 2011, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee, to the Indenture, dated as of May 18, 2009, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee 8-K 4.2 2/8/2011
33 | 4.7 Fourth Supplemental Indenture for 0.875% Notes due 2017, 2.125% Notes due 2022, and 3.500% Notes due 2042, dated as of November 7, 2012, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee, to the Indenture, dated as of May 18, 2009, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee 8-K 4.1 11/7/2012
34 | 4.8 Fifth Supplemental Indenture for 2.625% Notes due 2033, dated as of May 2, 2013, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee, to the Indenture, dated as of May 18, 2009, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee 8-K 4.1 5/1/2013
35 | 4.9 Sixth Supplemental Indenture for 1.000% Notes due 2018, 2.375% Notes due 2023, and 3.750% Notes due 2043, dated as of May 2, 2013, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee, to the Indenture, dated as of May 18, 2009, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee 8-K 4.2 5/1/2013
36 | 4.10 Seventh Supplemental Indenture for 2.125% Notes due 2021 and 3.125% Notes due 2028, dated as of December 6, 2013, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee, to the Indenture, dated as of May 18, 2009, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee 8-K 4.1 12/6/2013
37 | 104
38 | PART IV
39 | Item 15
40 | Incorporated by Reference Incorporated by Reference Incorporated by Reference Incorporated by Reference
41 | Exhibit
42 | Number Exhibit Description Filed
43 | Herewith Form Period
44 | Ending Exhibit Filing Date
45 | 4.11 Eighth Supplemental Indenture for 1.625% Notes due 2018, 3.625% Notes due 2023, and 4.875% Notes due 2043, dated as of December 6, 2013, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee, to the Indenture, dated as of May 18, 2009, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee 8-K 4.2 12/6/2013
46 | 4.12 Ninth Supplemental Indenture for 1.850% Notes due 2020, 2.375% Notes due 2022, 2.700% Notes due 2025, 3.500% Notes due 2035, 3.750% Notes due 2045, and 4.000% Notes due 2055, dated as of February 12, 2015, between Microsoft Corporation and U.S. Bank National Association, as Trustee, to the Indenture, dated as of May 18, 2009, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as trustee 8-K 4.1 2/12/2015
47 | 4.13 Tenth Supplemental Indenture for 1.300% Notes due 2018, 2.000% Notes due 2020, 2.650% Notes due 2022, 3.125% Notes due 2025, 4.200% Notes due 2035, 4.450% Notes due 2045, and 4.750% Notes due 2055, dated as of November 3, 2015, between Microsoft Corporation and U.S. Bank National Association, as Trustee, to the Indenture, dated as of May 18, 2009, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as trustee 8-K 4.1 11/3/2015
48 | 4.14 Eleventh Supplemental Indenture for 1.100% Notes due 2019, 1.550% Notes due 2021, 2.000% Notes due 2023, 2.400% Notes due 2026, 3.450% Notes due 2036, 3.700% Notes due 2046, and 3.950% Notes due 2056, dated as of August 8, 2016, between Microsoft Corporation and U.S. Bank, National Association, as Trustee, to the Indenture, dated as of May 18, 2009, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as trustee 8-K 4.1 8/5/2016
49 | 105
50 | PART IV
51 | Item 15
52 | Incorporated by Reference Incorporated by Reference Incorporated by Reference Incorporated by Reference
53 | Exhibit
54 | Number Exhibit Description Filed
55 | Herewith Form Period
56 | Ending Exhibit Filing Date
57 | 4.15 Twelfth Supplemental Indenture for 1.850% Notes due 2020, 2.400% Notes due 2022, 2.875% Notes due 2024, 3.300% Notes due 2027, 4.100% Notes due 2037, 4.250% Notes due 2047, and 4.500% Notes due 2057, dated as of February 6, 2017, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee, to the Indenture, dated as of May 18, 2009, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as trustee 8-K 4.1 2/3/2017
58 | 4.16 Thirteenth Supplemental Indenture for 2.525% Notes due 2050 and 2.675% Notes due 2060, dated as of June 1, 2020, between Microsoft Corporation and U.S. Bank National Association, as Trustee, to the Indenture, dated as of May 18, 2009, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as trustee 8-K 4.1 6/1/2020
59 | 4.17 Fourteenth Supplemental Indenture for 2.921% Notes due 2052 and 3.041% Notes due 2062, dated as of March 17, 2021, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee, to the Indenture, dated as of May 18, 2009, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as trustee 8-K 4.1 3/17/2021
60 | 4.18 Fifteenth Supplemental Indenture, dated as of November 6, 2023, by and between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee 8-K 4.2 11/6/2023
61 | 4.19 Indenture, dated as of September 19, 2016, by and between Activision Blizzard, Inc. and Wells Fargo Bank, National Association, as Trustee, with respect to Activision Blizzard, Inc.’s 3.400% Senior Notes due 2026 8-K 4.9 11/6/2023
62 | 106
63 | PART IV
64 | Item 15
65 | Incorporated by Reference Incorporated by Reference Incorporated by Reference Incorporated by Reference
66 | Exhibit
67 | Number Exhibit Description Filed
68 | Herewith Form Period
69 | Ending Exhibit Filing Date
70 | 4.20 Base Indenture, dated as of May 26, 2017, by and between Activision Blizzard, Inc. and Wells Fargo Bank, National Association, as Trustee, with respect to Activision Blizzard, Inc.’s 3.400% Senior Notes due 2027, 1.350% Senior Notes due 2030, 4.500% Senior Notes due 2047 and 2.500% Senior Notes due 2050 8-K 4.10 11/6/2023
71 | 4.21 First Supplemental Indenture, dated as of May 26, 2017, by and between Activision Blizzard, Inc. and Wells Fargo Bank, National Association, as Trustee, with respect to Activision Blizzard, Inc.’s 3.400% Senior Notes due 2027 and 4.500% Senior Notes due 2047 8-K 4.11 11/6/2023
72 | 4.22 Second Supplemental Indenture, dated as of August 10, 2020, by and between Activision Blizzard, Inc. and Wells Fargo Bank, National Association, as Trustee, with respect to Activision Blizzard, Inc.’s 1.350% Senior Notes due 2030 and 2.500% Senior Notes due 2050 8-K 4.12 11/6/2023
73 | 4.23 First Supplemental Indenture, dated as of October 27, 2023, by and between Activision Blizzard, Inc. and Computershare Trust Company, N.A., with respect to Activision Blizzard, Inc.’s 3.400% Senior Notes due 2026 8-K 4.13 11/6/2023
74 | 4.24 Third Supplemental Indenture, dated as of October 27, 2023, by and between Activision Blizzard, Inc. and Computershare Trust Company, N.A., with respect to Activision Blizzard, Inc.’s 3.400% Senior Notes due 2027 and 4.500% Senior Notes due 2047 8-K 4.14 11/6/2023
75 | 4.25 Fourth Supplemental Indenture, dated as of October 27, 2023, by and between Activision Blizzard, Inc. and Computershare Trust Company, N.A., with respect to Activision Blizzard, Inc.’s 1.350% Senior Notes due 2030 and 2.500% Senior Notes due 2050 8-K 4.15 11/6/2023
76 | 4.26 Description of Securities X
77 | 10.1* Microsoft Corporation 2001 Stock Plan 10-Q 9/30/2016 10.1 10/20/2016
78 | 10.4* Microsoft Corporation Employee Stock Purchase Plan 10-K 6/30/2012 10.4 7/26/2012
79 | 107
80 | PART IV
81 | Item 15
82 | Incorporated by Reference Incorporated by Reference Incorporated by Reference Incorporated by Reference
83 | Exhibit
84 | Number Exhibit Description Filed
85 | Herewith Form Period
86 | Ending Exhibit Filing Date
87 | 10.5* Microsoft Corporation Deferred Compensation Plan X
88 | 10.6* Microsoft Corporation 2017 Stock Plan DEF14A Annex C 10/16/2017
89 | 10.7* Form of Stock Award Agreement Under the Microsoft Corporation 2017 Stock Plan 10-Q 3/31/2018 10.26 4/26/2018
90 | 10.8* Form of Performance Stock Award Agreement Under the Microsoft Corporation 2017 Stock Plan 10-Q 3/31/2018 10.27 4/26/2018
91 | 10.9 Amended and Restated Officers’ Indemnification Trust Agreement between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as trustee 10-Q 9/30/2016 10.12 10/20/2016
92 | 10.10 Assumption of Beneficiaries’ Representative Obligations Under Amended and Restated Officers’ Indemnification Trust Agreement 10-K 6/30/2020 10.25 7/30/2020
93 | 10.11 Form of Indemnification Agreement and Amended and Restated Directors’ Indemnification Trust Agreement between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as trustee 10-K 6/30/2019 10.13 8/1/2019
94 | 10.12 Assumption of Beneficiaries’ Representative Obligations Under Amended and Restated Directors’ Indemnification Trust Agreement 10-K 6/30/2020 10.26 7/30/2020
95 | 10.14* Microsoft Corporation Deferred Compensation Plan for Non-Employee Directors 10-Q 12/31/2017 10.14 1/31/2018
96 | 10.15* Microsoft Corporation Executive Incentive Plan 8-K 10.1 9/19/2018
97 | 10.19* Microsoft Corporation Executive Incentive Plan 10-Q 9/30/2016 10.17 10/20/2016
98 | 10.20* Form of Executive Incentive Plan (Executive Officer SAs) Stock Award Agreement under the Microsoft Corporation 2001 Stock Plan 10-Q 9/30/2016 10.18 10/20/2016
99 | 10.21* Form of Executive Incentive Plan Performance Stock Award Agreement under the Microsoft Corporation 2001 Stock Plan 10-Q 9/30/2016 10.25 10/20/2016
100 | 10.22* Senior Executive Severance Benefit Plan 10-Q 9/30/2016 10.22 10/20/2016
101 | 10.23* Offer Letter, dated February 3, 2014, between Microsoft Corporation and Satya Nadella 8-K 10.1 2/4/2014
102 | 108
103 | PART IV
104 | Item 15
105 | Incorporated by Reference Incorporated by Reference Incorporated by Reference Incorporated by Reference
106 | Exhibit
107 | Number Exhibit Description Filed
108 | Herewith Form Period
109 | Ending Exhibit Filing Date
110 | 10.24* Long-Term Performance Stock Award Agreement between Microsoft Corporation and Satya Nadella 10-Q 12/31/2014 10.24 1/26/2015
111 | 10.25* Offer Letter, dated October 25, 2020, between Microsoft Corporation and Christopher Young 10-Q 9/30/2021 10.27 10/26/2021
112 | 19.1 General Insider Trading Policy X
113 | 19.2 Restricted Trading Window Policy X
114 | 19.3 Insider Trading Compliance and Preclearance Policies for Section 16 Officers and Directors of Microsoft X
115 | 21 Subsidiaries of Registrant X
116 | 23.1 Consent of Independent Registered Public Accounting Firm X
117 | 31.1 Certification of Chief Executive Officer Pursuant to Section 302 of the Sarbanes-Oxley Act of 2002 X
118 | 31.2 Certification of Chief Financial Officer Pursuant to Section 302 of the Sarbanes-Oxley Act of 2002 X
119 | 32.1** Certification of Chief Executive Officer Pursuant to Section 906 of the Sarbanes-Oxley Act of 2002 X
120 | 32.2** Certification of Chief Financial Officer Pursuant to Section 906 of the Sarbanes-Oxley Act of 2002 X
121 | 97.1* Microsoft Corporation Executive Compensation Recovery Policy X
122 | 101.INS Inline XBRL Instance Document—the instance document does not appear in the Interactive Data File as its XBRL tags are embedded within the Inline XBRL document X
123 | 101.SCH Inline XBRL Taxonomy Extension Schema With Embedded Linkbase Documents X
124 | 104 Cover page formatted as Inline XBRL and contained in Exhibit 101 X
125 | * Indicates a management contract or compensatory plan or arrangement.
126 | ** Furnished, not filed.
127 | 109
128 | PART IV
129 | Item 16
130 | ITEM 16. FORM 10-K SUMMARY
131 | None.
132 | 110
--------------------------------------------------------------------------------
/doc2dict/doc2dict/convert_instructions_to_dict.py:
--------------------------------------------------------------------------------
1 | # TODO
2 | # rewrite this to set up modular stuff
3 | # e.g. preprocessing like wraparound
4 |
5 | import re
6 | from importlib.metadata import version
7 |
8 | __version__ = version("doc2dict")
9 |
10 | LIKELY_HEADER_ATTRIBUTES = ['bold', 'italic', 'underline', 'text-center', 'all_caps', 'fake_table','proper_case']
11 |
12 | def remove_empty_contents(obj):
13 | """Recursively remove empty contents dictionaries"""
14 | if isinstance(obj, dict):
15 | if 'contents' in obj and not obj['contents']:
16 | del obj['contents']
17 | else:
18 | for value in obj.values():
19 | remove_empty_contents(value)
20 |
21 | def create_level(level_num=-1, class_name='text', title='', attributes=None):
22 | """Factory function to create level dictionaries with all required fields"""
23 | return {
24 | 'level': level_num,
25 | 'class': class_name,
26 | 'standardized_title': title,
27 | 'attributes': attributes or {}
28 | }
29 |
30 |
31 | def split_header_instructions(instructions_list):
32 | """
33 | Splits instruction groups where the first instruction would be classified as a header.
34 |
35 | Args:
36 | instructions_list: List of instruction groups (each group is a list of instructions)
37 |
38 | Returns:
39 | New list of instruction groups with headers separated from their content
40 | """
41 |
42 |
43 | # First, detect big_script like in determine_levels
44 | text_instructions = [instr[0] for instr in instructions_list if 'text' in instr[0]]
45 | font_size_counts = {size: sum(1 for item in text_instructions if item.get('font-size') == size)
46 | for size in set(item.get('font-size') for item in text_instructions if item.get('font-size') is not None)}
47 |
48 | big_script = [False] * len(instructions_list)
49 | if font_size_counts:
50 | most_common_font_size, font_count = max(font_size_counts.items(), key=lambda x: x[1])
51 | if font_count > (0.5 * len(instructions_list)):
52 | # Check for big script (>20% larger than most common)
53 | for idx, instructions in enumerate(instructions_list):
54 | first = instructions[0]
55 | if 'text' in first and first.get('font-size') is not None:
56 | if first.get('font-size') > (1.2 * most_common_font_size):
57 | big_script[idx] = True
58 |
59 | # Now split instruction groups
60 | new_instructions_list = []
61 |
62 | for idx, instructions in enumerate(instructions_list):
63 | # Skip if only one instruction - nothing to split
64 | if len(instructions) <= 1:
65 | new_instructions_list.append(instructions)
66 | continue
67 |
68 | first_instruction = instructions[0]
69 |
70 | # Check if first instruction would be classified as a header
71 | is_header = False
72 | if 'text' in first_instruction:
73 | # Check for header attributes or big_script
74 | has_header_attrs = any(first_instruction.get(attr, False) for attr in LIKELY_HEADER_ATTRIBUTES)
75 | if has_header_attrs or big_script[idx]:
76 | is_header = True
77 |
78 | if is_header:
79 | # Split: first instruction becomes its own group, rest become another group
80 | new_instructions_list.append([first_instruction])
81 | if len(instructions) > 1: # Add remaining instructions as separate group
82 | new_instructions_list.append(instructions[1:])
83 | else:
84 | # Keep as is - no splitting needed
85 | new_instructions_list.append(instructions)
86 |
87 | return new_instructions_list
88 |
89 |
90 | # AI GENERATED CODE BC I WANT TO PUSH TO PROD #
91 | def determine_predicted_header_levels(levels):
92 | """
93 | Assigns hierarchy levels to predicted headers based on their attributes,
94 | maintaining consistency within each section defined by known headers.
95 |
96 | Args:
97 | levels: List of dictionaries containing level, class, and attributes
98 |
99 | Returns:
100 | List of tuples in the format (level, class)
101 | """
102 | # Find the base level for predicted headers
103 | predicted_headers = [l for l in levels if l['class'] == 'predicted header']
104 | if not predicted_headers:
105 | return [(level['level'], level['class'], level.get('standardized_title','')) for level in levels]
106 |
107 | base_level = min(h['level'] for h in predicted_headers)
108 |
109 | # Create a copy of levels that we'll modify
110 | updated_levels = levels.copy()
111 |
112 | # Track the last known header level
113 | current_section_level = -1
114 |
115 | # Dictionary to map attribute combinations to levels within the current section
116 | # Format: {attribute_key: assigned_level}
117 | attr_level_map = {}
118 |
119 | # Helper function to create a key from attributes dictionary
120 | def attr_to_key(attrs):
121 | if not attrs:
122 | return "no_attributes"
123 | # Sort keys to ensure consistent mapping regardless of order
124 | return "_".join(sorted([k for k, v in attrs.items() if v]))
125 |
126 | # Process each item
127 | for i, item in enumerate(updated_levels):
128 | # When we hit a known header, reset our attribute mapping
129 | if item['class'] != 'predicted header' and item['class'] not in ['text', 'textsmall']:
130 | if item['level'] <= current_section_level:
131 | # We've entered a new section at same or higher level, reset mappings
132 | attr_level_map = {}
133 | current_section_level = item['level']
134 | continue
135 |
136 | # Skip non-header items
137 | if item['class'] != 'predicted header':
138 | continue
139 |
140 | # Create a key for this item's attributes
141 | attr_key = attr_to_key(item.get('attributes', {}))
142 |
143 | # If we haven't seen this attribute combination in this section,
144 | # assign it the next available level
145 | if attr_key not in attr_level_map:
146 | attr_level_map[attr_key] = base_level + len(attr_level_map)
147 |
148 | # Assign the level based on the mapping
149 | item['level'] = attr_level_map[attr_key]
150 |
151 | # Return in the required format
152 | return [(level['level'], level['class'], level.get('standardized_title','')) for level in updated_levels]
153 | # AI GENERATED CODE BC I WANT TO PUSH TO PROD #
154 |
155 | def extract_cell_content(cell):
156 | """Helper function to extract content from table cells that may contain text or images"""
157 | if 'image' in cell:
158 | return cell # Return the full cell structure for images
159 | else:
160 | return cell.get("text", "") # Return text content or empty string
161 |
162 | def determine_levels(instructions_list, mapping_dict=None):
163 | if mapping_dict is None:
164 | predicted_header_level = 0
165 | #TODO bandaid fix
166 | elif 'rules' in mapping_dict:
167 | predicted_header_level = 0
168 | else:
169 | predicted_header_level = max(mapping_dict.values()) + 1
170 |
171 | # filter out tables, include both text and image instructions
172 | headers = []
173 | for instructions in instructions_list:
174 | first_instruction = instructions[0]
175 | if 'text' in first_instruction or 'image' in first_instruction:
176 | headers.append(first_instruction)
177 | else:
178 | headers.append({})
179 |
180 |
181 | # count font-size (only for text instructions)
182 | small_script = [False] * len(headers)
183 | big_script = [False] * len(headers)
184 | text_instructions = [instr[0] for instr in instructions_list if 'text' in instr[0]]
185 | font_size_counts = {size: sum(1 for item in text_instructions if item.get('font-size') == size) for size in set(item.get('font-size') for item in text_instructions if item.get('font-size') is not None)}
186 |
187 | # use only font size goes here
188 | if mapping_dict is not None:
189 | if 'rules' in mapping_dict:
190 | if 'use_font_size_only_for_level' in mapping_dict['rules']:
191 | # Filter headers first for this special case
192 | headers = [item if 'text' in item and any([item.get(attr, False) for attr in LIKELY_HEADER_ATTRIBUTES]) else {} for item in headers]
193 |
194 | most_common_font_size, font_count = max(font_size_counts.items(), key=lambda x: x[1])
195 |
196 | # Get all unique font sizes and sort them in descending order (largest font = level 0, next = level 1, etc.)
197 | unique_font_sizes = sorted(font_size_counts.keys(), reverse=True)
198 |
199 | # Create a mapping from font size to level (largest font = level 0, next = level 1, etc.)
200 | font_size_to_level = {size: idx for idx, size in enumerate(unique_font_sizes)}
201 |
202 | levels = []
203 | for idx, header in enumerate(headers):
204 | if 'text' in header and header.get('font-size') is not None:
205 | font_size = header.get('font-size')
206 |
207 | if font_size < most_common_font_size:
208 | # Assign small script for fonts smaller than most common
209 | level = (-2,'textsmall','')
210 | else:
211 | # Assign level based on font size hierarchy
212 | hierarchy_level = font_size_to_level[font_size]
213 | level = (hierarchy_level, 'predicted header','')
214 | else:
215 | # No font size information or not text, treat as regular text
216 | level = (-1, 'text','')
217 |
218 | levels.append(level)
219 |
220 | return levels
221 |
222 | # Detect font sizes first (before filtering headers)
223 | if font_size_counts != {}:
224 | most_common_font_size, font_count = max(font_size_counts.items(), key=lambda x: x[1])
225 | if font_count > (.5 * len(instructions_list)):
226 | # assume anything with less than this font size is small script
227 | small_script = [True if 'text' in item and item.get('font-size') is not None and item.get('font-size') < most_common_font_size else False for item in headers]
228 |
229 | # assume anything with more than 20% of the most common font size is big script
230 | big_script = [True if 'text' in item and item.get('font-size') is not None and item.get('font-size') > (1.2 * most_common_font_size) else False for item in headers]
231 |
232 | # NOW filter headers after font size detection (includes big_script in the filtering)
233 | headers = [item if 'text' in item and (any([item.get(attr, False) for attr in LIKELY_HEADER_ATTRIBUTES]) or big_script[idx]) else {} for idx, item in enumerate(headers)]
234 |
235 | levels = []
236 | for idx,header in enumerate(headers):
237 | level = None
238 | attributes = {attr: header.get(attr, False) for attr in LIKELY_HEADER_ATTRIBUTES if attr in header}
239 |
240 | if small_script[idx]:
241 | level = create_level(-2, 'textsmall')
242 | elif 'text' in header:
243 | if mapping_dict is not None:
244 | text = header['text'].lower()
245 | regex_tuples = [(item[0][1], item[0][0], item[1]) for item in mapping_dict.items()]
246 |
247 | for regex, header_class, hierarchy_level in regex_tuples:
248 | match = re.match(regex, text.strip())
249 | if match:
250 | # create a dictionary of attributes from LIKELY_HEADER_ATTRIBUTES
251 | match_groups = match.groups()
252 | if len(match_groups) > 0:
253 | string = ''.join([str(x) for x in match_groups if x is not None])
254 | standardized_title = f'{header_class}{string}'
255 | else:
256 | standardized_title = f'{header_class}'
257 | level = create_level(hierarchy_level, header_class, standardized_title, attributes)
258 | break
259 |
260 | if level is None:
261 | # Check for header attributes OR big_script
262 | if any([header.get(attr,False) for attr in LIKELY_HEADER_ATTRIBUTES]) or big_script[idx]:
263 | level = create_level(predicted_header_level, 'predicted header', '', attributes)
264 |
265 | if level is None:
266 | level = create_level(-1, 'text')
267 |
268 | levels.append(level)
269 |
270 | # NOW USE SEQUENCE AND ATTRIBUTES IN THE LEVELS TO DETERMINE HIERARCHY FOR PREDICTED HEADERS
271 | levels = determine_predicted_header_levels(levels)
272 | return levels
273 |
274 | def convert_instructions_to_dict(instructions_list, mapping_dict=None):
275 |
276 | # add filtering stage here
277 |
278 | # CHANGE: Split mixed header-content groups first
279 | instructions_list = split_header_instructions(instructions_list)
280 |
281 | # Get pre-calculated levels for each instruction
282 | levels = determine_levels(instructions_list, mapping_dict)
283 |
284 | # Initialize document structure
285 | document = {'contents': {}}
286 |
287 | # Create an introduction section
288 | introduction = {'title': 'introduction', 'class': 'introduction', 'contents': {}}
289 |
290 | # Add the introduction to the document
291 | document['contents'][-1] = introduction
292 |
293 | # Keep track of current position in hierarchy
294 | current_section = introduction
295 | current_path = [document, introduction] # Path from root to current section
296 | current_levels = [-1, 0] # Corresponding hierarchy levels
297 |
298 | # Process each instruction using pre-calculated levels
299 | for idx, instructions in enumerate(instructions_list):
300 | instruction = instructions[0]
301 | level, level_class, standardized_title = levels[idx]
302 |
303 | if level >= 0:
304 | # This is a section header
305 |
306 | # Pop hierarchy until finding appropriate parent
307 | while len(current_levels) > 1 and current_levels[-1] >= level:
308 | current_path.pop()
309 | current_levels.pop()
310 |
311 | # Extract title from the instruction (only text instructions can be headers)
312 | if 'text' in instruction:
313 | title = ''.join([instr['text'] for instr in instructions if 'text' in instr])
314 | else:
315 | title = '[Non-text header]' # Fallback, though this shouldn't happen
316 |
317 | # Create new section, in correct order
318 | new_section = {'title': title}
319 | if standardized_title: # Add right after title
320 | new_section['standardized_title'] = standardized_title
321 | new_section['class'] = level_class
322 | new_section['contents'] = {}
323 |
324 | # Add section to parent's contents with index as key
325 | parent = current_path[-1]
326 | parent['contents'][idx] = new_section
327 |
328 | # Update tracking
329 | current_path.append(new_section)
330 | current_levels.append(level)
331 | current_section = new_section
332 |
333 | # CHANGE: Removed mixed content handling here since groups are now pure
334 |
335 | # CHANGE: Simplified - only process regular content (no mixed groups anymore)
336 | if level in [-1, -2]:
337 | for instruction in instructions:
338 | if 'text' in instruction:
339 | if not current_section['contents'].get(idx):
340 | current_section['contents'][idx] = {level_class: ''}
341 | if level_class in current_section['contents'][idx]:
342 | current_section['contents'][idx][level_class] += instruction['text']
343 | else:
344 | current_section['contents'][idx][level_class] = instruction['text']
345 | elif 'image' in instruction:
346 | current_section['contents'][idx] = {'image': instruction['image']}
347 | elif 'table' in instruction:
348 | current_section['contents'][idx] = {'table': [[extract_cell_content(cell) for cell in row] for row in instruction['table']]}
349 |
350 | # Create final result with metadata
351 | result = {
352 | 'metadata': {
353 | 'parser': 'doc2dict',
354 | 'github': 'https://github.com/john-friedman/doc2dict',
355 | "version": __version__,
356 | },
357 | 'document': document['contents']
358 | }
359 |
360 | remove_empty_contents(result)
361 | return result
--------------------------------------------------------------------------------
/doc2dict/doc2dict/mapping.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | def flatten_hierarchy(content, sep='\n'):
4 | result = []
5 |
6 | def process_node(node):
7 | if isinstance(node, str):
8 | if node.strip():
9 | result.append(node.strip())
10 | return
11 |
12 | if isinstance(node, list):
13 | for item in node:
14 | process_node(item)
15 | return
16 |
17 | if isinstance(node, dict):
18 | if node.get('text') and not node.get('content'):
19 | result.append(node['text'].strip())
20 |
21 | if node.get('content'):
22 | process_node(node['content'])
23 |
24 | for key, value in node.items():
25 | if key not in ('type', 'text', 'content'):
26 | process_node(value)
27 |
28 | process_node(content)
29 | return sep.join(result)
30 |
31 | class JSONTransformer:
32 | def __init__(self, mapping_dict):
33 | """Initialize transformer with mapping dictionary."""
34 | self.mapping_dict = mapping_dict
35 | self.id_to_text = {}
36 | self.used_matches = set()
37 |
38 | def _find_refs(self, data, search_key):
39 | """Find all references based on search key in the data."""
40 | matches = []
41 |
42 | if isinstance(data, dict):
43 | if search_key in data:
44 | matches.append(data)
45 | for value in data.values():
46 | matches.extend(self._find_refs(value, search_key))
47 | elif isinstance(data, list):
48 | for item in data:
49 | matches.extend(self._find_refs(item, search_key))
50 |
51 | return matches
52 |
53 | def _extract_ref_ids(self, ref_data, search_id):
54 | """Extract reference IDs from either dict or list data."""
55 | if isinstance(ref_data, dict):
56 | ref_id = ref_data.get(search_id)
57 | return [ref_id] if ref_id is not None else []
58 | elif isinstance(ref_data, list):
59 | ids = []
60 | for item in ref_data:
61 | if isinstance(item, dict):
62 | ref_id = item.get(search_id)
63 | if ref_id is not None:
64 | ids.append(ref_id)
65 | return ids
66 | return []
67 |
68 | def _find_content(self, data, match_identifier, match_content):
69 | """Find all content entries in the data that match the identifier and content pattern."""
70 | matches = []
71 |
72 | if isinstance(data, dict):
73 | if match_identifier in data and match_content in data:
74 | matches.append(data)
75 | for value in data.values():
76 | matches.extend(self._find_content(value, match_identifier, match_content))
77 | elif isinstance(data, list):
78 | for item in data:
79 | matches.extend(self._find_content(item, match_identifier, match_content))
80 |
81 | return matches
82 |
83 | def _build_mapping(self, data, transformation):
84 | """Build mapping between identifiers and their content."""
85 | match_rule = transformation['match']
86 | id_key = match_rule['identifier']
87 | content_key = match_rule['content']
88 |
89 | content_matches = self._find_content(data, id_key, content_key)
90 |
91 | for match in content_matches:
92 | if id_key in match and content_key in match:
93 | self.id_to_text[match[id_key]] = match[content_key]
94 | if match_rule.get('remove_after_use', False):
95 | self.used_matches.add(match[id_key])
96 |
97 | def _remove_used_content(self, data, match_rule):
98 | """Remove the used content entries based on match rule."""
99 | if isinstance(data, dict):
100 | id_key = match_rule['identifier']
101 |
102 | if id_key in data and data.get(id_key) in self.used_matches:
103 | return None
104 |
105 | result = {}
106 | for k, v in data.items():
107 | processed = self._remove_used_content(v, match_rule)
108 | if processed is not None:
109 | result[k] = processed
110 |
111 | return result if result else None
112 |
113 | elif isinstance(data, list):
114 | result = [item for item in data
115 | if (processed := self._remove_used_content(item, match_rule)) is not None]
116 | return result if result else None
117 |
118 | return data
119 |
120 | def _apply_standardization(self, data, transformation):
121 | """Apply standardization rules to transform text based on regex pattern."""
122 | if isinstance(data, dict):
123 | if data.get('type') == transformation['match']['type'] and 'text' in data:
124 | pattern = transformation['match']['text_pattern']
125 | match = re.match(pattern, data['text'])
126 | if match:
127 | value = match.group(1)
128 | output_field = transformation['output'].get('field', 'text')
129 | data[output_field] = transformation['output']['format'].format(value.lower())
130 |
131 | for value in data.values():
132 | if isinstance(value, (dict, list)):
133 | self._apply_standardization(value, transformation)
134 |
135 | elif isinstance(data, list):
136 | for item in data:
137 | if isinstance(item, (dict, list)):
138 | self._apply_standardization(item, transformation)
139 |
140 | def _apply_trim(self, data, transformation):
141 | if not isinstance(data, dict) or 'content' not in data:
142 | return data
143 |
144 | match_type = transformation['match']['type']
145 | expected = transformation['match'].get('expected')
146 | output_type = transformation['output']['type']
147 |
148 | matches = []
149 | def find_matches(content, current_path=[]):
150 | for i, item in enumerate(content):
151 | if isinstance(item, dict):
152 | if item.get('type') == match_type and 'text' in item:
153 | matches.append({
154 | 'path': current_path + [i],
155 | 'text': item['text']
156 | })
157 | if 'content' in item:
158 | find_matches(item['content'], current_path + [i, 'content'])
159 |
160 | find_matches(data['content'])
161 | if not matches:
162 | return data
163 |
164 | text_groups = {}
165 | for match in matches:
166 | text = match['text']
167 | if text not in text_groups:
168 | text_groups[text] = []
169 | text_groups[text].append(match['path'])
170 |
171 | result = {'type': output_type}
172 | for text, paths in text_groups.items():
173 | if len(paths) > expected:
174 | if expected == 0:
175 | result['content'] = [flatten_hierarchy(data['content'])]
176 | data['content'] = [result]
177 | else:
178 | split_path = paths[expected]
179 | split_idx = split_path[0]
180 | before_content = data['content'][:split_idx]
181 | result['content'] = [flatten_hierarchy(before_content)]
182 | data['content'] = data['content'][split_idx:]
183 | data['content'].insert(0, result)
184 | break
185 |
186 | return data
187 |
188 | def _apply_consecutive_merge(self, data, transformation):
189 | """Merge consecutive sections with same type and text."""
190 | if isinstance(data, dict):
191 | if 'content' in data and isinstance(data['content'], list):
192 | new_content = []
193 | current_section = None
194 |
195 | for item in data['content']:
196 | if (isinstance(item, dict) and
197 | item.get('type') in transformation['match']['types'] and
198 | 'text' in item):
199 | if (current_section and
200 | current_section['type'] == item['type'] and
201 | current_section['text'] == item['text']):
202 | current_section['content'].extend(item['content'])
203 | else:
204 | if current_section:
205 | new_content.append(current_section)
206 | current_section = item
207 | else:
208 | if current_section:
209 | new_content.append(current_section)
210 | current_section = None
211 | new_content.append(item)
212 |
213 | if current_section:
214 | new_content.append(current_section)
215 |
216 | data['content'] = new_content
217 |
218 | for value in data.values():
219 | if isinstance(value, (dict, list)):
220 | self._apply_consecutive_merge(value, transformation)
221 |
222 | elif isinstance(data, list):
223 | for item in data:
224 | if isinstance(item, (dict, list)):
225 | self._apply_consecutive_merge(item, transformation)
226 |
227 | def transform(self, data):
228 | """Transform the data according to the mapping dictionary."""
229 | result = data.copy()
230 |
231 | for transformation in self.mapping_dict['transformations']:
232 | if transformation.get('type') == 'standardize':
233 | self._apply_standardization(result, transformation)
234 | elif transformation.get('type') == 'merge_consecutive':
235 | self._apply_consecutive_merge(result, transformation)
236 | elif transformation.get('type') == 'trim':
237 | self._apply_trim(result, transformation)
238 | else:
239 | # Reference replacement logic
240 | self._build_mapping(result, transformation)
241 |
242 | search_key = transformation['search']['key']
243 | search_id = transformation['search']['identifier']
244 | output_key = transformation['output']['key']
245 |
246 | refs = self._find_refs(result, search_key)
247 |
248 | for ref in refs:
249 | ref_ids = self._extract_ref_ids(ref[search_key], search_id)
250 | if ref_ids:
251 | # Create a list of referenced content
252 | referenced_content = [
253 | self.id_to_text[ref_id]
254 | for ref_id in ref_ids
255 | if ref_id in self.id_to_text
256 | ]
257 | if referenced_content:
258 | ref[output_key] = referenced_content
259 | del ref[search_key]
260 |
261 | if transformation['match'].get('remove_after_use', False):
262 | result = self._remove_used_content(result, transformation['match'])
263 |
264 | return result
265 |
266 | class RuleProcessor:
267 | def __init__(self, rules_dict):
268 | self.rules = rules_dict
269 |
270 | def _apply_remove_rules(self, lines):
271 | if 'remove' not in self.rules:
272 | return lines
273 |
274 | result = lines.copy()
275 | for rule in self.rules['remove']:
276 | pattern = rule['pattern']
277 | result = [line for line in result if not re.match(pattern, line)]
278 |
279 | return result
280 |
281 | def _join_consecutive_strings(self, content_list):
282 | """Join consecutive strings in a content list."""
283 | if not content_list:
284 | return content_list
285 |
286 | result = []
287 | current_strings = []
288 |
289 | for item in content_list:
290 | if isinstance(item, str):
291 | current_strings.append(item)
292 | else:
293 | if current_strings:
294 | result.append(self.rules.get('join_text').join(current_strings))
295 | current_strings = []
296 | if isinstance(item, dict) and 'content' in item:
297 | item['content'] = self._join_consecutive_strings(item['content'])
298 | result.append(item)
299 |
300 | if current_strings:
301 | result.append(self.rules.get('join_text').join(current_strings))
302 |
303 | return result
304 |
305 | def _find_matching_end(self, lines, start_idx, end_pattern):
306 | """Find matching end pattern considering nesting."""
307 | pattern_name = None
308 | nesting_level = 1
309 |
310 | for i in range(start_idx + 1, len(lines)):
311 | line = lines[i]
312 |
313 | if pattern_name and re.match(pattern_name, line):
314 | nesting_level += 1
315 | elif re.match(end_pattern, line):
316 | nesting_level -= 1
317 | if nesting_level == 0:
318 | return i
319 |
320 | return len(lines) - 1
321 |
322 | def _process_block(self, lines, start_idx, rule, mappings):
323 | """Process a block of content, handling nested blocks."""
324 | content = []
325 | current_idx = start_idx + 1
326 | end_idx = None
327 |
328 | if rule.get('end'):
329 | end_idx = self._find_matching_end(lines, start_idx, rule['end'])
330 | else:
331 | for i in range(start_idx + 1, len(lines)):
332 | if any(re.match(r['pattern'], lines[i])
333 | for r in mappings if r.get('hierarchy') is not None):
334 | end_idx = i - 1
335 | break
336 | if end_idx is None:
337 | end_idx = len(lines) - 1
338 |
339 | while current_idx < end_idx:
340 | line = lines[current_idx]
341 | matched = False
342 |
343 | for nested_rule in mappings:
344 | if re.match(nested_rule['pattern'], line):
345 | nested_content, next_idx = self._process_block(
346 | lines, current_idx, nested_rule, mappings
347 | )
348 | if nested_content:
349 | content.append(nested_content)
350 | current_idx = next_idx + 1
351 | matched = True
352 | break
353 |
354 | if not matched:
355 | content.append(line)
356 | current_idx += 1
357 |
358 | if rule.get('keep_end', False) and end_idx < len(lines):
359 | content.append(lines[end_idx])
360 |
361 | return {
362 | 'type': rule['name'],
363 | 'content': content
364 | }, end_idx
365 |
366 | def _apply_mapping_rules(self, lines):
367 | if 'mappings' not in self.rules:
368 | return {'content': lines}
369 |
370 | result = {'content': []}
371 | hierarchy_stack = [result]
372 |
373 | mappings = sorted(
374 | self.rules['mappings'],
375 | key=lambda x: x.get('hierarchy', float('inf'))
376 | )
377 |
378 | i = 0
379 | while i < len(lines):
380 | line = lines[i]
381 | matched = False
382 |
383 | for rule in mappings:
384 | if re.match(rule['pattern'], line):
385 | if rule.get('hierarchy') is not None:
386 | new_section = {
387 | 'type': rule['name'],
388 | 'text': line,
389 | 'content': []
390 | }
391 |
392 | while len(hierarchy_stack) > rule['hierarchy'] + 1:
393 | hierarchy_stack.pop()
394 |
395 | parent = hierarchy_stack[-1]
396 | if isinstance(parent.get('content'), list):
397 | parent['content'].append(new_section)
398 |
399 | hierarchy_stack.append(new_section)
400 | i += 1
401 |
402 | else:
403 | block, end_idx = self._process_block(lines, i, rule, mappings)
404 | parent = hierarchy_stack[-1]
405 | if isinstance(parent.get('content'), list):
406 | parent['content'].append(block)
407 | i = end_idx + 1
408 |
409 | matched = True
410 | break
411 |
412 | if not matched:
413 | parent = hierarchy_stack[-1]
414 | if isinstance(parent.get('content'), list):
415 | parent['content'].append(line)
416 | i += 1
417 |
418 | if self.rules.get('join_text') is not None:
419 | result['content'] = self._join_consecutive_strings(result['content'])
420 |
421 | return result
422 |
423 | class DocumentProcessor:
424 | def __init__(self, config):
425 | self.rules = config.get('rules', {})
426 | self.transformations = config.get('transformations', [])
427 | self.rule_processor = RuleProcessor(self.rules)
428 | self.json_transformer = JSONTransformer({'transformations': self.transformations}) if self.transformations else None
429 |
430 | def process(self, lines):
431 | filtered_lines = self.rule_processor._apply_remove_rules(lines)
432 | structured_data = self.rule_processor._apply_mapping_rules(filtered_lines)
433 |
434 | if self.json_transformer:
435 | structured_data = self.json_transformer.transform(structured_data)
436 |
437 | return structured_data
--------------------------------------------------------------------------------
/example_output/html/levels.txt:
--------------------------------------------------------------------------------
1 | (2, 'predicted header', '')
2 | (2, 'predicted header', '')
3 | (2, 'predicted header', '')
4 | (2, 'predicted header', '')
5 | (-1, 'text', '')
6 | (-1, 'text', '')
7 | (-2, 'textsmall', '')
8 | (2, 'predicted header', '')
9 | (-1, 'text', '')
10 | (-2, 'textsmall', '')
11 | (-2, 'textsmall', '')
12 | (-2, 'textsmall', '')
13 | (-1, 'text', '')
14 | (-1, 'text', '')
15 | (-1, 'text', '')
16 | (-1, 'text', '')
17 | (-1, 'text', '')
18 | (-1, 'text', '')
19 | (-1, 'text', '')
20 | (-1, 'text', '')
21 | (-1, 'text', '')
22 | (-1, 'text', '')
23 | (-1, 'text', '')
24 | (-1, 'text', '')
25 | (-1, 'text', '')
26 | (-2, 'textsmall', '')
27 | (-1, 'text', '')
28 | (2, 'predicted header', '')
29 | (2, 'predicted header', '')
30 | (2, 'predicted header', '')
31 | (2, 'predicted header', '')
32 | (-1, 'text', '')
33 | (-2, 'textsmall', '')
34 | (-2, 'textsmall', '')
35 | (-2, 'textsmall', '')
36 | (2, 'predicted header', '')
37 | (-1, 'text', '')
38 | (0, 'part', 'parti')
39 | (1, 'item', 'item1')
40 | (3, 'predicted header', '')
41 | (4, 'predicted header', '')
42 | (-1, 'text', '')
43 | (-1, 'text', '')
44 | (-1, 'text', '')
45 | (4, 'predicted header', '')
46 | (-1, 'text', '')
47 | (-1, 'text', '')
48 | (-1, 'text', '')
49 | (-2, 'textsmall', '')
50 | (-2, 'textsmall', '')
51 | (-2, 'textsmall', '')
52 | (4, 'predicted header', '')
53 | (-1, 'text', '')
54 | (-1, 'text', '')
55 | (-1, 'text', '')
56 | (-1, 'text', '')
57 | (5, 'predicted header', '')
58 | (-1, 'text', '')
59 | (-1, 'text', '')
60 | (-1, 'text', '')
61 | (5, 'predicted header', '')
62 | (-1, 'text', '')
63 | (-1, 'text', '')
64 | (-1, 'text', '')
65 | (-1, 'text', '')
66 | (-2, 'textsmall', '')
67 | (-2, 'textsmall', '')
68 | (-2, 'textsmall', '')
69 | (-1, 'text', '')
70 | (-1, 'text', '')
71 | (-1, 'text', '')
72 | (-1, 'text', '')
73 | (-1, 'text', '')
74 | (-1, 'text', '')
75 | (-1, 'text', '')
76 | (-1, 'text', '')
77 | (5, 'predicted header', '')
78 | (-1, 'text', '')
79 | (-1, 'text', '')
80 | (-1, 'text', '')
81 | (-2, 'textsmall', '')
82 | (-2, 'textsmall', '')
83 | (-2, 'textsmall', '')
84 | (-1, 'text', '')
85 | (5, 'predicted header', '')
86 | (-1, 'text', '')
87 | (-1, 'text', '')
88 | (-1, 'text', '')
89 | (-1, 'text', '')
90 | (-1, 'text', '')
91 | (-1, 'text', '')
92 | (-1, 'text', '')
93 | (-1, 'text', '')
94 | (4, 'predicted header', '')
95 | (5, 'predicted header', '')
96 | (-1, 'text', '')
97 | (-1, 'text', '')
98 | (-1, 'text', '')
99 | (-1, 'text', '')
100 | (-1, 'text', '')
101 | (-1, 'text', '')
102 | (-1, 'text', '')
103 | (-1, 'text', '')
104 | (-1, 'text', '')
105 | (-1, 'text', '')
106 | (-2, 'textsmall', '')
107 | (-2, 'textsmall', '')
108 | (-2, 'textsmall', '')
109 | (-1, 'text', '')
110 | (5, 'predicted header', '')
111 | (-1, 'text', '')
112 | (-1, 'text', '')
113 | (-1, 'text', '')
114 | (-1, 'text', '')
115 | (5, 'predicted header', '')
116 | (-1, 'text', '')
117 | (-1, 'text', '')
118 | (3, 'predicted header', '')
119 | (-1, 'text', '')
120 | (-2, 'textsmall', '')
121 | (-2, 'textsmall', '')
122 | (-2, 'textsmall', '')
123 | (-1, 'text', '')
124 | (-1, 'text', '')
125 | (-1, 'text', '')
126 | (-1, 'text', '')
127 | (-1, 'text', '')
128 | (-1, 'text', '')
129 | (3, 'predicted header', '')
130 | (-1, 'text', '')
131 | (-1, 'text', '')
132 | (-1, 'text', '')
133 | (4, 'predicted header', '')
134 | (-1, 'text', '')
135 | (-1, 'text', '')
136 | (-1, 'text', '')
137 | (-1, 'text', '')
138 | (-1, 'text', '')
139 | (-2, 'textsmall', '')
140 | (-2, 'textsmall', '')
141 | (-2, 'textsmall', '')
142 | (5, 'predicted header', '')
143 | (-1, 'text', '')
144 | (5, 'predicted header', '')
145 | (-1, 'text', '')
146 | (5, 'predicted header', '')
147 | (-1, 'text', '')
148 | (5, 'predicted header', '')
149 | (-1, 'text', '')
150 | (-2, 'textsmall', '')
151 | (-2, 'textsmall', '')
152 | (-2, 'textsmall', '')
153 | (6, 'predicted header', '')
154 | (-1, 'text', '')
155 | (-1, 'text', '')
156 | (-1, 'text', '')
157 | (4, 'predicted header', '')
158 | (-1, 'text', '')
159 | (-1, 'text', '')
160 | (-1, 'text', '')
161 | (5, 'predicted header', '')
162 | (-1, 'text', '')
163 | (-1, 'text', '')
164 | (-1, 'text', '')
165 | (-2, 'textsmall', '')
166 | (-2, 'textsmall', '')
167 | (-2, 'textsmall', '')
168 | (-1, 'text', '')
169 | (5, 'predicted header', '')
170 | (-1, 'text', '')
171 | (6, 'predicted header', '')
172 | (-1, 'text', '')
173 | (-1, 'text', '')
174 | (-1, 'text', '')
175 | (-1, 'text', '')
176 | (-1, 'text', '')
177 | (-1, 'text', '')
178 | (-2, 'textsmall', '')
179 | (-2, 'textsmall', '')
180 | (-2, 'textsmall', '')
181 | (4, 'predicted header', '')
182 | (-1, 'text', '')
183 | (-1, 'text', '')
184 | (-1, 'text', '')
185 | (-1, 'text', '')
186 | (-1, 'text', '')
187 | (5, 'predicted header', '')
188 | (-1, 'text', '')
189 | (-1, 'text', '')
190 | (-1, 'text', '')
191 | (-1, 'text', '')
192 | (-1, 'text', '')
193 | (-1, 'text', '')
194 | (-1, 'text', '')
195 | (-1, 'text', '')
196 | (-1, 'text', '')
197 | (-1, 'text', '')
198 | (-1, 'text', '')
199 | (-1, 'text', '')
200 | (-1, 'text', '')
201 | (5, 'predicted header', '')
202 | (-1, 'text', '')
203 | (-2, 'textsmall', '')
204 | (-2, 'textsmall', '')
205 | (-2, 'textsmall', '')
206 | (5, 'predicted header', '')
207 | (-1, 'text', '')
208 | (-1, 'text', '')
209 | (-1, 'text', '')
210 | (5, 'predicted header', '')
211 | (-1, 'text', '')
212 | (6, 'predicted header', '')
213 | (-1, 'text', '')
214 | (-1, 'text', '')
215 | (-1, 'text', '')
216 | (-1, 'text', '')
217 | (-2, 'textsmall', '')
218 | (-2, 'textsmall', '')
219 | (-2, 'textsmall', '')
220 | (3, 'predicted header', '')
221 | (-1, 'text', '')
222 | (-1, 'text', '')
223 | (-1, 'text', '')
224 | (3, 'predicted header', '')
225 | (4, 'predicted header', '')
226 | (-1, 'text', '')
227 | (-1, 'text', '')
228 | (-1, 'text', '')
229 | (-1, 'text', '')
230 | (-1, 'text', '')
231 | (-1, 'text', '')
232 | (-1, 'text', '')
233 | (-1, 'text', '')
234 | (-1, 'text', '')
235 | (-1, 'text', '')
236 | (-2, 'textsmall', '')
237 | (-2, 'textsmall', '')
238 | (-2, 'textsmall', '')
239 | (-1, 'text', '')
240 | (-1, 'text', '')
241 | (4, 'predicted header', '')
242 | (-1, 'text', '')
243 | (-1, 'text', '')
244 | (-1, 'text', '')
245 | (3, 'predicted header', '')
246 | (-1, 'text', '')
247 | (4, 'predicted header', '')
248 | (-1, 'text', '')
249 | (-1, 'text', '')
250 | (-2, 'textsmall', '')
251 | (-2, 'textsmall', '')
252 | (-2, 'textsmall', '')
253 | (4, 'predicted header', '')
254 | (-1, 'text', '')
255 | (-1, 'text', '')
256 | (4, 'predicted header', '')
257 | (-1, 'text', '')
258 | (-1, 'text', '')
259 | (-1, 'text', '')
260 | (3, 'predicted header', '')
261 | (-1, 'text', '')
262 | (-1, 'text', '')
263 | (-2, 'textsmall', '')
264 | (-2, 'textsmall', '')
265 | (-2, 'textsmall', '')
266 | (4, 'predicted header', '')
267 | (5, 'predicted header', '')
268 | (-1, 'text', '')
269 | (5, 'predicted header', '')
270 | (-1, 'text', '')
271 | (5, 'predicted header', '')
272 | (-1, 'text', '')
273 | (5, 'predicted header', '')
274 | (-1, 'text', '')
275 | (5, 'predicted header', '')
276 | (-1, 'text', '')
277 | (5, 'predicted header', '')
278 | (-1, 'text', '')
279 | (4, 'predicted header', '')
280 | (-1, 'text', '')
281 | (-1, 'text', '')
282 | (-1, 'text', '')
283 | (-2, 'textsmall', '')
284 | (-2, 'textsmall', '')
285 | (-2, 'textsmall', '')
286 | (3, 'predicted header', '')
287 | (-1, 'text', '')
288 | (3, 'predicted header', '')
289 | (-1, 'text', '')
290 | (-1, 'text', '')
291 | (3, 'predicted header', '')
292 | (-1, 'text', '')
293 | (-1, 'text', '')
294 | (-1, 'text', '')
295 | (-1, 'text', '')
296 | (-1, 'text', '')
297 | (-2, 'textsmall', '')
298 | (-2, 'textsmall', '')
299 | (-2, 'textsmall', '')
300 | (-1, 'text', '')
301 | (-1, 'text', '')
302 | (-1, 'text', '')
303 | (-1, 'text', '')
304 | (3, 'predicted header', '')
305 | (-1, 'text', '')
306 | (-1, 'text', '')
307 | (-1, 'text', '')
308 | (-1, 'text', '')
309 | (-1, 'text', '')
310 | (-1, 'text', '')
311 | (-1, 'text', '')
312 | (-1, 'text', '')
313 | (-1, 'text', '')
314 | (-1, 'text', '')
315 | (-2, 'textsmall', '')
316 | (-2, 'textsmall', '')
317 | (-2, 'textsmall', '')
318 | (1, 'item', 'item1a')
319 | (-1, 'text', '')
320 | (2, 'predicted header', '')
321 | (3, 'predicted header', '')
322 | (4, 'predicted header', '')
323 | (-1, 'text', '')
324 | (4, 'predicted header', '')
325 | (-1, 'text', '')
326 | (-1, 'text', '')
327 | (-1, 'text', '')
328 | (-1, 'text', '')
329 | (-2, 'textsmall', '')
330 | (-2, 'textsmall', '')
331 | (-2, 'textsmall', '')
332 | (-1, 'text', '')
333 | (4, 'predicted header', '')
334 | (-1, 'text', '')
335 | (-1, 'text', '')
336 | (-1, 'text', '')
337 | (-1, 'text', '')
338 | (-1, 'text', '')
339 | (-1, 'text', '')
340 | (-1, 'text', '')
341 | (3, 'predicted header', '')
342 | (-1, 'text', '')
343 | (-1, 'text', '')
344 | (-1, 'text', '')
345 | (-1, 'text', '')
346 | (-1, 'text', '')
347 | (-1, 'text', '')
348 | (-2, 'textsmall', '')
349 | (-2, 'textsmall', '')
350 | (-2, 'textsmall', '')
351 | (-1, 'text', '')
352 | (2, 'predicted header', '')
353 | (3, 'predicted header', '')
354 | (3, 'predicted header', '')
355 | (3, 'predicted header', '')
356 | (-2, 'textsmall', '')
357 | (-2, 'textsmall', '')
358 | (-2, 'textsmall', '')
359 | (2, 'predicted header', '')
360 | (3, 'predicted header', '')
361 | (4, 'predicted header', '')
362 | (-1, 'text', '')
363 | (-1, 'text', '')
364 | (-1, 'text', '')
365 | (-2, 'textsmall', '')
366 | (-2, 'textsmall', '')
367 | (-2, 'textsmall', '')
368 | (-1, 'text', '')
369 | (4, 'predicted header', '')
370 | (-1, 'text', '')
371 | (-1, 'text', '')
372 | (-1, 'text', '')
373 | (-1, 'text', '')
374 | (-2, 'textsmall', '')
375 | (-2, 'textsmall', '')
376 | (-2, 'textsmall', '')
377 | (4, 'predicted header', '')
378 | (-1, 'text', '')
379 | (-1, 'text', '')
380 | (-1, 'text', '')
381 | (3, 'predicted header', '')
382 | (3, 'predicted header', '')
383 | (-2, 'textsmall', '')
384 | (-2, 'textsmall', '')
385 | (-2, 'textsmall', '')
386 | (3, 'predicted header', '')
387 | (4, 'predicted header', '')
388 | (-1, 'text', '')
389 | (4, 'predicted header', '')
390 | (-1, 'text', '')
391 | (3, 'predicted header', '')
392 | (3, 'predicted header', '')
393 | (-2, 'textsmall', '')
394 | (-2, 'textsmall', '')
395 | (-2, 'textsmall', '')
396 | (2, 'predicted header', '')
397 | (3, 'predicted header', '')
398 | (3, 'predicted header', '')
399 | (-1, 'text', '')
400 | (-1, 'text', '')
401 | (-2, 'textsmall', '')
402 | (-2, 'textsmall', '')
403 | (-2, 'textsmall', '')
404 | (2, 'predicted header', '')
405 | (3, 'predicted header', '')
406 | (-1, 'text', '')
407 | (-1, 'text', '')
408 | (-1, 'text', '')
409 | (3, 'predicted header', '')
410 | (-2, 'textsmall', '')
411 | (-2, 'textsmall', '')
412 | (-2, 'textsmall', '')
413 | (-1, 'text', '')
414 | (3, 'predicted header', '')
415 | (-1, 'text', '')
416 | (-2, 'textsmall', '')
417 | (-2, 'textsmall', '')
418 | (-2, 'textsmall', '')
419 | (3, 'predicted header', '')
420 | (-1, 'text', '')
421 | (3, 'predicted header', '')
422 | (3, 'predicted header', '')
423 | (3, 'predicted header', '')
424 | (-2, 'textsmall', '')
425 | (-2, 'textsmall', '')
426 | (-2, 'textsmall', '')
427 | (-1, 'text', '')
428 | (-1, 'text', '')
429 | (3, 'predicted header', '')
430 | (2, 'predicted header', '')
431 | (3, 'predicted header', '')
432 | (-1, 'text', '')
433 | (-1, 'text', '')
434 | (3, 'predicted header', '')
435 | (-2, 'textsmall', '')
436 | (-2, 'textsmall', '')
437 | (-2, 'textsmall', '')
438 | (2, 'predicted header', '')
439 | (3, 'predicted header', '')
440 | (-1, 'text', '')
441 | (-1, 'text', '')
442 | (-1, 'text', '')
443 | (-1, 'text', '')
444 | (3, 'predicted header', '')
445 | (-1, 'text', '')
446 | (-1, 'text', '')
447 | (-1, 'text', '')
448 | (3, 'predicted header', '')
449 | (-2, 'textsmall', '')
450 | (-2, 'textsmall', '')
451 | (-2, 'textsmall', '')
452 | (-1, 'text', '')
453 | (-1, 'text', '')
454 | (-1, 'text', '')
455 | (3, 'predicted header', '')
456 | (3, 'predicted header', '')
457 | (-2, 'textsmall', '')
458 | (-2, 'textsmall', '')
459 | (-2, 'textsmall', '')
460 | (1, 'item', 'item1b')
461 | (-1, 'text', '')
462 | (1, 'item', 'item1c')
463 | (2, 'predicted header', '')
464 | (-1, 'text', '')
465 | (-1, 'text', '')
466 | (-1, 'text', '')
467 | (-1, 'text', '')
468 | (-1, 'text', '')
469 | (-1, 'text', '')
470 | (-1, 'text', '')
471 | (-2, 'textsmall', '')
472 | (-2, 'textsmall', '')
473 | (-2, 'textsmall', '')
474 | (-1, 'text', '')
475 | (-1, 'text', '')
476 | (-1, 'text', '')
477 | (-1, 'text', '')
478 | (2, 'predicted header', '')
479 | (-1, 'text', '')
480 | (-1, 'text', '')
481 | (-2, 'textsmall', '')
482 | (-2, 'textsmall', '')
483 | (-2, 'textsmall', '')
484 | (1, 'item', 'item2')
485 | (-1, 'text', '')
486 | (-1, 'text', '')
487 | (-1, 'text', '')
488 | (-1, 'text', '')
489 | (1, 'item', 'item3')
490 | (-1, 'text', '')
491 | (1, 'item', 'item4')
492 | (-1, 'text', '')
493 | (-2, 'textsmall', '')
494 | (-2, 'textsmall', '')
495 | (-2, 'textsmall', '')
496 | (0, 'part', 'partii')
497 | (1, 'item', 'item5')
498 | (2, 'predicted header', '')
499 | (-1, 'text', '')
500 | (2, 'predicted header', '')
501 | (-1, 'text', '')
502 | (-1, 'text', '')
503 | (-1, 'text', '')
504 | (-1, 'text', '')
505 | (-1, 'text', '')
506 | (-1, 'text', '')
507 | (-2, 'textsmall', '')
508 | (-2, 'textsmall', '')
509 | (-2, 'textsmall', '')
510 | (1, 'item', 'item6')
511 | (-2, 'textsmall', '')
512 | (-2, 'textsmall', '')
513 | (-2, 'textsmall', '')
514 | (1, 'item', 'item7')
515 | (-1, 'text', '')
516 | (2, 'predicted header', '')
517 | (-1, 'text', '')
518 | (-1, 'text', '')
519 | (-1, 'text', '')
520 | (-1, 'text', '')
521 | (-1, 'text', '')
522 | (-1, 'text', '')
523 | (-1, 'text', '')
524 | (-1, 'text', '')
525 | (-1, 'text', '')
526 | (-1, 'text', '')
527 | (-1, 'text', '')
528 | (-1, 'text', '')
529 | (-1, 'text', '')
530 | (-1, 'text', '')
531 | (-2, 'textsmall', '')
532 | (-2, 'textsmall', '')
533 | (-2, 'textsmall', '')
534 | (3, 'predicted header', '')
535 | (-1, 'text', '')
536 | (3, 'predicted header', '')
537 | (-1, 'text', '')
538 | (-1, 'text', '')
539 | (-1, 'text', '')
540 | (-1, 'text', '')
541 | (-1, 'text', '')
542 | (3, 'predicted header', '')
543 | (-1, 'text', '')
544 | (3, 'predicted header', '')
545 | (-1, 'text', '')
546 | (-2, 'textsmall', '')
547 | (-2, 'textsmall', '')
548 | (-2, 'textsmall', '')
549 | (3, 'predicted header', '')
550 | (-1, 'text', '')
551 | (-1, 'text', '')
552 | (3, 'predicted header', '')
553 | (-1, 'text', '')
554 | (-1, 'text', '')
555 | (4, 'predicted header', '')
556 | (-1, 'text', '')
557 | (-1, 'text', '')
558 | (-2, 'textsmall', '')
559 | (-2, 'textsmall', '')
560 | (-2, 'textsmall', '')
561 | (4, 'predicted header', '')
562 | (-1, 'text', '')
563 | (-1, 'text', '')
564 | (4, 'predicted header', '')
565 | (-1, 'text', '')
566 | (-1, 'text', '')
567 | (-2, 'textsmall', '')
568 | (-2, 'textsmall', '')
569 | (-2, 'textsmall', '')
570 | (2, 'predicted header', '')
571 | (-1, 'text', '')
572 | (-1, 'text', '')
573 | (4, 'predicted header', '')
574 | (-1, 'text', '')
575 | (-1, 'text', '')
576 | (-1, 'text', '')
577 | (-1, 'text', '')
578 | (-1, 'text', '')
579 | (-1, 'text', '')
580 | (-1, 'text', '')
581 | (-1, 'text', '')
582 | (-2, 'textsmall', '')
583 | (-2, 'textsmall', '')
584 | (-2, 'textsmall', '')
585 | (2, 'predicted header', '')
586 | (-1, 'text', '')
587 | (3, 'predicted header', '')
588 | (4, 'predicted header', '')
589 | (5, 'predicted header', '')
590 | (-1, 'text', '')
591 | (-1, 'text', '')
592 | (-1, 'text', '')
593 | (-1, 'text', '')
594 | (-1, 'text', '')
595 | (-1, 'text', '')
596 | (-1, 'text', '')
597 | (-1, 'text', '')
598 | (5, 'predicted header', '')
599 | (-1, 'text', '')
600 | (-1, 'text', '')
601 | (-1, 'text', '')
602 | (-2, 'textsmall', '')
603 | (-2, 'textsmall', '')
604 | (-2, 'textsmall', '')
605 | (-1, 'text', '')
606 | (-1, 'text', '')
607 | (-1, 'text', '')
608 | (5, 'predicted header', '')
609 | (-1, 'text', '')
610 | (-1, 'text', '')
611 | (-1, 'text', '')
612 | (-1, 'text', '')
613 | (-1, 'text', '')
614 | (-1, 'text', '')
615 | (-1, 'text', '')
616 | (-1, 'text', '')
617 | (2, 'predicted header', '')
618 | (3, 'predicted header', '')
619 | (-1, 'text', '')
620 | (-1, 'text', '')
621 | (4, 'predicted header', '')
622 | (-1, 'text', '')
623 | (3, 'predicted header', '')
624 | (-1, 'text', '')
625 | (-2, 'textsmall', '')
626 | (-2, 'textsmall', '')
627 | (-2, 'textsmall', '')
628 | (-1, 'text', '')
629 | (4, 'predicted header', '')
630 | (-1, 'text', '')
631 | (3, 'predicted header', '')
632 | (-1, 'text', '')
633 | (-1, 'text', '')
634 | (4, 'predicted header', '')
635 | (-1, 'text', '')
636 | (2, 'predicted header', '')
637 | (-1, 'text', '')
638 | (-1, 'text', '')
639 | (-1, 'text', '')
640 | (4, 'predicted header', '')
641 | (-1, 'text', '')
642 | (-2, 'textsmall', '')
643 | (-2, 'textsmall', '')
644 | (-2, 'textsmall', '')
645 | (2, 'predicted header', '')
646 | (3, 'predicted header', '')
647 | (-1, 'text', '')
648 | (-1, 'text', '')
649 | (-1, 'text', '')
650 | (-1, 'text', '')
651 | (3, 'predicted header', '')
652 | (-1, 'text', '')
653 | (-1, 'text', '')
654 | (2, 'predicted header', '')
655 | (-1, 'text', '')
656 | (-2, 'textsmall', '')
657 | (-2, 'textsmall', '')
658 | (-2, 'textsmall', '')
659 | (-1, 'text', '')
660 | (-1, 'text', '')
661 | (-1, 'text', '')
662 | (2, 'predicted header', '')
663 | (-1, 'text', '')
664 | (3, 'predicted header', '')
665 | (-1, 'text', '')
666 | (3, 'predicted header', '')
667 | (-1, 'text', '')
668 | (-2, 'textsmall', '')
669 | (-2, 'textsmall', '')
670 | (-2, 'textsmall', '')
671 | (-1, 'text', '')
672 | (3, 'predicted header', '')
673 | (-1, 'text', '')
674 | (3, 'predicted header', '')
675 | (-1, 'text', '')
676 | (3, 'predicted header', '')
677 | (-1, 'text', '')
678 | (-1, 'text', '')
679 | (-1, 'text', '')
680 | (-1, 'text', '')
681 | (-2, 'textsmall', '')
682 | (-2, 'textsmall', '')
683 | (-2, 'textsmall', '')
684 | (3, 'predicted header', '')
685 | (4, 'predicted header', '')
686 | (-1, 'text', '')
687 | (-1, 'text', '')
688 | (-1, 'text', '')
689 | (-1, 'text', '')
690 | (-1, 'text', '')
691 | (-1, 'text', '')
692 | (4, 'predicted header', '')
693 | (-1, 'text', '')
694 | (4, 'predicted header', '')
695 | (-1, 'text', '')
696 | (4, 'predicted header', '')
697 | (-1, 'text', '')
698 | (4, 'predicted header', '')
699 | (-1, 'text', '')
700 | (-2, 'textsmall', '')
701 | (-2, 'textsmall', '')
702 | (-2, 'textsmall', '')
703 | (2, 'predicted header', '')
704 | (-1, 'text', '')
705 | (2, 'predicted header', '')
706 | (-1, 'text', '')
707 | (3, 'predicted header', '')
708 | (-1, 'text', '')
709 | (-1, 'text', '')
710 | (-1, 'text', '')
711 | (-1, 'text', '')
712 | (-1, 'text', '')
713 | (3, 'predicted header', '')
714 | (-1, 'text', '')
715 | (-2, 'textsmall', '')
716 | (-2, 'textsmall', '')
717 | (-2, 'textsmall', '')
718 | (-1, 'text', '')
719 | (3, 'predicted header', '')
720 | (-1, 'text', '')
721 | (-1, 'text', '')
722 | (-1, 'text', '')
723 | (3, 'predicted header', '')
724 | (-1, 'text', '')
725 | (3, 'predicted header', '')
726 | (-1, 'text', '')
727 | (-2, 'textsmall', '')
728 | (-2, 'textsmall', '')
729 | (-2, 'textsmall', '')
730 | (3, 'predicted header', '')
731 | (-1, 'text', '')
732 | (3, 'predicted header', '')
733 | (-1, 'text', '')
734 | (-2, 'textsmall', '')
735 | (-2, 'textsmall', '')
736 | (-2, 'textsmall', '')
737 | (3, 'predicted header', '')
738 | (-1, 'text', '')
739 | (-1, 'text', '')
740 | (-1, 'text', '')
741 | (-1, 'text', '')
742 | (-1, 'text', '')
743 | (-2, 'textsmall', '')
744 | (-2, 'textsmall', '')
745 | (-2, 'textsmall', '')
746 | (1, 'item', 'item7a')
747 | (2, 'predicted header', '')
748 | (-1, 'text', '')
749 | (3, 'predicted header', '')
750 | (-1, 'text', '')
751 | (3, 'predicted header', '')
752 | (-1, 'text', '')
753 | (3, 'predicted header', '')
754 | (-1, 'text', '')
755 | (3, 'predicted header', '')
756 | (-1, 'text', '')
757 | (2, 'predicted header', '')
758 | (-1, 'text', '')
759 | (-1, 'text', '')
760 | (-2, 'textsmall', '')
761 | (-2, 'textsmall', '')
762 | (-2, 'textsmall', '')
763 | (1, 'item', 'item8')
764 | (2, 'predicted header', '')
765 | (-1, 'text', '')
766 | (-1, 'text', '')
767 | (-2, 'textsmall', '')
768 | (-2, 'textsmall', '')
769 | (-2, 'textsmall', '')
770 | (2, 'predicted header', '')
771 | (-1, 'text', '')
772 | (-1, 'text', '')
773 | (-2, 'textsmall', '')
774 | (-2, 'textsmall', '')
775 | (-2, 'textsmall', '')
776 | (2, 'predicted header', '')
777 | (-1, 'text', '')
778 | (-1, 'text', '')
779 | (-2, 'textsmall', '')
780 | (-2, 'textsmall', '')
781 | (-2, 'textsmall', '')
782 | (2, 'predicted header', '')
783 | (-1, 'text', '')
784 | (-1, 'text', '')
785 | (-2, 'textsmall', '')
786 | (-2, 'textsmall', '')
787 | (-2, 'textsmall', '')
788 | (2, 'predicted header', '')
789 | (-1, 'text', '')
790 | (-1, 'text', '')
791 | (-2, 'textsmall', '')
792 | (-2, 'textsmall', '')
793 | (-2, 'textsmall', '')
794 | (2, 'predicted header', '')
795 | (3, 'predicted header', '')
796 | (4, 'predicted header', '')
797 | (-1, 'text', '')
798 | (-1, 'text', '')
799 | (4, 'predicted header', '')
800 | (-1, 'text', '')
801 | (4, 'predicted header', '')
802 | (-1, 'text', '')
803 | (-1, 'text', '')
804 | (4, 'predicted header', '')
805 | (-1, 'text', '')
806 | (4, 'predicted header', '')
807 | (5, 'predicted header', '')
808 | (-1, 'text', '')
809 | (-1, 'text', '')
810 | (-2, 'textsmall', '')
811 | (-2, 'textsmall', '')
812 | (-2, 'textsmall', '')
813 | (5, 'predicted header', '')
814 | (-1, 'text', '')
815 | (6, 'predicted header', '')
816 | (-1, 'text', '')
817 | (-1, 'text', '')
818 | (-1, 'text', '')
819 | (-1, 'text', '')
820 | (-1, 'text', '')
821 | (-1, 'text', '')
822 | (6, 'predicted header', '')
823 | (-1, 'text', '')
824 | (-2, 'textsmall', '')
825 | (-2, 'textsmall', '')
826 | (-2, 'textsmall', '')
827 | (-1, 'text', '')
828 | (-1, 'text', '')
829 | (-1, 'text', '')
830 | (-1, 'text', '')
831 | (5, 'predicted header', '')
832 | (-1, 'text', '')
833 | (-1, 'text', '')
834 | (-1, 'text', '')
835 | (-1, 'text', '')
836 | (-1, 'text', '')
837 | (-1, 'text', '')
838 | (-2, 'textsmall', '')
839 | (-2, 'textsmall', '')
840 | (-2, 'textsmall', '')
841 | (-1, 'text', '')
842 | (-1, 'text', '')
843 | (-1, 'text', '')
844 | (-1, 'text', '')
845 | (-1, 'text', '')
846 | (-1, 'text', '')
847 | (5, 'predicted header', '')
848 | (-1, 'text', '')
849 | (-1, 'text', '')
850 | (4, 'predicted header', '')
851 | (-1, 'text', '')
852 | (-2, 'textsmall', '')
853 | (-2, 'textsmall', '')
854 | (-2, 'textsmall', '')
855 | (4, 'predicted header', '')
856 | (-1, 'text', '')
857 | (4, 'predicted header', '')
858 | (-1, 'text', '')
859 | (4, 'predicted header', '')
860 | (-1, 'text', '')
861 | (4, 'predicted header', '')
862 | (-1, 'text', '')
863 | (-1, 'text', '')
864 | (4, 'predicted header', '')
865 | (-1, 'text', '')
866 | (4, 'predicted header', '')
867 | (5, 'predicted header', '')
868 | (-1, 'text', '')
869 | (-2, 'textsmall', '')
870 | (-2, 'textsmall', '')
871 | (-2, 'textsmall', '')
872 | (-1, 'text', '')
873 | (-1, 'text', '')
874 | (-1, 'text', '')
875 | (5, 'predicted header', '')
876 | (-1, 'text', '')
877 | (-1, 'text', '')
878 | (-1, 'text', '')
879 | (-1, 'text', '')
880 | (4, 'predicted header', '')
881 | (-1, 'text', '')
882 | (-1, 'text', '')
883 | (-2, 'textsmall', '')
884 | (-2, 'textsmall', '')
885 | (-2, 'textsmall', '')
886 | (-1, 'text', '')
887 | (-1, 'text', '')
888 | (-1, 'text', '')
889 | (-1, 'text', '')
890 | (4, 'predicted header', '')
891 | (-1, 'text', '')
892 | (4, 'predicted header', '')
893 | (-1, 'text', '')
894 | (4, 'predicted header', '')
895 | (-1, 'text', '')
896 | (-1, 'text', '')
897 | (-2, 'textsmall', '')
898 | (-2, 'textsmall', '')
899 | (-2, 'textsmall', '')
900 | (-1, 'text', '')
901 | (4, 'predicted header', '')
902 | (-1, 'text', '')
903 | (4, 'predicted header', '')
904 | (-1, 'text', '')
905 | (4, 'predicted header', '')
906 | (-1, 'text', '')
907 | (4, 'predicted header', '')
908 | (5, 'predicted header', '')
909 | (-1, 'text', '')
910 | (5, 'predicted header', '')
911 | (-1, 'text', '')
912 | (3, 'predicted header', '')
913 | (-1, 'text', '')
914 | (-2, 'textsmall', '')
915 | (-2, 'textsmall', '')
916 | (-2, 'textsmall', '')
917 | (-1, 'text', '')
918 | (-1, 'text', '')
919 | (-1, 'text', '')
920 | (3, 'predicted header', '')
921 | (-1, 'text', '')
922 | (-1, 'text', '')
923 | (-1, 'text', '')
924 | (4, 'predicted header', '')
925 | (-1, 'text', '')
926 | (-1, 'text', '')
927 | (-1, 'text', '')
928 | (-1, 'text', '')
929 | (-2, 'textsmall', '')
930 | (-2, 'textsmall', '')
931 | (-2, 'textsmall', '')
932 | (3, 'predicted header', '')
933 | (4, 'predicted header', '')
934 | (-1, 'text', '')
935 | (-1, 'text', '')
936 | (-2, 'textsmall', '')
937 | (-2, 'textsmall', '')
938 | (-2, 'textsmall', '')
939 | (-1, 'text', '')
940 | (-1, 'text', '')
941 | (-1, 'text', '')
942 | (-2, 'textsmall', '')
943 | (-2, 'textsmall', '')
944 | (-2, 'textsmall', '')
945 | (4, 'predicted header', '')
946 | (-1, 'text', '')
947 | (-1, 'text', '')
948 | (-1, 'text', '')
949 | (-1, 'text', '')
950 | (4, 'predicted header', '')
951 | (-1, 'text', '')
952 | (-1, 'text', '')
953 | (-2, 'textsmall', '')
954 | (-2, 'textsmall', '')
955 | (-2, 'textsmall', '')
956 | (3, 'predicted header', '')
957 | (-1, 'text', '')
958 | (4, 'predicted header', '')
959 | (-1, 'text', '')
960 | (-1, 'text', '')
961 | (-1, 'text', '')
962 | (4, 'predicted header', '')
963 | (-1, 'text', '')
964 | (-1, 'text', '')
965 | (4, 'predicted header', '')
966 | (-1, 'text', '')
967 | (4, 'predicted header', '')
968 | (-1, 'text', '')
969 | (4, 'predicted header', '')
970 | (-1, 'text', '')
971 | (-2, 'textsmall', '')
972 | (-2, 'textsmall', '')
973 | (-2, 'textsmall', '')
974 | (-1, 'text', '')
975 | (-1, 'text', '')
976 | (4, 'predicted header', '')
977 | (-1, 'text', '')
978 | (-1, 'text', '')
979 | (-1, 'text', '')
980 | (-1, 'text', '')
981 | (-1, 'text', '')
982 | (-2, 'textsmall', '')
983 | (-2, 'textsmall', '')
984 | (-2, 'textsmall', '')
985 | (-1, 'text', '')
986 | (-1, 'text', '')
987 | (-1, 'text', '')
988 | (-1, 'text', '')
989 | (3, 'predicted header', '')
990 | (-1, 'text', '')
991 | (-1, 'text', '')
992 | (-2, 'textsmall', '')
993 | (-2, 'textsmall', '')
994 | (-2, 'textsmall', '')
995 | (3, 'predicted header', '')
996 | (-1, 'text', '')
997 | (-1, 'text', '')
998 | (-1, 'text', '')
999 | (-1, 'text', '')
1000 | (3, 'predicted header', '')
1001 | (4, 'predicted header', '')
1002 | (-1, 'text', '')
1003 | (-1, 'text', '')
1004 | (-1, 'text', '')
1005 | (-1, 'text', '')
1006 | (-1, 'text', '')
1007 | (-2, 'textsmall', '')
1008 | (-2, 'textsmall', '')
1009 | (-2, 'textsmall', '')
1010 | (-1, 'text', '')
1011 | (-1, 'text', '')
1012 | (-1, 'text', '')
1013 | (-1, 'text', '')
1014 | (-1, 'text', '')
1015 | (-1, 'text', '')
1016 | (-1, 'text', '')
1017 | (-1, 'text', '')
1018 | (4, 'predicted header', '')
1019 | (-1, 'text', '')
1020 | (-1, 'text', '')
1021 | (-1, 'text', '')
1022 | (-1, 'text', '')
1023 | (-1, 'text', '')
1024 | (-2, 'textsmall', '')
1025 | (-2, 'textsmall', '')
1026 | (-2, 'textsmall', '')
1027 | (-1, 'text', '')
1028 | (-1, 'text', '')
1029 | (3, 'predicted header', '')
1030 | (-1, 'text', '')
1031 | (-1, 'text', '')
1032 | (-1, 'text', '')
1033 | (-1, 'text', '')
1034 | (-1, 'text', '')
1035 | (4, 'predicted header', '')
1036 | (-1, 'text', '')
1037 | (-1, 'text', '')
1038 | (-2, 'textsmall', '')
1039 | (-2, 'textsmall', '')
1040 | (-2, 'textsmall', '')
1041 | (3, 'predicted header', '')
1042 | (-1, 'text', '')
1043 | (-1, 'text', '')
1044 | (-1, 'text', '')
1045 | (-1, 'text', '')
1046 | (-1, 'text', '')
1047 | (-1, 'text', '')
1048 | (-1, 'text', '')
1049 | (-1, 'text', '')
1050 | (-1, 'text', '')
1051 | (-2, 'textsmall', '')
1052 | (-2, 'textsmall', '')
1053 | (-2, 'textsmall', '')
1054 | (3, 'predicted header', '')
1055 | (4, 'predicted header', '')
1056 | (-1, 'text', '')
1057 | (4, 'predicted header', '')
1058 | (-1, 'text', '')
1059 | (-1, 'text', '')
1060 | (-1, 'text', '')
1061 | (-1, 'text', '')
1062 | (-1, 'text', '')
1063 | (-1, 'text', '')
1064 | (-2, 'textsmall', '')
1065 | (-2, 'textsmall', '')
1066 | (-2, 'textsmall', '')
1067 | (-1, 'text', '')
1068 | (-1, 'text', '')
1069 | (3, 'predicted header', '')
1070 | (4, 'predicted header', '')
1071 | (-1, 'text', '')
1072 | (-1, 'text', '')
1073 | (-1, 'text', '')
1074 | (-1, 'text', '')
1075 | (-2, 'textsmall', '')
1076 | (-2, 'textsmall', '')
1077 | (-2, 'textsmall', '')
1078 | (4, 'predicted header', '')
1079 | (-1, 'text', '')
1080 | (-1, 'text', '')
1081 | (-1, 'text', '')
1082 | (-1, 'text', '')
1083 | (-1, 'text', '')
1084 | (-2, 'textsmall', '')
1085 | (-2, 'textsmall', '')
1086 | (-2, 'textsmall', '')
1087 | (-1, 'text', '')
1088 | (-1, 'text', '')
1089 | (-1, 'text', '')
1090 | (-1, 'text', '')
1091 | (-1, 'text', '')
1092 | (-1, 'text', '')
1093 | (4, 'predicted header', '')
1094 | (-1, 'text', '')
1095 | (-2, 'textsmall', '')
1096 | (-2, 'textsmall', '')
1097 | (-2, 'textsmall', '')
1098 | (-1, 'text', '')
1099 | (-1, 'text', '')
1100 | (-1, 'text', '')
1101 | (-1, 'text', '')
1102 | (-1, 'text', '')
1103 | (-1, 'text', '')
1104 | (3, 'predicted header', '')
1105 | (-1, 'text', '')
1106 | (-1, 'text', '')
1107 | (-1, 'text', '')
1108 | (-1, 'text', '')
1109 | (-1, 'text', '')
1110 | (-2, 'textsmall', '')
1111 | (-2, 'textsmall', '')
1112 | (-2, 'textsmall', '')
1113 | (3, 'predicted header', '')
1114 | (-1, 'text', '')
1115 | (-1, 'text', '')
1116 | (-1, 'text', '')
1117 | (-1, 'text', '')
1118 | (-1, 'text', '')
1119 | (-1, 'text', '')
1120 | (-1, 'text', '')
1121 | (-2, 'textsmall', '')
1122 | (-2, 'textsmall', '')
1123 | (-2, 'textsmall', '')
1124 | (-1, 'text', '')
1125 | (-1, 'text', '')
1126 | (-1, 'text', '')
1127 | (3, 'predicted header', '')
1128 | (4, 'predicted header', '')
1129 | (-1, 'text', '')
1130 | (-1, 'text', '')
1131 | (-2, 'textsmall', '')
1132 | (-2, 'textsmall', '')
1133 | (-2, 'textsmall', '')
1134 | (4, 'predicted header', '')
1135 | (-1, 'text', '')
1136 | (4, 'predicted header', '')
1137 | (-1, 'text', '')
1138 | (-1, 'text', '')
1139 | (3, 'predicted header', '')
1140 | (4, 'predicted header', '')
1141 | (-1, 'text', '')
1142 | (-1, 'text', '')
1143 | (4, 'predicted header', '')
1144 | (-1, 'text', '')
1145 | (-1, 'text', '')
1146 | (-1, 'text', '')
1147 | (-1, 'text', '')
1148 | (-2, 'textsmall', '')
1149 | (-2, 'textsmall', '')
1150 | (-2, 'textsmall', '')
1151 | (-1, 'text', '')
1152 | (4, 'predicted header', '')
1153 | (-1, 'text', '')
1154 | (-1, 'text', '')
1155 | (-1, 'text', '')
1156 | (-2, 'textsmall', '')
1157 | (-2, 'textsmall', '')
1158 | (-2, 'textsmall', '')
1159 | (3, 'predicted header', '')
1160 | (-1, 'text', '')
1161 | (-1, 'text', '')
1162 | (3, 'predicted header', '')
1163 | (-1, 'text', '')
1164 | (-1, 'text', '')
1165 | (-1, 'text', '')
1166 | (4, 'predicted header', '')
1167 | (-1, 'text', '')
1168 | (-2, 'textsmall', '')
1169 | (-2, 'textsmall', '')
1170 | (-2, 'textsmall', '')
1171 | (5, 'predicted header', '')
1172 | (-1, 'text', '')
1173 | (5, 'predicted header', '')
1174 | (-1, 'text', '')
1175 | (-1, 'text', '')
1176 | (-1, 'text', '')
1177 | (-1, 'text', '')
1178 | (-1, 'text', '')
1179 | (-1, 'text', '')
1180 | (4, 'predicted header', '')
1181 | (-1, 'text', '')
1182 | (-1, 'text', '')
1183 | (-1, 'text', '')
1184 | (-1, 'text', '')
1185 | (-2, 'textsmall', '')
1186 | (-2, 'textsmall', '')
1187 | (-2, 'textsmall', '')
1188 | (4, 'predicted header', '')
1189 | (-1, 'text', '')
1190 | (3, 'predicted header', '')
1191 | (-1, 'text', '')
1192 | (-1, 'text', '')
1193 | (4, 'predicted header', '')
1194 | (-1, 'text', '')
1195 | (-1, 'text', '')
1196 | (-1, 'text', '')
1197 | (-1, 'text', '')
1198 | (-1, 'text', '')
1199 | (4, 'predicted header', '')
1200 | (-1, 'text', '')
1201 | (-1, 'text', '')
1202 | (-1, 'text', '')
1203 | (4, 'predicted header', '')
1204 | (-1, 'text', '')
1205 | (-1, 'text', '')
1206 | (-1, 'text', '')
1207 | (-2, 'textsmall', '')
1208 | (-2, 'textsmall', '')
1209 | (-2, 'textsmall', '')
1210 | (-1, 'text', '')
1211 | (-1, 'text', '')
1212 | (-1, 'text', '')
1213 | (-1, 'text', '')
1214 | (-1, 'text', '')
1215 | (-1, 'text', '')
1216 | (-1, 'text', '')
1217 | (-1, 'text', '')
1218 | (-1, 'text', '')
1219 | (-2, 'textsmall', '')
1220 | (-2, 'textsmall', '')
1221 | (-2, 'textsmall', '')
1222 | (-1, 'text', '')
1223 | (-1, 'text', '')
1224 | (-1, 'text', '')
1225 | (-1, 'text', '')
1226 | (-1, 'text', '')
1227 | (-1, 'text', '')
1228 | (-1, 'text', '')
1229 | (-2, 'textsmall', '')
1230 | (-2, 'textsmall', '')
1231 | (-2, 'textsmall', '')
1232 | (4, 'predicted header', '')
1233 | (-1, 'text', '')
1234 | (4, 'predicted header', '')
1235 | (-1, 'text', '')
1236 | (-1, 'text', '')
1237 | (4, 'predicted header', '')
1238 | (-1, 'text', '')
1239 | (-1, 'text', '')
1240 | (4, 'predicted header', '')
1241 | (-1, 'text', '')
1242 | (5, 'predicted header', '')
1243 | (6, 'predicted header', '')
1244 | (-1, 'text', '')
1245 | (-2, 'textsmall', '')
1246 | (-2, 'textsmall', '')
1247 | (-2, 'textsmall', '')
1248 | (-1, 'text', '')
1249 | (-1, 'text', '')
1250 | (-1, 'text', '')
1251 | (-1, 'text', '')
1252 | (-1, 'text', '')
1253 | (-1, 'text', '')
1254 | (6, 'predicted header', '')
1255 | (-1, 'text', '')
1256 | (-1, 'text', '')
1257 | (-1, 'text', '')
1258 | (-1, 'text', '')
1259 | (-1, 'text', '')
1260 | (-1, 'text', '')
1261 | (-1, 'text', '')
1262 | (-1, 'text', '')
1263 | (-1, 'text', '')
1264 | (5, 'predicted header', '')
1265 | (6, 'predicted header', '')
1266 | (-1, 'text', '')
1267 | (-2, 'textsmall', '')
1268 | (-2, 'textsmall', '')
1269 | (-2, 'textsmall', '')
1270 | (-1, 'text', '')
1271 | (6, 'predicted header', '')
1272 | (-1, 'text', '')
1273 | (-1, 'text', '')
1274 | (-1, 'text', '')
1275 | (-1, 'text', '')
1276 | (-1, 'text', '')
1277 | (-1, 'text', '')
1278 | (5, 'predicted header', '')
1279 | (6, 'predicted header', '')
1280 | (-1, 'text', '')
1281 | (-1, 'text', '')
1282 | (6, 'predicted header', '')
1283 | (-1, 'text', '')
1284 | (-1, 'text', '')
1285 | (-1, 'text', '')
1286 | (-2, 'textsmall', '')
1287 | (-2, 'textsmall', '')
1288 | (-2, 'textsmall', '')
1289 | (-1, 'text', '')
1290 | (-1, 'text', '')
1291 | (-1, 'text', '')
1292 | (-1, 'text', '')
1293 | (-1, 'text', '')
1294 | (-1, 'text', '')
1295 | (-1, 'text', '')
1296 | (-1, 'text', '')
1297 | (-2, 'textsmall', '')
1298 | (-2, 'textsmall', '')
1299 | (-2, 'textsmall', '')
1300 | (1, 'item', 'item9')
1301 | (-1, 'text', '')
1302 | (1, 'item', 'item9a')
1303 | (-1, 'text', '')
1304 | (2, 'predicted header', '')
1305 | (-1, 'text', '')
1306 | (-1, 'text', '')
1307 | (-2, 'textsmall', '')
1308 | (-2, 'textsmall', '')
1309 | (-2, 'textsmall', '')
1310 | (2, 'predicted header', '')
1311 | (-1, 'text', '')
1312 | (2, 'predicted header', '')
1313 | (-1, 'text', '')
1314 | (-1, 'text', '')
1315 | (-1, 'text', '')
1316 | (2, 'predicted header', '')
1317 | (-1, 'text', '')
1318 | (-1, 'text', '')
1319 | (2, 'predicted header', '')
1320 | (-1, 'text', '')
1321 | (-2, 'textsmall', '')
1322 | (-2, 'textsmall', '')
1323 | (-2, 'textsmall', '')
1324 | (-1, 'text', '')
1325 | (-1, 'text', '')
1326 | (-1, 'text', '')
1327 | (-1, 'text', '')
1328 | (-2, 'textsmall', '')
1329 | (-2, 'textsmall', '')
1330 | (-2, 'textsmall', '')
1331 | (1, 'item', 'item9b')
1332 | (2, 'predicted header', '')
1333 | (-1, 'text', '')
1334 | (1, 'item', 'item9c')
1335 | (-1, 'text', '')
1336 | (0, 'part', 'partiii')
1337 | (1, 'item', 'item10')
1338 | (-1, 'text', '')
1339 | (-1, 'text', '')
1340 | (-1, 'text', '')
1341 | (-1, 'text', '')
1342 | (-1, 'text', '')
1343 | (-1, 'text', '')
1344 | (-2, 'textsmall', '')
1345 | (-2, 'textsmall', '')
1346 | (-2, 'textsmall', '')
1347 | (1, 'item', 'item11')
1348 | (-1, 'text', '')
1349 | (1, 'item', 'item12')
1350 | (-1, 'text', '')
1351 | (1, 'item', 'item13')
1352 | (-1, 'text', '')
1353 | (1, 'item', 'item14')
1354 | (-1, 'text', '')
1355 | (-2, 'textsmall', '')
1356 | (-2, 'textsmall', '')
1357 | (-2, 'textsmall', '')
1358 | (0, 'part', 'partiv')
1359 | (1, 'item', 'item15')
1360 | (2, 'predicted header', '')
1361 | (-1, 'text', '')
1362 | (-1, 'text', '')
1363 | (2, 'predicted header', '')
1364 | (-1, 'text', '')
1365 | (-2, 'textsmall', '')
1366 | (-2, 'textsmall', '')
1367 | (-2, 'textsmall', '')
1368 | (-1, 'text', '')
1369 | (-2, 'textsmall', '')
1370 | (-2, 'textsmall', '')
1371 | (-2, 'textsmall', '')
1372 | (-1, 'text', '')
1373 | (-2, 'textsmall', '')
1374 | (-2, 'textsmall', '')
1375 | (-2, 'textsmall', '')
1376 | (-1, 'text', '')
1377 | (-2, 'textsmall', '')
1378 | (-2, 'textsmall', '')
1379 | (-2, 'textsmall', '')
1380 | (-1, 'text', '')
1381 | (-2, 'textsmall', '')
1382 | (-2, 'textsmall', '')
1383 | (-2, 'textsmall', '')
1384 | (-1, 'text', '')
1385 | (-2, 'textsmall', '')
1386 | (-2, 'textsmall', '')
1387 | (-2, 'textsmall', '')
1388 | (-1, 'text', '')
1389 | (-1, 'text', '')
1390 | (-1, 'text', '')
1391 | (-2, 'textsmall', '')
1392 | (-2, 'textsmall', '')
1393 | (-2, 'textsmall', '')
1394 | (1, 'item', 'item16')
1395 | (-1, 'text', '')
1396 | (-2, 'textsmall', '')
1397 | (0, 'signatures', 'signatures')
1398 | (-1, 'text', '')
1399 | (-1, 'text', '')
1400 | (-1, 'text', '')
1401 | (-1, 'text', '')
1402 | (-2, 'textsmall', '')
1403 |
--------------------------------------------------------------------------------
/doc2dict/doc2dict/html/convert_html_to_instructions.py:
--------------------------------------------------------------------------------
1 | from ..utils.strings import check_string_style
2 | # params
3 | tag_groups = {
4 | "bold": ["b", "strong"],
5 | "italic": ["i", "em"],
6 | "underline": ["u", "ins"],
7 | }
8 |
9 | EMPTY_CHARS = ' \t\n\r\xa0'
10 | EMPTY_TABLE_CHARS = ['', '–', '-']
11 | LEFT_TABLE_CHARS = ['$','(']
12 | RIGHT_TABLE_CHARS = [')','%']
13 |
14 | def remove_leading_empty_instructions(instructions):
15 | """Remove leading empty/whitespace-only instructions from the list"""
16 | if not instructions:
17 | return instructions
18 |
19 | # Find the first non-empty instruction
20 | first_meaningful_index = 0
21 | for i, instruction in enumerate(instructions):
22 | # Skip non-text instructions (tables, images are meaningful content)
23 | if 'image' in instruction or 'table' in instruction:
24 | first_meaningful_index = i
25 | break
26 |
27 | # Check if text instruction has meaningful content
28 | if 'text' in instruction:
29 | text = instruction['text'].strip(EMPTY_CHARS)
30 | if text: # Non-empty after stripping
31 | first_meaningful_index = i
32 | break
33 | else:
34 | # If we get here, all instructions were empty text or whitespace-only
35 | return []
36 |
37 | # Return sliced list starting from first meaningful instruction
38 | return instructions[first_meaningful_index:]
39 |
40 | def is_empty_instructions(instructions):
41 | """Check if an instruction block contains only whitespace/empty content"""
42 | if not instructions:
43 | return True
44 |
45 | for instruction in instructions:
46 | # Skip non-text instructions (tables, images are meaningful content)
47 | if 'image' in instruction or 'table' in instruction:
48 | return False
49 |
50 | # Check if text instruction has meaningful content
51 | if 'text' in instruction:
52 | text = instruction['text'].strip(EMPTY_CHARS)
53 | if text: # Non-empty after stripping
54 | return False
55 |
56 | # All instructions were either empty text or whitespace-only
57 | return True
58 |
59 | # utils
60 | def walk(node):
61 | yield ("start",node)
62 | for child in node.iter(include_text=True):
63 | yield from walk(child)
64 |
65 | yield ("end",node)
66 |
67 |
68 | def style_to_dict(style_string):
69 | result = {}
70 | if not style_string:
71 | return result
72 | # send to lower case
73 | style_string = style_string.lower()
74 | style_list = [attr.strip(EMPTY_CHARS) for attr in style_string.split(';') if attr.strip(EMPTY_CHARS)]
75 |
76 | for item in style_list:
77 | if ':' in item:
78 | key, value = item.split(':', 1)
79 | result[key.strip(EMPTY_CHARS)] = value.strip(EMPTY_CHARS)
80 | return result
81 |
82 |
83 | def parse_font_shorthand(font_value):
84 | """
85 | Parse CSS font shorthand property into individual components.
86 |
87 | Font shorthand syntax: [font-style] [font-variant] [font-weight] font-size [/line-height] font-family
88 | Required: font-size and font-family
89 | Optional (in order): font-style, font-variant, font-weight, line-height
90 |
91 | Examples:
92 | - "bold 10pt Times New Roman" -> {'font-weight': 'bold', 'font-size': '10pt', 'font-family': 'Times New Roman'}
93 | - "italic bold 12px Arial" -> {'font-style': 'italic', 'font-weight': 'bold', 'font-size': '12px', 'font-family': 'Arial'}
94 | """
95 | if not font_value:
96 | return {}
97 |
98 | # Clean and split the font value
99 | parts = font_value.strip().split()
100 | if len(parts) < 2: # Must have at least font-size and font-family
101 | return {}
102 |
103 | result = {}
104 | i = 0
105 |
106 | # Parse optional properties in order: font-style, font-variant, font-weight
107 |
108 | # Check for font-style (italic, oblique, normal)
109 | if i < len(parts) and parts[i].lower() in ['italic', 'oblique', 'normal']:
110 | if parts[i].lower() == 'italic':
111 | result['font-style'] = 'italic'
112 | i += 1
113 |
114 | # Check for font-variant (small-caps, normal) - we'll skip this for now
115 | if i < len(parts) and parts[i].lower() in ['small-caps', 'normal']:
116 | # Skip font-variant for now since we don't handle it
117 | i += 1
118 |
119 | # Check for font-weight (bold, normal, 100-900, lighter, bolder)
120 | if i < len(parts):
121 | weight = parts[i].lower()
122 | if weight in ['bold', '700']:
123 | result['font-weight'] = 'bold'
124 | i += 1
125 | elif weight in ['normal', '400']:
126 | result['font-weight'] = 'normal'
127 | i += 1
128 | elif weight in ['100', '200', '300', '500', '600', '800', '900', 'lighter', 'bolder']:
129 | result['font-weight'] = weight
130 | i += 1
131 |
132 | # Next must be font-size (required)
133 | if i < len(parts):
134 | size_part = parts[i]
135 | # Handle font-size/line-height format (e.g., "12px/1.5")
136 | if '/' in size_part:
137 | size, line_height = size_part.split('/', 1)
138 | result['font-size'] = size
139 | result['line-height'] = line_height
140 | else:
141 | result['font-size'] = size_part
142 | i += 1
143 |
144 | # Remaining parts are font-family (required)
145 | if i < len(parts):
146 | # Join remaining parts for font family (handles "Times New Roman" etc.)
147 | font_family = ' '.join(parts[i:])
148 | # Remove quotes if present
149 | font_family = font_family.strip('\'"')
150 | result['font-family'] = font_family
151 |
152 | return result
153 |
154 | def get_style(node):
155 | increments = []
156 | stacks = []
157 | style = node.attributes.get('style', '')
158 | style_dict = style_to_dict(style)
159 |
160 | # Parse font shorthand if present
161 | if 'font' in style_dict:
162 | font_properties = parse_font_shorthand(style_dict['font'])
163 | # Merge parsed properties into style_dict
164 | style_dict.update(font_properties)
165 |
166 | if 'font-weight' in style_dict:
167 | if style_dict['font-weight'] == 'bold':
168 | increments.append('bold')
169 | elif style_dict['font-weight'] == '700':
170 | increments.append('bold')
171 |
172 | if 'font-style' in style_dict:
173 | if style_dict['font-style'] == 'italic':
174 | increments.append('italic')
175 |
176 | if 'text-decoration' in style_dict:
177 | if style_dict['text-decoration'] == 'underline':
178 | increments.append('underline')
179 |
180 | if 'text-align' in style_dict:
181 | if style_dict['text-align'] == 'center':
182 | increments.append('text-center')
183 |
184 |
185 | left_indent = 0
186 |
187 | if 'font-size' in style_dict:
188 | font_size = style_dict['font-size']
189 | font_size = normalize_to_px(font_size)
190 | stacks.append({'font-size': font_size})
191 |
192 | if 'text-indent' in style_dict:
193 | indent = style_dict['text-indent']
194 | indent = normalize_to_px(indent)
195 | left_indent += indent
196 |
197 | if 'padding' in style_dict:
198 | padding_value = style_dict['padding']
199 | # Handle four-value format: top right bottom left
200 | if padding_value.count(' ') == 3:
201 | _, _, _, left = padding_value.split(' ')
202 | left = normalize_to_px(left)
203 | left_indent += left
204 | # Handle three-value format: top right/left bottom
205 | elif padding_value.count(' ') == 2:
206 | _, right_left, _ = padding_value.split(' ')
207 | right_left = normalize_to_px(right_left)
208 | left_indent += right_left
209 | # Handle two-value format: top/bottom right/left
210 | elif padding_value.count(' ') == 1:
211 | _, right_left = padding_value.split(' ')
212 | right_left = normalize_to_px(right_left)
213 | left_indent += right_left
214 | # Handle single-value format: all sides
215 | else:
216 | padding_value = normalize_to_px(padding_value)
217 | left_indent += padding_value
218 |
219 | # Also handle direct padding-left if specified
220 | if 'padding-left' in style_dict:
221 | padding_left = style_dict['padding-left']
222 | padding_left = normalize_to_px(padding_left)
223 | left_indent += padding_left
224 |
225 | # Handle margin with the same logic as padding
226 | if 'margin' in style_dict:
227 | margin_value = style_dict['margin']
228 | # Handle four-value format: top right bottom left
229 | if margin_value.count(' ') == 3:
230 | _, _, _, left = margin_value.split(' ')
231 | left = normalize_to_px(left)
232 | left_indent += left
233 | # Handle three-value format: top right/left bottom
234 | elif margin_value.count(' ') == 2:
235 | _, right_left, _ = margin_value.split(' ')
236 | right_left = normalize_to_px(right_left)
237 | left_indent += right_left
238 | # Handle two-value format: top/bottom right/left
239 | elif margin_value.count(' ') == 1:
240 | _, right_left = margin_value.split(' ')
241 | right_left = normalize_to_px(right_left)
242 | left_indent += right_left
243 | # Handle single-value format: all sides
244 | else:
245 | margin_value = normalize_to_px(margin_value)
246 | left_indent += margin_value
247 |
248 | # Handle direct margin-left if specified
249 | if 'margin-left' in style_dict:
250 | margin_left = style_dict['margin-left']
251 | margin_left = normalize_to_px(margin_left)
252 | left_indent += margin_left
253 |
254 | if 'display' in style_dict:
255 | if style_dict['display'] == 'none':
256 | increments.append('display-none')
257 |
258 | if left_indent != 0:
259 | stacks.append({'left-indent': str(left_indent)})
260 | return increments, stacks
261 |
262 | def parse_css_value(value_str):
263 | """Extract numeric value and unit from CSS value string"""
264 | if not value_str or not isinstance(value_str, str):
265 | return 0, 'px'
266 |
267 | value_str = value_str.strip(EMPTY_CHARS)
268 |
269 | # Handle non-numeric values
270 | if value_str in ['auto', 'inherit', 'initial']:
271 | return 0, value_str
272 |
273 | # Find where the number ends
274 | numeric_part = ''
275 | for i, char in enumerate(value_str):
276 | if char.isdigit() or char == '.':
277 | numeric_part += char
278 | elif char == '-' and i == 0: # Handle negative values
279 | numeric_part += char
280 | else:
281 | unit = value_str[i:].strip(EMPTY_CHARS)
282 | break
283 | else:
284 | unit = 'px' # Default if no unit specified
285 |
286 | # Convert numeric part to float
287 | try:
288 | value = float(numeric_part) if numeric_part else 0
289 | except ValueError:
290 | value = 0
291 |
292 | return value, unit
293 |
294 |
295 | def normalize_to_px(value_str, font_context=None):
296 | """Convert any CSS measurement to pixels based on context"""
297 | if not value_str:
298 | return 0
299 |
300 | # Parse the value
301 | value, unit = parse_css_value(value_str)
302 |
303 | # Early return for non-numeric values
304 | if unit in ['auto', 'inherit', 'initial']:
305 | return 0
306 |
307 | # Get font context in pixels
308 | current_font_size = 16 # Default
309 | if font_context:
310 | font_value, font_unit = parse_css_value(font_context)
311 | if font_unit == 'px':
312 | current_font_size = font_value
313 | elif font_unit == 'pt':
314 | current_font_size = font_value * 1.333
315 | else:
316 | # For simplicity, treat all other units as approximately 16px
317 | current_font_size = font_value * 16 if font_value else 16
318 |
319 | # Convert to pixels
320 | if unit == 'px':
321 | return value
322 | elif unit == 'pt':
323 | return value * 1.333
324 | elif unit == 'em':
325 | return value * current_font_size
326 | elif unit == 'rem':
327 | return value * 16 # Root em always based on root font size
328 | elif unit == '%':
329 | return value * current_font_size / 100 # % of font size
330 | elif unit == 'ex':
331 | return value * current_font_size / 2 # Roughly half the font size
332 | elif unit == 'ch':
333 | return value * current_font_size * 0.5 # Approximate character width
334 | elif unit in ['vh', 'vw', 'vmin', 'vmax']:
335 | return value # Cannot accurately convert viewport units without screen size
336 | elif unit == 'cm':
337 | return value * 37.8 # Approximate for screen (96dpi)
338 | elif unit == 'mm':
339 | return value * 3.78 # 1/10th of cm
340 | elif unit == 'in':
341 | return value * 96 # Standard 96dpi
342 | elif unit == 'pc':
343 | return value * 16 # 1pc = 12pt
344 | else:
345 | return value # Unknown unit, return as is
346 |
347 | def safe_increment(dct,key):
348 | if key not in dct:
349 | dct[key] = 0
350 |
351 | dct[key] += 1
352 |
353 | def safe_decrement(dct,key):
354 | if key not in dct:
355 | dct[key] = 0
356 |
357 | dct[key] -= 1
358 | if dct[key] < 0:
359 | dct[key] = 0
360 |
361 | def safe_stack(dct,key,val):
362 | if key not in dct:
363 | dct[key] = []
364 |
365 | dct[key].append(val)
366 |
367 | def safe_unstack(dct,key):
368 | if key not in dct:
369 | dct[key] = []
370 |
371 | if len(dct[key]) > 0:
372 | dct[key].pop()
373 | else:
374 | dct[key] = []
375 |
376 | def parse_start_style(current_attributes,node):
377 | increments, stacks = get_style(node)
378 | if 'display-none' in increments:
379 | return 'skip'
380 |
381 | for key in increments:
382 | safe_increment(current_attributes,key)
383 |
384 | for stack in stacks:
385 | for key in stack:
386 | safe_stack(current_attributes,key,stack[key])
387 |
388 | return ''
389 | def parse_end_style(current_attributes,node):
390 | increments,stacks = get_style(node)
391 | if 'display-none' in increments:
392 | return 'skip'
393 |
394 | for key in increments:
395 | safe_decrement(current_attributes,key)
396 |
397 | for stack in stacks:
398 | for key in stack:
399 | safe_unstack(current_attributes,key)
400 |
401 | return ''
402 |
403 | def parse_start_tag(current_attributes,node):
404 | tag = node.tag
405 |
406 | if tag == 'table':
407 | return 'table'
408 | elif tag == '-text':
409 | return 'text'
410 | elif tag == 'a':
411 | href = node.attributes.get('href', '')
412 | safe_stack(current_attributes, 'href', href)
413 | return ''
414 | elif tag == 'img':
415 | return 'image'
416 |
417 | for tag in tag_groups:
418 | if node.tag in tag_groups[tag]:
419 | safe_increment(current_attributes,tag)
420 | return ''
421 |
422 | def parse_end_tag(current_attributes,node):
423 | tag = node.tag
424 |
425 | if tag == 'table':
426 | return 'table'
427 | elif tag in ['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li','br']:
428 | return 'newline'
429 | elif tag == 'a':
430 | safe_unstack(current_attributes, 'href')
431 | return ''
432 |
433 | for tag in tag_groups:
434 | if node.tag in tag_groups[tag]:
435 | safe_decrement(current_attributes,tag)
436 | return ''
437 |
438 | # USED AI BC LAZY #
439 | def merge_instructions(instructions):
440 | if not instructions or len(instructions) <= 1:
441 | return instructions
442 |
443 | result = [instructions[0]]
444 |
445 | for i in range(1, len(instructions)):
446 | current = instructions[i]
447 | prev = result[-1]
448 |
449 | # Skip merging if either instruction is an image
450 | if 'image' in current or 'image' in prev:
451 | result.append(current)
452 | continue
453 |
454 | # Case 1: Empty string after strip
455 | if current.get('text', '').strip(EMPTY_CHARS) == '':
456 | prev['text'] += current.get('text', '')
457 | continue
458 |
459 | # Case 2: Attributes match with previous
460 | attrs_to_check = ['bold', 'text-center', 'italic', 'underline', 'font-size']
461 | attrs_match = all(current.get(attr) == prev.get(attr) for attr in attrs_to_check)
462 |
463 | if attrs_match:
464 | prev['text'] += current.get('text', '')
465 | continue
466 |
467 | # Case 3: Check if attributes match with any earlier instruction
468 | # This handles the case where instructions a and c match but b doesn't
469 | merged = False
470 | for j in range(len(result) - 1, -1, -1): # Check all previous instructions
471 | earlier = result[j]
472 | if 'image' not in earlier and all(current.get(attr) == earlier.get(attr) for attr in attrs_to_check):
473 | # Combine all instructions from j to the current one
474 | combined_text = earlier['text']
475 | for k in range(j + 1, len(result)):
476 | if 'text' in result[k]:
477 | combined_text += result[k].get('text', '')
478 | combined_text += current.get('text', '')
479 |
480 | earlier['text'] = combined_text
481 | # Remove the instructions that were merged
482 | result = result[:j+1]
483 | merged = True
484 | break
485 |
486 | if not merged:
487 | result.append(current)
488 |
489 | return result
490 | # USED AI BC LAZY #
491 |
492 | def is_subset(items1, items2, empty_chars):
493 | """returns true if items1 is a subset of items2"""
494 | return all(item1.get('text', '') in empty_chars or item1.get('text', '') == item2.get('text', '') for item1, item2 in zip(items1, items2))
495 |
496 | def remove_subset_rows(table, empty_chars, direction="bottom_to_top"):
497 | """
498 | Remove subset rows from the table.
499 | direction: "bottom_to_top" or "top_to_bottom"
500 | """
501 | if not table:
502 | return table
503 |
504 | keep_rows = [True] * len(table)
505 |
506 | if direction == "bottom_to_top":
507 | # Compare each row with the row above it
508 | for i in range(len(table)-1, 0, -1):
509 | if is_subset(table[i], table[i-1], empty_chars):
510 | keep_rows[i] = False
511 | else: # top_to_bottom
512 | # Compare each row with the row below it
513 | for i in range(len(table)-1):
514 | if is_subset(table[i], table[i+1], empty_chars):
515 | keep_rows[i] = False
516 |
517 | return [table[i] for i in range(len(table)) if keep_rows[i]]
518 |
519 | def remove_subset_columns(table, empty_chars, direction="left_to_right"):
520 | """
521 | Remove subset columns from the table.
522 | direction: "left_to_right" or "right_to_left"
523 | """
524 | if not table or not table[0]:
525 | return table
526 |
527 | num_cols = len(table[0])
528 | keep_cols = [True] * num_cols
529 |
530 | if direction == "left_to_right":
531 | # Compare each column with the column to its right
532 | for j in range(num_cols-1):
533 | col1 = [row[j] for row in table]
534 | col2 = [row[j+1] for row in table]
535 | if is_subset(col1, col2, empty_chars):
536 | keep_cols[j] = False
537 | else: # right_to_left
538 | # Compare each column with the column to its left
539 | for j in range(num_cols-1, 0, -1):
540 | col1 = [row[j] for row in table]
541 | col2 = [row[j-1] for row in table]
542 | if is_subset(col1, col2, empty_chars):
543 | keep_cols[j] = False
544 |
545 | return [[row[j] for j in range(num_cols) if keep_cols[j]] for row in table]
546 |
547 |
548 |
549 | def is_left_char_cell(cell):
550 | """Check if cell contains only LEFT_TABLE_CHARS + EMPTY_CHARS"""
551 | if 'image' in cell:
552 | return False
553 | text = cell.get('text', '')
554 | if not text:
555 | return False
556 | # Check if all characters in text are either left chars or empty chars
557 | return all(char in LEFT_TABLE_CHARS + EMPTY_TABLE_CHARS for char in text)
558 |
559 | def is_right_char_cell(cell):
560 | """Check if cell contains only RIGHT_TABLE_CHARS + EMPTY_CHARS"""
561 | if 'image' in cell:
562 | return False
563 | text = cell.get('text', '')
564 | if not text:
565 | return False
566 | # Check if all characters in text are either right chars or empty chars
567 | return all(char in RIGHT_TABLE_CHARS + EMPTY_TABLE_CHARS for char in text)
568 |
569 | def is_content_cell(cell):
570 | """Check if cell has meaningful content (not just formatting chars)"""
571 | if 'image' in cell:
572 | return True
573 | text = cell.get('text', '')
574 | if not text:
575 | return False
576 | # Content cell if it has chars that aren't formatting or empty
577 | all_formatting_chars = LEFT_TABLE_CHARS + RIGHT_TABLE_CHARS + EMPTY_TABLE_CHARS
578 | return any(char not in all_formatting_chars for char in text)
579 |
580 | def find_next_content_cell(row, start_col):
581 | """Find next cell with content to the right"""
582 | for col in range(start_col + 1, len(row)):
583 | if is_content_cell(row[col]):
584 | return col
585 | return None
586 |
587 | def find_prev_content_cell(row, start_col):
588 | """Find previous cell with content to the left"""
589 | for col in range(start_col - 1, -1, -1):
590 | if is_content_cell(row[col]):
591 | return col
592 | return None
593 |
594 | def merge_cell_content(source_cell, target_cell, direction):
595 | """Merge source cell text into target cell"""
596 | source_text = source_cell.get('text', '')
597 | target_text = target_cell.get('text', '')
598 |
599 | # Create a copy of target cell to preserve its attributes
600 | merged_cell = target_cell.copy()
601 |
602 | if direction == 'left':
603 | # Source goes to the left of target
604 | merged_cell['text'] = source_text + target_text
605 | else: # direction == 'right'
606 | # Source goes to the right of target
607 | merged_cell['text'] = target_text + source_text
608 |
609 | return merged_cell
610 |
611 |
612 | def merge_cell_instructions(instructions):
613 | """
614 | Merge all text from cell instructions into a single instruction.
615 | Discard images, concatenate all text, collect ALL attributes from ALL instructions.
616 | For boolean attributes (bold, italic, etc.), if ANY instruction has it, the result has it.
617 | For list attributes (font-size, href, etc.), use the last non-empty value.
618 | """
619 | if not instructions:
620 | return {'text': ''}
621 |
622 | # Collect all text and all attributes
623 | combined_text = ''
624 | all_attributes = {}
625 |
626 | for instruction in instructions:
627 | # Skip images completely
628 | if 'image' in instruction:
629 | continue
630 |
631 | # Add any text content
632 | if 'text' in instruction:
633 | combined_text += instruction['text']
634 |
635 | # Collect all attributes except 'text'
636 | for key, value in instruction.items():
637 | if key == 'text':
638 | continue
639 |
640 | if key not in all_attributes:
641 | all_attributes[key] = []
642 | all_attributes[key].append(value)
643 |
644 | # Create final cell instruction
645 | result = {'text': combined_text}
646 |
647 | # Process collected attributes
648 | for key, values in all_attributes.items():
649 | # Remove None/empty values
650 | non_empty_values = [v for v in values if v is not None and v != '']
651 |
652 | if not non_empty_values:
653 | continue
654 |
655 | # For boolean attributes (True/False), if ANY instruction has True, result is True
656 | if all(isinstance(v, bool) for v in non_empty_values):
657 | result[key] = any(non_empty_values)
658 |
659 | # For numeric attributes, use the last value
660 | elif all(isinstance(v, (int, float)) for v in non_empty_values):
661 | result[key] = non_empty_values[-1]
662 |
663 | # For string attributes, use the last non-empty value
664 | else:
665 | result[key] = non_empty_values[-1]
666 |
667 | return result
668 |
669 | def merge_table_formatting(table):
670 | """Merge formatting characters with adjacent content"""
671 | if not table or not table[0]:
672 | return table
673 |
674 | # Create a working copy
675 | result_table = [row[:] for row in table]
676 |
677 | # Left merging pass - merge LEFT_TABLE_CHARS with content to their right
678 | for row_idx, row in enumerate(result_table):
679 | for col_idx, cell in enumerate(row):
680 | if is_left_char_cell(cell):
681 | # Find next content cell to the right
682 | target_col = find_next_content_cell(row, col_idx)
683 | if target_col is not None:
684 | # Merge this cell's content with the target cell
685 | merged_cell = merge_cell_content(cell, row[target_col], 'left')
686 | result_table[row_idx][target_col] = merged_cell
687 | # Mark source cell as empty
688 | result_table[row_idx][col_idx] = {'text': ''}
689 |
690 | # Right merging pass - merge RIGHT_TABLE_CHARS with content to their left
691 | for row_idx, row in enumerate(result_table):
692 | for col_idx, cell in enumerate(row):
693 | if is_right_char_cell(cell):
694 | # Find previous content cell to the left
695 | target_col = find_prev_content_cell(row, col_idx)
696 | if target_col is not None:
697 | # Merge this cell's content with the target cell
698 | merged_cell = merge_cell_content(cell, row[target_col], 'right')
699 | result_table[row_idx][target_col] = merged_cell
700 | # Mark source cell as empty
701 | result_table[row_idx][col_idx] = {'text': ''}
702 |
703 | return result_table
704 |
705 | def clean_table(table):
706 | if len(table) == 0:
707 | return table, "dirty"
708 |
709 | # First check if table has same number of columns
710 | same_length = all([len(row) == len(table[0]) for row in table])
711 | if not same_length:
712 | return table, "dirty"
713 |
714 | # NEW: Table detection - single row tables are likely formatting, not data
715 | if len(table) == 1:
716 | return table, "not_table"
717 |
718 | # Merge formatting characters with adjacent content
719 | table = merge_table_formatting(table)
720 |
721 | # Convert image cells to text cells with [IMAGE: {src}] format
722 | for row_idx, row in enumerate(table):
723 | for col_idx, cell in enumerate(row):
724 | if 'image' in cell:
725 | src = cell['image'].get('src', '')
726 | # Create new text cell preserving other attributes
727 | new_cell = {k: v for k, v in cell.items() if k != 'image'}
728 | new_cell['text'] = f'[IMAGE: {src}]'
729 | table[row_idx][col_idx] = new_cell
730 |
731 | empty_chars = EMPTY_TABLE_CHARS
732 |
733 | # Remove empty rows - now only need to check text since all images are converted
734 | table = [row for row in table if any(
735 | (cell.get('text', '') not in empty_chars)
736 | for cell in row
737 | )]
738 |
739 | # Remove empty columns - now only need to check text since all images are converted
740 | if table and table[0]:
741 | keep_cols = [j for j in range(len(table[0])) if any(
742 | (table[i][j].get('text', '') not in empty_chars)
743 | for i in range(len(table))
744 | )]
745 | table = [[row[j] for j in keep_cols] for row in table]
746 |
747 | # Remove subset rows and columns
748 | table = remove_subset_rows(table, empty_chars, "bottom_to_top")
749 | table = remove_subset_rows(table, empty_chars, "top_to_bottom")
750 | table = remove_subset_columns(table, empty_chars, "left_to_right")
751 | table = remove_subset_columns(table, empty_chars, "right_to_left")
752 |
753 | return table, "cleaned"
754 |
755 | # TODO, not sure how it handles ragged tables... e.g. td are not same length in rows
756 | def convert_html_to_instructions(root):
757 | skip_node = False
758 | in_table = False
759 | in_cell = False
760 |
761 | instructions_list = []
762 | instructions = []
763 | current_attributes = {}
764 |
765 | # Dictionary-based approach for table cells
766 | table_cells = {}
767 | max_row = -1
768 | max_col = -1
769 | occupied_positions = set()
770 | current_cell_instructions = []
771 |
772 | # table
773 | row_id = 0
774 | col_id = 0
775 | rowspan = 1
776 | colspan = 1
777 |
778 | for signal, node in walk(root):
779 | if signal == "start":
780 | # skip invisible elements
781 | if skip_node:
782 | continue
783 | elif in_table and node.tag in ['td', 'th']:
784 | in_cell = True
785 | colspan = int(node.attributes.get('colspan', 1))
786 | rowspan = int(node.attributes.get('rowspan', 1))
787 | current_cell_instructions = []
788 | elif in_table and node.tag == 'tr':
789 | pass
790 |
791 | style_command = parse_start_style(current_attributes, node)
792 | if style_command == 'skip':
793 | skip_node = True
794 | continue
795 |
796 | tag_command = parse_start_tag(current_attributes, node)
797 | if tag_command == 'table':
798 | in_table = True
799 | # Reset table variables
800 | table_cells = {}
801 | max_row = -1
802 | max_col = -1
803 | occupied_positions = set()
804 | row_id = 0
805 | col_id = 0
806 | if len(instructions) > 0:
807 | if not is_empty_instructions(instructions):
808 | instructions_list.append(instructions)
809 | instructions = []
810 | continue
811 | elif tag_command == 'text':
812 | text = node.text_content
813 |
814 | # check not leading whitespace
815 | if len(instructions) == 0:
816 | text = text
817 | if len(text) == 0:
818 | continue
819 |
820 | instruction = {'text': text}
821 |
822 | text_styles = check_string_style(text)
823 | instruction.update(text_styles)
824 |
825 | for key in current_attributes:
826 | val = current_attributes[key]
827 | if isinstance(val, list):
828 | if len(val) > 0:
829 | instruction[key] = val[-1]
830 | elif isinstance(val, int):
831 | if val > 0:
832 | instruction[key] = True
833 |
834 | # Redirect instruction output based on context
835 | if in_cell:
836 | current_cell_instructions.append(instruction)
837 | else:
838 | instructions.append(instruction)
839 | elif tag_command == 'image':
840 | src = node.attributes.get('src', '')
841 | alt = node.attributes.get('alt', '')
842 |
843 | instruction = {'image': {'src': src, 'alt': alt}}
844 |
845 | for key in current_attributes:
846 | val = current_attributes[key]
847 | if isinstance(val, list):
848 | if len(val) > 0:
849 | instruction[key] = val[-1]
850 | elif isinstance(val, int):
851 | if val > 0:
852 | instruction[key] = True
853 |
854 | # Redirect instruction output based on context
855 | if in_cell:
856 | current_cell_instructions.append(instruction)
857 | else:
858 | instructions.append(instruction)
859 |
860 | elif signal == "end":
861 | style_command = parse_end_style(current_attributes, node)
862 | if style_command == 'skip':
863 | skip_node = False
864 | continue
865 |
866 | tag_command = parse_end_tag(current_attributes, node)
867 | if tag_command == 'table':
868 |
869 | # Create a properly sized matrix from the collected data
870 | if max_row >= 0 and max_col >= 0: # Only if we have cells
871 | matrix = [[{'text': ''} for _ in range(max_col + 1)] for _ in range(max_row + 1)]
872 |
873 |
874 | # Fill in the cells
875 | for (r, c), cell_data in table_cells.items():
876 | if 'text' in cell_data:
877 | # Create a copy and strip the text
878 | cleaned_cell = cell_data.copy()
879 | cleaned_cell['text'] = cell_data['text'].strip(EMPTY_CHARS)
880 | matrix[r][c] = cleaned_cell
881 | else:
882 | matrix[r][c] = cell_data
883 |
884 |
885 | # clean the matrix
886 | matrix,cleaning_status = clean_table(matrix)
887 | if cleaning_status == "not_table":
888 | # Combine all cells into one instruction block (same line)
889 | all_cells = []
890 | for cell in matrix[0]:
891 | if 'text' in cell and cell['text'].strip(EMPTY_CHARS):
892 | all_cells.append(cell)
893 | if all_cells:
894 | instructions_list.append(all_cells) # One block = One line
895 | elif len(matrix) == 1:
896 | # Fallback for other single-row cases that somehow didn't get caught
897 | cell_texts = []
898 | for cell in matrix[0]:
899 | if 'image' in cell:
900 | cell_texts.append(f"[Image: {cell['image'].get('alt', 'No alt text')}]")
901 | else:
902 | cell_texts.append(cell.get('text', ''))
903 | matrix_text = ' '.join(cell_texts)
904 | instructions_list.append([{'text': matrix_text, 'fake_table': True}])
905 | else:
906 | # Multi-row table (cleaned or dirty)
907 | instructions_list.append([{'table': matrix, 'cleaned': cleaning_status == "cleaned"}])
908 |
909 |
910 | # Reset table state
911 | table_cells = {}
912 | occupied_positions = set()
913 | current_cell_instructions = []
914 | in_table = False
915 | continue
916 | elif in_table:
917 | if node.tag in ['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'br']:
918 | # Add newline to current cell if we're in a cell
919 | if in_cell:
920 | if current_cell_instructions:
921 | last_instruction = current_cell_instructions[-1]
922 | if 'text' in last_instruction:
923 | last_instruction['text'] += '\n'
924 | elif node.tag == 'tr':
925 | row_id += 1
926 | col_id = 0
927 | elif node.tag in ['td', 'th']:
928 | # Process accumulated cell instructions
929 | if current_cell_instructions:
930 | cell_data = merge_cell_instructions(current_cell_instructions)
931 |
932 | else:
933 | cell_data = {'text': ''}
934 |
935 | # Find next available position if current is occupied
936 | while (row_id, col_id) in occupied_positions:
937 | col_id += 1
938 |
939 | # Store the cell_data at EVERY position this cell occupies
940 | for y in range(rowspan):
941 | for x in range(colspan):
942 | # Store cell data at this position
943 | table_cells[(row_id + y, col_id + x)] = cell_data
944 | # Mark position as occupied
945 | occupied_positions.add((row_id + y, col_id + x))
946 |
947 | # Update maximum dimensions
948 | max_row = max(max_row, row_id + rowspan - 1)
949 | max_col = max(max_col, col_id + colspan - 1)
950 |
951 | # Move to next position
952 | col_id += colspan
953 | current_cell_instructions = []
954 | in_cell = False
955 |
956 | elif tag_command == 'newline':
957 | if len(instructions) > 0:
958 | instructions = remove_leading_empty_instructions(instructions)
959 | instructions = merge_instructions(instructions)
960 | if len(instructions) == 1:
961 | # strip text if it's a text instruction
962 | if 'text' in instructions[0]:
963 | instructions[0]['text'] = instructions[0]['text'].strip(EMPTY_CHARS)
964 | if not is_empty_instructions(instructions):
965 | instructions_list.append(instructions)
966 | instructions = []
967 | continue
968 |
969 | # add any remaining instructions
970 | if instructions:
971 | if len(instructions) > 0:
972 | instructions = remove_leading_empty_instructions(instructions)
973 | if len(instructions) == 1:
974 | # strip text if it's a text instruction
975 | if 'text' in instructions[0]:
976 | instructions[0]['text'] = instructions[0]['text'].strip(EMPTY_CHARS)
977 | if not is_empty_instructions(instructions):
978 | instructions_list.append(instructions)
979 | return instructions_list
--------------------------------------------------------------------------------