├── doc2dict ├── doc2dict │ ├── pdf │ │ ├── __init__.py │ │ ├── mapping.py │ │ ├── __pycache__ │ │ │ ├── utils.cpython-313.pyc │ │ │ ├── mapping.cpython-313.pyc │ │ │ ├── __init__.cpython-313.pyc │ │ │ ├── pdf2dict.cpython-313.pyc │ │ │ ├── pdf_utils.cpython-313.pyc │ │ │ └── convert_pdf_to_instructions.cpython-313.pyc │ │ ├── pdf2dict.py │ │ ├── convert_pdf_to_instructions.py │ │ ├── pdf_utils.py │ │ └── utils.py │ ├── txt │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── parser.cpython-313.pyc │ │ │ └── __init__.cpython-313.pyc │ │ ├── convert_txt_to_instructions.py │ │ └── txt2dict.py │ ├── xml │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── mapping.cpython-311.pyc │ │ │ ├── parser.cpython-311.pyc │ │ │ ├── parser.cpython-313.pyc │ │ │ ├── __init__.cpython-311.pyc │ │ │ ├── __init__.cpython-313.pyc │ │ │ └── mapping_dicts.cpython-311.pyc │ │ └── parser.py │ ├── html │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-313.pyc │ │ │ ├── html2dict.cpython-313.pyc │ │ │ ├── mapping.cpython-313.pyc │ │ │ ├── visualize_dict.cpython-313.pyc │ │ │ ├── visualize_instructions.cpython-313.pyc │ │ │ ├── convert_html_to_instructions.cpython-313.pyc │ │ │ └── convert_instructions_to_dict.cpython-313.pyc │ │ ├── mapping.py │ │ ├── html2dict.py │ │ ├── visualize_instructions.py │ │ ├── visualize_dict.py │ │ └── convert_html_to_instructions.py │ ├── utils │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── utils.cpython-313.pyc │ │ │ ├── __init__.cpython-313.pyc │ │ │ └── storage.cpython-313.pyc │ │ ├── strings.py │ │ ├── utils.py │ │ └── format_dict.py │ ├── __pycache__ │ │ ├── utils.cpython-313.pyc │ │ ├── __init__.cpython-313.pyc │ │ ├── dict2dict.cpython-313.pyc │ │ └── mapping.cpython-313.pyc │ ├── __init__.py │ ├── dict2dict.py │ ├── convert_instructions_to_dict.py │ └── mapping.py ├── doc2dict.egg-info │ ├── dependency_links.txt │ ├── top_level.txt │ ├── requires.txt │ ├── PKG-INFO │ └── SOURCES.txt ├── setup.py └── docs │ ├── docs │ ├── index.md │ ├── parsing │ │ ├── todo.md │ │ ├── pdf.md │ │ └── html.md │ └── whitepaper.md │ └── mkdocs.yml ├── .gitignore ├── contributors.md ├── .github └── workflows │ ├── deploy-docs.yml │ └── build_wheels.yml ├── LICENSE ├── readme.md └── example_output └── html ├── unnest.txt └── levels.txt /doc2dict/doc2dict/pdf/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /doc2dict/doc2dict/txt/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /doc2dict/doc2dict/xml/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /doc2dict/doc2dict/html/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /doc2dict/doc2dict/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *__pycache__/* 2 | data/* 3 | *.pyc 4 | -------------------------------------------------------------------------------- /doc2dict/doc2dict.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /doc2dict/doc2dict.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | doc2dict 2 | -------------------------------------------------------------------------------- /doc2dict/doc2dict.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | selectolax 2 | xmltodict 3 | -------------------------------------------------------------------------------- /contributors.md: -------------------------------------------------------------------------------- 1 | * John Friedman 2 | * Benedetto Leto 3 | * Rian Dolphin 4 | * Gal Skarishevsky -------------------------------------------------------------------------------- /doc2dict/doc2dict/pdf/mapping.py: -------------------------------------------------------------------------------- 1 | pdf_base_mapping_dict = { 2 | 'rules': {'use_font_size_only_for_level':True} 3 | } -------------------------------------------------------------------------------- /doc2dict/doc2dict/__pycache__/utils.cpython-313.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/__pycache__/utils.cpython-313.pyc -------------------------------------------------------------------------------- /doc2dict/doc2dict/__pycache__/__init__.cpython-313.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/__pycache__/__init__.cpython-313.pyc -------------------------------------------------------------------------------- /doc2dict/doc2dict/__pycache__/dict2dict.cpython-313.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/__pycache__/dict2dict.cpython-313.pyc -------------------------------------------------------------------------------- /doc2dict/doc2dict/__pycache__/mapping.cpython-313.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/__pycache__/mapping.cpython-313.pyc -------------------------------------------------------------------------------- /doc2dict/doc2dict/pdf/__pycache__/utils.cpython-313.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/pdf/__pycache__/utils.cpython-313.pyc -------------------------------------------------------------------------------- /doc2dict/doc2dict/pdf/__pycache__/mapping.cpython-313.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/pdf/__pycache__/mapping.cpython-313.pyc -------------------------------------------------------------------------------- /doc2dict/doc2dict/txt/__pycache__/parser.cpython-313.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/txt/__pycache__/parser.cpython-313.pyc -------------------------------------------------------------------------------- /doc2dict/doc2dict/utils/__pycache__/utils.cpython-313.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/utils/__pycache__/utils.cpython-313.pyc -------------------------------------------------------------------------------- /doc2dict/doc2dict/xml/__pycache__/mapping.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/xml/__pycache__/mapping.cpython-311.pyc -------------------------------------------------------------------------------- /doc2dict/doc2dict/xml/__pycache__/parser.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/xml/__pycache__/parser.cpython-311.pyc -------------------------------------------------------------------------------- /doc2dict/doc2dict/xml/__pycache__/parser.cpython-313.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/xml/__pycache__/parser.cpython-313.pyc -------------------------------------------------------------------------------- /doc2dict/doc2dict/html/__pycache__/__init__.cpython-313.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/html/__pycache__/__init__.cpython-313.pyc -------------------------------------------------------------------------------- /doc2dict/doc2dict/html/__pycache__/html2dict.cpython-313.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/html/__pycache__/html2dict.cpython-313.pyc -------------------------------------------------------------------------------- /doc2dict/doc2dict/html/__pycache__/mapping.cpython-313.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/html/__pycache__/mapping.cpython-313.pyc -------------------------------------------------------------------------------- /doc2dict/doc2dict/pdf/__pycache__/__init__.cpython-313.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/pdf/__pycache__/__init__.cpython-313.pyc -------------------------------------------------------------------------------- /doc2dict/doc2dict/pdf/__pycache__/pdf2dict.cpython-313.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/pdf/__pycache__/pdf2dict.cpython-313.pyc -------------------------------------------------------------------------------- /doc2dict/doc2dict/pdf/__pycache__/pdf_utils.cpython-313.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/pdf/__pycache__/pdf_utils.cpython-313.pyc -------------------------------------------------------------------------------- /doc2dict/doc2dict/txt/__pycache__/__init__.cpython-313.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/txt/__pycache__/__init__.cpython-313.pyc -------------------------------------------------------------------------------- /doc2dict/doc2dict/utils/__pycache__/__init__.cpython-313.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/utils/__pycache__/__init__.cpython-313.pyc -------------------------------------------------------------------------------- /doc2dict/doc2dict/utils/__pycache__/storage.cpython-313.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/utils/__pycache__/storage.cpython-313.pyc -------------------------------------------------------------------------------- /doc2dict/doc2dict/xml/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/xml/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /doc2dict/doc2dict/xml/__pycache__/__init__.cpython-313.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/xml/__pycache__/__init__.cpython-313.pyc -------------------------------------------------------------------------------- /doc2dict/doc2dict/html/__pycache__/visualize_dict.cpython-313.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/html/__pycache__/visualize_dict.cpython-313.pyc -------------------------------------------------------------------------------- /doc2dict/doc2dict/xml/__pycache__/mapping_dicts.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/xml/__pycache__/mapping_dicts.cpython-311.pyc -------------------------------------------------------------------------------- /doc2dict/doc2dict/html/__pycache__/visualize_instructions.cpython-313.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/html/__pycache__/visualize_instructions.cpython-313.pyc -------------------------------------------------------------------------------- /doc2dict/doc2dict/pdf/__pycache__/convert_pdf_to_instructions.cpython-313.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/pdf/__pycache__/convert_pdf_to_instructions.cpython-313.pyc -------------------------------------------------------------------------------- /doc2dict/doc2dict/html/__pycache__/convert_html_to_instructions.cpython-313.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/html/__pycache__/convert_html_to_instructions.cpython-313.pyc -------------------------------------------------------------------------------- /doc2dict/doc2dict/html/__pycache__/convert_instructions_to_dict.cpython-313.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/john-friedman/doc2dict/HEAD/doc2dict/doc2dict/html/__pycache__/convert_instructions_to_dict.cpython-313.pyc -------------------------------------------------------------------------------- /doc2dict/doc2dict.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 2.4 2 | Name: doc2dict 3 | Version: 0.2.6 4 | Requires-Python: >=3.8 5 | Requires-Dist: selectolax 6 | Requires-Dist: xmltodict 7 | Dynamic: requires-dist 8 | Dynamic: requires-python 9 | -------------------------------------------------------------------------------- /doc2dict/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name="doc2dict", 5 | version="0.4.9", 6 | packages=find_packages(), 7 | install_requires=['selectolax','xmltodict','pypdfium2' 8 | ] 9 | ) -------------------------------------------------------------------------------- /doc2dict/doc2dict/html/mapping.py: -------------------------------------------------------------------------------- 1 | # This will be heavily reworked in the future. 2 | 3 | dict_10k_html = { 4 | ('part',r'^part\s*([ivx]+)$') : 0, 5 | ('signatures',r'^signatures?\.*$') : 0, 6 | ('item',r'^item\s*(\d+)\.?([a-z])?') : 1, 7 | } -------------------------------------------------------------------------------- /doc2dict/docs/docs/index.md: -------------------------------------------------------------------------------- 1 | # Welcome to doc2dict 2 | 3 | doc2dict is a package to quickly parse documents in `pdf`, `html`, `xml`, and `txt` formats. It supports the [datamule](https://github.com/john-friedman/datamule-python) project. 4 | 5 | ???+ warning "Package is in early development" -------------------------------------------------------------------------------- /doc2dict/docs/docs/parsing/todo.md: -------------------------------------------------------------------------------- 1 | # TODO 2 | * Seperate instructions into own directory 3 | * add rules option to relax headers restrctions, e.g. useful for form 10-D https://www.sec.gov/Archives/edgar/data/1766367/000188852425005427/dma19b10_10d-202503.htm 4 | * modify base pdf parsing mapping dict to user underline - then test with APP NTC form. -------------------------------------------------------------------------------- /doc2dict/doc2dict.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | setup.py 2 | doc2dict/__init__.py 3 | doc2dict/dict2dict.py 4 | doc2dict/mapping.py 5 | doc2dict.egg-info/PKG-INFO 6 | doc2dict.egg-info/SOURCES.txt 7 | doc2dict.egg-info/dependency_links.txt 8 | doc2dict.egg-info/requires.txt 9 | doc2dict.egg-info/top_level.txt 10 | doc2dict/txt/__init__.py 11 | doc2dict/txt/parser.py 12 | doc2dict/xml/__init__.py 13 | doc2dict/xml/parser.py -------------------------------------------------------------------------------- /doc2dict/doc2dict/html/html2dict.py: -------------------------------------------------------------------------------- 1 | from .convert_html_to_instructions import convert_html_to_instructions 2 | from ..convert_instructions_to_dict import convert_instructions_to_dict 3 | from selectolax.parser import HTMLParser 4 | def html2dict(content,mapping_dict=None): 5 | parser = HTMLParser(content) 6 | 7 | body = parser.body 8 | instructions = convert_html_to_instructions(body) 9 | dct = convert_instructions_to_dict(instructions, mapping_dict) 10 | return dct -------------------------------------------------------------------------------- /doc2dict/doc2dict/pdf/pdf2dict.py: -------------------------------------------------------------------------------- 1 | from .convert_pdf_to_instructions import convert_pdf_to_instructions 2 | from ..convert_instructions_to_dict import convert_instructions_to_dict 3 | from .mapping import pdf_base_mapping_dict 4 | def pdf2dict(content,mapping_dict=None): 5 | instructions = convert_pdf_to_instructions(content) 6 | if mapping_dict is None: 7 | mapping_dict=pdf_base_mapping_dict 8 | dct = convert_instructions_to_dict(instructions, mapping_dict) 9 | return dct -------------------------------------------------------------------------------- /doc2dict/doc2dict/__init__.py: -------------------------------------------------------------------------------- 1 | from .xml.parser import xml2dict 2 | from .txt.txt2dict import txt2dict 3 | from .dict2dict import dict2dict 4 | 5 | from .html.convert_html_to_instructions import convert_html_to_instructions 6 | from .convert_instructions_to_dict import convert_instructions_to_dict 7 | from .html.visualize_instructions import visualize_instructions 8 | from .html.visualize_dict import visualize_dict 9 | from .html.html2dict import html2dict 10 | 11 | from .pdf.pdf2dict import pdf2dict 12 | 13 | from .utils.utils import get_title 14 | from .utils.format_dict import unnest_dict, flatten_dict -------------------------------------------------------------------------------- /doc2dict/docs/mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: doc2dict 2 | 3 | theme: 4 | name: material 5 | palette: 6 | primary: indigo 7 | accent: indigo 8 | features: 9 | - navigation.instant 10 | - navigation.tracking 11 | - navigation.expand 12 | - content.code.copy 13 | 14 | nav: 15 | - Home: index.md 16 | - doc2dict: 17 | - html: parsing/html.md 18 | - pdf: parsing/pdf.md 19 | - White Paper: whitepaper.md 20 | 21 | markdown_extensions: 22 | - pymdownx.superfences 23 | - pymdownx.highlight 24 | - def_list 25 | - admonition 26 | - pymdownx.details 27 | - toc: 28 | permalink: true 29 | 30 | repo_url: https://github.com/john-friedman/doc2dict 31 | repo_name: john-friedman/doc2dict -------------------------------------------------------------------------------- /doc2dict/doc2dict/utils/strings.py: -------------------------------------------------------------------------------- 1 | import re 2 | def check_string_style(text): 3 | if not text or not text.strip(): 4 | return {} 5 | 6 | styles = {} 7 | 8 | if text.isupper(): 9 | styles['all_caps'] = True 10 | else: 11 | # Stop words that can be lowercase in proper case 12 | stop_words = r'\b(and|or|of|the|in|on|at|to|for|with|by|a|an)\b' 13 | 14 | # Replace stop words with placeholder, check if remaining words are proper case 15 | text_no_stops = re.sub(stop_words, 'STOP', text, flags=re.IGNORECASE) 16 | 17 | # Check if all non-stop words start with capital and have at least one capital 18 | if re.match(r'^[A-Z][a-zA-Z]*(\s+(STOP|[A-Z][a-zA-Z]*))*$', text_no_stops) and re.search(r'[A-Z]', text): 19 | styles['proper_case'] = True 20 | 21 | return styles -------------------------------------------------------------------------------- /.github/workflows/deploy-docs.yml: -------------------------------------------------------------------------------- 1 | name: Deploy MkDocs 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | workflow_dispatch: # Allows manual triggering 8 | 9 | permissions: 10 | contents: write 11 | 12 | jobs: 13 | deploy: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: actions/checkout@v3 17 | 18 | - name: Set up Python 19 | uses: actions/setup-python@v4 20 | with: 21 | python-version: '3.x' 22 | 23 | - name: Install dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install mkdocs mkdocs-material pymdown-extensions 27 | 28 | - name: Build and deploy MkDocs 29 | run: | 30 | # Navigate to the docs directory 31 | cd doc2dict/docs 32 | 33 | # Build the site 34 | mkdocs build 35 | 36 | # Deploy to GitHub Pages 37 | mkdocs gh-deploy --force -------------------------------------------------------------------------------- /.github/workflows/build_wheels.yml: -------------------------------------------------------------------------------- 1 | name: Build and Upload to PyPI 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' 7 | 8 | jobs: 9 | deploy: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v4 13 | 14 | - name: Set up Python 15 | uses: actions/setup-python@v4 16 | with: 17 | python-version: '3.11' 18 | 19 | - name: Install dependencies 20 | working-directory: ./doc2dict # Added this 21 | run: | 22 | python -m pip install --upgrade pip 23 | pip install setuptools wheel twine 24 | 25 | - name: Build package 26 | working-directory: ./doc2dict # Added this 27 | run: | 28 | python setup.py sdist bdist_wheel 29 | 30 | - name: Upload to PyPI 31 | env: 32 | TWINE_USERNAME: __token__ 33 | TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} 34 | working-directory: ./doc2dict # Added this 35 | run: | 36 | twine upload dist/* -------------------------------------------------------------------------------- /doc2dict/doc2dict/xml/parser.py: -------------------------------------------------------------------------------- 1 | import xmltodict 2 | from ..mapping import JSONTransformer 3 | 4 | def remove_namespace_and_none(path, key, value): 5 | # Skip this key-value pair if value is None 6 | if value is None: 7 | return None # Return None to exclude this key-value pair 8 | 9 | # Remove xmlns attribute altogether 10 | if key == '@xmlns': 11 | return None 12 | 13 | # Remove namespace from keys 14 | if ':' in key: 15 | # Keep only the part after the last colon 16 | return key.split(':')[-1], value 17 | 18 | return key, value 19 | 20 | def xml2dict(content, mapping_dict=None): 21 | data = xmltodict.parse( 22 | content, 23 | postprocessor=remove_namespace_and_none, 24 | process_namespaces=True, # Handle namespaces 25 | namespaces={} 26 | ) 27 | 28 | if mapping_dict is None: 29 | return data 30 | 31 | transformer = JSONTransformer(mapping_dict) 32 | transformed_data = transformer.transform(data) 33 | return transformed_data -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 John Friedman 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /doc2dict/doc2dict/txt/convert_txt_to_instructions.py: -------------------------------------------------------------------------------- 1 | 2 | # need to rememember how html 2 instructions treats empty lines 3 | # may need to rejig to standardize 4 | 5 | from ..utils.strings import check_string_style 6 | 7 | TAB_SIZE = 4 8 | 9 | def get_left_indent(line): 10 | count = 0 11 | for c in line: 12 | if c == '\t': 13 | count += TAB_SIZE 14 | elif c.isspace() and c not in '\r\n\f\v': 15 | count += 1 16 | else: 17 | break 18 | return count 19 | 20 | def convert_txt_to_instructions(content): 21 | lines = content.split('\n') 22 | instructions_list = [] 23 | 24 | for line in lines: 25 | instructions = [] 26 | if len(line) != 0: 27 | instruction = {'text':line} 28 | left_indent = get_left_indent(line) 29 | if left_indent != 0: 30 | instruction['left-indent'] = str(left_indent) 31 | 32 | # style 33 | styles = check_string_style(line) 34 | instruction.update(styles) 35 | 36 | instructions.append(instruction) 37 | instructions_list.append(instructions) 38 | else: 39 | instructions_list.append([]) 40 | 41 | return instructions_list -------------------------------------------------------------------------------- /doc2dict/doc2dict/pdf/convert_pdf_to_instructions.py: -------------------------------------------------------------------------------- 1 | import pypdfium2 as pdfium 2 | from .pdf_utils import get_text, get_font_name, get_font, get_font_size 3 | from .utils import get_font_attributes, assign_line, standardize_font_size 4 | 5 | 6 | def convert_pdf_to_instructions(content): 7 | 8 | # Open the PDF 9 | pdf = pdfium.PdfDocument(content) 10 | 11 | instructions_stream = [] 12 | # Extract text and font info from each page 13 | for page_index in range(len(pdf)): 14 | page = pdf[page_index] 15 | text_page = page.get_textpage() 16 | page_width = page.get_width() 17 | 18 | 19 | # Get page objects 20 | for obj in page.get_objects(): 21 | text = get_text(text_page, obj) 22 | font = get_font(obj) 23 | font_name = get_font_name(font) 24 | font_attributes = get_font_attributes(font_name) # mild duplication 25 | 26 | font_size = get_font_size(obj) 27 | 28 | 29 | 30 | # left bottom righ top 31 | coords_tuple = obj.get_pos() 32 | 33 | # lets not add items if font size is 0 34 | if font_size is None: 35 | continue 36 | else: 37 | instruction = {'text': text} | {'coords': coords_tuple, 'font-size': font_size, 'font-name': font_name} | font_attributes 38 | instructions_stream.append(instruction) 39 | 40 | 41 | # Clean up resources 42 | pdf.close() 43 | 44 | #instructions_stream = standardize_font_size(instructions_stream) 45 | instructions_list = assign_line(instructions_stream) 46 | 47 | 48 | return instructions_list -------------------------------------------------------------------------------- /doc2dict/docs/docs/parsing/pdf.md: -------------------------------------------------------------------------------- 1 | # PDF 2 | 3 | ???+ warning "Very Early Stage" 4 | This code is in a very early stage. 5 | 6 | ## Quickstart 7 | ``` 8 | # Load your pdf file 9 | with open('apple_10k_2024.pdf','rb') as f: 10 | content = f.read() 11 | 12 | # Convert to dictionary 13 | dct = pdf2dict(content,mapping_dict=None) 14 | ``` 15 | 16 | 17 | ## Benchmarks 18 | * About 200 pages per second single threaded. 19 | 20 | ???+ warning "multithreading" 21 | pdf2dict can't be run multithreaded due to the limitations of pypdfium2 22 | 23 | 24 | ## Compatibility 25 | Requires pdfs with underlying text structure so no scans yet. 26 | 27 | `convert_scan_to_instructions` would be fairly straightforward to implement. Font-size can be inferred from bounding boxes, as can line alignment. Rotation probably won't be an issue for decent scans like the ones submitted to the SEC. 28 | 29 | The issue is performance. 30 | 31 | The point of `doc2dict` is mostly that it's fast. Local OCR such as pytesseract would put a hard cap of 10 pages per second. 32 | 33 | This is too slow to be useful for my use-case. Here's a benchmark. 34 | 35 | **Convert all 2024 Annual Report to Shareholders to dict form** 36 | 2000 a year * mean 50 pages / 200 pages per second = 500 seconds = ~ 10 minutes. (PDF Parser) 37 | 38 | Where as a scan parser would take at least 200 minutes ~ 3 hours. 39 | 40 | I think the solution will be to write a scan parser that takes input of bounding boxes/ minimum features required as input. Users can then use their preferred OCR - e.g. local, Google, AWS, etc for the slow part. 41 | 42 | ## TODO 43 | think about tables 44 | get center 45 | get other old attributes like indent 46 | 47 | ## Issues 48 | * Adobe PDF encodings return weird characters. 49 | -------------------------------------------------------------------------------- /doc2dict/doc2dict/utils/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | def get_title(dct, title=None, title_regex=None, title_class=None): 4 | results = [] 5 | 6 | # Ensure exactly one of title or title_regex is specified 7 | if (title is None and title_regex is None) or (title is not None and title_regex is not None): 8 | raise ValueError("Exactly one of 'title' or 'title_regex' must be specified") 9 | 10 | title_class = title_class.lower() if title_class else None 11 | 12 | if title_regex: 13 | title_pattern = re.compile(title_regex, re.IGNORECASE) 14 | else: 15 | title_lower = title.lower() 16 | 17 | def search(node, parent_id=None): 18 | if isinstance(node, dict): 19 | node_title = node.get('title', '') 20 | node_class = node.get('class', '').lower() 21 | node_standardized_title = node.get('standardized_title', '') 22 | 23 | # Check title match based on which parameter was provided 24 | if title_regex: 25 | title_match = (title_pattern.match(node_title) or 26 | title_pattern.match(node_standardized_title)) 27 | else: 28 | title_match = (node_title.lower() == title_lower or 29 | node_standardized_title.lower() == title_lower) 30 | 31 | if title_match and (title_class is None or node_class == title_class): 32 | results.append((parent_id, node)) 33 | 34 | contents = node.get('contents', {}) 35 | for key, value in contents.items(): 36 | search(value, key) 37 | 38 | if 'document' in dct: 39 | for doc_id, doc_node in dct['document'].items(): 40 | search(doc_node, doc_id) 41 | 42 | return results 43 | -------------------------------------------------------------------------------- /doc2dict/doc2dict/dict2dict.py: -------------------------------------------------------------------------------- 1 | def dict2dict(data): 2 | result = {} 3 | 4 | def process_item(item): 5 | # If item is a string, return it directly 6 | if isinstance(item, str): 7 | return item.strip() 8 | 9 | # If item is not a dict, return string version 10 | if not isinstance(item, dict): 11 | return str(item).strip() 12 | 13 | # Base case: if there's no further content, return the item itself 14 | if 'content' not in item: 15 | return item 16 | 17 | # If there's a text key, use it as the dict key, otherwise use the type 18 | key = item.get('text', item.get('type', '')) 19 | 20 | # Process the content 21 | if isinstance(item['content'], list): 22 | # Check if content contains dictionaries with type/text 23 | if any(isinstance(x, dict) and ('type' in x or 'text' in x) for x in item['content']): 24 | nested_result = {} 25 | for content_item in item['content']: 26 | if isinstance(content_item, dict): 27 | nested_key = content_item.get('text', content_item.get('type', '')) 28 | nested_result[nested_key] = process_item(content_item) 29 | return nested_result 30 | # If content items are simple values (strings/numbers), join with newlines and strip 31 | else: 32 | return '\n'.join(str(x) for x in item['content']).strip() 33 | else: 34 | return str(item['content']).strip() 35 | 36 | # Handle case where data itself might be a string 37 | if isinstance(data, str): 38 | return data.strip() 39 | 40 | # Handle case where content is a list directly 41 | if isinstance(data.get('content', []), list): 42 | for item in data['content']: 43 | if isinstance(item, dict): 44 | key = item.get('text', item.get('type', '')) 45 | result[key] = process_item(item) 46 | else: 47 | # If we have a string in content, use it as both key and value 48 | result[str(item).strip()] = str(item).strip() 49 | 50 | return result -------------------------------------------------------------------------------- /doc2dict/doc2dict/pdf/pdf_utils.py: -------------------------------------------------------------------------------- 1 | import pypdfium2 as pdfium 2 | import pypdfium2.raw as pdfium_c 3 | from ctypes import c_ushort, c_ulong, POINTER, c_float, c_void_p, c_size_t, c_uint8, c_int 4 | 5 | 6 | def get_text(text_page,obj): 7 | text_len = pdfium_c.FPDFTextObj_GetText( 8 | obj.raw, # FPDF_PAGEOBJECT 9 | text_page.raw, # FPDF_TEXTPAGE (NULL in this case) 10 | None, # POINTER(FPDF_WCHAR) - NULL to get the length 11 | c_ulong(0) # c_ulong - specify 0 to get the required buffer size 12 | ) 13 | 14 | # Create buffer for the text 15 | buffer = pdfium_c.create_string_buffer(text_len * 2) # UTF-16LE encoding 16 | text_ptr = pdfium_c.cast(buffer, pdfium_c.POINTER(pdfium_c.c_ushort)) 17 | 18 | # Second call to actually get the text 19 | chars_copied = pdfium_c.FPDFTextObj_GetText( 20 | obj.raw, # FPDF_PAGEOBJECT 21 | text_page.raw, # FPDF_TEXTPAGE (NULL in this case) 22 | text_ptr, # POINTER(FPDF_WCHAR) - pointer to our buffer 23 | c_ulong(text_len) # c_ulong - the buffer size 24 | ) 25 | 26 | # Convert UTF-16LE to string 27 | # Only convert the number of characters actually copied 28 | text = buffer.raw[:chars_copied*2].decode('utf-16le', errors='ignore') 29 | 30 | # remove buffer 31 | text = text.strip('\x00') 32 | return text 33 | 34 | 35 | def get_font_size(obj): 36 | # Create a c_float to receive the font size value 37 | font_size = c_float(0.0) 38 | 39 | # Call the PDFium function to get the font size 40 | result = pdfium_c.FPDFTextObj_GetFontSize( 41 | obj.raw, # FPDF_PAGEOBJECT 42 | pdfium_c.byref(font_size) # POINTER(c_float) 43 | ) 44 | 45 | # Check if the function call was successful 46 | if result: 47 | matrix = obj.get_matrix().get() 48 | # Apply the transformation matrix to the font size 49 | mean_scale = (matrix[0] + matrix[3]) / 2 50 | 51 | return round(font_size.value * mean_scale,2) 52 | else: 53 | return None 54 | 55 | 56 | def get_font(obj): 57 | font = pdfium_c.FPDFTextObj_GetFont(obj.raw) 58 | return font 59 | 60 | def get_font_name(font): 61 | # Get font name 62 | name_len = pdfium_c.FPDFFont_GetBaseFontName(font, None, 0) 63 | name_buffer = pdfium_c.create_string_buffer(name_len) 64 | pdfium_c.FPDFFont_GetBaseFontName(font, name_buffer, name_len) 65 | font_name = name_buffer.value.decode('utf-8', errors='ignore') 66 | 67 | 68 | return font_name 69 | -------------------------------------------------------------------------------- /doc2dict/docs/docs/parsing/html.md: -------------------------------------------------------------------------------- 1 | # HTML 2 | 3 | ## Quickstart 4 | ``` 5 | # Load your html file 6 | with open('apple_10k_2024.htm','r') as f: 7 | content = f.read() 8 | 9 | # Convert to dictionary 10 | dct = html2dict(content,mapping_dict=None) 11 | ``` 12 | 13 | ### Example 14 | ``` 15 | ... 16 | "37": { 17 | "title": "PART I", 18 | "standardized_title": "parti", 19 | "class": "part", 20 | "contents": { 21 | "38": { 22 | "title": "ITEM 1. BUSINESS", 23 | "standardized_title": "item1", 24 | "class": "item", 25 | "contents": { 26 | "39": { 27 | "title": "GENERAL", 28 | "standardized_title": "", 29 | "class": "predicted header", 30 | "contents": { 31 | "40": { 32 | "title": "Embracing Our Future", 33 | ... 34 | "292": { 35 | "table": [ 36 | [ 37 | "Name", 38 | "Age", 39 | "Position with the Company" 40 | ], 41 | [ 42 | "Satya Nadella", 43 | "56", 44 | "Chairman and Chief Executive Officer" 45 | ], 46 | ... 47 | ``` 48 | 49 | 50 | 51 | ## Tweaking the engine for your use case 52 | 53 | ???+ note "I will make this section better soon" 54 | I just want to get the basic docs out! 55 | 56 | ### Debugging 57 | ``` 58 | from doc2dict import convert_html_to_instructions, convert_instructions_to_dict, visualize_instructions, visualize_dict 59 | 60 | # load your html file 61 | with open('tesla10k.htm','r') as f: 62 | content = f.read() 63 | 64 | # convert html to a series of instructions 65 | instructions = convert_html_to_instructions(content) 66 | 67 | # visualize the conversion 68 | visualize_instructions(instructions) 69 | 70 | # convert instructions to dictionary 71 | dct = html2dict(content,mapping_dict=None) 72 | 73 | # visualize dictionary 74 | visualize_dict(dct) 75 | ``` 76 | 77 | ### Writing your own mapping dictionaries 78 | 79 | ???+ warning "Experimental" 80 | If you write a mapping dict, and I change something so it stops working - please [email me](mailto:johnfriedman@datamule.xyz). 81 | 82 | Mapping dicts currently work by specifying the class of the section header: `part`, regex for section header `r'^part\s*([ivx]+)$'` where the capture group `([ivx]+)` and class `part` determine the `standardized_title`, and the level, where `0` is the root. 83 | 84 | In this example, `items` will always be nested under `parts`. 85 | ``` 86 | dict_10k_html = { 87 | ('part',r'^part\s*([ivx]+)$') : 0, 88 | ('signatures',r'^signatures?\.*$') : 0, 89 | ('item',r'^item\s*(\d+)\.?([a-z])?') : 1, 90 | } 91 | ``` 92 | -------------------------------------------------------------------------------- /doc2dict/doc2dict/txt/txt2dict.py: -------------------------------------------------------------------------------- 1 | from .convert_txt_to_instructions import convert_txt_to_instructions 2 | from ..convert_instructions_to_dict import convert_instructions_to_dict 3 | 4 | 5 | # FIX THIS # TODO TODO 6 | def combine_text_wraparound(instructions_list): 7 | """Used for e.g. text files where the next line is meant to be part of the same paragraph, but the next next line is a new paragraph""" 8 | 9 | # merge instructions 10 | new_instructions_list = [] 11 | current_instructions = [] 12 | 13 | for line_num in range(len(instructions_list) - 1): 14 | instructions = instructions_list[line_num] 15 | # Add wraparound attribute to each instruction 16 | for instruction in instructions: 17 | instruction['wraparound'] = True 18 | 19 | # Only add space if this is NOT the first line of the paragraph 20 | if current_instructions and 'text' in instructions[0]: 21 | instructions[0]['text'] = ' ' + instructions[0]['text'] 22 | 23 | # Extend current_instructions with this line's instructions 24 | current_instructions.extend(instructions) 25 | 26 | if instructions_list[line_num + 1] == []: # Next line is empty 27 | if current_instructions: # Only append if not empty 28 | new_instructions_list.append(current_instructions) 29 | current_instructions = [] # Reset for new paragraph 30 | 31 | # Handle the last line 32 | if instructions_list: # Check if list is not empty 33 | last_instructions = instructions_list[-1] 34 | 35 | # Only add space if this is NOT the first line of the paragraph 36 | if current_instructions and 'text' in last_instructions[0]: 37 | last_instructions[0]['text'] = ' ' + last_instructions[0]['text'] 38 | 39 | current_instructions.extend(last_instructions) 40 | if current_instructions: # Only append if not empty 41 | new_instructions_list.append(current_instructions) 42 | 43 | return new_instructions_list 44 | 45 | 46 | def txt2dict(content,mapping_dict=None,encoding='utf-8'): 47 | content = content.decode(encoding=encoding) 48 | instructions_list = convert_txt_to_instructions(content=content) 49 | 50 | # we need to add a filter here, ideally via mapping 51 | # should use whether ends with '.' to merge. into blocks 52 | # probably add default and if detected for the pdf use case 53 | 54 | instructions_list = combine_text_wraparound(instructions_list=instructions_list) 55 | 56 | # handle dash headers e.g. [{'text': 'Item 2. Properties', 'wraparound': True}, {'text': ' -------------------', 'wraparound': True}] 57 | # duct tape solution TODO fix 58 | for instructions in instructions_list: 59 | if 'text' in instructions[-1]: 60 | if set(instructions[-1]['text'].replace(' ','')) == {'-'}: 61 | # add bold to all instructions 62 | [item.update({'bold': True}) or item for item in instructions] 63 | instructions.pop() 64 | 65 | instructions_list = [item for item in instructions_list if item !=[]] 66 | 67 | dct = convert_instructions_to_dict(instructions_list=instructions_list,mapping_dict=mapping_dict) 68 | return dct 69 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # doc2dict 2 | 3 | Convert HTML, XML, and PDFs into dictionaries. 4 | 5 | * [Documentation](https://john-friedman.github.io/doc2dict/) 6 | 7 | Note that `doc2dict` is in an early stage. The goal is to create a fast, generalized, algorithmic parser that can be easily tweaked depending on the document. 8 | 9 | `doc2dict` supports the [datamule](https://github.com/john-friedman/datamule-python) project. 10 | 11 | ## Parsers 12 | 13 | 1. HTML Parser 14 | 2. PDF Parser - very early stage, currently only supports some pdf types. 15 | 3. XML Parser - please use Martin Blech's excellent xmltodict. doc2dict's xml2dict is currently a mess. 16 | 17 | ## Installation 18 | 19 | ```bash 20 | pip install doc2dict 21 | ``` 22 | 23 | ## HTML 24 | 25 | ### Examples 26 | 27 | Parsed HTML in Dictionary Form: 28 | [example](example_output/html/dict.json) 29 | 30 | Dictionary Form converted to HTML for easy visualiztion: 31 | [example](example_output/html/document_visualization.html) 32 | 33 | ### Quickstart 34 | 35 | ```python 36 | from doc2dict import html2dict, visualize_dict 37 | 38 | # Load your html file 39 | with open('apple_10k_2024.html','r') as f: 40 | content = f.read() 41 | 42 | # Parse 43 | dct = html2dict(content,mapping_dict=None) 44 | 45 | # Visualize Parsing 46 | visualize_dict(dct) 47 | ``` 48 | 49 | ### Mapping Dicts 50 | 51 | Mapping dictionaries are rules that you pass into the parser to tweak its functionality. 52 | 53 | The below mapping dict tells the parser that "item" header should appear in the nesting of "part" headers. 54 | 55 | ```python 56 | tenk_mapping_dict = { 57 | ('part',r'^part\s*([ivx]+)$') : 0, 58 | ('signatures',r'^signatures?\.*$') : 0, 59 | ('item',r'^item\s*(\d+)') : 1, 60 | } 61 | ``` 62 | 63 | ### Debugging 64 | 65 | ```python 66 | from doc2dict import * 67 | from selectolax.parser import HTMLParser 68 | 69 | # Load your html file 70 | with open('apple_10k_2024.htm','r') as f: 71 | content = f.read() 72 | 73 | 74 | body = HTMLParser(content).body 75 | 76 | # convert html to a series of instructions 77 | instructions = convert_html_to_instructions(body) 78 | 79 | # visualize the conversion 80 | visualize_instructions(instructions) 81 | 82 | # convert instructions to dictionary 83 | dct = html2dict(content,mapping_dict=None) 84 | 85 | # visualize dictionary 86 | visualize_dict(dct) 87 | ``` 88 | 89 | ### Benchmarks 90 | 91 | Based on my personal (potato) laptop: 92 | * About 500 pages per second single threaded. 93 | * Parses the 57 page Apple 10-K in 160 milliseconds. 94 | 95 | ## PDF 96 | 97 | The pdf parser is in a very early stage. It does not always handle encoding issues and the resulting hierarchies can be quite odd. 98 | 99 | I've released this because it may be useful to you, and as a proof of concept that fast pdf to dictionary parsing is possible. I plan to develop this further when presented with an interesting use case. 100 | 101 | ### Quickstart 102 | 103 | ```python 104 | from doc2dict import pdf2dict, visualize_dict 105 | 106 | # Load your html file 107 | with open('apple_10k_2024.pdf','rb') as f: 108 | content = f.read() 109 | 110 | # Parse 111 | dct = pdf2dict(content,mapping_dict=None) 112 | 113 | # Visualize Parsing 114 | visualize_dict(dct) 115 | ``` 116 | 117 | ### Benchmarks 118 | 119 | * About 200 pages per second single threaded. 120 | 121 | ### Other Functions: 122 | - flatten_dict(dct, format='markdown') or flatten_dict(dct, format='text') 123 | - unnest_dict(dct) - returns dict in form (id,type,content,level) 124 | 125 | # TODO 126 | - generalize instructions to dict 127 | - add github workflow to run parser on examples after each push. -------------------------------------------------------------------------------- /doc2dict/doc2dict/utils/format_dict.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | def _clean_cell_content(cell_content): 4 | 5 | text = str(cell_content) 6 | 7 | # Replace non-breaking space 8 | text = text.replace('\u00a0', '') 9 | 10 | # Replace tabs with spaces 11 | text = text.replace('\t', ' ') 12 | 13 | # Replace multiple newlines with single spaces 14 | text = text.replace('\n\n', ' ') 15 | text = text.replace('\n', ' ') 16 | 17 | # Replace multiple spaces with single spaces 18 | text = re.sub(r'\s+', ' ', text) 19 | 20 | # Strip leading/trailing whitespace 21 | text = text.strip() 22 | 23 | return text 24 | 25 | def _format_table(table_data): 26 | if not table_data: 27 | return [] 28 | 29 | # Clean all cell content first 30 | cleaned_data = [] 31 | for row in table_data: 32 | cleaned_row = [_clean_cell_content(cell) for cell in row] 33 | cleaned_data.append(cleaned_row) 34 | 35 | # Calculate column widths using cleaned data 36 | col_widths = [] 37 | for row in cleaned_data: 38 | for i, cell in enumerate(row): 39 | cell_len = len(cell) 40 | if i >= len(col_widths): 41 | col_widths.append(cell_len) 42 | else: 43 | col_widths[i] = max(col_widths[i], cell_len) 44 | 45 | formatted_rows = [] 46 | formatted_rows.append('') # Empty line before table 47 | 48 | for i, row in enumerate(cleaned_data): 49 | padded_cells = [cell.ljust(col_widths[j]) for j, cell in enumerate(row)] 50 | formatted_rows.append('| ' + ' | '.join(padded_cells) + ' |') 51 | 52 | # Add separator after first row (header) 53 | if i == 0: 54 | separator = '|' + '|'.join('-' * (w + 2) for w in col_widths) + '|' 55 | formatted_rows.append(separator) 56 | 57 | formatted_rows.append('') # Empty line after table 58 | return formatted_rows 59 | 60 | 61 | def _format_title(text, level): 62 | # Ensure level is at least 1 for proper markdown heading 63 | markdown_level = max(1, min(level + 1, 6)) 64 | return "#" * markdown_level + " " + text 65 | 66 | def unnest_dict(dct): 67 | result = [] 68 | 69 | def process_content(content, current_id=None, level=0): 70 | if not isinstance(content, dict): 71 | return 72 | 73 | # Process title, text, textsmall, and table directly 74 | for key in ['title', 'text', 'textsmall', 'table']: 75 | if key in content: 76 | # skip introduction filler 77 | if current_id == -1: 78 | pass 79 | else: 80 | result.append((current_id, key, content[key], level)) 81 | 82 | # Process contents recursively in numeric order 83 | contents = content.get('contents', {}) 84 | if contents: 85 | for key in contents.keys(): 86 | process_content(contents[key], key, level + 1) 87 | 88 | # Start processing from document 89 | if 'document' in dct: 90 | document = dct['document'] 91 | for key in document.keys(): 92 | process_content(document[key], key, 0) 93 | else: 94 | # If no document key, process the entire dictionary 95 | process_content(dct, level=0) 96 | 97 | return result 98 | 99 | def flatten_dict(dct=None, format='markdown',tuples_list=None): 100 | if tuples_list is None: 101 | tuples_list = unnest_dict(dct) 102 | results = [] 103 | if format == 'markdown': 104 | for tuple in tuples_list: 105 | tuple_type = tuple[1] 106 | content = tuple[2] 107 | level = tuple[3] 108 | if tuple_type == 'table': 109 | results.extend(_format_table(content)) 110 | elif tuple_type == 'text': 111 | results.append(content) 112 | elif tuple_type == 'textsmall': 113 | results.append(f'{content}') 114 | elif tuple_type == 'title': 115 | results.append(_format_title(content,level)) 116 | 117 | return '\n'.join(results) 118 | elif format == 'text': 119 | for tuple in tuples_list: 120 | tuple_type = tuple[1] 121 | content = tuple[2] 122 | level = tuple[3] 123 | 124 | # reuse markdown format 125 | if tuple_type == 'table': 126 | results.extend(_format_table(content)) 127 | elif tuple_type == 'text': 128 | results.append(content) 129 | elif tuple_type == 'textsmall': 130 | results.append(content) 131 | elif tuple_type == 'title': 132 | results.append('') 133 | results.append(content) 134 | results.append('') 135 | 136 | return '\n'.join(results) 137 | else: 138 | raise ValueError(f'Format not found: {format}') -------------------------------------------------------------------------------- /doc2dict/doc2dict/pdf/utils.py: -------------------------------------------------------------------------------- 1 | 2 | # TODO, modify for e.g. BOLD AND ITALIC or IT etc name variations 3 | def get_font_attributes(font_name): 4 | dct = {} 5 | attribute = font_name.split('-') 6 | if len(attribute) > 1: 7 | key = attribute[-1].lower() 8 | dct[key] = True 9 | return dct 10 | 11 | def get_font_size(coords_tuple): 12 | left = coords_tuple[0] 13 | bottom = coords_tuple[1] 14 | right = coords_tuple[2] 15 | top = coords_tuple[3] 16 | height = top - bottom 17 | font_size = height / 2 18 | return font_size * 4 # Multiplying just because why not? 19 | 20 | # TODO REMOVE. we do need to find how to get actual font size 21 | def standardize_font_size(instructions_stream): 22 | """ 23 | Standardize font sizes in the instructions stream by merging font sizes that are close to each other. 24 | 25 | Args: 26 | instructions_stream (list): List of dictionaries containing text elements with font-size information 27 | 28 | Returns: 29 | list: The instructions stream with standardized font sizes 30 | """ 31 | if not instructions_stream: 32 | return [] 33 | 34 | # First, extract all unique font sizes 35 | font_sizes = [] 36 | for item in instructions_stream: 37 | if 'font-size' in item: 38 | font_sizes.append(item['font-size']) 39 | 40 | # If no font sizes found, return original stream 41 | if not font_sizes: 42 | return instructions_stream 43 | 44 | # Sort font sizes 45 | font_sizes = sorted(set(font_sizes)) 46 | 47 | # Group similar font sizes 48 | standardized_sizes = [] 49 | current_group = [font_sizes[0]] 50 | 51 | for i in range(1, len(font_sizes)): 52 | # Calculate relative difference between consecutive font sizes 53 | current_size = font_sizes[i] 54 | prev_size = font_sizes[i-1] 55 | relative_diff = abs(current_size - prev_size) / max(current_size, prev_size) 56 | 57 | # If the difference is less than a threshold (e.g., 5%), group them 58 | if relative_diff < 0.05: 59 | current_group.append(current_size) 60 | else: 61 | # Calculate average for the current group 62 | avg_size = sum(current_group) / len(current_group) 63 | standardized_sizes.append((current_group, avg_size)) 64 | current_group = [current_size] 65 | 66 | # Add the last group 67 | if current_group: 68 | avg_size = sum(current_group) / len(current_group) 69 | standardized_sizes.append((current_group, avg_size)) 70 | 71 | # Create a mapping from original sizes to standardized sizes 72 | size_mapping = {} 73 | for group, avg_size in standardized_sizes: 74 | for size in group: 75 | size_mapping[size] = avg_size 76 | 77 | # Apply the mapping to the instructions stream 78 | for item in instructions_stream: 79 | if 'font-size' in item and item['font-size'] in size_mapping: 80 | item['font-size'] = size_mapping[item['font-size']] 81 | 82 | return instructions_stream 83 | 84 | def assign_line(instructions_stream): 85 | """ 86 | Assign line numbers to text elements that are positioned on the same line. 87 | Only compares with the next neighbor in the list. 88 | """ 89 | 90 | # Initialize with first element 91 | current_line = 0 92 | instructions_list = [] 93 | instructions = [instructions_stream[0]] 94 | 95 | # Process remaining elements 96 | for i in range(len(instructions_stream) - 1): 97 | current = instructions_stream[i] 98 | next_item = instructions_stream[i + 1] 99 | 100 | # Extract y-coordinates (bottom of text) 101 | current_y = current['coords'][1] # bottom y of current 102 | next_y = next_item['coords'][1] # bottom y of next 103 | 104 | # Get font sizes for tolerance calculation 105 | current_font_size = current['font-size'] 106 | next_font_size = next_item['font-size'] 107 | 108 | # Calculate tolerance based on larger font size 109 | tolerance = max(current_font_size, next_font_size) * 0.5 110 | 111 | # Check if next item is on the same line 112 | if abs(current_y - next_y) <= tolerance: 113 | # if font-name and font-size are the same, then we can merge them. We can do this, because font name contains bold/italic 114 | if current['font-name'] == next_item['font-name'] and current['font-size'] == next_item['font-size']: 115 | # Merge the two items 116 | current['text'] += next_item['text'] 117 | current['coords'] = ( 118 | min(current['coords'][0], next_item['coords'][0]), # left 119 | min(current['coords'][1], next_item['coords'][1]), # bottom 120 | max(current['coords'][2], next_item['coords'][2]), # right 121 | max(current['coords'][3], next_item['coords'][3]) # top 122 | ) 123 | else: 124 | instructions.append(next_item) 125 | else: 126 | instructions_list.append(instructions) 127 | instructions = [next_item] 128 | 129 | return instructions_list 130 | 131 | # so these need to be modified to look at all the dicts. 132 | def get_left_indent(coords_tuple): 133 | return 134 | 135 | def get_is_centered(coords_tuple): 136 | return -------------------------------------------------------------------------------- /doc2dict/docs/docs/whitepaper.md: -------------------------------------------------------------------------------- 1 | # High Speed Document Algorithmic Parsing 2 | 3 | ## Abstract 4 | Parsing documents that are human readable into machine readable form is difficult due to under the hood variation. Here is my attempt at providing a fast, robust generalized approach, that can be easily modified to account for variation in documents. 5 | 6 |
7 | Download as PDF 8 |
9 | 10 | ???+ note "Caveats" 11 | This is not meant to be a perfect parsing approach. It's meant to be a "good enough" approach that is fast enough to parse the entire SEC corpus on a personal laptop. This is also in an early stage - things will change. 12 | 13 | ???+ note "Terminology" 14 | I don't know the right words to use. If you do, please [email me](mailto:johnfriedman@datamule.xyz) and/or bully me into correcting the terminology. 15 | 16 | 17 | ## General 18 | 19 | ### Approach 20 | 1. Convert messy document into a simple list of instructions. 21 | 2. Convert the list of instructions into dictionary using a set of rules that can be easily tailored for the document. 22 | 23 | The idea here is to turn a complex problem that is hard to solve, into a simple problem, that is easy to solve. 24 | * Nested html is hard to understand -> the same html in list form is easy 25 | * Raw pdfs are hard to understand -> the same pdf in list form is easy 26 | 27 | We can then convert from the (flat) list form into a nested dictionary by using simple rules like "bigger headers have higher nesting" as well as specify where certain headers go - "item 1a risk factors should be nested under part i". 28 | 29 | This also makes the parsing easier to modify for less technical users. A grad student in economics is unlikely to be able to modify the walk through a html document to properly account for style inheritance, but likely can modify rules such as "ignore italics for header selection". 30 | 31 | #### Examples 32 | Instructions List: 33 | ``` 34 | [{'text': 'PART I', 'text-style': 'all_caps', 'left-indent': 8.0, 'font-size': 13.33, 'text-center': True, 'bold': True}] 35 | [{'text': 'ITEM 1. BUSINESS', 'text-style': 'all_caps', 'left-indent': 8.0, 'font-size': 15.995999999999999, 'text-center': True, 'bold': True}] 36 | [{'text': 'GENERAL', 'text-style': 'all_caps', 'left-indent': 8.0, 'font-size': 13.33, 'text-center': True, 'underline': True}] 37 | [{'text': 'Embracing Our Future', 'left-indent': 8.0, 'font-size': 13.33, 'bold': True}]... 38 | ``` 39 | 40 | Dictionary 41 | ``` 42 | "37": { 43 | "title": "PART I", 44 | "standardized_title": "parti", 45 | "class": "part", 46 | "contents": { 47 | "38": { 48 | "title": "ITEM 1. BUSINESS", 49 | "standardized_title": "item1", 50 | "class": "item", 51 | "contents": { 52 | "39": { 53 | "title": "GENERAL", 54 | "standardized_title": "", 55 | "class": "predicted header", 56 | "contents": { 57 | "40": { 58 | "title": "Embracing Our Future",... 59 | ``` 60 | 61 | 62 | 63 | ### Mapping Dictionaries 64 | I call the set of rules used to convert the list of instructions into a dictionary a "mapping dict". The idea is that a less technical user who will have trouble tweaking the engine can easily modify a list of rules that tweak the output. 65 | 66 | #### Example 67 | ``` 68 | dict_10k_html = { 69 | ('part',r'^part\s*([ivx]+)$') : 0, 70 | ('signatures',r'^signatures?\.*$') : 0, 71 | ('item',r'^item\s*(\d+)\.?([a-z])?') : 1, 72 | } 73 | ``` 74 | 75 | The above mapping dict tells the parser to assign class 'part' to predicted headers and assign hierarchy '0' or root level. It then uses the capture group `([ivx]+)` and the class to determine the standarized_title. 76 | 77 | ## HTML 78 | 79 | The basic html approach has already been implemented. Ballpark speed is about 500 pages per second on my two year old personal laptop. 80 | 81 | ### Approach 82 | 83 | 1. Iterate through the html file, keeping track of attributes that apply for each text node, with special handling for tables to create the instructions list. Output each text node as an instruction on the same line if the two text nodes visually appear on the same line. 84 | 2. For the instructions list, determine which instructions are likely to be headers. If an instruction is a header, determine hierarchy with the aid of a mapping dict if present. 85 | 86 | ### Tables 87 | 88 | 1. Construct a matrix with each cell representing a cell in the table 89 | 2. If a cell spans multiple rows or columns, duplicate the cell in the matrix 90 | 3. Remove rows and columns that are considered empty - e.g. have only empty characters 91 | 4. Remove rows and columns that contain no unique information - e.g. if a column is a subset of another column, remove it. 92 | 93 | TODO 94 | 95 | * Currently removes unmatched parenthesis columns, in the future will merge them 96 | * Currently does not handle indents - many tables can be split into multiple tables using information from indents 97 | 98 | ???+ note "Goal" 99 | The goal here is not to perfectly parse tables. We can get close, but often the information for html tables is above the table in a seperate block. 100 | 101 | ### Visualization 102 | 103 | Visualization is important for both the instructions_list stage and the final dict stage. Visualization lets users quickly debug whether the parser is working as expected, and what to tweak. -------------------------------------------------------------------------------- /doc2dict/doc2dict/html/visualize_instructions.py: -------------------------------------------------------------------------------- 1 | import webbrowser 2 | import os 3 | 4 | def format_dct_style(line): 5 | text = line.get('text', '') 6 | href = line.get('href', '') 7 | 8 | style_properties = [] 9 | # might have issues here in the future 10 | if 'bold' in line: 11 | style_properties.append('font-weight: bold') 12 | if 'italic' in line: 13 | style_properties.append('font-style: italic') 14 | if 'underline' in line: 15 | style_properties.append('text-decoration: underline') 16 | if 'font-size' in line: 17 | font_size = line['font-size'] 18 | if font_size: 19 | style_properties.append(f'font-size: {font_size}') 20 | if 'left-indent' in line: 21 | left_indent = line['left-indent'] 22 | if left_indent: 23 | style_properties.append(f'margin-left: {left_indent}px') 24 | 25 | return style_properties, text, href 26 | 27 | def format_table(table): 28 | table_html = "" 29 | for idx, row in enumerate(table): 30 | table_html += "" 31 | for cell in row: 32 | if 'image' in cell: 33 | image_data = cell['image'] 34 | src = image_data.get('src', '') 35 | alt = image_data.get('alt', '') 36 | cell_content = f"{alt}" 37 | else: 38 | cell_text = cell.get('text', '') 39 | cell_href = cell.get('href', '') 40 | 41 | if cell_href: 42 | cell_content = f"{cell_text}" 43 | else: 44 | cell_content = cell_text 45 | 46 | if idx == 0: 47 | table_html += f"" 48 | else: 49 | table_html += f"" 50 | table_html += "" 51 | 52 | table_html += "
{cell_content}{cell_content}
" 53 | return table_html 54 | 55 | def visualize_instructions(instructions_list): 56 | # Simplified color scheme 57 | single_line_color = '#E8EAF6' # Light indigo - clean, professional 58 | multi_first_color = '#DCEDC8' # Light sage green - clear starting point 59 | multi_rest_color = '#F9FBE7' # Very pale yellow-green - subtle continuation 60 | 61 | table_uncleaned_color = '#FFECB3' # Warm amber - intuitive "needs attention" 62 | table_cleaned_color = '#B2DFDB' # Teal - fresh and clean feeling 63 | 64 | html_content = """ 65 | 66 | 67 | 126 | 127 | """ 128 | 129 | for instructions in instructions_list: 130 | if len(instructions) == 1: 131 | if 'table' in instructions[0]: 132 | table_html = format_table(instructions[0]['table']) 133 | html_content += "
" 134 | if instructions[0].get('cleaned', False): 135 | html_content += f"
{table_html}
" 136 | else: 137 | html_content += f"
{table_html}
" 138 | html_content += "
" 139 | continue 140 | elif 'image' in instructions[0]: 141 | image_data = instructions[0]['image'] 142 | src = image_data.get('src', '') 143 | alt = image_data.get('alt', '') 144 | 145 | html_content += "
" 146 | html_content += f"{alt}" 147 | html_content += "
" 148 | continue 149 | 150 | first_instruction = instructions[0] 151 | is_centered = first_instruction.get('text-center', False) 152 | div_style = '' 153 | 154 | if is_centered: 155 | div_style = 'text-align: center;' 156 | 157 | html_content += f"
" 158 | for idx, instruction in enumerate(instructions): 159 | if 'image' in instruction: 160 | # Handle image instructions 161 | image_data = instruction['image'] 162 | src = image_data.get('src', '') 163 | alt = image_data.get('alt', '') 164 | 165 | if len(instructions) == 1: 166 | color = single_line_color 167 | elif idx == 0: 168 | color = multi_first_color 169 | else: 170 | color = multi_rest_color 171 | 172 | html_content += f"" 173 | html_content += f"{alt}" 174 | html_content += "" 175 | else: 176 | # Handle text instructions 177 | style_properties, text, href = format_dct_style(instruction) 178 | 179 | if len(instructions) == 1: 180 | color = single_line_color 181 | elif idx == 0: 182 | color = multi_first_color 183 | else: 184 | color = multi_rest_color 185 | 186 | style_properties.append(f'background-color: {color}') 187 | style = '; '.join(style_properties) 188 | 189 | if href: 190 | span_content = f"{text}" 191 | else: 192 | span_content = text 193 | 194 | html_content += f"{span_content}" 195 | 196 | html_content += "
" 197 | 198 | html_content += """ 199 | 200 | """ 201 | 202 | # Write HTML content to a temporary file 203 | with open('instructions_visualization.html', 'w', encoding='utf-8') as f: 204 | f.write(html_content) 205 | 206 | # Get the absolute path of the file 207 | file_path = os.path.abspath('instructions_visualization.html') 208 | 209 | # Open the HTML file in the default web browser 210 | webbrowser.open('file://' + file_path) -------------------------------------------------------------------------------- /doc2dict/doc2dict/html/visualize_dict.py: -------------------------------------------------------------------------------- 1 | import webbrowser 2 | import os 3 | 4 | def visualize_dict(data_dict, filename='document_visualization.html', open_browser=True): 5 | """ 6 | Convert nested dictionary to HTML visualization and open in browser 7 | 8 | Parameters: 9 | data_dict (dict): The nested dictionary to visualize 10 | filename (str): The name of the HTML file to create 11 | open_browser (bool): Whether to automatically open in browser 12 | 13 | Returns: 14 | str: The path to the created HTML file 15 | """ 16 | html = [] 17 | 18 | # Add HTML document opening tags and CSS 19 | html.append(""" 20 | 21 | 22 | 23 | 24 | 25 | Document Visualization 26 | 104 | 105 | 106 | """) 107 | 108 | # Add metadata box 109 | if "metadata" in data_dict: 110 | html.append('
') 111 | html.append('
Parser Metadata
') 112 | metadata = data_dict["metadata"] 113 | for key, value in metadata.items(): 114 | html.append(f'
{key}: {value}
') 115 | html.append('
') 116 | 117 | # Process the document structure 118 | if "document" in data_dict: 119 | html.append('
') 120 | process_document(data_dict["document"], html, 1) 121 | html.append('
') 122 | 123 | # Add HTML closing tags 124 | html.append(""" 125 | 126 | 127 | """) 128 | 129 | html_content = ''.join(html) 130 | 131 | # Write HTML content to a file 132 | with open(filename, 'w', encoding='utf-8') as f: 133 | f.write(html_content) 134 | 135 | # Get the absolute path of the file 136 | file_path = os.path.abspath(filename) 137 | 138 | # Open the HTML file in the default web browser if requested 139 | if open_browser: 140 | webbrowser.open('file://' + file_path) 141 | 142 | return file_path 143 | 144 | def process_document(doc_dict, html, level): 145 | """Process document elements recursively""" 146 | # Sort keys to ensure numerical order for items like "1", "2", etc. 147 | try: 148 | sorted_keys = sorted(doc_dict.keys(), key=lambda x: (not x.lstrip('-').isdigit(), int(x) if x.lstrip('-').isdigit() else x)) 149 | except: 150 | # Fallback if sorting fails 151 | sorted_keys = list(doc_dict.keys()) 152 | 153 | for key in sorted_keys: 154 | value = doc_dict[key] 155 | 156 | if isinstance(value, dict): 157 | section_title = value.get("title", "") 158 | 159 | # Output the section title 160 | if section_title: 161 | heading_level = min(level, 6) # Limit to h6 162 | html.append(f'{section_title}') 163 | 164 | # Process the section content 165 | html.append('
') 166 | 167 | # Handle direct content fields 168 | for attr_key, attr_value in value.items(): 169 | if attr_key not in ["title", "class", "contents", "standardized_title"]: 170 | process_content(attr_key, attr_value, html) 171 | 172 | # Process contents dictionary if it exists 173 | if "contents" in value and value["contents"]: 174 | process_document(value["contents"], html, level + 1) 175 | 176 | html.append('
') 177 | else: 178 | # Direct content 179 | process_content(key, value, html) 180 | 181 | def process_content(content_type, content, html): 182 | """Process specific content types""" 183 | if content_type == "text": 184 | # Preserve bullet points and other formatting 185 | html.append(f'

{content}

') 186 | elif content_type == "textsmall": 187 | html.append(f'

{content}

') 188 | elif content_type == "image": 189 | process_image(content, html) 190 | elif content_type == "table": 191 | process_table(content, html) 192 | else: 193 | pass 194 | 195 | def process_image(image_data, html): 196 | """Convert image data to HTML img tag""" 197 | src = image_data.get('src', '') 198 | alt = image_data.get('alt', 'Image') 199 | 200 | html.append('
') 201 | html.append(f'{alt}') 202 | html.append('
') 203 | 204 | def process_table_cell(cell): 205 | """Process a single table cell that may contain text or image data""" 206 | if isinstance(cell, dict): 207 | if 'image' in cell: 208 | # Cell contains an image 209 | image_data = cell['image'] 210 | src = image_data.get('src', '') 211 | alt = image_data.get('alt', 'Image') 212 | return f'{alt}' 213 | elif 'text' in cell: 214 | # Cell contains structured text data 215 | return cell['text'] 216 | else: 217 | # Cell is a dict but doesn't match expected structure 218 | return str(cell) 219 | else: 220 | # Cell is a string or other simple type 221 | return str(cell) 222 | 223 | def process_table(table_data, html): 224 | """Convert table data to HTML table""" 225 | html.append('') 226 | 227 | # Check if first row should be treated as header 228 | has_header = False 229 | if len(table_data) > 1: 230 | # Heuristic: if first row contains mostly text content, treat as header 231 | first_row = table_data[0] 232 | text_cells = 0 233 | for cell in first_row: 234 | if isinstance(cell, str) and cell.strip(): 235 | text_cells += 1 236 | elif isinstance(cell, dict) and cell.get('text', '').strip(): 237 | text_cells += 1 238 | 239 | if text_cells >= len(first_row) / 2: # At least half the cells have text 240 | has_header = True 241 | 242 | for i, row in enumerate(table_data): 243 | html.append('') 244 | for cell in row: 245 | # Use th for header cells, otherwise td 246 | tag = 'th' if has_header and i == 0 else 'td' 247 | cell_content = process_table_cell(cell) 248 | html.append(f'<{tag}>{cell_content}') 249 | html.append('') 250 | 251 | html.append('
') -------------------------------------------------------------------------------- /example_output/html/unnest.txt: -------------------------------------------------------------------------------- 1 | PART IV 2 | ITEM 15. EXHIBIT AND FINANCIAL STATEMENT SCHEDULES 3 | (a)Financial Statements and Schedules 4 | The financial statements are set forth under Part II, Item 8 of this Form 10-K, as indexed below. Financial statement schedules have been omitted since they either are not required, not applicable, or the information is otherwise included. 5 | Index to Financial Statements Page 6 | Income Statements 56 7 | Comprehensive Income Statements 57 8 | Balance Sheets 58 9 | Cash Flows Statements 59 10 | Stockholders’ Equity Statements 60 11 | Notes to Financial Statements 61 12 | Report of Independent Registered Public Accounting Firm 94 13 | (b)Exhibit Listing 14 | Incorporated by Reference Incorporated by Reference Incorporated by Reference Incorporated by Reference 15 | Exhibit 16 | Number Exhibit Description Filed 17 | Herewith Form Period 18 | Ending Exhibit Filing Date 19 | 3.1 Amended and Restated Articles of Incorporation of Microsoft Corporation 8-K 3.1 12/1/2016 20 | 3.2 Bylaws of Microsoft Corporation 8-K 3.2 7/3/2023 21 | 4.1 Indenture, dated as of May 18, 2009, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee (“Base Indenture”) S-3ASR 4.1 10/29/2015 22 | 4.2 Form of First Supplemental Indenture for 2.95% Notes due 2014, 4.20% Notes due 2019, and 5.20% Notes due 2039, dated as of May 18, 2009, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee, to the Base Indenture 8-K 4.2 5/15/2009 23 | 4.5 Form of Second Supplemental Indenture for 0.875% Notes due 2013, 1.625% Notes due 2015, 3.00% Notes due 2020, and 4.50% Notes due 2040, dated as of September 27, 2010, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee, to the Indenture, dated as of May 18, 2009, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee 8-K 4.2 9/27/2010 24 | 103 25 | PART IV 26 | Item 15 27 | Incorporated by Reference Incorporated by Reference Incorporated by Reference Incorporated by Reference 28 | Exhibit 29 | Number Exhibit Description Filed 30 | Herewith Form Period 31 | Ending Exhibit Filing Date 32 | 4.6 Third Supplemental Indenture for 2.500% Notes due 2016, 4.000% Notes due 2021, and 5.300% Notes due 2041, dated as of February 8, 2011, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee, to the Indenture, dated as of May 18, 2009, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee 8-K 4.2 2/8/2011 33 | 4.7 Fourth Supplemental Indenture for 0.875% Notes due 2017, 2.125% Notes due 2022, and 3.500% Notes due 2042, dated as of November 7, 2012, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee, to the Indenture, dated as of May 18, 2009, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee 8-K 4.1 11/7/2012 34 | 4.8 Fifth Supplemental Indenture for 2.625% Notes due 2033, dated as of May 2, 2013, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee, to the Indenture, dated as of May 18, 2009, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee 8-K 4.1 5/1/2013 35 | 4.9 Sixth Supplemental Indenture for 1.000% Notes due 2018, 2.375% Notes due 2023, and 3.750% Notes due 2043, dated as of May 2, 2013, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee, to the Indenture, dated as of May 18, 2009, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee 8-K 4.2 5/1/2013 36 | 4.10 Seventh Supplemental Indenture for 2.125% Notes due 2021 and 3.125% Notes due 2028, dated as of December 6, 2013, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee, to the Indenture, dated as of May 18, 2009, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee 8-K 4.1 12/6/2013 37 | 104 38 | PART IV 39 | Item 15 40 | Incorporated by Reference Incorporated by Reference Incorporated by Reference Incorporated by Reference 41 | Exhibit 42 | Number Exhibit Description Filed 43 | Herewith Form Period 44 | Ending Exhibit Filing Date 45 | 4.11 Eighth Supplemental Indenture for 1.625% Notes due 2018, 3.625% Notes due 2023, and 4.875% Notes due 2043, dated as of December 6, 2013, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee, to the Indenture, dated as of May 18, 2009, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee 8-K 4.2 12/6/2013 46 | 4.12 Ninth Supplemental Indenture for 1.850% Notes due 2020, 2.375% Notes due 2022, 2.700% Notes due 2025, 3.500% Notes due 2035, 3.750% Notes due 2045, and 4.000% Notes due 2055, dated as of February 12, 2015, between Microsoft Corporation and U.S. Bank National Association, as Trustee, to the Indenture, dated as of May 18, 2009, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as trustee 8-K 4.1 2/12/2015 47 | 4.13 Tenth Supplemental Indenture for 1.300% Notes due 2018, 2.000% Notes due 2020, 2.650% Notes due 2022, 3.125% Notes due 2025, 4.200% Notes due 2035, 4.450% Notes due 2045, and 4.750% Notes due 2055, dated as of November 3, 2015, between Microsoft Corporation and U.S. Bank National Association, as Trustee, to the Indenture, dated as of May 18, 2009, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as trustee 8-K 4.1 11/3/2015 48 | 4.14 Eleventh Supplemental Indenture for 1.100% Notes due 2019, 1.550% Notes due 2021, 2.000% Notes due 2023, 2.400% Notes due 2026, 3.450% Notes due 2036, 3.700% Notes due 2046, and 3.950% Notes due 2056, dated as of August 8, 2016, between Microsoft Corporation and U.S. Bank, National Association, as Trustee, to the Indenture, dated as of May 18, 2009, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as trustee 8-K 4.1 8/5/2016 49 | 105 50 | PART IV 51 | Item 15 52 | Incorporated by Reference Incorporated by Reference Incorporated by Reference Incorporated by Reference 53 | Exhibit 54 | Number Exhibit Description Filed 55 | Herewith Form Period 56 | Ending Exhibit Filing Date 57 | 4.15 Twelfth Supplemental Indenture for 1.850% Notes due 2020, 2.400% Notes due 2022, 2.875% Notes due 2024, 3.300% Notes due 2027, 4.100% Notes due 2037, 4.250% Notes due 2047, and 4.500% Notes due 2057, dated as of February 6, 2017, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee, to the Indenture, dated as of May 18, 2009, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as trustee 8-K 4.1 2/3/2017 58 | 4.16 Thirteenth Supplemental Indenture for 2.525% Notes due 2050 and 2.675% Notes due 2060, dated as of June 1, 2020, between Microsoft Corporation and U.S. Bank National Association, as Trustee, to the Indenture, dated as of May 18, 2009, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as trustee 8-K 4.1 6/1/2020 59 | 4.17 Fourteenth Supplemental Indenture for 2.921% Notes due 2052 and 3.041% Notes due 2062, dated as of March 17, 2021, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee, to the Indenture, dated as of May 18, 2009, between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as trustee 8-K 4.1 3/17/2021 60 | 4.18 Fifteenth Supplemental Indenture, dated as of November 6, 2023, by and between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as Trustee 8-K 4.2 11/6/2023 61 | 4.19 Indenture, dated as of September 19, 2016, by and between Activision Blizzard, Inc. and Wells Fargo Bank, National Association, as Trustee, with respect to Activision Blizzard, Inc.’s 3.400% Senior Notes due 2026 8-K 4.9 11/6/2023 62 | 106 63 | PART IV 64 | Item 15 65 | Incorporated by Reference Incorporated by Reference Incorporated by Reference Incorporated by Reference 66 | Exhibit 67 | Number Exhibit Description Filed 68 | Herewith Form Period 69 | Ending Exhibit Filing Date 70 | 4.20 Base Indenture, dated as of May 26, 2017, by and between Activision Blizzard, Inc. and Wells Fargo Bank, National Association, as Trustee, with respect to Activision Blizzard, Inc.’s 3.400% Senior Notes due 2027, 1.350% Senior Notes due 2030, 4.500% Senior Notes due 2047 and 2.500% Senior Notes due 2050 8-K 4.10 11/6/2023 71 | 4.21 First Supplemental Indenture, dated as of May 26, 2017, by and between Activision Blizzard, Inc. and Wells Fargo Bank, National Association, as Trustee, with respect to Activision Blizzard, Inc.’s 3.400% Senior Notes due 2027 and 4.500% Senior Notes due 2047 8-K 4.11 11/6/2023 72 | 4.22 Second Supplemental Indenture, dated as of August 10, 2020, by and between Activision Blizzard, Inc. and Wells Fargo Bank, National Association, as Trustee, with respect to Activision Blizzard, Inc.’s 1.350% Senior Notes due 2030 and 2.500% Senior Notes due 2050 8-K 4.12 11/6/2023 73 | 4.23 First Supplemental Indenture, dated as of October 27, 2023, by and between Activision Blizzard, Inc. and Computershare Trust Company, N.A., with respect to Activision Blizzard, Inc.’s 3.400% Senior Notes due 2026 8-K 4.13 11/6/2023 74 | 4.24 Third Supplemental Indenture, dated as of October 27, 2023, by and between Activision Blizzard, Inc. and Computershare Trust Company, N.A., with respect to Activision Blizzard, Inc.’s 3.400% Senior Notes due 2027 and 4.500% Senior Notes due 2047 8-K 4.14 11/6/2023 75 | 4.25 Fourth Supplemental Indenture, dated as of October 27, 2023, by and between Activision Blizzard, Inc. and Computershare Trust Company, N.A., with respect to Activision Blizzard, Inc.’s 1.350% Senior Notes due 2030 and 2.500% Senior Notes due 2050 8-K 4.15 11/6/2023 76 | 4.26 Description of Securities X 77 | 10.1* Microsoft Corporation 2001 Stock Plan 10-Q 9/30/2016 10.1 10/20/2016 78 | 10.4* Microsoft Corporation Employee Stock Purchase Plan 10-K 6/30/2012 10.4 7/26/2012 79 | 107 80 | PART IV 81 | Item 15 82 | Incorporated by Reference Incorporated by Reference Incorporated by Reference Incorporated by Reference 83 | Exhibit 84 | Number Exhibit Description Filed 85 | Herewith Form Period 86 | Ending Exhibit Filing Date 87 | 10.5* Microsoft Corporation Deferred Compensation Plan X 88 | 10.6* Microsoft Corporation 2017 Stock Plan DEF14A Annex C 10/16/2017 89 | 10.7* Form of Stock Award Agreement Under the Microsoft Corporation 2017 Stock Plan 10-Q 3/31/2018 10.26 4/26/2018 90 | 10.8* Form of Performance Stock Award Agreement Under the Microsoft Corporation 2017 Stock Plan 10-Q 3/31/2018 10.27 4/26/2018 91 | 10.9 Amended and Restated Officers’ Indemnification Trust Agreement between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as trustee 10-Q 9/30/2016 10.12 10/20/2016 92 | 10.10 Assumption of Beneficiaries’ Representative Obligations Under Amended and Restated Officers’ Indemnification Trust Agreement 10-K 6/30/2020 10.25 7/30/2020 93 | 10.11 Form of Indemnification Agreement and Amended and Restated Directors’ Indemnification Trust Agreement between Microsoft Corporation and The Bank of New York Mellon Trust Company, N.A., as trustee 10-K 6/30/2019 10.13 8/1/2019 94 | 10.12 Assumption of Beneficiaries’ Representative Obligations Under Amended and Restated Directors’ Indemnification Trust Agreement 10-K 6/30/2020 10.26 7/30/2020 95 | 10.14* Microsoft Corporation Deferred Compensation Plan for Non-Employee Directors 10-Q 12/31/2017 10.14 1/31/2018 96 | 10.15* Microsoft Corporation Executive Incentive Plan 8-K 10.1 9/19/2018 97 | 10.19* Microsoft Corporation Executive Incentive Plan 10-Q 9/30/2016 10.17 10/20/2016 98 | 10.20* Form of Executive Incentive Plan (Executive Officer SAs) Stock Award Agreement under the Microsoft Corporation 2001 Stock Plan 10-Q 9/30/2016 10.18 10/20/2016 99 | 10.21* Form of Executive Incentive Plan Performance Stock Award Agreement under the Microsoft Corporation 2001 Stock Plan 10-Q 9/30/2016 10.25 10/20/2016 100 | 10.22* Senior Executive Severance Benefit Plan 10-Q 9/30/2016 10.22 10/20/2016 101 | 10.23* Offer Letter, dated February 3, 2014, between Microsoft Corporation and Satya Nadella 8-K 10.1 2/4/2014 102 | 108 103 | PART IV 104 | Item 15 105 | Incorporated by Reference Incorporated by Reference Incorporated by Reference Incorporated by Reference 106 | Exhibit 107 | Number Exhibit Description Filed 108 | Herewith Form Period 109 | Ending Exhibit Filing Date 110 | 10.24* Long-Term Performance Stock Award Agreement between Microsoft Corporation and Satya Nadella 10-Q 12/31/2014 10.24 1/26/2015 111 | 10.25* Offer Letter, dated October 25, 2020, between Microsoft Corporation and Christopher Young 10-Q 9/30/2021 10.27 10/26/2021 112 | 19.1 General Insider Trading Policy X 113 | 19.2 Restricted Trading Window Policy X 114 | 19.3 Insider Trading Compliance and Preclearance Policies for Section 16 Officers and Directors of Microsoft X 115 | 21 Subsidiaries of Registrant X 116 | 23.1 Consent of Independent Registered Public Accounting Firm X 117 | 31.1 Certification of Chief Executive Officer Pursuant to Section 302 of the Sarbanes-Oxley Act of 2002 X 118 | 31.2 Certification of Chief Financial Officer Pursuant to Section 302 of the Sarbanes-Oxley Act of 2002 X 119 | 32.1** Certification of Chief Executive Officer Pursuant to Section 906 of the Sarbanes-Oxley Act of 2002 X 120 | 32.2** Certification of Chief Financial Officer Pursuant to Section 906 of the Sarbanes-Oxley Act of 2002 X 121 | 97.1* Microsoft Corporation Executive Compensation Recovery Policy X 122 | 101.INS Inline XBRL Instance Document—the instance document does not appear in the Interactive Data File as its XBRL tags are embedded within the Inline XBRL document X 123 | 101.SCH Inline XBRL Taxonomy Extension Schema With Embedded Linkbase Documents X 124 | 104 Cover page formatted as Inline XBRL and contained in Exhibit 101 X 125 | * Indicates a management contract or compensatory plan or arrangement. 126 | ** Furnished, not filed. 127 | 109 128 | PART IV 129 | Item 16 130 | ITEM 16. FORM 10-K SUMMARY 131 | None. 132 | 110 -------------------------------------------------------------------------------- /doc2dict/doc2dict/convert_instructions_to_dict.py: -------------------------------------------------------------------------------- 1 | # TODO 2 | # rewrite this to set up modular stuff 3 | # e.g. preprocessing like wraparound 4 | 5 | import re 6 | from importlib.metadata import version 7 | 8 | __version__ = version("doc2dict") 9 | 10 | LIKELY_HEADER_ATTRIBUTES = ['bold', 'italic', 'underline', 'text-center', 'all_caps', 'fake_table','proper_case'] 11 | 12 | def remove_empty_contents(obj): 13 | """Recursively remove empty contents dictionaries""" 14 | if isinstance(obj, dict): 15 | if 'contents' in obj and not obj['contents']: 16 | del obj['contents'] 17 | else: 18 | for value in obj.values(): 19 | remove_empty_contents(value) 20 | 21 | def create_level(level_num=-1, class_name='text', title='', attributes=None): 22 | """Factory function to create level dictionaries with all required fields""" 23 | return { 24 | 'level': level_num, 25 | 'class': class_name, 26 | 'standardized_title': title, 27 | 'attributes': attributes or {} 28 | } 29 | 30 | 31 | def split_header_instructions(instructions_list): 32 | """ 33 | Splits instruction groups where the first instruction would be classified as a header. 34 | 35 | Args: 36 | instructions_list: List of instruction groups (each group is a list of instructions) 37 | 38 | Returns: 39 | New list of instruction groups with headers separated from their content 40 | """ 41 | 42 | 43 | # First, detect big_script like in determine_levels 44 | text_instructions = [instr[0] for instr in instructions_list if 'text' in instr[0]] 45 | font_size_counts = {size: sum(1 for item in text_instructions if item.get('font-size') == size) 46 | for size in set(item.get('font-size') for item in text_instructions if item.get('font-size') is not None)} 47 | 48 | big_script = [False] * len(instructions_list) 49 | if font_size_counts: 50 | most_common_font_size, font_count = max(font_size_counts.items(), key=lambda x: x[1]) 51 | if font_count > (0.5 * len(instructions_list)): 52 | # Check for big script (>20% larger than most common) 53 | for idx, instructions in enumerate(instructions_list): 54 | first = instructions[0] 55 | if 'text' in first and first.get('font-size') is not None: 56 | if first.get('font-size') > (1.2 * most_common_font_size): 57 | big_script[idx] = True 58 | 59 | # Now split instruction groups 60 | new_instructions_list = [] 61 | 62 | for idx, instructions in enumerate(instructions_list): 63 | # Skip if only one instruction - nothing to split 64 | if len(instructions) <= 1: 65 | new_instructions_list.append(instructions) 66 | continue 67 | 68 | first_instruction = instructions[0] 69 | 70 | # Check if first instruction would be classified as a header 71 | is_header = False 72 | if 'text' in first_instruction: 73 | # Check for header attributes or big_script 74 | has_header_attrs = any(first_instruction.get(attr, False) for attr in LIKELY_HEADER_ATTRIBUTES) 75 | if has_header_attrs or big_script[idx]: 76 | is_header = True 77 | 78 | if is_header: 79 | # Split: first instruction becomes its own group, rest become another group 80 | new_instructions_list.append([first_instruction]) 81 | if len(instructions) > 1: # Add remaining instructions as separate group 82 | new_instructions_list.append(instructions[1:]) 83 | else: 84 | # Keep as is - no splitting needed 85 | new_instructions_list.append(instructions) 86 | 87 | return new_instructions_list 88 | 89 | 90 | # AI GENERATED CODE BC I WANT TO PUSH TO PROD # 91 | def determine_predicted_header_levels(levels): 92 | """ 93 | Assigns hierarchy levels to predicted headers based on their attributes, 94 | maintaining consistency within each section defined by known headers. 95 | 96 | Args: 97 | levels: List of dictionaries containing level, class, and attributes 98 | 99 | Returns: 100 | List of tuples in the format (level, class) 101 | """ 102 | # Find the base level for predicted headers 103 | predicted_headers = [l for l in levels if l['class'] == 'predicted header'] 104 | if not predicted_headers: 105 | return [(level['level'], level['class'], level.get('standardized_title','')) for level in levels] 106 | 107 | base_level = min(h['level'] for h in predicted_headers) 108 | 109 | # Create a copy of levels that we'll modify 110 | updated_levels = levels.copy() 111 | 112 | # Track the last known header level 113 | current_section_level = -1 114 | 115 | # Dictionary to map attribute combinations to levels within the current section 116 | # Format: {attribute_key: assigned_level} 117 | attr_level_map = {} 118 | 119 | # Helper function to create a key from attributes dictionary 120 | def attr_to_key(attrs): 121 | if not attrs: 122 | return "no_attributes" 123 | # Sort keys to ensure consistent mapping regardless of order 124 | return "_".join(sorted([k for k, v in attrs.items() if v])) 125 | 126 | # Process each item 127 | for i, item in enumerate(updated_levels): 128 | # When we hit a known header, reset our attribute mapping 129 | if item['class'] != 'predicted header' and item['class'] not in ['text', 'textsmall']: 130 | if item['level'] <= current_section_level: 131 | # We've entered a new section at same or higher level, reset mappings 132 | attr_level_map = {} 133 | current_section_level = item['level'] 134 | continue 135 | 136 | # Skip non-header items 137 | if item['class'] != 'predicted header': 138 | continue 139 | 140 | # Create a key for this item's attributes 141 | attr_key = attr_to_key(item.get('attributes', {})) 142 | 143 | # If we haven't seen this attribute combination in this section, 144 | # assign it the next available level 145 | if attr_key not in attr_level_map: 146 | attr_level_map[attr_key] = base_level + len(attr_level_map) 147 | 148 | # Assign the level based on the mapping 149 | item['level'] = attr_level_map[attr_key] 150 | 151 | # Return in the required format 152 | return [(level['level'], level['class'], level.get('standardized_title','')) for level in updated_levels] 153 | # AI GENERATED CODE BC I WANT TO PUSH TO PROD # 154 | 155 | def extract_cell_content(cell): 156 | """Helper function to extract content from table cells that may contain text or images""" 157 | if 'image' in cell: 158 | return cell # Return the full cell structure for images 159 | else: 160 | return cell.get("text", "") # Return text content or empty string 161 | 162 | def determine_levels(instructions_list, mapping_dict=None): 163 | if mapping_dict is None: 164 | predicted_header_level = 0 165 | #TODO bandaid fix 166 | elif 'rules' in mapping_dict: 167 | predicted_header_level = 0 168 | else: 169 | predicted_header_level = max(mapping_dict.values()) + 1 170 | 171 | # filter out tables, include both text and image instructions 172 | headers = [] 173 | for instructions in instructions_list: 174 | first_instruction = instructions[0] 175 | if 'text' in first_instruction or 'image' in first_instruction: 176 | headers.append(first_instruction) 177 | else: 178 | headers.append({}) 179 | 180 | 181 | # count font-size (only for text instructions) 182 | small_script = [False] * len(headers) 183 | big_script = [False] * len(headers) 184 | text_instructions = [instr[0] for instr in instructions_list if 'text' in instr[0]] 185 | font_size_counts = {size: sum(1 for item in text_instructions if item.get('font-size') == size) for size in set(item.get('font-size') for item in text_instructions if item.get('font-size') is not None)} 186 | 187 | # use only font size goes here 188 | if mapping_dict is not None: 189 | if 'rules' in mapping_dict: 190 | if 'use_font_size_only_for_level' in mapping_dict['rules']: 191 | # Filter headers first for this special case 192 | headers = [item if 'text' in item and any([item.get(attr, False) for attr in LIKELY_HEADER_ATTRIBUTES]) else {} for item in headers] 193 | 194 | most_common_font_size, font_count = max(font_size_counts.items(), key=lambda x: x[1]) 195 | 196 | # Get all unique font sizes and sort them in descending order (largest font = level 0, next = level 1, etc.) 197 | unique_font_sizes = sorted(font_size_counts.keys(), reverse=True) 198 | 199 | # Create a mapping from font size to level (largest font = level 0, next = level 1, etc.) 200 | font_size_to_level = {size: idx for idx, size in enumerate(unique_font_sizes)} 201 | 202 | levels = [] 203 | for idx, header in enumerate(headers): 204 | if 'text' in header and header.get('font-size') is not None: 205 | font_size = header.get('font-size') 206 | 207 | if font_size < most_common_font_size: 208 | # Assign small script for fonts smaller than most common 209 | level = (-2,'textsmall','') 210 | else: 211 | # Assign level based on font size hierarchy 212 | hierarchy_level = font_size_to_level[font_size] 213 | level = (hierarchy_level, 'predicted header','') 214 | else: 215 | # No font size information or not text, treat as regular text 216 | level = (-1, 'text','') 217 | 218 | levels.append(level) 219 | 220 | return levels 221 | 222 | # Detect font sizes first (before filtering headers) 223 | if font_size_counts != {}: 224 | most_common_font_size, font_count = max(font_size_counts.items(), key=lambda x: x[1]) 225 | if font_count > (.5 * len(instructions_list)): 226 | # assume anything with less than this font size is small script 227 | small_script = [True if 'text' in item and item.get('font-size') is not None and item.get('font-size') < most_common_font_size else False for item in headers] 228 | 229 | # assume anything with more than 20% of the most common font size is big script 230 | big_script = [True if 'text' in item and item.get('font-size') is not None and item.get('font-size') > (1.2 * most_common_font_size) else False for item in headers] 231 | 232 | # NOW filter headers after font size detection (includes big_script in the filtering) 233 | headers = [item if 'text' in item and (any([item.get(attr, False) for attr in LIKELY_HEADER_ATTRIBUTES]) or big_script[idx]) else {} for idx, item in enumerate(headers)] 234 | 235 | levels = [] 236 | for idx,header in enumerate(headers): 237 | level = None 238 | attributes = {attr: header.get(attr, False) for attr in LIKELY_HEADER_ATTRIBUTES if attr in header} 239 | 240 | if small_script[idx]: 241 | level = create_level(-2, 'textsmall') 242 | elif 'text' in header: 243 | if mapping_dict is not None: 244 | text = header['text'].lower() 245 | regex_tuples = [(item[0][1], item[0][0], item[1]) for item in mapping_dict.items()] 246 | 247 | for regex, header_class, hierarchy_level in regex_tuples: 248 | match = re.match(regex, text.strip()) 249 | if match: 250 | # create a dictionary of attributes from LIKELY_HEADER_ATTRIBUTES 251 | match_groups = match.groups() 252 | if len(match_groups) > 0: 253 | string = ''.join([str(x) for x in match_groups if x is not None]) 254 | standardized_title = f'{header_class}{string}' 255 | else: 256 | standardized_title = f'{header_class}' 257 | level = create_level(hierarchy_level, header_class, standardized_title, attributes) 258 | break 259 | 260 | if level is None: 261 | # Check for header attributes OR big_script 262 | if any([header.get(attr,False) for attr in LIKELY_HEADER_ATTRIBUTES]) or big_script[idx]: 263 | level = create_level(predicted_header_level, 'predicted header', '', attributes) 264 | 265 | if level is None: 266 | level = create_level(-1, 'text') 267 | 268 | levels.append(level) 269 | 270 | # NOW USE SEQUENCE AND ATTRIBUTES IN THE LEVELS TO DETERMINE HIERARCHY FOR PREDICTED HEADERS 271 | levels = determine_predicted_header_levels(levels) 272 | return levels 273 | 274 | def convert_instructions_to_dict(instructions_list, mapping_dict=None): 275 | 276 | # add filtering stage here 277 | 278 | # CHANGE: Split mixed header-content groups first 279 | instructions_list = split_header_instructions(instructions_list) 280 | 281 | # Get pre-calculated levels for each instruction 282 | levels = determine_levels(instructions_list, mapping_dict) 283 | 284 | # Initialize document structure 285 | document = {'contents': {}} 286 | 287 | # Create an introduction section 288 | introduction = {'title': 'introduction', 'class': 'introduction', 'contents': {}} 289 | 290 | # Add the introduction to the document 291 | document['contents'][-1] = introduction 292 | 293 | # Keep track of current position in hierarchy 294 | current_section = introduction 295 | current_path = [document, introduction] # Path from root to current section 296 | current_levels = [-1, 0] # Corresponding hierarchy levels 297 | 298 | # Process each instruction using pre-calculated levels 299 | for idx, instructions in enumerate(instructions_list): 300 | instruction = instructions[0] 301 | level, level_class, standardized_title = levels[idx] 302 | 303 | if level >= 0: 304 | # This is a section header 305 | 306 | # Pop hierarchy until finding appropriate parent 307 | while len(current_levels) > 1 and current_levels[-1] >= level: 308 | current_path.pop() 309 | current_levels.pop() 310 | 311 | # Extract title from the instruction (only text instructions can be headers) 312 | if 'text' in instruction: 313 | title = ''.join([instr['text'] for instr in instructions if 'text' in instr]) 314 | else: 315 | title = '[Non-text header]' # Fallback, though this shouldn't happen 316 | 317 | # Create new section, in correct order 318 | new_section = {'title': title} 319 | if standardized_title: # Add right after title 320 | new_section['standardized_title'] = standardized_title 321 | new_section['class'] = level_class 322 | new_section['contents'] = {} 323 | 324 | # Add section to parent's contents with index as key 325 | parent = current_path[-1] 326 | parent['contents'][idx] = new_section 327 | 328 | # Update tracking 329 | current_path.append(new_section) 330 | current_levels.append(level) 331 | current_section = new_section 332 | 333 | # CHANGE: Removed mixed content handling here since groups are now pure 334 | 335 | # CHANGE: Simplified - only process regular content (no mixed groups anymore) 336 | if level in [-1, -2]: 337 | for instruction in instructions: 338 | if 'text' in instruction: 339 | if not current_section['contents'].get(idx): 340 | current_section['contents'][idx] = {level_class: ''} 341 | if level_class in current_section['contents'][idx]: 342 | current_section['contents'][idx][level_class] += instruction['text'] 343 | else: 344 | current_section['contents'][idx][level_class] = instruction['text'] 345 | elif 'image' in instruction: 346 | current_section['contents'][idx] = {'image': instruction['image']} 347 | elif 'table' in instruction: 348 | current_section['contents'][idx] = {'table': [[extract_cell_content(cell) for cell in row] for row in instruction['table']]} 349 | 350 | # Create final result with metadata 351 | result = { 352 | 'metadata': { 353 | 'parser': 'doc2dict', 354 | 'github': 'https://github.com/john-friedman/doc2dict', 355 | "version": __version__, 356 | }, 357 | 'document': document['contents'] 358 | } 359 | 360 | remove_empty_contents(result) 361 | return result -------------------------------------------------------------------------------- /doc2dict/doc2dict/mapping.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | def flatten_hierarchy(content, sep='\n'): 4 | result = [] 5 | 6 | def process_node(node): 7 | if isinstance(node, str): 8 | if node.strip(): 9 | result.append(node.strip()) 10 | return 11 | 12 | if isinstance(node, list): 13 | for item in node: 14 | process_node(item) 15 | return 16 | 17 | if isinstance(node, dict): 18 | if node.get('text') and not node.get('content'): 19 | result.append(node['text'].strip()) 20 | 21 | if node.get('content'): 22 | process_node(node['content']) 23 | 24 | for key, value in node.items(): 25 | if key not in ('type', 'text', 'content'): 26 | process_node(value) 27 | 28 | process_node(content) 29 | return sep.join(result) 30 | 31 | class JSONTransformer: 32 | def __init__(self, mapping_dict): 33 | """Initialize transformer with mapping dictionary.""" 34 | self.mapping_dict = mapping_dict 35 | self.id_to_text = {} 36 | self.used_matches = set() 37 | 38 | def _find_refs(self, data, search_key): 39 | """Find all references based on search key in the data.""" 40 | matches = [] 41 | 42 | if isinstance(data, dict): 43 | if search_key in data: 44 | matches.append(data) 45 | for value in data.values(): 46 | matches.extend(self._find_refs(value, search_key)) 47 | elif isinstance(data, list): 48 | for item in data: 49 | matches.extend(self._find_refs(item, search_key)) 50 | 51 | return matches 52 | 53 | def _extract_ref_ids(self, ref_data, search_id): 54 | """Extract reference IDs from either dict or list data.""" 55 | if isinstance(ref_data, dict): 56 | ref_id = ref_data.get(search_id) 57 | return [ref_id] if ref_id is not None else [] 58 | elif isinstance(ref_data, list): 59 | ids = [] 60 | for item in ref_data: 61 | if isinstance(item, dict): 62 | ref_id = item.get(search_id) 63 | if ref_id is not None: 64 | ids.append(ref_id) 65 | return ids 66 | return [] 67 | 68 | def _find_content(self, data, match_identifier, match_content): 69 | """Find all content entries in the data that match the identifier and content pattern.""" 70 | matches = [] 71 | 72 | if isinstance(data, dict): 73 | if match_identifier in data and match_content in data: 74 | matches.append(data) 75 | for value in data.values(): 76 | matches.extend(self._find_content(value, match_identifier, match_content)) 77 | elif isinstance(data, list): 78 | for item in data: 79 | matches.extend(self._find_content(item, match_identifier, match_content)) 80 | 81 | return matches 82 | 83 | def _build_mapping(self, data, transformation): 84 | """Build mapping between identifiers and their content.""" 85 | match_rule = transformation['match'] 86 | id_key = match_rule['identifier'] 87 | content_key = match_rule['content'] 88 | 89 | content_matches = self._find_content(data, id_key, content_key) 90 | 91 | for match in content_matches: 92 | if id_key in match and content_key in match: 93 | self.id_to_text[match[id_key]] = match[content_key] 94 | if match_rule.get('remove_after_use', False): 95 | self.used_matches.add(match[id_key]) 96 | 97 | def _remove_used_content(self, data, match_rule): 98 | """Remove the used content entries based on match rule.""" 99 | if isinstance(data, dict): 100 | id_key = match_rule['identifier'] 101 | 102 | if id_key in data and data.get(id_key) in self.used_matches: 103 | return None 104 | 105 | result = {} 106 | for k, v in data.items(): 107 | processed = self._remove_used_content(v, match_rule) 108 | if processed is not None: 109 | result[k] = processed 110 | 111 | return result if result else None 112 | 113 | elif isinstance(data, list): 114 | result = [item for item in data 115 | if (processed := self._remove_used_content(item, match_rule)) is not None] 116 | return result if result else None 117 | 118 | return data 119 | 120 | def _apply_standardization(self, data, transformation): 121 | """Apply standardization rules to transform text based on regex pattern.""" 122 | if isinstance(data, dict): 123 | if data.get('type') == transformation['match']['type'] and 'text' in data: 124 | pattern = transformation['match']['text_pattern'] 125 | match = re.match(pattern, data['text']) 126 | if match: 127 | value = match.group(1) 128 | output_field = transformation['output'].get('field', 'text') 129 | data[output_field] = transformation['output']['format'].format(value.lower()) 130 | 131 | for value in data.values(): 132 | if isinstance(value, (dict, list)): 133 | self._apply_standardization(value, transformation) 134 | 135 | elif isinstance(data, list): 136 | for item in data: 137 | if isinstance(item, (dict, list)): 138 | self._apply_standardization(item, transformation) 139 | 140 | def _apply_trim(self, data, transformation): 141 | if not isinstance(data, dict) or 'content' not in data: 142 | return data 143 | 144 | match_type = transformation['match']['type'] 145 | expected = transformation['match'].get('expected') 146 | output_type = transformation['output']['type'] 147 | 148 | matches = [] 149 | def find_matches(content, current_path=[]): 150 | for i, item in enumerate(content): 151 | if isinstance(item, dict): 152 | if item.get('type') == match_type and 'text' in item: 153 | matches.append({ 154 | 'path': current_path + [i], 155 | 'text': item['text'] 156 | }) 157 | if 'content' in item: 158 | find_matches(item['content'], current_path + [i, 'content']) 159 | 160 | find_matches(data['content']) 161 | if not matches: 162 | return data 163 | 164 | text_groups = {} 165 | for match in matches: 166 | text = match['text'] 167 | if text not in text_groups: 168 | text_groups[text] = [] 169 | text_groups[text].append(match['path']) 170 | 171 | result = {'type': output_type} 172 | for text, paths in text_groups.items(): 173 | if len(paths) > expected: 174 | if expected == 0: 175 | result['content'] = [flatten_hierarchy(data['content'])] 176 | data['content'] = [result] 177 | else: 178 | split_path = paths[expected] 179 | split_idx = split_path[0] 180 | before_content = data['content'][:split_idx] 181 | result['content'] = [flatten_hierarchy(before_content)] 182 | data['content'] = data['content'][split_idx:] 183 | data['content'].insert(0, result) 184 | break 185 | 186 | return data 187 | 188 | def _apply_consecutive_merge(self, data, transformation): 189 | """Merge consecutive sections with same type and text.""" 190 | if isinstance(data, dict): 191 | if 'content' in data and isinstance(data['content'], list): 192 | new_content = [] 193 | current_section = None 194 | 195 | for item in data['content']: 196 | if (isinstance(item, dict) and 197 | item.get('type') in transformation['match']['types'] and 198 | 'text' in item): 199 | if (current_section and 200 | current_section['type'] == item['type'] and 201 | current_section['text'] == item['text']): 202 | current_section['content'].extend(item['content']) 203 | else: 204 | if current_section: 205 | new_content.append(current_section) 206 | current_section = item 207 | else: 208 | if current_section: 209 | new_content.append(current_section) 210 | current_section = None 211 | new_content.append(item) 212 | 213 | if current_section: 214 | new_content.append(current_section) 215 | 216 | data['content'] = new_content 217 | 218 | for value in data.values(): 219 | if isinstance(value, (dict, list)): 220 | self._apply_consecutive_merge(value, transformation) 221 | 222 | elif isinstance(data, list): 223 | for item in data: 224 | if isinstance(item, (dict, list)): 225 | self._apply_consecutive_merge(item, transformation) 226 | 227 | def transform(self, data): 228 | """Transform the data according to the mapping dictionary.""" 229 | result = data.copy() 230 | 231 | for transformation in self.mapping_dict['transformations']: 232 | if transformation.get('type') == 'standardize': 233 | self._apply_standardization(result, transformation) 234 | elif transformation.get('type') == 'merge_consecutive': 235 | self._apply_consecutive_merge(result, transformation) 236 | elif transformation.get('type') == 'trim': 237 | self._apply_trim(result, transformation) 238 | else: 239 | # Reference replacement logic 240 | self._build_mapping(result, transformation) 241 | 242 | search_key = transformation['search']['key'] 243 | search_id = transformation['search']['identifier'] 244 | output_key = transformation['output']['key'] 245 | 246 | refs = self._find_refs(result, search_key) 247 | 248 | for ref in refs: 249 | ref_ids = self._extract_ref_ids(ref[search_key], search_id) 250 | if ref_ids: 251 | # Create a list of referenced content 252 | referenced_content = [ 253 | self.id_to_text[ref_id] 254 | for ref_id in ref_ids 255 | if ref_id in self.id_to_text 256 | ] 257 | if referenced_content: 258 | ref[output_key] = referenced_content 259 | del ref[search_key] 260 | 261 | if transformation['match'].get('remove_after_use', False): 262 | result = self._remove_used_content(result, transformation['match']) 263 | 264 | return result 265 | 266 | class RuleProcessor: 267 | def __init__(self, rules_dict): 268 | self.rules = rules_dict 269 | 270 | def _apply_remove_rules(self, lines): 271 | if 'remove' not in self.rules: 272 | return lines 273 | 274 | result = lines.copy() 275 | for rule in self.rules['remove']: 276 | pattern = rule['pattern'] 277 | result = [line for line in result if not re.match(pattern, line)] 278 | 279 | return result 280 | 281 | def _join_consecutive_strings(self, content_list): 282 | """Join consecutive strings in a content list.""" 283 | if not content_list: 284 | return content_list 285 | 286 | result = [] 287 | current_strings = [] 288 | 289 | for item in content_list: 290 | if isinstance(item, str): 291 | current_strings.append(item) 292 | else: 293 | if current_strings: 294 | result.append(self.rules.get('join_text').join(current_strings)) 295 | current_strings = [] 296 | if isinstance(item, dict) and 'content' in item: 297 | item['content'] = self._join_consecutive_strings(item['content']) 298 | result.append(item) 299 | 300 | if current_strings: 301 | result.append(self.rules.get('join_text').join(current_strings)) 302 | 303 | return result 304 | 305 | def _find_matching_end(self, lines, start_idx, end_pattern): 306 | """Find matching end pattern considering nesting.""" 307 | pattern_name = None 308 | nesting_level = 1 309 | 310 | for i in range(start_idx + 1, len(lines)): 311 | line = lines[i] 312 | 313 | if pattern_name and re.match(pattern_name, line): 314 | nesting_level += 1 315 | elif re.match(end_pattern, line): 316 | nesting_level -= 1 317 | if nesting_level == 0: 318 | return i 319 | 320 | return len(lines) - 1 321 | 322 | def _process_block(self, lines, start_idx, rule, mappings): 323 | """Process a block of content, handling nested blocks.""" 324 | content = [] 325 | current_idx = start_idx + 1 326 | end_idx = None 327 | 328 | if rule.get('end'): 329 | end_idx = self._find_matching_end(lines, start_idx, rule['end']) 330 | else: 331 | for i in range(start_idx + 1, len(lines)): 332 | if any(re.match(r['pattern'], lines[i]) 333 | for r in mappings if r.get('hierarchy') is not None): 334 | end_idx = i - 1 335 | break 336 | if end_idx is None: 337 | end_idx = len(lines) - 1 338 | 339 | while current_idx < end_idx: 340 | line = lines[current_idx] 341 | matched = False 342 | 343 | for nested_rule in mappings: 344 | if re.match(nested_rule['pattern'], line): 345 | nested_content, next_idx = self._process_block( 346 | lines, current_idx, nested_rule, mappings 347 | ) 348 | if nested_content: 349 | content.append(nested_content) 350 | current_idx = next_idx + 1 351 | matched = True 352 | break 353 | 354 | if not matched: 355 | content.append(line) 356 | current_idx += 1 357 | 358 | if rule.get('keep_end', False) and end_idx < len(lines): 359 | content.append(lines[end_idx]) 360 | 361 | return { 362 | 'type': rule['name'], 363 | 'content': content 364 | }, end_idx 365 | 366 | def _apply_mapping_rules(self, lines): 367 | if 'mappings' not in self.rules: 368 | return {'content': lines} 369 | 370 | result = {'content': []} 371 | hierarchy_stack = [result] 372 | 373 | mappings = sorted( 374 | self.rules['mappings'], 375 | key=lambda x: x.get('hierarchy', float('inf')) 376 | ) 377 | 378 | i = 0 379 | while i < len(lines): 380 | line = lines[i] 381 | matched = False 382 | 383 | for rule in mappings: 384 | if re.match(rule['pattern'], line): 385 | if rule.get('hierarchy') is not None: 386 | new_section = { 387 | 'type': rule['name'], 388 | 'text': line, 389 | 'content': [] 390 | } 391 | 392 | while len(hierarchy_stack) > rule['hierarchy'] + 1: 393 | hierarchy_stack.pop() 394 | 395 | parent = hierarchy_stack[-1] 396 | if isinstance(parent.get('content'), list): 397 | parent['content'].append(new_section) 398 | 399 | hierarchy_stack.append(new_section) 400 | i += 1 401 | 402 | else: 403 | block, end_idx = self._process_block(lines, i, rule, mappings) 404 | parent = hierarchy_stack[-1] 405 | if isinstance(parent.get('content'), list): 406 | parent['content'].append(block) 407 | i = end_idx + 1 408 | 409 | matched = True 410 | break 411 | 412 | if not matched: 413 | parent = hierarchy_stack[-1] 414 | if isinstance(parent.get('content'), list): 415 | parent['content'].append(line) 416 | i += 1 417 | 418 | if self.rules.get('join_text') is not None: 419 | result['content'] = self._join_consecutive_strings(result['content']) 420 | 421 | return result 422 | 423 | class DocumentProcessor: 424 | def __init__(self, config): 425 | self.rules = config.get('rules', {}) 426 | self.transformations = config.get('transformations', []) 427 | self.rule_processor = RuleProcessor(self.rules) 428 | self.json_transformer = JSONTransformer({'transformations': self.transformations}) if self.transformations else None 429 | 430 | def process(self, lines): 431 | filtered_lines = self.rule_processor._apply_remove_rules(lines) 432 | structured_data = self.rule_processor._apply_mapping_rules(filtered_lines) 433 | 434 | if self.json_transformer: 435 | structured_data = self.json_transformer.transform(structured_data) 436 | 437 | return structured_data -------------------------------------------------------------------------------- /example_output/html/levels.txt: -------------------------------------------------------------------------------- 1 | (2, 'predicted header', '') 2 | (2, 'predicted header', '') 3 | (2, 'predicted header', '') 4 | (2, 'predicted header', '') 5 | (-1, 'text', '') 6 | (-1, 'text', '') 7 | (-2, 'textsmall', '') 8 | (2, 'predicted header', '') 9 | (-1, 'text', '') 10 | (-2, 'textsmall', '') 11 | (-2, 'textsmall', '') 12 | (-2, 'textsmall', '') 13 | (-1, 'text', '') 14 | (-1, 'text', '') 15 | (-1, 'text', '') 16 | (-1, 'text', '') 17 | (-1, 'text', '') 18 | (-1, 'text', '') 19 | (-1, 'text', '') 20 | (-1, 'text', '') 21 | (-1, 'text', '') 22 | (-1, 'text', '') 23 | (-1, 'text', '') 24 | (-1, 'text', '') 25 | (-1, 'text', '') 26 | (-2, 'textsmall', '') 27 | (-1, 'text', '') 28 | (2, 'predicted header', '') 29 | (2, 'predicted header', '') 30 | (2, 'predicted header', '') 31 | (2, 'predicted header', '') 32 | (-1, 'text', '') 33 | (-2, 'textsmall', '') 34 | (-2, 'textsmall', '') 35 | (-2, 'textsmall', '') 36 | (2, 'predicted header', '') 37 | (-1, 'text', '') 38 | (0, 'part', 'parti') 39 | (1, 'item', 'item1') 40 | (3, 'predicted header', '') 41 | (4, 'predicted header', '') 42 | (-1, 'text', '') 43 | (-1, 'text', '') 44 | (-1, 'text', '') 45 | (4, 'predicted header', '') 46 | (-1, 'text', '') 47 | (-1, 'text', '') 48 | (-1, 'text', '') 49 | (-2, 'textsmall', '') 50 | (-2, 'textsmall', '') 51 | (-2, 'textsmall', '') 52 | (4, 'predicted header', '') 53 | (-1, 'text', '') 54 | (-1, 'text', '') 55 | (-1, 'text', '') 56 | (-1, 'text', '') 57 | (5, 'predicted header', '') 58 | (-1, 'text', '') 59 | (-1, 'text', '') 60 | (-1, 'text', '') 61 | (5, 'predicted header', '') 62 | (-1, 'text', '') 63 | (-1, 'text', '') 64 | (-1, 'text', '') 65 | (-1, 'text', '') 66 | (-2, 'textsmall', '') 67 | (-2, 'textsmall', '') 68 | (-2, 'textsmall', '') 69 | (-1, 'text', '') 70 | (-1, 'text', '') 71 | (-1, 'text', '') 72 | (-1, 'text', '') 73 | (-1, 'text', '') 74 | (-1, 'text', '') 75 | (-1, 'text', '') 76 | (-1, 'text', '') 77 | (5, 'predicted header', '') 78 | (-1, 'text', '') 79 | (-1, 'text', '') 80 | (-1, 'text', '') 81 | (-2, 'textsmall', '') 82 | (-2, 'textsmall', '') 83 | (-2, 'textsmall', '') 84 | (-1, 'text', '') 85 | (5, 'predicted header', '') 86 | (-1, 'text', '') 87 | (-1, 'text', '') 88 | (-1, 'text', '') 89 | (-1, 'text', '') 90 | (-1, 'text', '') 91 | (-1, 'text', '') 92 | (-1, 'text', '') 93 | (-1, 'text', '') 94 | (4, 'predicted header', '') 95 | (5, 'predicted header', '') 96 | (-1, 'text', '') 97 | (-1, 'text', '') 98 | (-1, 'text', '') 99 | (-1, 'text', '') 100 | (-1, 'text', '') 101 | (-1, 'text', '') 102 | (-1, 'text', '') 103 | (-1, 'text', '') 104 | (-1, 'text', '') 105 | (-1, 'text', '') 106 | (-2, 'textsmall', '') 107 | (-2, 'textsmall', '') 108 | (-2, 'textsmall', '') 109 | (-1, 'text', '') 110 | (5, 'predicted header', '') 111 | (-1, 'text', '') 112 | (-1, 'text', '') 113 | (-1, 'text', '') 114 | (-1, 'text', '') 115 | (5, 'predicted header', '') 116 | (-1, 'text', '') 117 | (-1, 'text', '') 118 | (3, 'predicted header', '') 119 | (-1, 'text', '') 120 | (-2, 'textsmall', '') 121 | (-2, 'textsmall', '') 122 | (-2, 'textsmall', '') 123 | (-1, 'text', '') 124 | (-1, 'text', '') 125 | (-1, 'text', '') 126 | (-1, 'text', '') 127 | (-1, 'text', '') 128 | (-1, 'text', '') 129 | (3, 'predicted header', '') 130 | (-1, 'text', '') 131 | (-1, 'text', '') 132 | (-1, 'text', '') 133 | (4, 'predicted header', '') 134 | (-1, 'text', '') 135 | (-1, 'text', '') 136 | (-1, 'text', '') 137 | (-1, 'text', '') 138 | (-1, 'text', '') 139 | (-2, 'textsmall', '') 140 | (-2, 'textsmall', '') 141 | (-2, 'textsmall', '') 142 | (5, 'predicted header', '') 143 | (-1, 'text', '') 144 | (5, 'predicted header', '') 145 | (-1, 'text', '') 146 | (5, 'predicted header', '') 147 | (-1, 'text', '') 148 | (5, 'predicted header', '') 149 | (-1, 'text', '') 150 | (-2, 'textsmall', '') 151 | (-2, 'textsmall', '') 152 | (-2, 'textsmall', '') 153 | (6, 'predicted header', '') 154 | (-1, 'text', '') 155 | (-1, 'text', '') 156 | (-1, 'text', '') 157 | (4, 'predicted header', '') 158 | (-1, 'text', '') 159 | (-1, 'text', '') 160 | (-1, 'text', '') 161 | (5, 'predicted header', '') 162 | (-1, 'text', '') 163 | (-1, 'text', '') 164 | (-1, 'text', '') 165 | (-2, 'textsmall', '') 166 | (-2, 'textsmall', '') 167 | (-2, 'textsmall', '') 168 | (-1, 'text', '') 169 | (5, 'predicted header', '') 170 | (-1, 'text', '') 171 | (6, 'predicted header', '') 172 | (-1, 'text', '') 173 | (-1, 'text', '') 174 | (-1, 'text', '') 175 | (-1, 'text', '') 176 | (-1, 'text', '') 177 | (-1, 'text', '') 178 | (-2, 'textsmall', '') 179 | (-2, 'textsmall', '') 180 | (-2, 'textsmall', '') 181 | (4, 'predicted header', '') 182 | (-1, 'text', '') 183 | (-1, 'text', '') 184 | (-1, 'text', '') 185 | (-1, 'text', '') 186 | (-1, 'text', '') 187 | (5, 'predicted header', '') 188 | (-1, 'text', '') 189 | (-1, 'text', '') 190 | (-1, 'text', '') 191 | (-1, 'text', '') 192 | (-1, 'text', '') 193 | (-1, 'text', '') 194 | (-1, 'text', '') 195 | (-1, 'text', '') 196 | (-1, 'text', '') 197 | (-1, 'text', '') 198 | (-1, 'text', '') 199 | (-1, 'text', '') 200 | (-1, 'text', '') 201 | (5, 'predicted header', '') 202 | (-1, 'text', '') 203 | (-2, 'textsmall', '') 204 | (-2, 'textsmall', '') 205 | (-2, 'textsmall', '') 206 | (5, 'predicted header', '') 207 | (-1, 'text', '') 208 | (-1, 'text', '') 209 | (-1, 'text', '') 210 | (5, 'predicted header', '') 211 | (-1, 'text', '') 212 | (6, 'predicted header', '') 213 | (-1, 'text', '') 214 | (-1, 'text', '') 215 | (-1, 'text', '') 216 | (-1, 'text', '') 217 | (-2, 'textsmall', '') 218 | (-2, 'textsmall', '') 219 | (-2, 'textsmall', '') 220 | (3, 'predicted header', '') 221 | (-1, 'text', '') 222 | (-1, 'text', '') 223 | (-1, 'text', '') 224 | (3, 'predicted header', '') 225 | (4, 'predicted header', '') 226 | (-1, 'text', '') 227 | (-1, 'text', '') 228 | (-1, 'text', '') 229 | (-1, 'text', '') 230 | (-1, 'text', '') 231 | (-1, 'text', '') 232 | (-1, 'text', '') 233 | (-1, 'text', '') 234 | (-1, 'text', '') 235 | (-1, 'text', '') 236 | (-2, 'textsmall', '') 237 | (-2, 'textsmall', '') 238 | (-2, 'textsmall', '') 239 | (-1, 'text', '') 240 | (-1, 'text', '') 241 | (4, 'predicted header', '') 242 | (-1, 'text', '') 243 | (-1, 'text', '') 244 | (-1, 'text', '') 245 | (3, 'predicted header', '') 246 | (-1, 'text', '') 247 | (4, 'predicted header', '') 248 | (-1, 'text', '') 249 | (-1, 'text', '') 250 | (-2, 'textsmall', '') 251 | (-2, 'textsmall', '') 252 | (-2, 'textsmall', '') 253 | (4, 'predicted header', '') 254 | (-1, 'text', '') 255 | (-1, 'text', '') 256 | (4, 'predicted header', '') 257 | (-1, 'text', '') 258 | (-1, 'text', '') 259 | (-1, 'text', '') 260 | (3, 'predicted header', '') 261 | (-1, 'text', '') 262 | (-1, 'text', '') 263 | (-2, 'textsmall', '') 264 | (-2, 'textsmall', '') 265 | (-2, 'textsmall', '') 266 | (4, 'predicted header', '') 267 | (5, 'predicted header', '') 268 | (-1, 'text', '') 269 | (5, 'predicted header', '') 270 | (-1, 'text', '') 271 | (5, 'predicted header', '') 272 | (-1, 'text', '') 273 | (5, 'predicted header', '') 274 | (-1, 'text', '') 275 | (5, 'predicted header', '') 276 | (-1, 'text', '') 277 | (5, 'predicted header', '') 278 | (-1, 'text', '') 279 | (4, 'predicted header', '') 280 | (-1, 'text', '') 281 | (-1, 'text', '') 282 | (-1, 'text', '') 283 | (-2, 'textsmall', '') 284 | (-2, 'textsmall', '') 285 | (-2, 'textsmall', '') 286 | (3, 'predicted header', '') 287 | (-1, 'text', '') 288 | (3, 'predicted header', '') 289 | (-1, 'text', '') 290 | (-1, 'text', '') 291 | (3, 'predicted header', '') 292 | (-1, 'text', '') 293 | (-1, 'text', '') 294 | (-1, 'text', '') 295 | (-1, 'text', '') 296 | (-1, 'text', '') 297 | (-2, 'textsmall', '') 298 | (-2, 'textsmall', '') 299 | (-2, 'textsmall', '') 300 | (-1, 'text', '') 301 | (-1, 'text', '') 302 | (-1, 'text', '') 303 | (-1, 'text', '') 304 | (3, 'predicted header', '') 305 | (-1, 'text', '') 306 | (-1, 'text', '') 307 | (-1, 'text', '') 308 | (-1, 'text', '') 309 | (-1, 'text', '') 310 | (-1, 'text', '') 311 | (-1, 'text', '') 312 | (-1, 'text', '') 313 | (-1, 'text', '') 314 | (-1, 'text', '') 315 | (-2, 'textsmall', '') 316 | (-2, 'textsmall', '') 317 | (-2, 'textsmall', '') 318 | (1, 'item', 'item1a') 319 | (-1, 'text', '') 320 | (2, 'predicted header', '') 321 | (3, 'predicted header', '') 322 | (4, 'predicted header', '') 323 | (-1, 'text', '') 324 | (4, 'predicted header', '') 325 | (-1, 'text', '') 326 | (-1, 'text', '') 327 | (-1, 'text', '') 328 | (-1, 'text', '') 329 | (-2, 'textsmall', '') 330 | (-2, 'textsmall', '') 331 | (-2, 'textsmall', '') 332 | (-1, 'text', '') 333 | (4, 'predicted header', '') 334 | (-1, 'text', '') 335 | (-1, 'text', '') 336 | (-1, 'text', '') 337 | (-1, 'text', '') 338 | (-1, 'text', '') 339 | (-1, 'text', '') 340 | (-1, 'text', '') 341 | (3, 'predicted header', '') 342 | (-1, 'text', '') 343 | (-1, 'text', '') 344 | (-1, 'text', '') 345 | (-1, 'text', '') 346 | (-1, 'text', '') 347 | (-1, 'text', '') 348 | (-2, 'textsmall', '') 349 | (-2, 'textsmall', '') 350 | (-2, 'textsmall', '') 351 | (-1, 'text', '') 352 | (2, 'predicted header', '') 353 | (3, 'predicted header', '') 354 | (3, 'predicted header', '') 355 | (3, 'predicted header', '') 356 | (-2, 'textsmall', '') 357 | (-2, 'textsmall', '') 358 | (-2, 'textsmall', '') 359 | (2, 'predicted header', '') 360 | (3, 'predicted header', '') 361 | (4, 'predicted header', '') 362 | (-1, 'text', '') 363 | (-1, 'text', '') 364 | (-1, 'text', '') 365 | (-2, 'textsmall', '') 366 | (-2, 'textsmall', '') 367 | (-2, 'textsmall', '') 368 | (-1, 'text', '') 369 | (4, 'predicted header', '') 370 | (-1, 'text', '') 371 | (-1, 'text', '') 372 | (-1, 'text', '') 373 | (-1, 'text', '') 374 | (-2, 'textsmall', '') 375 | (-2, 'textsmall', '') 376 | (-2, 'textsmall', '') 377 | (4, 'predicted header', '') 378 | (-1, 'text', '') 379 | (-1, 'text', '') 380 | (-1, 'text', '') 381 | (3, 'predicted header', '') 382 | (3, 'predicted header', '') 383 | (-2, 'textsmall', '') 384 | (-2, 'textsmall', '') 385 | (-2, 'textsmall', '') 386 | (3, 'predicted header', '') 387 | (4, 'predicted header', '') 388 | (-1, 'text', '') 389 | (4, 'predicted header', '') 390 | (-1, 'text', '') 391 | (3, 'predicted header', '') 392 | (3, 'predicted header', '') 393 | (-2, 'textsmall', '') 394 | (-2, 'textsmall', '') 395 | (-2, 'textsmall', '') 396 | (2, 'predicted header', '') 397 | (3, 'predicted header', '') 398 | (3, 'predicted header', '') 399 | (-1, 'text', '') 400 | (-1, 'text', '') 401 | (-2, 'textsmall', '') 402 | (-2, 'textsmall', '') 403 | (-2, 'textsmall', '') 404 | (2, 'predicted header', '') 405 | (3, 'predicted header', '') 406 | (-1, 'text', '') 407 | (-1, 'text', '') 408 | (-1, 'text', '') 409 | (3, 'predicted header', '') 410 | (-2, 'textsmall', '') 411 | (-2, 'textsmall', '') 412 | (-2, 'textsmall', '') 413 | (-1, 'text', '') 414 | (3, 'predicted header', '') 415 | (-1, 'text', '') 416 | (-2, 'textsmall', '') 417 | (-2, 'textsmall', '') 418 | (-2, 'textsmall', '') 419 | (3, 'predicted header', '') 420 | (-1, 'text', '') 421 | (3, 'predicted header', '') 422 | (3, 'predicted header', '') 423 | (3, 'predicted header', '') 424 | (-2, 'textsmall', '') 425 | (-2, 'textsmall', '') 426 | (-2, 'textsmall', '') 427 | (-1, 'text', '') 428 | (-1, 'text', '') 429 | (3, 'predicted header', '') 430 | (2, 'predicted header', '') 431 | (3, 'predicted header', '') 432 | (-1, 'text', '') 433 | (-1, 'text', '') 434 | (3, 'predicted header', '') 435 | (-2, 'textsmall', '') 436 | (-2, 'textsmall', '') 437 | (-2, 'textsmall', '') 438 | (2, 'predicted header', '') 439 | (3, 'predicted header', '') 440 | (-1, 'text', '') 441 | (-1, 'text', '') 442 | (-1, 'text', '') 443 | (-1, 'text', '') 444 | (3, 'predicted header', '') 445 | (-1, 'text', '') 446 | (-1, 'text', '') 447 | (-1, 'text', '') 448 | (3, 'predicted header', '') 449 | (-2, 'textsmall', '') 450 | (-2, 'textsmall', '') 451 | (-2, 'textsmall', '') 452 | (-1, 'text', '') 453 | (-1, 'text', '') 454 | (-1, 'text', '') 455 | (3, 'predicted header', '') 456 | (3, 'predicted header', '') 457 | (-2, 'textsmall', '') 458 | (-2, 'textsmall', '') 459 | (-2, 'textsmall', '') 460 | (1, 'item', 'item1b') 461 | (-1, 'text', '') 462 | (1, 'item', 'item1c') 463 | (2, 'predicted header', '') 464 | (-1, 'text', '') 465 | (-1, 'text', '') 466 | (-1, 'text', '') 467 | (-1, 'text', '') 468 | (-1, 'text', '') 469 | (-1, 'text', '') 470 | (-1, 'text', '') 471 | (-2, 'textsmall', '') 472 | (-2, 'textsmall', '') 473 | (-2, 'textsmall', '') 474 | (-1, 'text', '') 475 | (-1, 'text', '') 476 | (-1, 'text', '') 477 | (-1, 'text', '') 478 | (2, 'predicted header', '') 479 | (-1, 'text', '') 480 | (-1, 'text', '') 481 | (-2, 'textsmall', '') 482 | (-2, 'textsmall', '') 483 | (-2, 'textsmall', '') 484 | (1, 'item', 'item2') 485 | (-1, 'text', '') 486 | (-1, 'text', '') 487 | (-1, 'text', '') 488 | (-1, 'text', '') 489 | (1, 'item', 'item3') 490 | (-1, 'text', '') 491 | (1, 'item', 'item4') 492 | (-1, 'text', '') 493 | (-2, 'textsmall', '') 494 | (-2, 'textsmall', '') 495 | (-2, 'textsmall', '') 496 | (0, 'part', 'partii') 497 | (1, 'item', 'item5') 498 | (2, 'predicted header', '') 499 | (-1, 'text', '') 500 | (2, 'predicted header', '') 501 | (-1, 'text', '') 502 | (-1, 'text', '') 503 | (-1, 'text', '') 504 | (-1, 'text', '') 505 | (-1, 'text', '') 506 | (-1, 'text', '') 507 | (-2, 'textsmall', '') 508 | (-2, 'textsmall', '') 509 | (-2, 'textsmall', '') 510 | (1, 'item', 'item6') 511 | (-2, 'textsmall', '') 512 | (-2, 'textsmall', '') 513 | (-2, 'textsmall', '') 514 | (1, 'item', 'item7') 515 | (-1, 'text', '') 516 | (2, 'predicted header', '') 517 | (-1, 'text', '') 518 | (-1, 'text', '') 519 | (-1, 'text', '') 520 | (-1, 'text', '') 521 | (-1, 'text', '') 522 | (-1, 'text', '') 523 | (-1, 'text', '') 524 | (-1, 'text', '') 525 | (-1, 'text', '') 526 | (-1, 'text', '') 527 | (-1, 'text', '') 528 | (-1, 'text', '') 529 | (-1, 'text', '') 530 | (-1, 'text', '') 531 | (-2, 'textsmall', '') 532 | (-2, 'textsmall', '') 533 | (-2, 'textsmall', '') 534 | (3, 'predicted header', '') 535 | (-1, 'text', '') 536 | (3, 'predicted header', '') 537 | (-1, 'text', '') 538 | (-1, 'text', '') 539 | (-1, 'text', '') 540 | (-1, 'text', '') 541 | (-1, 'text', '') 542 | (3, 'predicted header', '') 543 | (-1, 'text', '') 544 | (3, 'predicted header', '') 545 | (-1, 'text', '') 546 | (-2, 'textsmall', '') 547 | (-2, 'textsmall', '') 548 | (-2, 'textsmall', '') 549 | (3, 'predicted header', '') 550 | (-1, 'text', '') 551 | (-1, 'text', '') 552 | (3, 'predicted header', '') 553 | (-1, 'text', '') 554 | (-1, 'text', '') 555 | (4, 'predicted header', '') 556 | (-1, 'text', '') 557 | (-1, 'text', '') 558 | (-2, 'textsmall', '') 559 | (-2, 'textsmall', '') 560 | (-2, 'textsmall', '') 561 | (4, 'predicted header', '') 562 | (-1, 'text', '') 563 | (-1, 'text', '') 564 | (4, 'predicted header', '') 565 | (-1, 'text', '') 566 | (-1, 'text', '') 567 | (-2, 'textsmall', '') 568 | (-2, 'textsmall', '') 569 | (-2, 'textsmall', '') 570 | (2, 'predicted header', '') 571 | (-1, 'text', '') 572 | (-1, 'text', '') 573 | (4, 'predicted header', '') 574 | (-1, 'text', '') 575 | (-1, 'text', '') 576 | (-1, 'text', '') 577 | (-1, 'text', '') 578 | (-1, 'text', '') 579 | (-1, 'text', '') 580 | (-1, 'text', '') 581 | (-1, 'text', '') 582 | (-2, 'textsmall', '') 583 | (-2, 'textsmall', '') 584 | (-2, 'textsmall', '') 585 | (2, 'predicted header', '') 586 | (-1, 'text', '') 587 | (3, 'predicted header', '') 588 | (4, 'predicted header', '') 589 | (5, 'predicted header', '') 590 | (-1, 'text', '') 591 | (-1, 'text', '') 592 | (-1, 'text', '') 593 | (-1, 'text', '') 594 | (-1, 'text', '') 595 | (-1, 'text', '') 596 | (-1, 'text', '') 597 | (-1, 'text', '') 598 | (5, 'predicted header', '') 599 | (-1, 'text', '') 600 | (-1, 'text', '') 601 | (-1, 'text', '') 602 | (-2, 'textsmall', '') 603 | (-2, 'textsmall', '') 604 | (-2, 'textsmall', '') 605 | (-1, 'text', '') 606 | (-1, 'text', '') 607 | (-1, 'text', '') 608 | (5, 'predicted header', '') 609 | (-1, 'text', '') 610 | (-1, 'text', '') 611 | (-1, 'text', '') 612 | (-1, 'text', '') 613 | (-1, 'text', '') 614 | (-1, 'text', '') 615 | (-1, 'text', '') 616 | (-1, 'text', '') 617 | (2, 'predicted header', '') 618 | (3, 'predicted header', '') 619 | (-1, 'text', '') 620 | (-1, 'text', '') 621 | (4, 'predicted header', '') 622 | (-1, 'text', '') 623 | (3, 'predicted header', '') 624 | (-1, 'text', '') 625 | (-2, 'textsmall', '') 626 | (-2, 'textsmall', '') 627 | (-2, 'textsmall', '') 628 | (-1, 'text', '') 629 | (4, 'predicted header', '') 630 | (-1, 'text', '') 631 | (3, 'predicted header', '') 632 | (-1, 'text', '') 633 | (-1, 'text', '') 634 | (4, 'predicted header', '') 635 | (-1, 'text', '') 636 | (2, 'predicted header', '') 637 | (-1, 'text', '') 638 | (-1, 'text', '') 639 | (-1, 'text', '') 640 | (4, 'predicted header', '') 641 | (-1, 'text', '') 642 | (-2, 'textsmall', '') 643 | (-2, 'textsmall', '') 644 | (-2, 'textsmall', '') 645 | (2, 'predicted header', '') 646 | (3, 'predicted header', '') 647 | (-1, 'text', '') 648 | (-1, 'text', '') 649 | (-1, 'text', '') 650 | (-1, 'text', '') 651 | (3, 'predicted header', '') 652 | (-1, 'text', '') 653 | (-1, 'text', '') 654 | (2, 'predicted header', '') 655 | (-1, 'text', '') 656 | (-2, 'textsmall', '') 657 | (-2, 'textsmall', '') 658 | (-2, 'textsmall', '') 659 | (-1, 'text', '') 660 | (-1, 'text', '') 661 | (-1, 'text', '') 662 | (2, 'predicted header', '') 663 | (-1, 'text', '') 664 | (3, 'predicted header', '') 665 | (-1, 'text', '') 666 | (3, 'predicted header', '') 667 | (-1, 'text', '') 668 | (-2, 'textsmall', '') 669 | (-2, 'textsmall', '') 670 | (-2, 'textsmall', '') 671 | (-1, 'text', '') 672 | (3, 'predicted header', '') 673 | (-1, 'text', '') 674 | (3, 'predicted header', '') 675 | (-1, 'text', '') 676 | (3, 'predicted header', '') 677 | (-1, 'text', '') 678 | (-1, 'text', '') 679 | (-1, 'text', '') 680 | (-1, 'text', '') 681 | (-2, 'textsmall', '') 682 | (-2, 'textsmall', '') 683 | (-2, 'textsmall', '') 684 | (3, 'predicted header', '') 685 | (4, 'predicted header', '') 686 | (-1, 'text', '') 687 | (-1, 'text', '') 688 | (-1, 'text', '') 689 | (-1, 'text', '') 690 | (-1, 'text', '') 691 | (-1, 'text', '') 692 | (4, 'predicted header', '') 693 | (-1, 'text', '') 694 | (4, 'predicted header', '') 695 | (-1, 'text', '') 696 | (4, 'predicted header', '') 697 | (-1, 'text', '') 698 | (4, 'predicted header', '') 699 | (-1, 'text', '') 700 | (-2, 'textsmall', '') 701 | (-2, 'textsmall', '') 702 | (-2, 'textsmall', '') 703 | (2, 'predicted header', '') 704 | (-1, 'text', '') 705 | (2, 'predicted header', '') 706 | (-1, 'text', '') 707 | (3, 'predicted header', '') 708 | (-1, 'text', '') 709 | (-1, 'text', '') 710 | (-1, 'text', '') 711 | (-1, 'text', '') 712 | (-1, 'text', '') 713 | (3, 'predicted header', '') 714 | (-1, 'text', '') 715 | (-2, 'textsmall', '') 716 | (-2, 'textsmall', '') 717 | (-2, 'textsmall', '') 718 | (-1, 'text', '') 719 | (3, 'predicted header', '') 720 | (-1, 'text', '') 721 | (-1, 'text', '') 722 | (-1, 'text', '') 723 | (3, 'predicted header', '') 724 | (-1, 'text', '') 725 | (3, 'predicted header', '') 726 | (-1, 'text', '') 727 | (-2, 'textsmall', '') 728 | (-2, 'textsmall', '') 729 | (-2, 'textsmall', '') 730 | (3, 'predicted header', '') 731 | (-1, 'text', '') 732 | (3, 'predicted header', '') 733 | (-1, 'text', '') 734 | (-2, 'textsmall', '') 735 | (-2, 'textsmall', '') 736 | (-2, 'textsmall', '') 737 | (3, 'predicted header', '') 738 | (-1, 'text', '') 739 | (-1, 'text', '') 740 | (-1, 'text', '') 741 | (-1, 'text', '') 742 | (-1, 'text', '') 743 | (-2, 'textsmall', '') 744 | (-2, 'textsmall', '') 745 | (-2, 'textsmall', '') 746 | (1, 'item', 'item7a') 747 | (2, 'predicted header', '') 748 | (-1, 'text', '') 749 | (3, 'predicted header', '') 750 | (-1, 'text', '') 751 | (3, 'predicted header', '') 752 | (-1, 'text', '') 753 | (3, 'predicted header', '') 754 | (-1, 'text', '') 755 | (3, 'predicted header', '') 756 | (-1, 'text', '') 757 | (2, 'predicted header', '') 758 | (-1, 'text', '') 759 | (-1, 'text', '') 760 | (-2, 'textsmall', '') 761 | (-2, 'textsmall', '') 762 | (-2, 'textsmall', '') 763 | (1, 'item', 'item8') 764 | (2, 'predicted header', '') 765 | (-1, 'text', '') 766 | (-1, 'text', '') 767 | (-2, 'textsmall', '') 768 | (-2, 'textsmall', '') 769 | (-2, 'textsmall', '') 770 | (2, 'predicted header', '') 771 | (-1, 'text', '') 772 | (-1, 'text', '') 773 | (-2, 'textsmall', '') 774 | (-2, 'textsmall', '') 775 | (-2, 'textsmall', '') 776 | (2, 'predicted header', '') 777 | (-1, 'text', '') 778 | (-1, 'text', '') 779 | (-2, 'textsmall', '') 780 | (-2, 'textsmall', '') 781 | (-2, 'textsmall', '') 782 | (2, 'predicted header', '') 783 | (-1, 'text', '') 784 | (-1, 'text', '') 785 | (-2, 'textsmall', '') 786 | (-2, 'textsmall', '') 787 | (-2, 'textsmall', '') 788 | (2, 'predicted header', '') 789 | (-1, 'text', '') 790 | (-1, 'text', '') 791 | (-2, 'textsmall', '') 792 | (-2, 'textsmall', '') 793 | (-2, 'textsmall', '') 794 | (2, 'predicted header', '') 795 | (3, 'predicted header', '') 796 | (4, 'predicted header', '') 797 | (-1, 'text', '') 798 | (-1, 'text', '') 799 | (4, 'predicted header', '') 800 | (-1, 'text', '') 801 | (4, 'predicted header', '') 802 | (-1, 'text', '') 803 | (-1, 'text', '') 804 | (4, 'predicted header', '') 805 | (-1, 'text', '') 806 | (4, 'predicted header', '') 807 | (5, 'predicted header', '') 808 | (-1, 'text', '') 809 | (-1, 'text', '') 810 | (-2, 'textsmall', '') 811 | (-2, 'textsmall', '') 812 | (-2, 'textsmall', '') 813 | (5, 'predicted header', '') 814 | (-1, 'text', '') 815 | (6, 'predicted header', '') 816 | (-1, 'text', '') 817 | (-1, 'text', '') 818 | (-1, 'text', '') 819 | (-1, 'text', '') 820 | (-1, 'text', '') 821 | (-1, 'text', '') 822 | (6, 'predicted header', '') 823 | (-1, 'text', '') 824 | (-2, 'textsmall', '') 825 | (-2, 'textsmall', '') 826 | (-2, 'textsmall', '') 827 | (-1, 'text', '') 828 | (-1, 'text', '') 829 | (-1, 'text', '') 830 | (-1, 'text', '') 831 | (5, 'predicted header', '') 832 | (-1, 'text', '') 833 | (-1, 'text', '') 834 | (-1, 'text', '') 835 | (-1, 'text', '') 836 | (-1, 'text', '') 837 | (-1, 'text', '') 838 | (-2, 'textsmall', '') 839 | (-2, 'textsmall', '') 840 | (-2, 'textsmall', '') 841 | (-1, 'text', '') 842 | (-1, 'text', '') 843 | (-1, 'text', '') 844 | (-1, 'text', '') 845 | (-1, 'text', '') 846 | (-1, 'text', '') 847 | (5, 'predicted header', '') 848 | (-1, 'text', '') 849 | (-1, 'text', '') 850 | (4, 'predicted header', '') 851 | (-1, 'text', '') 852 | (-2, 'textsmall', '') 853 | (-2, 'textsmall', '') 854 | (-2, 'textsmall', '') 855 | (4, 'predicted header', '') 856 | (-1, 'text', '') 857 | (4, 'predicted header', '') 858 | (-1, 'text', '') 859 | (4, 'predicted header', '') 860 | (-1, 'text', '') 861 | (4, 'predicted header', '') 862 | (-1, 'text', '') 863 | (-1, 'text', '') 864 | (4, 'predicted header', '') 865 | (-1, 'text', '') 866 | (4, 'predicted header', '') 867 | (5, 'predicted header', '') 868 | (-1, 'text', '') 869 | (-2, 'textsmall', '') 870 | (-2, 'textsmall', '') 871 | (-2, 'textsmall', '') 872 | (-1, 'text', '') 873 | (-1, 'text', '') 874 | (-1, 'text', '') 875 | (5, 'predicted header', '') 876 | (-1, 'text', '') 877 | (-1, 'text', '') 878 | (-1, 'text', '') 879 | (-1, 'text', '') 880 | (4, 'predicted header', '') 881 | (-1, 'text', '') 882 | (-1, 'text', '') 883 | (-2, 'textsmall', '') 884 | (-2, 'textsmall', '') 885 | (-2, 'textsmall', '') 886 | (-1, 'text', '') 887 | (-1, 'text', '') 888 | (-1, 'text', '') 889 | (-1, 'text', '') 890 | (4, 'predicted header', '') 891 | (-1, 'text', '') 892 | (4, 'predicted header', '') 893 | (-1, 'text', '') 894 | (4, 'predicted header', '') 895 | (-1, 'text', '') 896 | (-1, 'text', '') 897 | (-2, 'textsmall', '') 898 | (-2, 'textsmall', '') 899 | (-2, 'textsmall', '') 900 | (-1, 'text', '') 901 | (4, 'predicted header', '') 902 | (-1, 'text', '') 903 | (4, 'predicted header', '') 904 | (-1, 'text', '') 905 | (4, 'predicted header', '') 906 | (-1, 'text', '') 907 | (4, 'predicted header', '') 908 | (5, 'predicted header', '') 909 | (-1, 'text', '') 910 | (5, 'predicted header', '') 911 | (-1, 'text', '') 912 | (3, 'predicted header', '') 913 | (-1, 'text', '') 914 | (-2, 'textsmall', '') 915 | (-2, 'textsmall', '') 916 | (-2, 'textsmall', '') 917 | (-1, 'text', '') 918 | (-1, 'text', '') 919 | (-1, 'text', '') 920 | (3, 'predicted header', '') 921 | (-1, 'text', '') 922 | (-1, 'text', '') 923 | (-1, 'text', '') 924 | (4, 'predicted header', '') 925 | (-1, 'text', '') 926 | (-1, 'text', '') 927 | (-1, 'text', '') 928 | (-1, 'text', '') 929 | (-2, 'textsmall', '') 930 | (-2, 'textsmall', '') 931 | (-2, 'textsmall', '') 932 | (3, 'predicted header', '') 933 | (4, 'predicted header', '') 934 | (-1, 'text', '') 935 | (-1, 'text', '') 936 | (-2, 'textsmall', '') 937 | (-2, 'textsmall', '') 938 | (-2, 'textsmall', '') 939 | (-1, 'text', '') 940 | (-1, 'text', '') 941 | (-1, 'text', '') 942 | (-2, 'textsmall', '') 943 | (-2, 'textsmall', '') 944 | (-2, 'textsmall', '') 945 | (4, 'predicted header', '') 946 | (-1, 'text', '') 947 | (-1, 'text', '') 948 | (-1, 'text', '') 949 | (-1, 'text', '') 950 | (4, 'predicted header', '') 951 | (-1, 'text', '') 952 | (-1, 'text', '') 953 | (-2, 'textsmall', '') 954 | (-2, 'textsmall', '') 955 | (-2, 'textsmall', '') 956 | (3, 'predicted header', '') 957 | (-1, 'text', '') 958 | (4, 'predicted header', '') 959 | (-1, 'text', '') 960 | (-1, 'text', '') 961 | (-1, 'text', '') 962 | (4, 'predicted header', '') 963 | (-1, 'text', '') 964 | (-1, 'text', '') 965 | (4, 'predicted header', '') 966 | (-1, 'text', '') 967 | (4, 'predicted header', '') 968 | (-1, 'text', '') 969 | (4, 'predicted header', '') 970 | (-1, 'text', '') 971 | (-2, 'textsmall', '') 972 | (-2, 'textsmall', '') 973 | (-2, 'textsmall', '') 974 | (-1, 'text', '') 975 | (-1, 'text', '') 976 | (4, 'predicted header', '') 977 | (-1, 'text', '') 978 | (-1, 'text', '') 979 | (-1, 'text', '') 980 | (-1, 'text', '') 981 | (-1, 'text', '') 982 | (-2, 'textsmall', '') 983 | (-2, 'textsmall', '') 984 | (-2, 'textsmall', '') 985 | (-1, 'text', '') 986 | (-1, 'text', '') 987 | (-1, 'text', '') 988 | (-1, 'text', '') 989 | (3, 'predicted header', '') 990 | (-1, 'text', '') 991 | (-1, 'text', '') 992 | (-2, 'textsmall', '') 993 | (-2, 'textsmall', '') 994 | (-2, 'textsmall', '') 995 | (3, 'predicted header', '') 996 | (-1, 'text', '') 997 | (-1, 'text', '') 998 | (-1, 'text', '') 999 | (-1, 'text', '') 1000 | (3, 'predicted header', '') 1001 | (4, 'predicted header', '') 1002 | (-1, 'text', '') 1003 | (-1, 'text', '') 1004 | (-1, 'text', '') 1005 | (-1, 'text', '') 1006 | (-1, 'text', '') 1007 | (-2, 'textsmall', '') 1008 | (-2, 'textsmall', '') 1009 | (-2, 'textsmall', '') 1010 | (-1, 'text', '') 1011 | (-1, 'text', '') 1012 | (-1, 'text', '') 1013 | (-1, 'text', '') 1014 | (-1, 'text', '') 1015 | (-1, 'text', '') 1016 | (-1, 'text', '') 1017 | (-1, 'text', '') 1018 | (4, 'predicted header', '') 1019 | (-1, 'text', '') 1020 | (-1, 'text', '') 1021 | (-1, 'text', '') 1022 | (-1, 'text', '') 1023 | (-1, 'text', '') 1024 | (-2, 'textsmall', '') 1025 | (-2, 'textsmall', '') 1026 | (-2, 'textsmall', '') 1027 | (-1, 'text', '') 1028 | (-1, 'text', '') 1029 | (3, 'predicted header', '') 1030 | (-1, 'text', '') 1031 | (-1, 'text', '') 1032 | (-1, 'text', '') 1033 | (-1, 'text', '') 1034 | (-1, 'text', '') 1035 | (4, 'predicted header', '') 1036 | (-1, 'text', '') 1037 | (-1, 'text', '') 1038 | (-2, 'textsmall', '') 1039 | (-2, 'textsmall', '') 1040 | (-2, 'textsmall', '') 1041 | (3, 'predicted header', '') 1042 | (-1, 'text', '') 1043 | (-1, 'text', '') 1044 | (-1, 'text', '') 1045 | (-1, 'text', '') 1046 | (-1, 'text', '') 1047 | (-1, 'text', '') 1048 | (-1, 'text', '') 1049 | (-1, 'text', '') 1050 | (-1, 'text', '') 1051 | (-2, 'textsmall', '') 1052 | (-2, 'textsmall', '') 1053 | (-2, 'textsmall', '') 1054 | (3, 'predicted header', '') 1055 | (4, 'predicted header', '') 1056 | (-1, 'text', '') 1057 | (4, 'predicted header', '') 1058 | (-1, 'text', '') 1059 | (-1, 'text', '') 1060 | (-1, 'text', '') 1061 | (-1, 'text', '') 1062 | (-1, 'text', '') 1063 | (-1, 'text', '') 1064 | (-2, 'textsmall', '') 1065 | (-2, 'textsmall', '') 1066 | (-2, 'textsmall', '') 1067 | (-1, 'text', '') 1068 | (-1, 'text', '') 1069 | (3, 'predicted header', '') 1070 | (4, 'predicted header', '') 1071 | (-1, 'text', '') 1072 | (-1, 'text', '') 1073 | (-1, 'text', '') 1074 | (-1, 'text', '') 1075 | (-2, 'textsmall', '') 1076 | (-2, 'textsmall', '') 1077 | (-2, 'textsmall', '') 1078 | (4, 'predicted header', '') 1079 | (-1, 'text', '') 1080 | (-1, 'text', '') 1081 | (-1, 'text', '') 1082 | (-1, 'text', '') 1083 | (-1, 'text', '') 1084 | (-2, 'textsmall', '') 1085 | (-2, 'textsmall', '') 1086 | (-2, 'textsmall', '') 1087 | (-1, 'text', '') 1088 | (-1, 'text', '') 1089 | (-1, 'text', '') 1090 | (-1, 'text', '') 1091 | (-1, 'text', '') 1092 | (-1, 'text', '') 1093 | (4, 'predicted header', '') 1094 | (-1, 'text', '') 1095 | (-2, 'textsmall', '') 1096 | (-2, 'textsmall', '') 1097 | (-2, 'textsmall', '') 1098 | (-1, 'text', '') 1099 | (-1, 'text', '') 1100 | (-1, 'text', '') 1101 | (-1, 'text', '') 1102 | (-1, 'text', '') 1103 | (-1, 'text', '') 1104 | (3, 'predicted header', '') 1105 | (-1, 'text', '') 1106 | (-1, 'text', '') 1107 | (-1, 'text', '') 1108 | (-1, 'text', '') 1109 | (-1, 'text', '') 1110 | (-2, 'textsmall', '') 1111 | (-2, 'textsmall', '') 1112 | (-2, 'textsmall', '') 1113 | (3, 'predicted header', '') 1114 | (-1, 'text', '') 1115 | (-1, 'text', '') 1116 | (-1, 'text', '') 1117 | (-1, 'text', '') 1118 | (-1, 'text', '') 1119 | (-1, 'text', '') 1120 | (-1, 'text', '') 1121 | (-2, 'textsmall', '') 1122 | (-2, 'textsmall', '') 1123 | (-2, 'textsmall', '') 1124 | (-1, 'text', '') 1125 | (-1, 'text', '') 1126 | (-1, 'text', '') 1127 | (3, 'predicted header', '') 1128 | (4, 'predicted header', '') 1129 | (-1, 'text', '') 1130 | (-1, 'text', '') 1131 | (-2, 'textsmall', '') 1132 | (-2, 'textsmall', '') 1133 | (-2, 'textsmall', '') 1134 | (4, 'predicted header', '') 1135 | (-1, 'text', '') 1136 | (4, 'predicted header', '') 1137 | (-1, 'text', '') 1138 | (-1, 'text', '') 1139 | (3, 'predicted header', '') 1140 | (4, 'predicted header', '') 1141 | (-1, 'text', '') 1142 | (-1, 'text', '') 1143 | (4, 'predicted header', '') 1144 | (-1, 'text', '') 1145 | (-1, 'text', '') 1146 | (-1, 'text', '') 1147 | (-1, 'text', '') 1148 | (-2, 'textsmall', '') 1149 | (-2, 'textsmall', '') 1150 | (-2, 'textsmall', '') 1151 | (-1, 'text', '') 1152 | (4, 'predicted header', '') 1153 | (-1, 'text', '') 1154 | (-1, 'text', '') 1155 | (-1, 'text', '') 1156 | (-2, 'textsmall', '') 1157 | (-2, 'textsmall', '') 1158 | (-2, 'textsmall', '') 1159 | (3, 'predicted header', '') 1160 | (-1, 'text', '') 1161 | (-1, 'text', '') 1162 | (3, 'predicted header', '') 1163 | (-1, 'text', '') 1164 | (-1, 'text', '') 1165 | (-1, 'text', '') 1166 | (4, 'predicted header', '') 1167 | (-1, 'text', '') 1168 | (-2, 'textsmall', '') 1169 | (-2, 'textsmall', '') 1170 | (-2, 'textsmall', '') 1171 | (5, 'predicted header', '') 1172 | (-1, 'text', '') 1173 | (5, 'predicted header', '') 1174 | (-1, 'text', '') 1175 | (-1, 'text', '') 1176 | (-1, 'text', '') 1177 | (-1, 'text', '') 1178 | (-1, 'text', '') 1179 | (-1, 'text', '') 1180 | (4, 'predicted header', '') 1181 | (-1, 'text', '') 1182 | (-1, 'text', '') 1183 | (-1, 'text', '') 1184 | (-1, 'text', '') 1185 | (-2, 'textsmall', '') 1186 | (-2, 'textsmall', '') 1187 | (-2, 'textsmall', '') 1188 | (4, 'predicted header', '') 1189 | (-1, 'text', '') 1190 | (3, 'predicted header', '') 1191 | (-1, 'text', '') 1192 | (-1, 'text', '') 1193 | (4, 'predicted header', '') 1194 | (-1, 'text', '') 1195 | (-1, 'text', '') 1196 | (-1, 'text', '') 1197 | (-1, 'text', '') 1198 | (-1, 'text', '') 1199 | (4, 'predicted header', '') 1200 | (-1, 'text', '') 1201 | (-1, 'text', '') 1202 | (-1, 'text', '') 1203 | (4, 'predicted header', '') 1204 | (-1, 'text', '') 1205 | (-1, 'text', '') 1206 | (-1, 'text', '') 1207 | (-2, 'textsmall', '') 1208 | (-2, 'textsmall', '') 1209 | (-2, 'textsmall', '') 1210 | (-1, 'text', '') 1211 | (-1, 'text', '') 1212 | (-1, 'text', '') 1213 | (-1, 'text', '') 1214 | (-1, 'text', '') 1215 | (-1, 'text', '') 1216 | (-1, 'text', '') 1217 | (-1, 'text', '') 1218 | (-1, 'text', '') 1219 | (-2, 'textsmall', '') 1220 | (-2, 'textsmall', '') 1221 | (-2, 'textsmall', '') 1222 | (-1, 'text', '') 1223 | (-1, 'text', '') 1224 | (-1, 'text', '') 1225 | (-1, 'text', '') 1226 | (-1, 'text', '') 1227 | (-1, 'text', '') 1228 | (-1, 'text', '') 1229 | (-2, 'textsmall', '') 1230 | (-2, 'textsmall', '') 1231 | (-2, 'textsmall', '') 1232 | (4, 'predicted header', '') 1233 | (-1, 'text', '') 1234 | (4, 'predicted header', '') 1235 | (-1, 'text', '') 1236 | (-1, 'text', '') 1237 | (4, 'predicted header', '') 1238 | (-1, 'text', '') 1239 | (-1, 'text', '') 1240 | (4, 'predicted header', '') 1241 | (-1, 'text', '') 1242 | (5, 'predicted header', '') 1243 | (6, 'predicted header', '') 1244 | (-1, 'text', '') 1245 | (-2, 'textsmall', '') 1246 | (-2, 'textsmall', '') 1247 | (-2, 'textsmall', '') 1248 | (-1, 'text', '') 1249 | (-1, 'text', '') 1250 | (-1, 'text', '') 1251 | (-1, 'text', '') 1252 | (-1, 'text', '') 1253 | (-1, 'text', '') 1254 | (6, 'predicted header', '') 1255 | (-1, 'text', '') 1256 | (-1, 'text', '') 1257 | (-1, 'text', '') 1258 | (-1, 'text', '') 1259 | (-1, 'text', '') 1260 | (-1, 'text', '') 1261 | (-1, 'text', '') 1262 | (-1, 'text', '') 1263 | (-1, 'text', '') 1264 | (5, 'predicted header', '') 1265 | (6, 'predicted header', '') 1266 | (-1, 'text', '') 1267 | (-2, 'textsmall', '') 1268 | (-2, 'textsmall', '') 1269 | (-2, 'textsmall', '') 1270 | (-1, 'text', '') 1271 | (6, 'predicted header', '') 1272 | (-1, 'text', '') 1273 | (-1, 'text', '') 1274 | (-1, 'text', '') 1275 | (-1, 'text', '') 1276 | (-1, 'text', '') 1277 | (-1, 'text', '') 1278 | (5, 'predicted header', '') 1279 | (6, 'predicted header', '') 1280 | (-1, 'text', '') 1281 | (-1, 'text', '') 1282 | (6, 'predicted header', '') 1283 | (-1, 'text', '') 1284 | (-1, 'text', '') 1285 | (-1, 'text', '') 1286 | (-2, 'textsmall', '') 1287 | (-2, 'textsmall', '') 1288 | (-2, 'textsmall', '') 1289 | (-1, 'text', '') 1290 | (-1, 'text', '') 1291 | (-1, 'text', '') 1292 | (-1, 'text', '') 1293 | (-1, 'text', '') 1294 | (-1, 'text', '') 1295 | (-1, 'text', '') 1296 | (-1, 'text', '') 1297 | (-2, 'textsmall', '') 1298 | (-2, 'textsmall', '') 1299 | (-2, 'textsmall', '') 1300 | (1, 'item', 'item9') 1301 | (-1, 'text', '') 1302 | (1, 'item', 'item9a') 1303 | (-1, 'text', '') 1304 | (2, 'predicted header', '') 1305 | (-1, 'text', '') 1306 | (-1, 'text', '') 1307 | (-2, 'textsmall', '') 1308 | (-2, 'textsmall', '') 1309 | (-2, 'textsmall', '') 1310 | (2, 'predicted header', '') 1311 | (-1, 'text', '') 1312 | (2, 'predicted header', '') 1313 | (-1, 'text', '') 1314 | (-1, 'text', '') 1315 | (-1, 'text', '') 1316 | (2, 'predicted header', '') 1317 | (-1, 'text', '') 1318 | (-1, 'text', '') 1319 | (2, 'predicted header', '') 1320 | (-1, 'text', '') 1321 | (-2, 'textsmall', '') 1322 | (-2, 'textsmall', '') 1323 | (-2, 'textsmall', '') 1324 | (-1, 'text', '') 1325 | (-1, 'text', '') 1326 | (-1, 'text', '') 1327 | (-1, 'text', '') 1328 | (-2, 'textsmall', '') 1329 | (-2, 'textsmall', '') 1330 | (-2, 'textsmall', '') 1331 | (1, 'item', 'item9b') 1332 | (2, 'predicted header', '') 1333 | (-1, 'text', '') 1334 | (1, 'item', 'item9c') 1335 | (-1, 'text', '') 1336 | (0, 'part', 'partiii') 1337 | (1, 'item', 'item10') 1338 | (-1, 'text', '') 1339 | (-1, 'text', '') 1340 | (-1, 'text', '') 1341 | (-1, 'text', '') 1342 | (-1, 'text', '') 1343 | (-1, 'text', '') 1344 | (-2, 'textsmall', '') 1345 | (-2, 'textsmall', '') 1346 | (-2, 'textsmall', '') 1347 | (1, 'item', 'item11') 1348 | (-1, 'text', '') 1349 | (1, 'item', 'item12') 1350 | (-1, 'text', '') 1351 | (1, 'item', 'item13') 1352 | (-1, 'text', '') 1353 | (1, 'item', 'item14') 1354 | (-1, 'text', '') 1355 | (-2, 'textsmall', '') 1356 | (-2, 'textsmall', '') 1357 | (-2, 'textsmall', '') 1358 | (0, 'part', 'partiv') 1359 | (1, 'item', 'item15') 1360 | (2, 'predicted header', '') 1361 | (-1, 'text', '') 1362 | (-1, 'text', '') 1363 | (2, 'predicted header', '') 1364 | (-1, 'text', '') 1365 | (-2, 'textsmall', '') 1366 | (-2, 'textsmall', '') 1367 | (-2, 'textsmall', '') 1368 | (-1, 'text', '') 1369 | (-2, 'textsmall', '') 1370 | (-2, 'textsmall', '') 1371 | (-2, 'textsmall', '') 1372 | (-1, 'text', '') 1373 | (-2, 'textsmall', '') 1374 | (-2, 'textsmall', '') 1375 | (-2, 'textsmall', '') 1376 | (-1, 'text', '') 1377 | (-2, 'textsmall', '') 1378 | (-2, 'textsmall', '') 1379 | (-2, 'textsmall', '') 1380 | (-1, 'text', '') 1381 | (-2, 'textsmall', '') 1382 | (-2, 'textsmall', '') 1383 | (-2, 'textsmall', '') 1384 | (-1, 'text', '') 1385 | (-2, 'textsmall', '') 1386 | (-2, 'textsmall', '') 1387 | (-2, 'textsmall', '') 1388 | (-1, 'text', '') 1389 | (-1, 'text', '') 1390 | (-1, 'text', '') 1391 | (-2, 'textsmall', '') 1392 | (-2, 'textsmall', '') 1393 | (-2, 'textsmall', '') 1394 | (1, 'item', 'item16') 1395 | (-1, 'text', '') 1396 | (-2, 'textsmall', '') 1397 | (0, 'signatures', 'signatures') 1398 | (-1, 'text', '') 1399 | (-1, 'text', '') 1400 | (-1, 'text', '') 1401 | (-1, 'text', '') 1402 | (-2, 'textsmall', '') 1403 | -------------------------------------------------------------------------------- /doc2dict/doc2dict/html/convert_html_to_instructions.py: -------------------------------------------------------------------------------- 1 | from ..utils.strings import check_string_style 2 | # params 3 | tag_groups = { 4 | "bold": ["b", "strong"], 5 | "italic": ["i", "em"], 6 | "underline": ["u", "ins"], 7 | } 8 | 9 | EMPTY_CHARS = ' \t\n\r\xa0' 10 | EMPTY_TABLE_CHARS = ['', '–', '-'] 11 | LEFT_TABLE_CHARS = ['$','('] 12 | RIGHT_TABLE_CHARS = [')','%'] 13 | 14 | def remove_leading_empty_instructions(instructions): 15 | """Remove leading empty/whitespace-only instructions from the list""" 16 | if not instructions: 17 | return instructions 18 | 19 | # Find the first non-empty instruction 20 | first_meaningful_index = 0 21 | for i, instruction in enumerate(instructions): 22 | # Skip non-text instructions (tables, images are meaningful content) 23 | if 'image' in instruction or 'table' in instruction: 24 | first_meaningful_index = i 25 | break 26 | 27 | # Check if text instruction has meaningful content 28 | if 'text' in instruction: 29 | text = instruction['text'].strip(EMPTY_CHARS) 30 | if text: # Non-empty after stripping 31 | first_meaningful_index = i 32 | break 33 | else: 34 | # If we get here, all instructions were empty text or whitespace-only 35 | return [] 36 | 37 | # Return sliced list starting from first meaningful instruction 38 | return instructions[first_meaningful_index:] 39 | 40 | def is_empty_instructions(instructions): 41 | """Check if an instruction block contains only whitespace/empty content""" 42 | if not instructions: 43 | return True 44 | 45 | for instruction in instructions: 46 | # Skip non-text instructions (tables, images are meaningful content) 47 | if 'image' in instruction or 'table' in instruction: 48 | return False 49 | 50 | # Check if text instruction has meaningful content 51 | if 'text' in instruction: 52 | text = instruction['text'].strip(EMPTY_CHARS) 53 | if text: # Non-empty after stripping 54 | return False 55 | 56 | # All instructions were either empty text or whitespace-only 57 | return True 58 | 59 | # utils 60 | def walk(node): 61 | yield ("start",node) 62 | for child in node.iter(include_text=True): 63 | yield from walk(child) 64 | 65 | yield ("end",node) 66 | 67 | 68 | def style_to_dict(style_string): 69 | result = {} 70 | if not style_string: 71 | return result 72 | # send to lower case 73 | style_string = style_string.lower() 74 | style_list = [attr.strip(EMPTY_CHARS) for attr in style_string.split(';') if attr.strip(EMPTY_CHARS)] 75 | 76 | for item in style_list: 77 | if ':' in item: 78 | key, value = item.split(':', 1) 79 | result[key.strip(EMPTY_CHARS)] = value.strip(EMPTY_CHARS) 80 | return result 81 | 82 | 83 | def parse_font_shorthand(font_value): 84 | """ 85 | Parse CSS font shorthand property into individual components. 86 | 87 | Font shorthand syntax: [font-style] [font-variant] [font-weight] font-size [/line-height] font-family 88 | Required: font-size and font-family 89 | Optional (in order): font-style, font-variant, font-weight, line-height 90 | 91 | Examples: 92 | - "bold 10pt Times New Roman" -> {'font-weight': 'bold', 'font-size': '10pt', 'font-family': 'Times New Roman'} 93 | - "italic bold 12px Arial" -> {'font-style': 'italic', 'font-weight': 'bold', 'font-size': '12px', 'font-family': 'Arial'} 94 | """ 95 | if not font_value: 96 | return {} 97 | 98 | # Clean and split the font value 99 | parts = font_value.strip().split() 100 | if len(parts) < 2: # Must have at least font-size and font-family 101 | return {} 102 | 103 | result = {} 104 | i = 0 105 | 106 | # Parse optional properties in order: font-style, font-variant, font-weight 107 | 108 | # Check for font-style (italic, oblique, normal) 109 | if i < len(parts) and parts[i].lower() in ['italic', 'oblique', 'normal']: 110 | if parts[i].lower() == 'italic': 111 | result['font-style'] = 'italic' 112 | i += 1 113 | 114 | # Check for font-variant (small-caps, normal) - we'll skip this for now 115 | if i < len(parts) and parts[i].lower() in ['small-caps', 'normal']: 116 | # Skip font-variant for now since we don't handle it 117 | i += 1 118 | 119 | # Check for font-weight (bold, normal, 100-900, lighter, bolder) 120 | if i < len(parts): 121 | weight = parts[i].lower() 122 | if weight in ['bold', '700']: 123 | result['font-weight'] = 'bold' 124 | i += 1 125 | elif weight in ['normal', '400']: 126 | result['font-weight'] = 'normal' 127 | i += 1 128 | elif weight in ['100', '200', '300', '500', '600', '800', '900', 'lighter', 'bolder']: 129 | result['font-weight'] = weight 130 | i += 1 131 | 132 | # Next must be font-size (required) 133 | if i < len(parts): 134 | size_part = parts[i] 135 | # Handle font-size/line-height format (e.g., "12px/1.5") 136 | if '/' in size_part: 137 | size, line_height = size_part.split('/', 1) 138 | result['font-size'] = size 139 | result['line-height'] = line_height 140 | else: 141 | result['font-size'] = size_part 142 | i += 1 143 | 144 | # Remaining parts are font-family (required) 145 | if i < len(parts): 146 | # Join remaining parts for font family (handles "Times New Roman" etc.) 147 | font_family = ' '.join(parts[i:]) 148 | # Remove quotes if present 149 | font_family = font_family.strip('\'"') 150 | result['font-family'] = font_family 151 | 152 | return result 153 | 154 | def get_style(node): 155 | increments = [] 156 | stacks = [] 157 | style = node.attributes.get('style', '') 158 | style_dict = style_to_dict(style) 159 | 160 | # Parse font shorthand if present 161 | if 'font' in style_dict: 162 | font_properties = parse_font_shorthand(style_dict['font']) 163 | # Merge parsed properties into style_dict 164 | style_dict.update(font_properties) 165 | 166 | if 'font-weight' in style_dict: 167 | if style_dict['font-weight'] == 'bold': 168 | increments.append('bold') 169 | elif style_dict['font-weight'] == '700': 170 | increments.append('bold') 171 | 172 | if 'font-style' in style_dict: 173 | if style_dict['font-style'] == 'italic': 174 | increments.append('italic') 175 | 176 | if 'text-decoration' in style_dict: 177 | if style_dict['text-decoration'] == 'underline': 178 | increments.append('underline') 179 | 180 | if 'text-align' in style_dict: 181 | if style_dict['text-align'] == 'center': 182 | increments.append('text-center') 183 | 184 | 185 | left_indent = 0 186 | 187 | if 'font-size' in style_dict: 188 | font_size = style_dict['font-size'] 189 | font_size = normalize_to_px(font_size) 190 | stacks.append({'font-size': font_size}) 191 | 192 | if 'text-indent' in style_dict: 193 | indent = style_dict['text-indent'] 194 | indent = normalize_to_px(indent) 195 | left_indent += indent 196 | 197 | if 'padding' in style_dict: 198 | padding_value = style_dict['padding'] 199 | # Handle four-value format: top right bottom left 200 | if padding_value.count(' ') == 3: 201 | _, _, _, left = padding_value.split(' ') 202 | left = normalize_to_px(left) 203 | left_indent += left 204 | # Handle three-value format: top right/left bottom 205 | elif padding_value.count(' ') == 2: 206 | _, right_left, _ = padding_value.split(' ') 207 | right_left = normalize_to_px(right_left) 208 | left_indent += right_left 209 | # Handle two-value format: top/bottom right/left 210 | elif padding_value.count(' ') == 1: 211 | _, right_left = padding_value.split(' ') 212 | right_left = normalize_to_px(right_left) 213 | left_indent += right_left 214 | # Handle single-value format: all sides 215 | else: 216 | padding_value = normalize_to_px(padding_value) 217 | left_indent += padding_value 218 | 219 | # Also handle direct padding-left if specified 220 | if 'padding-left' in style_dict: 221 | padding_left = style_dict['padding-left'] 222 | padding_left = normalize_to_px(padding_left) 223 | left_indent += padding_left 224 | 225 | # Handle margin with the same logic as padding 226 | if 'margin' in style_dict: 227 | margin_value = style_dict['margin'] 228 | # Handle four-value format: top right bottom left 229 | if margin_value.count(' ') == 3: 230 | _, _, _, left = margin_value.split(' ') 231 | left = normalize_to_px(left) 232 | left_indent += left 233 | # Handle three-value format: top right/left bottom 234 | elif margin_value.count(' ') == 2: 235 | _, right_left, _ = margin_value.split(' ') 236 | right_left = normalize_to_px(right_left) 237 | left_indent += right_left 238 | # Handle two-value format: top/bottom right/left 239 | elif margin_value.count(' ') == 1: 240 | _, right_left = margin_value.split(' ') 241 | right_left = normalize_to_px(right_left) 242 | left_indent += right_left 243 | # Handle single-value format: all sides 244 | else: 245 | margin_value = normalize_to_px(margin_value) 246 | left_indent += margin_value 247 | 248 | # Handle direct margin-left if specified 249 | if 'margin-left' in style_dict: 250 | margin_left = style_dict['margin-left'] 251 | margin_left = normalize_to_px(margin_left) 252 | left_indent += margin_left 253 | 254 | if 'display' in style_dict: 255 | if style_dict['display'] == 'none': 256 | increments.append('display-none') 257 | 258 | if left_indent != 0: 259 | stacks.append({'left-indent': str(left_indent)}) 260 | return increments, stacks 261 | 262 | def parse_css_value(value_str): 263 | """Extract numeric value and unit from CSS value string""" 264 | if not value_str or not isinstance(value_str, str): 265 | return 0, 'px' 266 | 267 | value_str = value_str.strip(EMPTY_CHARS) 268 | 269 | # Handle non-numeric values 270 | if value_str in ['auto', 'inherit', 'initial']: 271 | return 0, value_str 272 | 273 | # Find where the number ends 274 | numeric_part = '' 275 | for i, char in enumerate(value_str): 276 | if char.isdigit() or char == '.': 277 | numeric_part += char 278 | elif char == '-' and i == 0: # Handle negative values 279 | numeric_part += char 280 | else: 281 | unit = value_str[i:].strip(EMPTY_CHARS) 282 | break 283 | else: 284 | unit = 'px' # Default if no unit specified 285 | 286 | # Convert numeric part to float 287 | try: 288 | value = float(numeric_part) if numeric_part else 0 289 | except ValueError: 290 | value = 0 291 | 292 | return value, unit 293 | 294 | 295 | def normalize_to_px(value_str, font_context=None): 296 | """Convert any CSS measurement to pixels based on context""" 297 | if not value_str: 298 | return 0 299 | 300 | # Parse the value 301 | value, unit = parse_css_value(value_str) 302 | 303 | # Early return for non-numeric values 304 | if unit in ['auto', 'inherit', 'initial']: 305 | return 0 306 | 307 | # Get font context in pixels 308 | current_font_size = 16 # Default 309 | if font_context: 310 | font_value, font_unit = parse_css_value(font_context) 311 | if font_unit == 'px': 312 | current_font_size = font_value 313 | elif font_unit == 'pt': 314 | current_font_size = font_value * 1.333 315 | else: 316 | # For simplicity, treat all other units as approximately 16px 317 | current_font_size = font_value * 16 if font_value else 16 318 | 319 | # Convert to pixels 320 | if unit == 'px': 321 | return value 322 | elif unit == 'pt': 323 | return value * 1.333 324 | elif unit == 'em': 325 | return value * current_font_size 326 | elif unit == 'rem': 327 | return value * 16 # Root em always based on root font size 328 | elif unit == '%': 329 | return value * current_font_size / 100 # % of font size 330 | elif unit == 'ex': 331 | return value * current_font_size / 2 # Roughly half the font size 332 | elif unit == 'ch': 333 | return value * current_font_size * 0.5 # Approximate character width 334 | elif unit in ['vh', 'vw', 'vmin', 'vmax']: 335 | return value # Cannot accurately convert viewport units without screen size 336 | elif unit == 'cm': 337 | return value * 37.8 # Approximate for screen (96dpi) 338 | elif unit == 'mm': 339 | return value * 3.78 # 1/10th of cm 340 | elif unit == 'in': 341 | return value * 96 # Standard 96dpi 342 | elif unit == 'pc': 343 | return value * 16 # 1pc = 12pt 344 | else: 345 | return value # Unknown unit, return as is 346 | 347 | def safe_increment(dct,key): 348 | if key not in dct: 349 | dct[key] = 0 350 | 351 | dct[key] += 1 352 | 353 | def safe_decrement(dct,key): 354 | if key not in dct: 355 | dct[key] = 0 356 | 357 | dct[key] -= 1 358 | if dct[key] < 0: 359 | dct[key] = 0 360 | 361 | def safe_stack(dct,key,val): 362 | if key not in dct: 363 | dct[key] = [] 364 | 365 | dct[key].append(val) 366 | 367 | def safe_unstack(dct,key): 368 | if key not in dct: 369 | dct[key] = [] 370 | 371 | if len(dct[key]) > 0: 372 | dct[key].pop() 373 | else: 374 | dct[key] = [] 375 | 376 | def parse_start_style(current_attributes,node): 377 | increments, stacks = get_style(node) 378 | if 'display-none' in increments: 379 | return 'skip' 380 | 381 | for key in increments: 382 | safe_increment(current_attributes,key) 383 | 384 | for stack in stacks: 385 | for key in stack: 386 | safe_stack(current_attributes,key,stack[key]) 387 | 388 | return '' 389 | def parse_end_style(current_attributes,node): 390 | increments,stacks = get_style(node) 391 | if 'display-none' in increments: 392 | return 'skip' 393 | 394 | for key in increments: 395 | safe_decrement(current_attributes,key) 396 | 397 | for stack in stacks: 398 | for key in stack: 399 | safe_unstack(current_attributes,key) 400 | 401 | return '' 402 | 403 | def parse_start_tag(current_attributes,node): 404 | tag = node.tag 405 | 406 | if tag == 'table': 407 | return 'table' 408 | elif tag == '-text': 409 | return 'text' 410 | elif tag == 'a': 411 | href = node.attributes.get('href', '') 412 | safe_stack(current_attributes, 'href', href) 413 | return '' 414 | elif tag == 'img': 415 | return 'image' 416 | 417 | for tag in tag_groups: 418 | if node.tag in tag_groups[tag]: 419 | safe_increment(current_attributes,tag) 420 | return '' 421 | 422 | def parse_end_tag(current_attributes,node): 423 | tag = node.tag 424 | 425 | if tag == 'table': 426 | return 'table' 427 | elif tag in ['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li','br']: 428 | return 'newline' 429 | elif tag == 'a': 430 | safe_unstack(current_attributes, 'href') 431 | return '' 432 | 433 | for tag in tag_groups: 434 | if node.tag in tag_groups[tag]: 435 | safe_decrement(current_attributes,tag) 436 | return '' 437 | 438 | # USED AI BC LAZY # 439 | def merge_instructions(instructions): 440 | if not instructions or len(instructions) <= 1: 441 | return instructions 442 | 443 | result = [instructions[0]] 444 | 445 | for i in range(1, len(instructions)): 446 | current = instructions[i] 447 | prev = result[-1] 448 | 449 | # Skip merging if either instruction is an image 450 | if 'image' in current or 'image' in prev: 451 | result.append(current) 452 | continue 453 | 454 | # Case 1: Empty string after strip 455 | if current.get('text', '').strip(EMPTY_CHARS) == '': 456 | prev['text'] += current.get('text', '') 457 | continue 458 | 459 | # Case 2: Attributes match with previous 460 | attrs_to_check = ['bold', 'text-center', 'italic', 'underline', 'font-size'] 461 | attrs_match = all(current.get(attr) == prev.get(attr) for attr in attrs_to_check) 462 | 463 | if attrs_match: 464 | prev['text'] += current.get('text', '') 465 | continue 466 | 467 | # Case 3: Check if attributes match with any earlier instruction 468 | # This handles the case where instructions a and c match but b doesn't 469 | merged = False 470 | for j in range(len(result) - 1, -1, -1): # Check all previous instructions 471 | earlier = result[j] 472 | if 'image' not in earlier and all(current.get(attr) == earlier.get(attr) for attr in attrs_to_check): 473 | # Combine all instructions from j to the current one 474 | combined_text = earlier['text'] 475 | for k in range(j + 1, len(result)): 476 | if 'text' in result[k]: 477 | combined_text += result[k].get('text', '') 478 | combined_text += current.get('text', '') 479 | 480 | earlier['text'] = combined_text 481 | # Remove the instructions that were merged 482 | result = result[:j+1] 483 | merged = True 484 | break 485 | 486 | if not merged: 487 | result.append(current) 488 | 489 | return result 490 | # USED AI BC LAZY # 491 | 492 | def is_subset(items1, items2, empty_chars): 493 | """returns true if items1 is a subset of items2""" 494 | return all(item1.get('text', '') in empty_chars or item1.get('text', '') == item2.get('text', '') for item1, item2 in zip(items1, items2)) 495 | 496 | def remove_subset_rows(table, empty_chars, direction="bottom_to_top"): 497 | """ 498 | Remove subset rows from the table. 499 | direction: "bottom_to_top" or "top_to_bottom" 500 | """ 501 | if not table: 502 | return table 503 | 504 | keep_rows = [True] * len(table) 505 | 506 | if direction == "bottom_to_top": 507 | # Compare each row with the row above it 508 | for i in range(len(table)-1, 0, -1): 509 | if is_subset(table[i], table[i-1], empty_chars): 510 | keep_rows[i] = False 511 | else: # top_to_bottom 512 | # Compare each row with the row below it 513 | for i in range(len(table)-1): 514 | if is_subset(table[i], table[i+1], empty_chars): 515 | keep_rows[i] = False 516 | 517 | return [table[i] for i in range(len(table)) if keep_rows[i]] 518 | 519 | def remove_subset_columns(table, empty_chars, direction="left_to_right"): 520 | """ 521 | Remove subset columns from the table. 522 | direction: "left_to_right" or "right_to_left" 523 | """ 524 | if not table or not table[0]: 525 | return table 526 | 527 | num_cols = len(table[0]) 528 | keep_cols = [True] * num_cols 529 | 530 | if direction == "left_to_right": 531 | # Compare each column with the column to its right 532 | for j in range(num_cols-1): 533 | col1 = [row[j] for row in table] 534 | col2 = [row[j+1] for row in table] 535 | if is_subset(col1, col2, empty_chars): 536 | keep_cols[j] = False 537 | else: # right_to_left 538 | # Compare each column with the column to its left 539 | for j in range(num_cols-1, 0, -1): 540 | col1 = [row[j] for row in table] 541 | col2 = [row[j-1] for row in table] 542 | if is_subset(col1, col2, empty_chars): 543 | keep_cols[j] = False 544 | 545 | return [[row[j] for j in range(num_cols) if keep_cols[j]] for row in table] 546 | 547 | 548 | 549 | def is_left_char_cell(cell): 550 | """Check if cell contains only LEFT_TABLE_CHARS + EMPTY_CHARS""" 551 | if 'image' in cell: 552 | return False 553 | text = cell.get('text', '') 554 | if not text: 555 | return False 556 | # Check if all characters in text are either left chars or empty chars 557 | return all(char in LEFT_TABLE_CHARS + EMPTY_TABLE_CHARS for char in text) 558 | 559 | def is_right_char_cell(cell): 560 | """Check if cell contains only RIGHT_TABLE_CHARS + EMPTY_CHARS""" 561 | if 'image' in cell: 562 | return False 563 | text = cell.get('text', '') 564 | if not text: 565 | return False 566 | # Check if all characters in text are either right chars or empty chars 567 | return all(char in RIGHT_TABLE_CHARS + EMPTY_TABLE_CHARS for char in text) 568 | 569 | def is_content_cell(cell): 570 | """Check if cell has meaningful content (not just formatting chars)""" 571 | if 'image' in cell: 572 | return True 573 | text = cell.get('text', '') 574 | if not text: 575 | return False 576 | # Content cell if it has chars that aren't formatting or empty 577 | all_formatting_chars = LEFT_TABLE_CHARS + RIGHT_TABLE_CHARS + EMPTY_TABLE_CHARS 578 | return any(char not in all_formatting_chars for char in text) 579 | 580 | def find_next_content_cell(row, start_col): 581 | """Find next cell with content to the right""" 582 | for col in range(start_col + 1, len(row)): 583 | if is_content_cell(row[col]): 584 | return col 585 | return None 586 | 587 | def find_prev_content_cell(row, start_col): 588 | """Find previous cell with content to the left""" 589 | for col in range(start_col - 1, -1, -1): 590 | if is_content_cell(row[col]): 591 | return col 592 | return None 593 | 594 | def merge_cell_content(source_cell, target_cell, direction): 595 | """Merge source cell text into target cell""" 596 | source_text = source_cell.get('text', '') 597 | target_text = target_cell.get('text', '') 598 | 599 | # Create a copy of target cell to preserve its attributes 600 | merged_cell = target_cell.copy() 601 | 602 | if direction == 'left': 603 | # Source goes to the left of target 604 | merged_cell['text'] = source_text + target_text 605 | else: # direction == 'right' 606 | # Source goes to the right of target 607 | merged_cell['text'] = target_text + source_text 608 | 609 | return merged_cell 610 | 611 | 612 | def merge_cell_instructions(instructions): 613 | """ 614 | Merge all text from cell instructions into a single instruction. 615 | Discard images, concatenate all text, collect ALL attributes from ALL instructions. 616 | For boolean attributes (bold, italic, etc.), if ANY instruction has it, the result has it. 617 | For list attributes (font-size, href, etc.), use the last non-empty value. 618 | """ 619 | if not instructions: 620 | return {'text': ''} 621 | 622 | # Collect all text and all attributes 623 | combined_text = '' 624 | all_attributes = {} 625 | 626 | for instruction in instructions: 627 | # Skip images completely 628 | if 'image' in instruction: 629 | continue 630 | 631 | # Add any text content 632 | if 'text' in instruction: 633 | combined_text += instruction['text'] 634 | 635 | # Collect all attributes except 'text' 636 | for key, value in instruction.items(): 637 | if key == 'text': 638 | continue 639 | 640 | if key not in all_attributes: 641 | all_attributes[key] = [] 642 | all_attributes[key].append(value) 643 | 644 | # Create final cell instruction 645 | result = {'text': combined_text} 646 | 647 | # Process collected attributes 648 | for key, values in all_attributes.items(): 649 | # Remove None/empty values 650 | non_empty_values = [v for v in values if v is not None and v != ''] 651 | 652 | if not non_empty_values: 653 | continue 654 | 655 | # For boolean attributes (True/False), if ANY instruction has True, result is True 656 | if all(isinstance(v, bool) for v in non_empty_values): 657 | result[key] = any(non_empty_values) 658 | 659 | # For numeric attributes, use the last value 660 | elif all(isinstance(v, (int, float)) for v in non_empty_values): 661 | result[key] = non_empty_values[-1] 662 | 663 | # For string attributes, use the last non-empty value 664 | else: 665 | result[key] = non_empty_values[-1] 666 | 667 | return result 668 | 669 | def merge_table_formatting(table): 670 | """Merge formatting characters with adjacent content""" 671 | if not table or not table[0]: 672 | return table 673 | 674 | # Create a working copy 675 | result_table = [row[:] for row in table] 676 | 677 | # Left merging pass - merge LEFT_TABLE_CHARS with content to their right 678 | for row_idx, row in enumerate(result_table): 679 | for col_idx, cell in enumerate(row): 680 | if is_left_char_cell(cell): 681 | # Find next content cell to the right 682 | target_col = find_next_content_cell(row, col_idx) 683 | if target_col is not None: 684 | # Merge this cell's content with the target cell 685 | merged_cell = merge_cell_content(cell, row[target_col], 'left') 686 | result_table[row_idx][target_col] = merged_cell 687 | # Mark source cell as empty 688 | result_table[row_idx][col_idx] = {'text': ''} 689 | 690 | # Right merging pass - merge RIGHT_TABLE_CHARS with content to their left 691 | for row_idx, row in enumerate(result_table): 692 | for col_idx, cell in enumerate(row): 693 | if is_right_char_cell(cell): 694 | # Find previous content cell to the left 695 | target_col = find_prev_content_cell(row, col_idx) 696 | if target_col is not None: 697 | # Merge this cell's content with the target cell 698 | merged_cell = merge_cell_content(cell, row[target_col], 'right') 699 | result_table[row_idx][target_col] = merged_cell 700 | # Mark source cell as empty 701 | result_table[row_idx][col_idx] = {'text': ''} 702 | 703 | return result_table 704 | 705 | def clean_table(table): 706 | if len(table) == 0: 707 | return table, "dirty" 708 | 709 | # First check if table has same number of columns 710 | same_length = all([len(row) == len(table[0]) for row in table]) 711 | if not same_length: 712 | return table, "dirty" 713 | 714 | # NEW: Table detection - single row tables are likely formatting, not data 715 | if len(table) == 1: 716 | return table, "not_table" 717 | 718 | # Merge formatting characters with adjacent content 719 | table = merge_table_formatting(table) 720 | 721 | # Convert image cells to text cells with [IMAGE: {src}] format 722 | for row_idx, row in enumerate(table): 723 | for col_idx, cell in enumerate(row): 724 | if 'image' in cell: 725 | src = cell['image'].get('src', '') 726 | # Create new text cell preserving other attributes 727 | new_cell = {k: v for k, v in cell.items() if k != 'image'} 728 | new_cell['text'] = f'[IMAGE: {src}]' 729 | table[row_idx][col_idx] = new_cell 730 | 731 | empty_chars = EMPTY_TABLE_CHARS 732 | 733 | # Remove empty rows - now only need to check text since all images are converted 734 | table = [row for row in table if any( 735 | (cell.get('text', '') not in empty_chars) 736 | for cell in row 737 | )] 738 | 739 | # Remove empty columns - now only need to check text since all images are converted 740 | if table and table[0]: 741 | keep_cols = [j for j in range(len(table[0])) if any( 742 | (table[i][j].get('text', '') not in empty_chars) 743 | for i in range(len(table)) 744 | )] 745 | table = [[row[j] for j in keep_cols] for row in table] 746 | 747 | # Remove subset rows and columns 748 | table = remove_subset_rows(table, empty_chars, "bottom_to_top") 749 | table = remove_subset_rows(table, empty_chars, "top_to_bottom") 750 | table = remove_subset_columns(table, empty_chars, "left_to_right") 751 | table = remove_subset_columns(table, empty_chars, "right_to_left") 752 | 753 | return table, "cleaned" 754 | 755 | # TODO, not sure how it handles ragged tables... e.g. td are not same length in rows 756 | def convert_html_to_instructions(root): 757 | skip_node = False 758 | in_table = False 759 | in_cell = False 760 | 761 | instructions_list = [] 762 | instructions = [] 763 | current_attributes = {} 764 | 765 | # Dictionary-based approach for table cells 766 | table_cells = {} 767 | max_row = -1 768 | max_col = -1 769 | occupied_positions = set() 770 | current_cell_instructions = [] 771 | 772 | # table 773 | row_id = 0 774 | col_id = 0 775 | rowspan = 1 776 | colspan = 1 777 | 778 | for signal, node in walk(root): 779 | if signal == "start": 780 | # skip invisible elements 781 | if skip_node: 782 | continue 783 | elif in_table and node.tag in ['td', 'th']: 784 | in_cell = True 785 | colspan = int(node.attributes.get('colspan', 1)) 786 | rowspan = int(node.attributes.get('rowspan', 1)) 787 | current_cell_instructions = [] 788 | elif in_table and node.tag == 'tr': 789 | pass 790 | 791 | style_command = parse_start_style(current_attributes, node) 792 | if style_command == 'skip': 793 | skip_node = True 794 | continue 795 | 796 | tag_command = parse_start_tag(current_attributes, node) 797 | if tag_command == 'table': 798 | in_table = True 799 | # Reset table variables 800 | table_cells = {} 801 | max_row = -1 802 | max_col = -1 803 | occupied_positions = set() 804 | row_id = 0 805 | col_id = 0 806 | if len(instructions) > 0: 807 | if not is_empty_instructions(instructions): 808 | instructions_list.append(instructions) 809 | instructions = [] 810 | continue 811 | elif tag_command == 'text': 812 | text = node.text_content 813 | 814 | # check not leading whitespace 815 | if len(instructions) == 0: 816 | text = text 817 | if len(text) == 0: 818 | continue 819 | 820 | instruction = {'text': text} 821 | 822 | text_styles = check_string_style(text) 823 | instruction.update(text_styles) 824 | 825 | for key in current_attributes: 826 | val = current_attributes[key] 827 | if isinstance(val, list): 828 | if len(val) > 0: 829 | instruction[key] = val[-1] 830 | elif isinstance(val, int): 831 | if val > 0: 832 | instruction[key] = True 833 | 834 | # Redirect instruction output based on context 835 | if in_cell: 836 | current_cell_instructions.append(instruction) 837 | else: 838 | instructions.append(instruction) 839 | elif tag_command == 'image': 840 | src = node.attributes.get('src', '') 841 | alt = node.attributes.get('alt', '') 842 | 843 | instruction = {'image': {'src': src, 'alt': alt}} 844 | 845 | for key in current_attributes: 846 | val = current_attributes[key] 847 | if isinstance(val, list): 848 | if len(val) > 0: 849 | instruction[key] = val[-1] 850 | elif isinstance(val, int): 851 | if val > 0: 852 | instruction[key] = True 853 | 854 | # Redirect instruction output based on context 855 | if in_cell: 856 | current_cell_instructions.append(instruction) 857 | else: 858 | instructions.append(instruction) 859 | 860 | elif signal == "end": 861 | style_command = parse_end_style(current_attributes, node) 862 | if style_command == 'skip': 863 | skip_node = False 864 | continue 865 | 866 | tag_command = parse_end_tag(current_attributes, node) 867 | if tag_command == 'table': 868 | 869 | # Create a properly sized matrix from the collected data 870 | if max_row >= 0 and max_col >= 0: # Only if we have cells 871 | matrix = [[{'text': ''} for _ in range(max_col + 1)] for _ in range(max_row + 1)] 872 | 873 | 874 | # Fill in the cells 875 | for (r, c), cell_data in table_cells.items(): 876 | if 'text' in cell_data: 877 | # Create a copy and strip the text 878 | cleaned_cell = cell_data.copy() 879 | cleaned_cell['text'] = cell_data['text'].strip(EMPTY_CHARS) 880 | matrix[r][c] = cleaned_cell 881 | else: 882 | matrix[r][c] = cell_data 883 | 884 | 885 | # clean the matrix 886 | matrix,cleaning_status = clean_table(matrix) 887 | if cleaning_status == "not_table": 888 | # Combine all cells into one instruction block (same line) 889 | all_cells = [] 890 | for cell in matrix[0]: 891 | if 'text' in cell and cell['text'].strip(EMPTY_CHARS): 892 | all_cells.append(cell) 893 | if all_cells: 894 | instructions_list.append(all_cells) # One block = One line 895 | elif len(matrix) == 1: 896 | # Fallback for other single-row cases that somehow didn't get caught 897 | cell_texts = [] 898 | for cell in matrix[0]: 899 | if 'image' in cell: 900 | cell_texts.append(f"[Image: {cell['image'].get('alt', 'No alt text')}]") 901 | else: 902 | cell_texts.append(cell.get('text', '')) 903 | matrix_text = ' '.join(cell_texts) 904 | instructions_list.append([{'text': matrix_text, 'fake_table': True}]) 905 | else: 906 | # Multi-row table (cleaned or dirty) 907 | instructions_list.append([{'table': matrix, 'cleaned': cleaning_status == "cleaned"}]) 908 | 909 | 910 | # Reset table state 911 | table_cells = {} 912 | occupied_positions = set() 913 | current_cell_instructions = [] 914 | in_table = False 915 | continue 916 | elif in_table: 917 | if node.tag in ['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'br']: 918 | # Add newline to current cell if we're in a cell 919 | if in_cell: 920 | if current_cell_instructions: 921 | last_instruction = current_cell_instructions[-1] 922 | if 'text' in last_instruction: 923 | last_instruction['text'] += '\n' 924 | elif node.tag == 'tr': 925 | row_id += 1 926 | col_id = 0 927 | elif node.tag in ['td', 'th']: 928 | # Process accumulated cell instructions 929 | if current_cell_instructions: 930 | cell_data = merge_cell_instructions(current_cell_instructions) 931 | 932 | else: 933 | cell_data = {'text': ''} 934 | 935 | # Find next available position if current is occupied 936 | while (row_id, col_id) in occupied_positions: 937 | col_id += 1 938 | 939 | # Store the cell_data at EVERY position this cell occupies 940 | for y in range(rowspan): 941 | for x in range(colspan): 942 | # Store cell data at this position 943 | table_cells[(row_id + y, col_id + x)] = cell_data 944 | # Mark position as occupied 945 | occupied_positions.add((row_id + y, col_id + x)) 946 | 947 | # Update maximum dimensions 948 | max_row = max(max_row, row_id + rowspan - 1) 949 | max_col = max(max_col, col_id + colspan - 1) 950 | 951 | # Move to next position 952 | col_id += colspan 953 | current_cell_instructions = [] 954 | in_cell = False 955 | 956 | elif tag_command == 'newline': 957 | if len(instructions) > 0: 958 | instructions = remove_leading_empty_instructions(instructions) 959 | instructions = merge_instructions(instructions) 960 | if len(instructions) == 1: 961 | # strip text if it's a text instruction 962 | if 'text' in instructions[0]: 963 | instructions[0]['text'] = instructions[0]['text'].strip(EMPTY_CHARS) 964 | if not is_empty_instructions(instructions): 965 | instructions_list.append(instructions) 966 | instructions = [] 967 | continue 968 | 969 | # add any remaining instructions 970 | if instructions: 971 | if len(instructions) > 0: 972 | instructions = remove_leading_empty_instructions(instructions) 973 | if len(instructions) == 1: 974 | # strip text if it's a text instruction 975 | if 'text' in instructions[0]: 976 | instructions[0]['text'] = instructions[0]['text'].strip(EMPTY_CHARS) 977 | if not is_empty_instructions(instructions): 978 | instructions_list.append(instructions) 979 | return instructions_list --------------------------------------------------------------------------------