├── .github └── workflows │ └── pypi-project.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CHANGELOG.md ├── LICENSE.txt ├── README.md ├── README_DOCX_FILE_STRUCTURE.md ├── docx2python ├── __init__.py ├── attribute_register.py ├── bullets_and_numbering.py ├── depth_collector.py ├── docx_context.py ├── docx_output.py ├── docx_reader.py ├── docx_text.py ├── forms.py ├── iterators.py ├── main.py ├── merge_runs.py ├── namespace.py ├── numbering_formats.py ├── py.typed ├── text_runs.py └── utilities.py ├── pyproject.toml └── tests ├── __init__.py ├── conftest.py ├── do_not_test_missing_imagedata_rid.py ├── do_not_test_problem_files.py ├── helpers └── utils.py ├── resources ├── 240-DOP-1013A Lay Down Tubulars.docx ├── ControlTest.docx ├── apples_and_pears.docx ├── ascii_printable.docx ├── basic.docx ├── check_drop_my.docx ├── checked-true-false.docx ├── checked_boxes.docx ├── checked_drop1.docx ├── comments.docx ├── created-in-pages-bulleted-lists.docx ├── created-in-pages-paragraphs-only.docx ├── equations.docx ├── example.docx ├── example_numbering.docx ├── has_pict.docx ├── hyperlink.docx ├── imagedata_without_rid.docx ├── invalid_tag_name.docx ├── libreoffice_conversion.docx ├── list_index_a.docx ├── long_hyperlink.docx ├── merged_cells.docx ├── merged_links.docx ├── multiple_runs_per_paragraph.docx ├── nested_paragraphs.docx ├── nested_paragraphs_in_header.docx ├── nested_paragraphs_in_header3b.docx ├── paragraphs_and_tables.docx ├── pic_alt_text.docx ├── renamed_document_xml.docx ├── run_styles.docx ├── slanted_quotes.docx ├── soft_line_breaks.docx ├── strict.docx ├── symbols.docx ├── test-docx2python-conversion-google_docs.docx ├── test_file_with_comments.docx ├── unchecked_drop0.docx └── zen_of_python.docx ├── test_ascii_printable.py ├── test_check_drop.py ├── test_checked_boxes.py ├── test_close.py ├── test_comments.py ├── test_content_control_block_properties.py ├── test_created_in_pages.py ├── test_document2_xml.py ├── test_docx2python.py ├── test_docx_context.py ├── test_docx_output.py ├── test_dropdown_selector_in_table.py ├── test_equations.py ├── test_file_object.py ├── test_from_bytes.py ├── test_get_text.py ├── test_google_docs.py ├── test_hyperlinks.py ├── test_import.py ├── test_invalid_tag_name.py ├── test_iterators.py ├── test_libreoffice_conversion.py ├── test_lineage.py ├── test_linebreak_replace_text.py ├── test_list_position.py ├── test_long_hyperlink.py ├── test_merge_runs.py ├── test_merged_cells.py ├── test_more_html.py ├── test_numbering_formats.py ├── test_par_styles.py ├── test_pict.py ├── test_run_styles.py ├── test_slanted_quotes.py ├── test_soft_line_breaks.py ├── test_strict.py ├── test_symbols.py ├── test_tables_to_markdown.py ├── test_text_runs.py ├── test_toc_support.py └── test_utilities.py /.github/workflows/pypi-project.yml: -------------------------------------------------------------------------------- 1 | # Run tests then upload to Pypi on version bumps. 2 | # Run tests on each push. 3 | # Try to bump version 4 | # If version is bumped, upload to pypi or test.pypi depending on branch name. 5 | 6 | name: pypi project 7 | 8 | on: 9 | push: 10 | branches: [dev, master] 11 | pull_request: 12 | branches: [master] 13 | 14 | jobs: 15 | tests: 16 | runs-on: ubuntu-latest 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] 21 | os: [ubuntu-latest, macos-latest, windows-latest] 22 | # if: startsWith(github.event.head_commit.message, 'bump:') == false 23 | steps: 24 | - uses: actions/checkout@v4 25 | - name: Set up Python ${{ matrix.python-version }} 26 | uses: actions/setup-python@v5 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 33 | python -m pip install pytest 34 | python -m pip install commitizen 35 | python -m pip install . 36 | 37 | - name: Test with pytest 38 | run: | 39 | pytest 40 | 41 | # # If the tests pass, try to bump the version number. If no bump is warranted, 42 | # # pass silently. 43 | # bump_version: 44 | # runs-on: ubuntu-latest 45 | # name: "Bump version and create changelog with commitizen" 46 | # continue-on-error: false 47 | # needs: [tests] 48 | # if: github.ref == 'refs/heads/dev' 49 | # steps: 50 | # - name: Check out 51 | # uses: actions/checkout@v4 52 | # with: 53 | # fetch-depth: 0 54 | # token: "${{ secrets.COMMITIZEN_BUMP }}" 55 | # - id: cz 56 | # name: Create bump and changelog 57 | # uses: commitizen-tools/commitizen-action@master 58 | # with: 59 | # github_token: ${{ secrets.COMMITIZEN_BUMP }} 60 | # - name: Print Version 61 | # run: echo "Bumped to version ${{ steps.cz.outputs.version }}" 62 | 63 | # Deploy on test.pypi when branch is dev and commit message starts with 'bump' 64 | deploy-on-testpypi: 65 | runs-on: ubuntu-latest 66 | continue-on-error: true 67 | needs: [tests] 68 | if: github.ref_name == 'dev' && startsWith(github.event.head_commit.message, 'bump:') 69 | steps: 70 | - uses: actions/checkout@v4 71 | - name: Set up Python 72 | uses: actions/setup-python@v5 73 | with: 74 | python-version: '3.x' 75 | - name: Install dependencies 76 | run: | 77 | python -m pip install --upgrade pip 78 | pip install build 79 | - name: Build package 80 | run: python -m build 81 | - name: Publish package 82 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 83 | with: 84 | repository_url: https://test.pypi.org/legacy/ 85 | user: __token__ 86 | password: ${{ secrets.TEST_PYPI_API_TOKEN }} 87 | 88 | # Deploy on pypi when branch is master and commit message starts with 'bump' 89 | deploy-on-pypi: 90 | runs-on: ubuntu-latest 91 | continue-on-error: true 92 | needs: [tests] 93 | if: github.ref_name == 'master' && startsWith(github.event.head_commit.message, 'bump:') 94 | steps: 95 | - uses: actions/checkout@v4 96 | - name: Set up Python 97 | uses: actions/setup-python@v5 98 | with: 99 | python-version: '3.x' 100 | - name: Install dependencies 101 | run: | 102 | python -m pip install --upgrade pip 103 | pip install build 104 | - name: Build package 105 | run: python -m build 106 | - name: Publish package 107 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 108 | with: 109 | user: __token__ 110 | password: ${{ secrets.PYPI_API_TOKEN }} 111 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | __pycache__/ 3 | **/~* 4 | requirements.txt 5 | dev-requirements.txt 6 | Update-PythonVenv.ps1 7 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | 2 | ci: 3 | skip: [pyright] 4 | 5 | # exclude: 'scripts/.*|tests/.*' 6 | exclude: 'scripts/.*' 7 | 8 | repos: 9 | 10 | - repo: https://github.com/pre-commit/pre-commit-hooks 11 | rev: v5.0.0 12 | hooks: 13 | - id: check-added-large-files 14 | - id: check-ast 15 | - id: check-case-conflict 16 | - id: check-docstring-first 17 | - id: check-executables-have-shebangs 18 | - id: check-json 19 | - id: check-merge-conflict 20 | args: 21 | - --assume-in-merge 22 | - id: check-shebang-scripts-are-executable 23 | - id: check-symlinks 24 | - id: check-toml 25 | - id: check-vcs-permalinks 26 | - id: check-xml 27 | - id: check-yaml 28 | - id: debug-statements 29 | - id: destroyed-symlinks 30 | - id: detect-private-key 31 | - id: end-of-file-fixer 32 | - id: mixed-line-ending 33 | - id: requirements-txt-fixer 34 | - id: trailing-whitespace 35 | - id: fix-encoding-pragma 36 | args: 37 | - --remove 38 | # - id: name-tests-test 39 | # args: 40 | # - --pytest-test-first 41 | - id: no-commit-to-branch 42 | - id: pretty-format-json 43 | args: ['--autofix'] 44 | # - id: sort-simple-yaml 45 | # files: .pre-commit-config.yaml 46 | 47 | - repo: https://github.com/pre-commit/mirrors-mypy 48 | rev: v1.15.0 49 | hooks: 50 | - id: mypy 51 | name: mypy 52 | language: python 53 | language_version: python3.12 54 | types: [python] 55 | require_serial: true 56 | verbose: true 57 | additional_dependencies: ['types-requests'] 58 | # exclude: "tests" 59 | # args: 60 | # - --ignore-missing-imports 61 | # files: ^(src/|tests/) 62 | 63 | - repo: https://github.com/PyCQA/isort 64 | rev: 6.0.1 65 | hooks: 66 | - id: isort 67 | args: ["--profile", "black", "--filter-files", "--combine-as", "honor--noqa"] 68 | 69 | - repo: https://github.com/psf/black 70 | rev: 25.1.0 71 | hooks: 72 | - id: black 73 | language_version: python3.9 74 | args: ["--skip-magic-trailing-comma"] 75 | 76 | - repo: https://github.com/asottile/pyupgrade 77 | rev: v3.19.1 78 | hooks: 79 | - args: 80 | - --py39-plus 81 | id: pyupgrade 82 | 83 | - repo: https://github.com/Lucas-C/pre-commit-hooks 84 | rev: v1.5.5 85 | hooks: 86 | - id: remove-tabs 87 | 88 | # - repo: https://github.com/commitizen-tools/commitizen 89 | # rev: v2.40.0 90 | # hooks: 91 | # - id: commitizen 92 | 93 | # pylint still broken in python 3.12 94 | # - repo: https://github.com/pre-commit/mirrors-pylint 95 | # rev: v3.0.0a5 96 | # hooks: 97 | # - id: pylint 98 | # exclude: "tests" 99 | # name: pylint 100 | # args: 101 | # - --good-names=i,j,_,f 102 | # - --disable=protected-access 103 | # - --disable=no-member 104 | # - --disable=import-error 105 | # - --disable=no-name-in-module 106 | # - --load-plugins=pylint.extensions.docparams 107 | # - --accept-no-param-doc=n 108 | # - --accept-no-raise-doc=n 109 | # - --accept-no-return-doc=n 110 | # - --accept-no-yields-doc=n 111 | 112 | - repo: https://github.com/astral-sh/ruff-pre-commit 113 | # ignores 114 | # ANN201 Missing return type annotation for public function 115 | # ANN202 Missing return type annotation for private function (wants -> None everywhere) 116 | # B905 zip() without an explicit strict= parameter 117 | # COM812 Trailing comma missing 118 | # D203 1 blank line required before class docstring 119 | # D213 multi line summary second line 120 | # D400 first line should end with a period 121 | # I001 Import block is un-sorted or un-formatted 122 | # ISC003 Explicitly concatenated string should be implicitly concatenated 123 | # N802 Function name should be lowercase 124 | # N806 Variable in function should be lowercase 125 | # PERF401 Use a list comprehension to create a transformed list 126 | # PGH003 Use specific rule codes when ignoring type issues 127 | # PLR0913 Too many arguments to function call 128 | # 129 | # ERA001 Found commented-out code 130 | # N803 Argument name should be lowercase 131 | # S320 Using `lxml` to parse untrusted data is known to be ... XML attacks 132 | # PLR2004 Magic values 133 | # C901 function is too complex # for iter_at_depth 134 | # PLR0912 too many branches # for iter_at_depth 135 | # 136 | rev: 'v0.11.9' 137 | hooks: 138 | - id: ruff 139 | exclude: "tests" 140 | args: 141 | - --target-version=py38 142 | - --select=ALL 143 | - --ignore=ANN201,ANN202,B905,COM812,D203,D213,D400,I001,ISC003,N802,N806,PERF401,PGH003,PLR0913,ERA001,N803,S320,PLR2004,C901,PLR0912 144 | # # - --fix 145 | 146 | # reads pyproject.toml for additional config 147 | - repo: https://github.com/RobertCraigie/pyright-python 148 | rev: v1.1.400 149 | hooks: 150 | - id: pyright 151 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | 2 | ## 3.5.0 (2025-02-03) 3 | 4 | ### Feat 5 | 6 | - Remove Python 3.8 support. 7 | - Refactor File.path inference to support rare files with rels in a 8 | `word/glossary` directory. 9 | - Test Python 3.13 support. 10 | 11 | ## 3.4.0 (2025-02-01) 12 | 13 | ### Feat 14 | 15 | - edit and save rels files. You can now access the `rels_element` attribute of 16 | File instances to update hyperlink urls and other values. These will be saves 17 | on DocxReader.save(). This is an advanced feature and will not change text 18 | extraction. 19 | 20 | ## 3.3.0 (2024-12-05) 21 | 22 | ### Feat 23 | 24 | - skip elements with invalid tags. Issue a warning. These are usually the 25 | result of faulty conversion software. 26 | 27 | ## 3.2.1 (2024-11-17) 28 | 29 | ### Feat 30 | 31 | - add an `elem` attribute to `Par` instances, returning the xml element from 32 | which the paragraph was generated 33 | 34 | ## 3.0.0 (2024-07-27) 35 | 36 | ### BREAKING CHANGE 37 | 38 | - The html and duplicate_merged_cells arguments to docx2python are now keyword 39 | only. 40 | - Inserts empty cells and whitespace into exported 41 | tables. 42 | - Removed IndexedItem class which was *probably* only used internally, but it 43 | was a part of the public interface. 44 | - Function get_text was a public function. It mirrored the identical 45 | flatten_text from the docx_text module. 46 | - This change breaks the way paragraph styles (internally pStyle) were handled. 47 | The input argument `do_pStyle` will no now raise an error. 48 | - This doesn't change the interface and doesn't break any of my tests, but it 49 | took a lot of refactoring to make this change and it may break some 50 | unofficial patches I've made for clients. 51 | 52 | ### Feat 53 | 54 | - improve type hints for DocxContent properties 55 | - insert blank cells to match gridSpan 56 | - add list_position attribute for Par instances 57 | - explicate return types in iterators 58 | - use input file namespace 59 | 60 | ### Fix 61 | 62 | - eliminate double html tags for paragraph styles 63 | 64 | ### Refactor 65 | 66 | - make boolean args keyword only 67 | - use pathlib in lieu of os.path 68 | - remove Any types from DocxContent close method 69 | - convert HtmlFormatter lambdas to defs 70 | - specialize join_leaves into join_runs 71 | - insert html when extracting text 72 | - make queuing text outside paragraphs explicit 73 | - make _open_pars private 74 | - stop accepting extract_image bool argument 75 | - default duplicate_merged_cells to True 76 | - remove unused helper functions 77 | - use pathlib in conftest 78 | - expose numPr, ilvl, and number in BulletGenerator 79 | - remove redundant functions 80 | - remove do_pStyle argument from flatten_text 81 | - remove function get_text from iterators module 82 | - store content table as nested list of Par instances 83 | - move xml2html_format attrib from TagRunner to DepthCollector 84 | - factor out DepthCollector.item_depth param 85 | - make set_caret recursive 86 | - remove unused `styled` param from insert_text_as_new_run 87 | - remove relative imports in src modules 88 | 89 | ## 2.10.2 (2024-06-30) 90 | 91 | ### Refactor 92 | 93 | - remove relative imports in src modules 94 | 95 | ## 2.10.1 (2024-04-03) 96 | 97 | ### Fix 98 | 99 | - move paragraphs to main dependencies 100 | 101 | ## 2.10.0 (2024-04-03) 102 | 103 | ### Feat 104 | 105 | - support checkox "true"/"false" values 106 | 107 | ## 2.9.2 (2024-04-03) 108 | 109 | ### Fix 110 | 111 | - extract hyperlinks in comments 112 | - remove open_par limit in DepthCollector 113 | - return empty list when comments fails 114 | 115 | ## 2.9.1 (2024-04-02) 116 | 117 | ### Refactor 118 | 119 | - comb full-text and line-text formatting 120 | - refactor element text extractors into methods 121 | 122 | ## 2.9.0 (2024-03-30) 123 | 124 | ### Feat 125 | 126 | - extract comments from docx files 127 | - capture comment ranges 128 | 129 | ### Refactor 130 | 131 | - expose DepthCollector instance for File object 132 | - expose DepthCollector instance when get_text 133 | 134 | ## 2.8.0 (2024-01-21) 135 | 136 | ### Feat 137 | 138 | - capture hyperlink anchors 139 | 140 | ## 2.7.3 (2023-06-17) 141 | 142 | ### Fix 143 | 144 | - sync commitizen and poetry version numbers 145 | 146 | ## 2.7.2 (2023-06-16) 147 | 148 | ### Fix 149 | 150 | - update poetry lock file 151 | 152 | ## 2.7.1 (2023-05-02) 153 | 154 | ### Refactor 155 | 156 | - update and pass pre-commit hooks 157 | 158 | ## 2.7.0 (2023-04-27) 159 | 160 | ### Feat 161 | 162 | - preserve newlines in replace_docx_text 163 | - add py.typed for typecheckers 164 | - add argument duplicate_merged_cells for docx tables 165 | - add context manager protocol 166 | - allow type IOBytes for filename arguments 167 | - add and mostly pass pre-commit hooks 168 | - remove Python 3.7 support 169 | 170 | ### Fix 171 | 172 | - move pre-commit to dev requirement 173 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2019 Shay Hill 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README_DOCX_FILE_STRUCTURE.md: -------------------------------------------------------------------------------- 1 | ## typical docx file format 2 | 3 | To assist with reading the project documentation or extending `docx2python`. 4 | 5 | There are four basic types of files: 6 | 7 | 1. _rels/.rels - A list of docx content files (e.g., ``document.xml``) 8 | 9 | 2. content files - files that contain the text displayed in the docx. (e.g., ``document.xml``, ``header1.xml``). 10 | These files reference non-content files (images and formatting specifications) through relId numbers, which are 11 | defined in content-file rels. 12 | 13 | 3. content-file rels - (e.g., ``document.xml.rels``) this is where relId numbers are defined. The relId numbers 14 | used in ``document.xml`` will be defined in ``document.xml.rels``. 15 | 16 | 4. display files - (e.g., ``numbering.xml``) that tell the content files how to display text. These are linked from 17 | the content files through content-file rels. 18 | 19 | ### Docx file structure 20 | 21 | + _rels # named references to data (links, values, etc. for entire document) 22 | - .rels # map to locations of major files (e.g., document.xml) 23 | 24 | + customXml # all ignored by docx2python 25 | - item1.xml 26 | - item2.xml 27 | - item3.xml 28 | - itemProps1.xml 29 | - itemProps2.xml 30 | - itemProps2.xml 31 | _ _rels 32 | - item1.xml.rels 33 | - item2.xml.rels 34 | - item3.xml.rels 35 | 36 | + docProps 37 | - app.xml # ignored by docx2python 38 | - core.xml # author, modification date, etc. 39 | - custom.xml # ignored by docx2python 40 | 41 | + word # content of docx 42 | + _rels # images, numbering formats, etc. for content xml files 43 | - document.xml.rels 44 | - header1.xml.rels 45 | - header2.xml.rels 46 | - header3.xml.rels 47 | + media # folder holding all pictures attached in the docx file 48 | - image1.jpg 49 | - image2.jpg 50 | + theme # ignored by docx2python 51 | - theme1.xml 52 | - document.xml # main body text 53 | - header1.xml # header 1 content 54 | - footer1.xml 55 | - footnotes.xml 56 | - fontTable.xml # "long-hand" font descriptions. Ignored by docx2python 57 | - numbering.xml # required data to auto number paragraphs. doxc2python reads this 58 | - settings.xml # global file specifications. Ignored by docx2python 59 | - styles.xml # table styles, etc. Ignored by docx2python 60 | - webSettings.xml # ignored by docx2python 61 | 62 | A ``*.docx`` file is just a zipped up file structure (the structure defined above). You can unzip a docx file, make changes, then zip it back up and everything will work (provided your changes are valid xml). 63 | -------------------------------------------------------------------------------- /docx2python/__init__.py: -------------------------------------------------------------------------------- 1 | """Import function docx2python into the docx2python namespace. 2 | 3 | :author: Shay Hill 4 | :created: 2023-01-09 5 | """ 6 | 7 | from docx2python.main import docx2python 8 | 9 | __all__ = ["docx2python"] 10 | -------------------------------------------------------------------------------- /docx2python/bullets_and_numbering.py: -------------------------------------------------------------------------------- 1 | """Generate bullet and numbered-list strings. 2 | 3 | :author: Shay Hill 4 | :created: 11/15/2021 5 | 6 | Docx xml files do not track explicit numbering values. Each numbered paragraph has :: 7 | 8 | # indentation level 9 | # index to a list [by ilvl] of numbered-list formats 10 | 11 | Docx2Python keeps track of current numbering value, and increments these values as 12 | numbered paragraphs are encountered. If extracting partial text, the numbers may be 13 | incorrect, because all paragraphs in a numbered-list format may not be encountered 14 | during the extraction. 15 | """ 16 | 17 | from __future__ import annotations 18 | 19 | import warnings 20 | from collections import defaultdict 21 | from contextlib import suppress 22 | from typing import TYPE_CHECKING, Callable 23 | 24 | from docx2python import numbering_formats as nums 25 | from docx2python.namespace import get_attrib_by_qn, iterfind_by_qn 26 | 27 | if TYPE_CHECKING: 28 | from lxml.etree import _Element as EtreeElement # type: ignore 29 | 30 | from docx2python.docx_context import NumIdAttrs 31 | 32 | 33 | def _get_bullet_function(numFmt: str) -> Callable[[int], str]: 34 | """Select a bullet or numbering format function from xml numFmt. 35 | 36 | :param numFmt: xml numFmt (e.g., decimal, lowerLetter) 37 | :return: a function that takes an int and returns a string. If numFmt is not 38 | recognized, treat numbers as bullets. 39 | """ 40 | numFmt2bullet_function: dict[str, Callable[[int], str]] = { 41 | "decimal": nums.decimal, 42 | "lowerLetter": nums.lower_letter, 43 | "upperLetter": nums.upper_letter, 44 | "lowerRoman": nums.lower_roman, 45 | "upperRoman": nums.upper_roman, 46 | "bullet": nums.bullet, 47 | } 48 | try: 49 | retval_: Callable[[int], str] = numFmt2bullet_function[numFmt] 50 | except KeyError: 51 | warnings.warn( 52 | f"{numFmt} numbering format not implemented, " 53 | + f"substituting '{nums.bullet()}'", 54 | stacklevel=2, 55 | ) 56 | return nums.bullet 57 | else: 58 | return retval_ 59 | 60 | 61 | def _new_list_counter() -> defaultdict[str, defaultdict[str, int]]: 62 | """Return a counter, starting at zero, for each numId. 63 | 64 | :return: { 65 | a_numId: 0, 66 | b_numId: 0 67 | } 68 | 69 | This is what you need to keep track of where every nested list is at. 70 | """ 71 | return defaultdict(lambda: defaultdict(int)) 72 | 73 | 74 | def _increment_list_counter(ilvl2count: defaultdict[str, int], ilvl: str) -> int: 75 | """Increase counter at ilvl, reset counter at deeper levels. 76 | 77 | :param ilvl2count: context['numId2count'] 78 | :param ilvl: string representing an integer 79 | :return: updated count at ilvl. 80 | updates context['numId2count'] by reference 81 | 82 | On a numbered list, the count for sub-lists should reset when a parent list 83 | increases, e.g., 84 | 85 | 1. top-level list 86 | a. sublist 87 | b. sublist continues 88 | 2. back to top-level list 89 | a. sublist counter has been reset 90 | 91 | List counters are defaultdicts, so we can reset sublist counters by deleting 92 | them. 93 | """ 94 | ilvl2count[ilvl] += 1 95 | deeper_levels = [k for k in ilvl2count if k > ilvl] 96 | for level in deeper_levels: 97 | del ilvl2count[level] 98 | return ilvl2count[ilvl] 99 | 100 | 101 | class BulletGenerator: 102 | """Keep track of list counters and generate bullet strings. 103 | 104 | 105 | 106 | 107 | # indentation level 108 | # index to (multi-level) list format 109 | 110 | 111 | 112 | this text in numbered or bulleted list 113 | 114 | 115 | 116 | """ 117 | 118 | def __init__(self, numId2Attrs: dict[str, list[NumIdAttrs]]) -> None: 119 | """Set numId2numFmts. Initiate counters.""" 120 | self.numId2Attrs = numId2Attrs 121 | self.numId2count = _new_list_counter() 122 | 123 | # Only increment the number of a paragraph if that paragraph has not been 124 | # seen. See docstring for self._get_par_number. 125 | self._par2par_number: dict[EtreeElement, int | None] = {} 126 | 127 | def _get_numPr(self, paragraph: EtreeElement) -> EtreeElement | None: 128 | """Get the parent element of the numId and ilvl elements. 129 | 130 | :param paragraph: xml element 131 | :return: xml element or None if this fails. 132 | """ 133 | try: 134 | pPr = next(iterfind_by_qn(paragraph, "w:pPr")) 135 | return next(iterfind_by_qn(pPr, "w:numPr")) 136 | except (StopIteration, KeyError): 137 | return None 138 | 139 | def _get_numId(self, numPr: EtreeElement) -> str | None: 140 | """Get the numId for the paragraph. 141 | 142 | :param numPr: xml element (see class docstring) 143 | :return: numId as a string or None if this fails. 144 | 145 | The numId is an integer (string of an integer) index to a list of multi-level 146 | list formats. For each numId, there is a list of formats for each indentation 147 | level. 148 | """ 149 | try: 150 | numId_element = next(iterfind_by_qn(numPr, "w:numId")) 151 | return get_attrib_by_qn(numId_element, "w:val") 152 | except (StopIteration, KeyError): 153 | return None 154 | 155 | def _get_ilvl(self, numPr: EtreeElement) -> str | None: 156 | """Get the ilvl for the paragraph. 157 | 158 | :param numPr: xml element (see class docstring) 159 | :return: ilvl as a string or None if this fails. 160 | 161 | The ilvl is an integer (string of an integer) index of a multi-level list 162 | formats. For each ilvl, there is a format. 163 | """ 164 | try: 165 | ilvl_element = next(iterfind_by_qn(numPr, "w:ilvl")) 166 | return get_attrib_by_qn(ilvl_element, "w:val") 167 | except (StopIteration, KeyError): 168 | return None 169 | 170 | def get_bullet_fmt(self, paragraph: EtreeElement) -> tuple[str | None, str | None]: 171 | """Expose the numId and ilvl of a numbered paragraph. 172 | 173 | :param paragraph: xml element 174 | :return: numId (which list), ilvl (indentation level) 175 | 176 | This will return None, None, None if the paragraph is not numbered. 177 | """ 178 | numPr = self._get_numPr(paragraph) 179 | if numPr is None: 180 | return None, None 181 | numId = self._get_numId(numPr) 182 | ilvl = self._get_ilvl(numPr) 183 | if numId is None or ilvl is None: 184 | return numId, ilvl 185 | return numId, ilvl 186 | 187 | def get_par_number(self, paragraph: EtreeElement) -> int | None: 188 | """Get the number (at the current indentation level) of a paragraph. 189 | 190 | :param paragraph: xml element 191 | :return: number of the paragraph 192 | :effects: increment self.numId2count[numId][ilvl] if the paragraph has not 193 | been seen before. 194 | 195 | E.g., 196 | 197 | 1. paragraph # called here, return 1 198 | a. paragraph # called here, return 1 199 | b. paragraph # called here, return 2 200 | 2. paragraph # called here, return 2 201 | a. paragraph # called here, return 1 202 | 1. paragraph # called here, return 1 203 | 204 | numId and ilvl should both be defined for a numbered paragraph, but I'm 205 | testing both here to fail silently if that assumption is wrong. 206 | """ 207 | with suppress(KeyError): 208 | return self._par2par_number[paragraph] 209 | numId, ilvl = self.get_bullet_fmt(paragraph) 210 | if numId is None or ilvl is None: 211 | par_number = None 212 | else: 213 | counter = _increment_list_counter(self.numId2count[numId], ilvl) 214 | par_number = counter + self.get_start_value_zero_based(numId, ilvl) 215 | self._par2par_number[paragraph] = par_number 216 | return par_number 217 | 218 | def get_start_value_zero_based(self, numId: str | None, ilvl: str | None) -> int: 219 | """Get the start value, 0-based, for numbering sequence at particular level. 220 | 221 | :return: start index if present for a particular numId and ilvl, 0 otherwise 222 | """ 223 | attrs = self.__get_num_fmt_attributes(numId, ilvl) 224 | if not attrs or not attrs.start: 225 | return 0 226 | return attrs.start - 1 # subtract 1 to have 0-based result 227 | 228 | def get_list_position( 229 | self, paragraph: EtreeElement 230 | ) -> tuple[str | None, list[int]]: 231 | """Get the current numbering values. 232 | 233 | :return: numbering values as a tuple of integers 234 | 235 | E.g., 236 | 237 | Not in a list # called here, return () 238 | 239 | 1. paragraph # called here, return (numPr, 1) 240 | a. paragraph # called here, return (numPr, 1, 1) 241 | b. paragraph # called here, return (numPr, 1, 2) 242 | 2. paragraph # called here, return (numPr, 2) 243 | a. paragraph # called here, return (numPr, 2, 1) 244 | 1. paragraph # called here, return (numPr, 2, 1, 1) 245 | 246 | The numbering values are the current count at each indentation level. 247 | """ 248 | numPr, _ = self.get_bullet_fmt(paragraph) 249 | if numPr is None: 250 | return (numPr, []) 251 | # ensure the paragraph counter has been incremented 252 | _ = self.get_par_number(paragraph) 253 | return numPr, list(self.numId2count[numPr].values()) 254 | 255 | def get_bullet(self, paragraph: EtreeElement) -> str: 256 | """Get bullet string if paragraph is numbered. (e.g, '-- ' or '1) '). 257 | 258 | :param paragraph: xml element 259 | :return: specified 'bullet' string or '' if paragraph is not numbered 260 | 261 | Get an index to a multi-level list format (numId) and the indentation level 262 | (ilvl). If no numId or ilvl are defined, assume this is not a numbered list. 263 | If these values to exist, look up a list format with 264 | numId2numFmts[numId][ilvl]. If this fails, silently give up and use a bullet. 265 | 266 | bullet preceded by one tab for every indentation level. 267 | """ 268 | numId, ilvl = self.get_bullet_fmt(paragraph) 269 | number = self.get_par_number(paragraph) 270 | if numId is None: 271 | return "" 272 | if ilvl is None: 273 | return "" 274 | if number is None: 275 | return "" 276 | attrs = self.__get_num_fmt_attributes(numId, ilvl) 277 | numFmt = attrs.fmt if attrs and attrs.fmt else "bullet" 278 | 279 | def format_bullet(bullet: str) -> str: 280 | """Indent, format and pad the bullet or number string. 281 | 282 | :param bullet: any kind of list-item string (bullet, number, Roman, ...) 283 | :return: formatted bullet string 284 | """ 285 | if bullet != nums.bullet(): 286 | bullet += ")" 287 | return "\t" * int(ilvl) + bullet + "\t" 288 | 289 | get_unformatted_bullet_str = _get_bullet_function(numFmt) 290 | return format_bullet(get_unformatted_bullet_str(number)) 291 | 292 | def __get_num_fmt_attributes( 293 | self, numId: str | None, ilvl: str | None 294 | ) -> NumIdAttrs | None: 295 | if numId is None: 296 | return None 297 | if ilvl is None: 298 | return None 299 | try: 300 | return self.numId2Attrs[str(numId)][int(ilvl)] 301 | except (KeyError, IndexError, ValueError): 302 | return None 303 | -------------------------------------------------------------------------------- /docx2python/docx_context.py: -------------------------------------------------------------------------------- 1 | """Content from files that aren't ``word/document.xml``. 2 | 3 | :author: Shay Hill 4 | :created: 6/26/2019 5 | 6 | Most of the "meat" in a docx file is in ``word/document.xml``. These functions retrieve 7 | numbering formats, images, and font styles from *other* files in a decompressed docx. 8 | """ 9 | 10 | from __future__ import annotations 11 | 12 | import dataclasses 13 | from typing import TYPE_CHECKING 14 | 15 | from lxml import etree 16 | 17 | from docx2python.attribute_register import get_localname 18 | from docx2python.namespace import find_by_qn, findall_by_qn, get_attrib_by_qn 19 | 20 | if TYPE_CHECKING: 21 | import zipfile 22 | 23 | from lxml.etree import _Element as EtreeElement # type: ignore 24 | 25 | 26 | @dataclasses.dataclass 27 | class NumIdAttrs: 28 | """NumIdAttrs represents numbering attributes, such as format and start index.""" 29 | 30 | fmt: str | None 31 | start: int | None 32 | 33 | 34 | def collect_numAttrs(numFmts_root: EtreeElement) -> dict[str, list[NumIdAttrs]]: 35 | """Collect abstractNum bullet attributes into a dictionary. 36 | 37 | :param numFmts_root: Root element of ``word/numbering.xml``. 38 | :return: numId mapped to numFmts (by ilvl) 39 | 40 | :background: 41 | 42 | ``word/numbering.xml`` will have two sections. 43 | 44 | **SECTION 1** - Some abstractNum elements defining numbering formats for multiple 45 | indentation levels:: 46 | 47 | 48 | 49 | 50 | ... 51 | 52 | 53 | **SECTION 2** - Some num elements, each referencing an abstractNum. Multiple nums 54 | may reference the same abstractNum, but each will maintain a separate count (i.e., 55 | each numbered paragraph will start from 1, even if it shares a style with another 56 | paragraph.):: 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | **E.g, Given**: *above* 66 | 67 | **E.g., Returns**:: 68 | 69 | { 70 | # -----ilvl=0------ilvl=1------ilvl=2--- 71 | "1": [ NumIdAttrs(fmt:"decimal",start:2), 72 | NumIdAttrs(fmt:"lowerLetter",start:1), ...], 73 | "2": ... 74 | } 75 | """ 76 | abstractNumId2Attrs: dict[str, list[NumIdAttrs]] = {} 77 | 78 | for abstractNum in findall_by_qn(numFmts_root, "w:abstractNum"): 79 | id_ = str(get_attrib_by_qn(abstractNum, "w:abstractNumId")) 80 | 81 | abstractNumId2Attrs[id_] = [] 82 | for lvl in findall_by_qn(abstractNum, "w:lvl"): 83 | numFmtEl = find_by_qn(lvl, "w:numFmt") 84 | fmt = None 85 | if numFmtEl is not None: 86 | fmt = str(get_attrib_by_qn(numFmtEl, "w:val")) 87 | startEl = find_by_qn(lvl, "w:start") 88 | start = None 89 | if startEl is not None: 90 | qn = get_attrib_by_qn(startEl, "w:val") 91 | start = int(qn) 92 | abstractNumId2Attrs[id_].append(NumIdAttrs(fmt=fmt, start=start)) 93 | 94 | numId2attrs: dict[str, list[NumIdAttrs]] = {} 95 | num: EtreeElement 96 | for num in findall_by_qn(numFmts_root, "w:num"): 97 | numId = get_attrib_by_qn(num, "w:numId") 98 | abstractNumId = find_by_qn(num, "w:abstractNumId") 99 | if abstractNumId is None: 100 | continue 101 | abstractNumIdval = get_attrib_by_qn(abstractNumId, "w:val") 102 | numId2attrs[str(numId)] = abstractNumId2Attrs[str(abstractNumIdval)] 103 | 104 | return numId2attrs 105 | 106 | 107 | def collect_rels(zipf: zipfile.ZipFile) -> dict[str, list[dict[str, str]]]: 108 | """Map file to relId to attrib. 109 | 110 | :param zipf: created by ``zipfile.ZipFile("docx_filename")`` 111 | :return: a deep dictionary ``{filename: list of Relationships`` 112 | 113 | Each rel in list of Relationships is:: 114 | 115 | { 116 | "Id": "rId1", 117 | "Type": "http...", 118 | "Target": "path to file in docx" 119 | } 120 | 121 | There are several rels files: 122 | 123 | ``_rels/.rels``: rels related to entire structure. The identity of 124 | ``word/document.xml`` is here. (It might be called ``word/document2.xml`` or 125 | something else. Checking here is the best way to make sure.) 126 | 127 | ``word/_rels/document.xml.rels``: images, headers, etc. referenced by 128 | ``word/document.xml`` 129 | 130 | ``word/_rels/header1.xml.rels``: images, etc. for ``header1.xml`` 131 | 132 | ... 133 | 134 | Get everything from everywhere. Map ``_rels/.rels`` to ``'rels'`` and everything 135 | else to e.g., ``'document'`` or ``'header'``. RelIds are **not** unique between 136 | these files. 137 | 138 | **E.g, Given**:: 139 | 140 | # one of several files 141 | 142 | 143 | 144 | 146 | 148 | 150 | 152 | 153 | 154 | **Returns**:: 155 | 156 | { 157 | "filename": [ 158 | { 159 | "Id": "rId3", 160 | "Type": "http://schemas.../extended-properties", 161 | "Target": "docProps/app.xml", 162 | }, 163 | { 164 | "Id": "rId2", 165 | "Type": "http://schemas.../core-properties", 166 | "Target": "docProps/core.xml", 167 | }, 168 | ] 169 | } 170 | """ 171 | path2rels: dict[str, list[dict[str, str]]] = {} 172 | for rels in (x for x in zipf.namelist() if x[-5:] == ".rels"): 173 | rels_elem = etree.fromstring(zipf.read(rels)) 174 | path2rels[rels] = [ 175 | {str(y): str(z) for y, z in x.attrib.items()} for x in rels_elem 176 | ] 177 | tag = rels_elem.tag 178 | if isinstance(tag, bytearray): # for type checkers 179 | tag = tag.decode("utf-8") 180 | path2rels[rels].append( 181 | {"Id": "none", "Type": etree.QName(tag).namespace or "", "Target": rels} 182 | ) 183 | 184 | return path2rels 185 | 186 | 187 | def collect_docProps(root: EtreeElement) -> dict[str, str | None]: 188 | """Get author, modified, etc. from core-properties (should be docProps/core.xml). 189 | 190 | :param root: root of the XML tree 191 | :return: document property names mapped to values 192 | 193 | **E.g., Given**:: 194 | 195 | 196 | SG-DOP-5009 - Operate ROMAR swarf unit 197 | 198 | Shay Hill 199 | 200 | Shay Hill 201 | 202 | 6 203 | 204 | 2017-11-17T15:47:00Z 205 | 206 | 2019-01-10T07:21:00Z 207 | 208 | 2019-01-11T11:41:00Z 209 | 210 | 211 | 212 | **E.g., Returns**:: 213 | 214 | { 215 | "title": "SG-DOP-5009 - Operate ROMAR swarf unit", 216 | "creator": "Shay Hill", 217 | "lastModifiedBy": "Shay Hill", 218 | "revision": "6", 219 | ... 220 | } 221 | """ 222 | return {get_localname(x): x.text for x in root} 223 | -------------------------------------------------------------------------------- /docx2python/forms.py: -------------------------------------------------------------------------------- 1 | """Form checkboxes, dropdowns, and other non-text elements visible in Word. 2 | 3 | :author: Shay Hill 4 | :created: 6/17/2020 5 | 6 | Word represents some special characters as non-text elements (e.g., checkBox). These 7 | functions examine these elements to infer suitable text replacements. 8 | 9 | This file references "\u2610" and "\u2612" a few times. These are open and 10 | crossed-out checkboxes. Pypi doesn't like them in my file, so I have to reference 11 | them by their escape sequences. 12 | """ 13 | 14 | from __future__ import annotations 15 | 16 | from contextlib import suppress 17 | from typing import TYPE_CHECKING 18 | 19 | from docx2python.namespace import get_attrib_by_qn, iterfind_by_qn, qn 20 | 21 | if TYPE_CHECKING: 22 | from lxml.etree import _Element as EtreeElement # type: ignore 23 | 24 | 25 | def get_checkBox_entry(checkBox: EtreeElement) -> str: 26 | """Create text representation for a checkBox element. 27 | 28 | :param checkBox: a checkBox xml element 29 | :return: 30 | 1. attempt to get ``checked.w:val`` and return "\u2610" or "\u2612" 31 | 2. attempt to get ``default.w:val`` and return "\u2610" or "\u2612" 32 | 3. return ``--checkbox failed--`` 33 | 34 | Docx xml has at least two types of checkbox elements:: 35 | 36 | 1. ``checkBox`` can only be checked when the form is locked. These do not 37 | contain a text element, so this function is needed to select one from the 38 | ``w:checked`` or ``w:default`` sub-elements. 39 | 40 | 2. ``checkbox`` can be checked any time. Prints text as "\u2610" or "\u2612". 41 | Docx2Python can safely ignore this second type, as there will be a 42 | element inside with a checkbox character. 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | If the ``checked`` attribute is absent, return the default 51 | If the ``checked`` attribute is present, but not w:val is given, return unchecked 52 | """ 53 | 54 | def get_wval() -> str | None: 55 | """Get the value of the ``w:val`` attribute of the ``checked`` element. 56 | 57 | :return: the value of the ``w:val`` attribute of the ``checked`` element 58 | """ 59 | with suppress(StopIteration): 60 | checked = next(iterfind_by_qn(checkBox, "w:checked")) 61 | return str(checked.attrib.get(qn(checked, "w:val")) or "1") 62 | with suppress(StopIteration, KeyError): 63 | default = next(iterfind_by_qn(checkBox, "w:default")) 64 | return str(get_attrib_by_qn(default, "w:val")) 65 | return None 66 | 67 | return { 68 | "0": "\u2610", 69 | "false": "\u2610", 70 | "1": "\u2612", 71 | "true": "\u2612", 72 | None: "----checkbox failed----", 73 | }[get_wval()] 74 | 75 | 76 | def get_ddList_entry(ddList: EtreeElement) -> str: 77 | """Get only the selected string of a dropdown list. 78 | 79 | :param ddList: a dropdown-list element 80 | :return: w:listEntry value of input element. 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | might be missing when selection is "0" 89 | """ 90 | list_entries = [ 91 | get_attrib_by_qn(x, "w:val") for x in iterfind_by_qn(ddList, "w:listEntry") 92 | ] 93 | try: 94 | result = next(iterfind_by_qn(ddList, "w:result")) 95 | list_index = int(get_attrib_by_qn(result, "w:val")) 96 | except (StopIteration, KeyError): 97 | list_index = 0 98 | return str(list_entries[list_index]) 99 | -------------------------------------------------------------------------------- /docx2python/main.py: -------------------------------------------------------------------------------- 1 | """Top-level code. 2 | 3 | :author: Shay Hill 4 | :created: 7/2/2019 5 | """ 6 | 7 | from __future__ import annotations 8 | 9 | from typing import TYPE_CHECKING 10 | 11 | from docx2python.docx_output import DocxContent 12 | from docx2python.docx_reader import DocxReader 13 | 14 | if TYPE_CHECKING: 15 | import os 16 | from io import BytesIO 17 | 18 | 19 | def docx2python( 20 | docx_filename: str | os.PathLike[str] | BytesIO, 21 | image_folder: str | os.PathLike[str] | None = None, 22 | *, 23 | html: bool = False, 24 | duplicate_merged_cells: bool = True, 25 | ) -> DocxContent: 26 | """Unzip a docx file and extract contents. 27 | 28 | :param docx_filename: path to a docx file 29 | :param image_folder: optionally specify an image folder 30 | (images in docx will be copied to this folder) 31 | :param html: bool, extract some formatting as html 32 | :param duplicate_merged_cells: bool, duplicate merged cells to return a mxn 33 | nested list for each table (default True) 34 | :return: DocxContent object 35 | """ 36 | docx_context = DocxReader( 37 | docx_filename, html=html, duplicate_merged_cells=duplicate_merged_cells 38 | ) 39 | docx_content = DocxContent(docx_context, image_folder) 40 | if image_folder: 41 | _ = docx_content.images 42 | return docx_content 43 | -------------------------------------------------------------------------------- /docx2python/merge_runs.py: -------------------------------------------------------------------------------- 1 | """Merge runs with identical formatting. 2 | 3 | :author: Shay Hill 4 | :created: 12/13/2021 5 | 6 | Join consecutive xml runs with identical formatting. See docstring for ``merge_elems``. 7 | """ 8 | 9 | from __future__ import annotations 10 | 11 | import functools 12 | from itertools import groupby 13 | from typing import TYPE_CHECKING 14 | 15 | from docx2python.attribute_register import Tags, get_prefixed_tag, has_content 16 | from docx2python.text_runs import get_html_formatting 17 | 18 | if TYPE_CHECKING: 19 | from lxml.etree import _Element as EtreeElement # type: ignore 20 | 21 | from docx2python.docx_reader import File 22 | 23 | # identify tags that will be merged together (if formatting is equivalent) 24 | _MERGEABLE_TAGS = {Tags.RUN, Tags.HYPERLINK, Tags.TEXT, Tags.TEXT_MATH} 25 | 26 | 27 | def _is_mergeable(elem: EtreeElement) -> bool: 28 | """Can a run be merged with another run?""" 29 | return elem.tag in _MERGEABLE_TAGS or get_prefixed_tag(elem) in _MERGEABLE_TAGS 30 | 31 | 32 | def _elem_key(file: File, elem: EtreeElement) -> tuple[str, str, list[str]]: 33 | """Return enough info to tell if two elements are closely formatted. 34 | 35 | :param elem: any element in an xml file. 36 | :return: A summary of attributes (if two adjacent elements return the same key, 37 | they are considered mergeable). Only used to merge elements, so returns None 38 | if elements are not mergeable. 39 | 40 | Ignore text formatting differences if consecutive link elements point to the same 41 | address. Always join these. 42 | 43 | Docx2Text joins consecutive runs and links of the same style. Comparing two 44 | elem_key return values will tell you if 45 | * elements are the same type 46 | * link rels ids reference the same link 47 | * run styles are the same (as far as docx2python understands them) 48 | 49 | Elem rId attributes are replaced with rId['Target'] because different rIds can 50 | point to identical targets. This is important for hyperlinks, which can look 51 | different but point to the same address. 52 | 53 | """ 54 | tag = str(elem.tag) 55 | if not _is_mergeable(elem): 56 | return tag, "", [] 57 | 58 | # always join links pointing to the same address 59 | # elem.attrib key for relationship ids. These can find the information they 60 | # reference by ``file_instance.rels[elem.attrib[RELS_ID]]`` 61 | rels_id_key = f"{{{elem.nsmap['r']}}}id" 62 | rels_id = elem.attrib.get(rels_id_key) 63 | if rels_id: 64 | return tag, str(file.rels[str(rels_id)]), [] 65 | 66 | return tag, "", get_html_formatting(elem, file.context.xml2html_format) 67 | 68 | 69 | def _is_text_or_text_math(elem: EtreeElement) -> bool: 70 | """Can an element be treated as text?""" 71 | text_or_text_math = {Tags.TEXT, Tags.TEXT_MATH} 72 | return elem.tag in text_or_text_math or get_prefixed_tag(elem) in text_or_text_math 73 | 74 | 75 | def merge_elems(file: File, tree: EtreeElement) -> None: 76 | """Recursively merge duplicate (as far as docx2python is concerned) elements. 77 | 78 | :param file: File instancce 79 | :param tree: root_element from an xml in File instance 80 | :effects: Merges consecutive elements if tag, attrib, and style are the same 81 | 82 | There are a few ways consecutive elements can be "identical": 83 | * same link 84 | * same style 85 | 86 | Often, consecutive, "identical" elements are written as separate elements, 87 | because they aren't identical to Word. Word keeps track of revision history, 88 | spelling errors, etc., which are meaningless to docx2python. 89 | 90 | 91 | 92 | 93 | hy 94 | 95 | 96 | 97 | 98 | 99 | per 100 | 101 | 102 | 103 | 104 | link 105 | 106 | 107 | 108 | 109 | Docx2python condenses the above to (by merging links) 110 | 111 | 112 | 113 | 114 | hy 115 | 116 | 117 | per 118 | 119 | 120 | link 121 | 122 | 123 | 124 | 125 | Then to (by merging runs) 126 | 127 | 128 | 129 | 130 | hy 131 | per 132 | link 133 | 134 | 135 | 136 | 137 | Then finally to (by merging text) 138 | 139 | 140 | 141 | 142 | hyperlink 143 | 144 | 145 | 146 | 147 | This function only merges runs, text, and hyperlinks, because merging paragraphs 148 | or larger elements would ignore information docx2python DOES want to preserve. 149 | 150 | Filter out non-content items so runs can be joined even 151 | """ 152 | file_elem_key = functools.partial(_elem_key, file) 153 | 154 | elems = [x for x in tree if has_content(x)] 155 | runs = [list(y) for _, y in groupby(elems, key=file_elem_key)] 156 | 157 | for run in (x for x in runs if len(x) > 1 and _is_mergeable(x[0])): 158 | if _is_text_or_text_math(run[0]): 159 | run[0].text = "".join(x.text or "" for x in run) 160 | for elem in run[1:]: 161 | for e in elem: 162 | run[0].append(e) 163 | tree.remove(elem) 164 | 165 | for branch in tree: 166 | merge_elems(file, branch) 167 | -------------------------------------------------------------------------------- /docx2python/namespace.py: -------------------------------------------------------------------------------- 1 | """Register namespace entries in xml ``document`` elements. 2 | 3 | :author: Shay Hill 4 | :created: 7/5/2019 5 | 6 | A ```` element at the top of each xml file defines a namespace:: 7 | 8 | 12 | 13 | These entries can be accessed in the file by their abbreviations:: 14 | 15 | 16 | contents of paragraph 17 | 18 | 19 | ``lxml.etree`` reads ``""`` as 20 | 21 | ``"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}p"`` 22 | 23 | This module defines the necessary namespaces and transforms ``"w:p"`` to 24 | ``{http://...}p``. This allows readable code like:: 25 | 26 | if element.tag == qn("w:p"): 27 | 28 | instead of:: 29 | 30 | if element.tag == "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}p": 31 | 32 | If somewhere along the line this package just stops working, it may be that the NSMAP 33 | entries have been updated for whatever docx you're working with (though that's not 34 | supposed to ever happen). *If* this happens:: 35 | 36 | 1) Unzip the docx. 37 | 2) open ``word/document.xml`` in a text editor. 38 | 3) Search for xmlns:w=[some string] 39 | 4) update NSMAP['w'] = some string 40 | 41 | Lxml allows (deceptively) easy access to a file's namespaces; however, this is 42 | problematic because ``root_element.nsmap`` may not retrieve all nsmap entries. Other 43 | entries may be buried inside sub-environments further down in the tree. It is safer 44 | to explicate namespace mapping. 45 | 46 | If you extend docx2text with other tags, additional NSMAP entries may be necessary. 47 | """ 48 | 49 | from __future__ import annotations 50 | 51 | from typing import TYPE_CHECKING 52 | 53 | from docx2python.attribute_register import get_prefixed_tag 54 | 55 | if TYPE_CHECKING: 56 | from collections.abc import Iterator 57 | 58 | from lxml.etree import _Element as EtreeElement # type: ignore 59 | 60 | 61 | def qn(elem: EtreeElement, tag: str) -> str: 62 | """Turn a namespace-prefixed tag into a Clark-notation qualified tag. 63 | 64 | :param elem: lxml.etree._Element object 65 | :param tag: namespace-prefixed tag, e.g. ``w:p`` 66 | :return: Clark-notation qualified tag, 67 | e.g. ``{http://schemas.openxmlformats.org/wordprocessingml/2006/main}p`` 68 | IN THE NAMESPACES DEFINED IN THE ``elem`` ELEMENT 69 | 70 | Most lxml elements contain the entire namespace of their parent elements. Create 71 | a tag within this namespace. 72 | 73 | Stands for 'qualified name', a utility function to turn a namespace prefixed tag 74 | name into a Clark-notation qualified tag name for lxml. 75 | 76 | >>> qn('w:cSld') 77 | '{http://schemas.../main}cSld' 78 | 79 | Source: https://github.com/python-openxml/python-docx/ 80 | """ 81 | prefix, localname = tag.split(":") 82 | uri = elem.nsmap[prefix] 83 | return f"{{{uri}}}{localname}" 84 | 85 | 86 | def get_attrib_by_qn(elem: EtreeElement, tag: str) -> str: 87 | """Get the attribute of an element by a namespace-prefixed tag. 88 | 89 | :param elem: lxml.etree._Element object 90 | :param tag: namespace-prefixed tag, e.g. ``w:p`` 91 | :return: attribute of the element with the namespace-prefixed tag 92 | """ 93 | return elem.attrib[qn(elem, tag)] 94 | 95 | 96 | def find_by_qn(elem: EtreeElement, tag: str) -> EtreeElement | None: 97 | """Find next element in the tree with a namespace-prefixed tag. 98 | 99 | :param elem: lxml.etree._Element object 100 | :param tag: namespace-prefixed tag, e.g. ``w:p`` 101 | :return: next element with the namespace-prefixed tag 102 | """ 103 | return elem.find(qn(elem, tag)) 104 | 105 | 106 | def findall_by_qn(elem: EtreeElement, tag: str) -> list[EtreeElement]: 107 | """Find all elements in the tree with a namespace-prefixed tag. 108 | 109 | :param elem: lxml.etree._Element object 110 | :param tag: namespace-prefixed tag, e.g. ``w:p`` 111 | :return: list of elements with the namespace-prefixed tag 112 | """ 113 | return elem.findall(qn(elem, tag)) 114 | 115 | 116 | def find_parent_by_qn(elem: EtreeElement | None, tag: str) -> EtreeElement | None: 117 | """Find the parent element in the tree with a namespace-prefixed tag. 118 | 119 | :param elem: lxml.etree._Element object 120 | :param tag: namespace-prefixed tag, e.g. ``w:p`` 121 | :return: parent element with the namespace-prefixed tag 122 | """ 123 | if elem is None: 124 | return None 125 | if get_prefixed_tag(elem) == tag: 126 | return elem 127 | return find_parent_by_qn(elem.getparent(), tag) 128 | 129 | 130 | def iterfind_by_qn(elem: EtreeElement, tag: str) -> Iterator[EtreeElement]: 131 | """Iterate over all elements in the tree with a namespace-prefixed tag. 132 | 133 | :param elem: lxml.etree._Element object 134 | :param tag: namespace-prefixed tag, e.g. ``w:p`` 135 | :return: iterator over elements with the namespace-prefixed tag 136 | """ 137 | yield from elem.iterfind(qn(elem, tag)) 138 | -------------------------------------------------------------------------------- /docx2python/numbering_formats.py: -------------------------------------------------------------------------------- 1 | """Numbering formats for converted XML lists. 2 | 3 | :author: Shay Hill 4 | :created: 6/26/2019 5 | 6 | I don't want to add non-ascii text to a potentially ascii-only file, so all bullets 7 | are '--' and Roman numerals stop at 3999. 8 | 9 | Doesn't capture formatting like 1.1.1 or b) or (ii). Only the six basic formats are 10 | covered:: 11 | 12 | -- bullet 13 | 1 decimal 14 | a lowerLetter 15 | A upperLetter 16 | i lowerRoman 17 | I upperRoman 18 | """ 19 | 20 | from string import ascii_lowercase 21 | 22 | # Subs to convert any number of i's to a proper Roman numeral 23 | # fmt=off 24 | ROMAN_SUBS = [ 25 | ("iiiii", "v"), # 1+1+1+1+1 -> 5 26 | ("vv", "x"), # 5+5 -> 10 27 | ("xxxxx", "l"), # 10+10+10+10 -> 50 28 | ("ll", "c"), # 50+50 -> 100 29 | ("ccccc", "d"), # 100+100+100+100+100 -> 500 30 | ("dd", "m"), # 500+500 -> 1000 31 | ("iiii", "iv"), # 1+1+1+1 -> 4 32 | ("viv", "ix"), # 5+4 -> 9 33 | ("xxxx", "xl"), # 10+10+10+10 -> 40 34 | ("lxl", "xc"), # 50+40 -> 90 35 | ("cccc", "cd"), # 100+100+100+100 -> 40 36 | ("dcd", "cm"), # 500+400 -> 900 37 | ] 38 | # fmt=on 39 | 40 | 41 | def lower_letter(n: int) -> str: 42 | """Convert a positive integer to a string of letters representing base 26. 43 | 44 | :param n: any positive integer 45 | :return: the kind of "numbering" used for numbered lists and excel columns. 46 | (a, b, c ... aa, ab ...) Zero is undefined. 47 | :raise ValueError: if n is not a positive integer 48 | 49 | >>> lower_letter(1) 50 | 'a' 51 | >>> lower_letter(26) 52 | 'z' 53 | >>> lower_letter(27) 54 | 'aa' 55 | """ 56 | if n < 1: 57 | msg = f"0 and <1 are not defined for this numbering: {n}" 58 | raise ValueError(msg) 59 | result = "" 60 | while n: 61 | n, remainder = divmod(n - 1, 26) 62 | result = ascii_lowercase[remainder] + result 63 | return result 64 | 65 | 66 | def upper_letter(n: int) -> str: 67 | """Get int as an upprecase letter. 68 | 69 | :param n: any positive integer 70 | :return: the kind of "numbering" used for numbered lists and excel columns. 71 | """ 72 | return lower_letter(n).upper() 73 | 74 | 75 | def lower_roman(n: int) -> str: 76 | """Convert a positive integer to a lowercase Roman numeral. 77 | 78 | :param n: any positive integer 79 | :return: Roman number equivalent of n 80 | :raise ValueError: if n is not a positive integer 81 | 82 | >>> lower_roman(1) 83 | 'i' 84 | >>> lower_roman(9) 85 | 'ix' 86 | >>> lower_roman(44) 87 | 'xliv' 88 | 89 | Numbers greater than 3999 can be expressed with a bar over the number. The bar 90 | means "times 1000" (e.g., iv with a bar over it would be 4000). 91 | 92 | It'll never happen in this project, and I don't want to add non-ascii to what 93 | might be a pure ascii file, so this function will keep adding 'm' to as many 94 | thousand as you'd like. 95 | 96 | >>> lower_roman(10000) 97 | 'mmmmmmmmmm' 98 | """ 99 | if n < 1: 100 | msg = f"the Romans hadn't figured out {n}" 101 | raise ValueError(msg) 102 | result = "i" * n 103 | for pattern, replacement in ROMAN_SUBS: 104 | result = result.replace(pattern, replacement) 105 | return result 106 | 107 | 108 | def upper_roman(n: int) -> str: 109 | """Get int as an uppercase Roman numeral. 110 | 111 | :param n: any positive integer 112 | :return: Roman number equivalent of n 113 | """ 114 | return lower_roman(n).upper() 115 | 116 | 117 | def decimal(n: int) -> str: 118 | """Get int as a decimal number string. 119 | 120 | :param n: any integer 121 | :return: string such that int(decimal(n)) == n 122 | """ 123 | return str(n) 124 | 125 | 126 | def bullet(_: int = 0) -> str: 127 | """Get the string we're using to replace bullets. 128 | 129 | :return: the string we're using to replace bullets. 130 | """ 131 | return "--" 132 | -------------------------------------------------------------------------------- /docx2python/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/docx2python/py.typed -------------------------------------------------------------------------------- /docx2python/text_runs.py: -------------------------------------------------------------------------------- 1 | """Get text run formatting. 2 | 3 | :author: Shay Hill 4 | :created: 7/4/2019 5 | 6 | Text runs are formatted inline in the ``trash/document.xml`` or header files. Read 7 | those elements to extract formatting information. 8 | """ 9 | 10 | from __future__ import annotations 11 | 12 | from collections import defaultdict 13 | from contextlib import suppress 14 | from typing import TYPE_CHECKING 15 | 16 | from docx2python.attribute_register import ( 17 | HtmlFormatter, 18 | Tags, 19 | get_localname, 20 | get_prefixed_tag, 21 | ) 22 | from docx2python.namespace import find_parent_by_qn, qn 23 | 24 | if TYPE_CHECKING: 25 | from collections.abc import Sequence 26 | 27 | from lxml.etree import _Element as EtreeElement # type: ignore 28 | 29 | 30 | def _gather_sub_vals(element: EtreeElement, qname: str) -> dict[str, str | None]: 31 | """Gather formatting elements for a paragraph or text run. 32 | 33 | :param element: a ```` or ```` xml element. Maybe others 34 | :param qname: qualified name for child element. 35 | 36 | create with:: 37 | 38 | document = etree.fromstring('bytes string') 39 | # recursively search document for elements. 40 | 41 | :return: Style names ('b/', 'sz', etc.) mapped to values. 42 | 43 | To keep things more homogeneous, I've given tags like ```` (bold) a value of 44 | None, even though they don't take a value in xml. 45 | 46 | Each element of rPr will be either present (returned tag: None) or have a value 47 | (returned tag: val). 48 | 49 | **E.g., given**:: 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | text styled with rPaa 60 | 61 | 62 | 63 | **E.g., returns**:: 64 | 65 | { 66 | "rFonts": True, 67 | "b": None, 68 | "u": "single", 69 | "i": None, 70 | "sz": "32", 71 | "color": "red", 72 | "szCs": "32", 73 | } 74 | """ 75 | sub_vals: dict[str, str | None] = {} 76 | with suppress(StopIteration): 77 | for sub_element in next(element.iterfind(qname)): 78 | sub_val = sub_element.attrib.get(qn(sub_element, "w:val")) 79 | 80 | if sub_val: 81 | sub_vals[get_localname(sub_element)] = str(sub_val) 82 | else: 83 | sub_vals[get_localname(sub_element)] = None 84 | return sub_vals 85 | 86 | 87 | def gather_Pr(element: EtreeElement, tag: str | None = None) -> dict[str, str | None]: 88 | """Gather style values for a , , or element (maybe others). 89 | 90 | :param element: any xml element. r and p elems typically have Pr values. 91 | :param tag: optionally specify a tag to search for, e.g., 'w:sdt' 92 | :return: Style names ('b/', 'sz', etc.) mapped to values. 93 | 94 | These elements often have a subelement ```` or ```` which contains 95 | formatting instructions. This includes colspan, rowspan, and other table-cell 96 | properties. 97 | 98 | Will infer a style element qualified name: p -> pPr; r -> rPr 99 | 100 | Call this with any element. Runs and Paragraphs may have a Pr element. Most 101 | elements will not, but the function will will quietly return an empty dict. 102 | 103 | **Optional tag argument** 104 | 105 | The properties element is a child of the element it describes. With the default 106 | tag=None argument, this function will return that child. Given a tag, the 107 | function will first search up for a matching tag, then return the properties 108 | element of that tag. This allows simple access to, for example, the pPr element 109 | from a descendent `w:t` or `w:r` element. 110 | 111 | ``` 112 | 113 | 114 | 115 | 116 | 117 | 118 | ``` 119 | """ 120 | parent = element if tag is None else find_parent_by_qn(element, tag) 121 | if parent is None: 122 | return {} 123 | return _gather_sub_vals(parent, str(parent.tag) + "Pr") 124 | 125 | 126 | def get_pStyle(paragraph_element: EtreeElement) -> str: 127 | """Collect and format paragraph -> pPr -> pStyle value. 128 | 129 | :param paragraph_element: a ```` xml element 130 | 131 | :return: ``[(pStyle value, '')]`` 132 | 133 | Also see docstring for ``gather_pPr`` 134 | """ 135 | return gather_Pr(paragraph_element).get("pStyle", "") or "" 136 | 137 | 138 | def get_run_formatting( 139 | run_element: EtreeElement, xml2html: dict[str, HtmlFormatter] 140 | ) -> list[str]: 141 | """Get run-element formatting converted into html. 142 | 143 | :param run_element: a ```` xml element 144 | create with:: 145 | 146 | document = etree.fromstring('bytes string') 147 | # recursively search document for elements. 148 | 149 | :param xml2html: mapping to convert xml styles to html styles 150 | e.g., { 151 | 'b': ( at 0x0000026BC7875A60>,), 152 | 'smallCaps': ( at 0x0000026BC7896DC0>, 'font', 'style') 153 | } 154 | 155 | :return: ``['b', 'i', ...]`` 156 | 157 | Lists are always returned in order: 158 | 159 | ``"span"`` first then any other styles in alphabetical order. 160 | 161 | Also see docstring for ``gather_rPr`` 162 | """ 163 | return _format_Pr_into_html(gather_Pr(run_element), xml2html) 164 | 165 | 166 | def get_paragraph_formatting( 167 | paragraph_element: EtreeElement, xml2html: dict[str, HtmlFormatter] 168 | ) -> list[str]: 169 | """Get paragraph-element formatting converted into html. 170 | 171 | :param paragraph_element: a ```` xml element 172 | create with:: 173 | 174 | document = etree.fromstring('bytes string') 175 | # recursively search document for elements. 176 | 177 | :param xml2html: mapping to convert xml styles to html styles 178 | e.g., { 179 | 'b': ( at 0x0000026BC7875A60>,), 180 | 'smallCaps': ( at 0x0000026BC7896DC0>, 'font', 'style') 181 | } 182 | 183 | :return: ``['b', 'i', ...]`` 184 | 185 | Tuples are always returned in order: 186 | 187 | ``"font"`` first then any other styles in alphabetical order. 188 | 189 | Also see docstring for ``gather_rPr`` 190 | """ 191 | return _format_Pr_into_html({get_pStyle(paragraph_element): None}, xml2html) 192 | 193 | 194 | def _format_Pr_into_html( 195 | Pr2val: dict[str, str | None], xml2html: dict[str, HtmlFormatter] 196 | ) -> list[str]: 197 | """Format tags and values into html strings. 198 | 199 | :param Pr2val: tags mapped to values (extracted from xml) 200 | e.g., {'b': None, 'bCs': None} 201 | :param xml2html: mapping to convert xml styles to html styles 202 | e.g., { 203 | 'b': ( at 0x0000026BC7875A60>,), 204 | 'smallCaps': ( at 0x0000026BC7896DC0>, 'span', 'style') 205 | } 206 | :return: the interior part of html opening tags, eg, ['span style="..."', 'b', 'i'] 207 | 208 | Types of styles supported: 209 | (None, None, formatter -> tag, None) 210 | -> outside any containers, no value set, e.g., `` 211 | ('span', 'style', formatter -> tag, val) 212 | -> inside a span, inside a style property, e.g., `` 213 | 214 | Other formats would probably work, but they aren't necessary to support the tags 215 | supported (see README). 216 | """ 217 | style: list[str] = [] 218 | 219 | # group together supported formats with the same container and property_ 220 | # e.g., group together everything that goes into `` 221 | # con_pro2for[(con, pro)] = string created from for 222 | con_pro2for: defaultdict[tuple[None | str, None | str], list[str]] 223 | con_pro2for = defaultdict(list) 224 | for tag, val in ((k, v) for k, v in Pr2val.items() if k in xml2html): 225 | formatter, container, property_ = xml2html[tag] 226 | con_pro2for[(container, property_)].append(formatter(tag, val or "")) 227 | 228 | # group together supported formats with the same container 229 | # e.g., group together everything that goes into `` 230 | # con2pro_for[(con,)] = string created from pro and for 231 | con2pro_for: defaultdict[str, list[str]] = defaultdict(list) 232 | for k, v in sorted((k, v) for k, v in con_pro2for.items() if k[1] is not None): 233 | con2pro_for[k[0] or ""].append(f'{k[1]}="{";".join(sorted(v))}"') 234 | 235 | # incorporate container type into string 236 | # style.append(string created from con, pro, and for) 237 | for k_, v_ in sorted((k, v) for k, v in con2pro_for.items() if k): 238 | style.append(f"{k_} {' '.join(v_)}") 239 | 240 | # add back in formats with no container or property_ 241 | style += sorted(con_pro2for[(None, None)]) 242 | return style 243 | 244 | 245 | def get_html_formatting( 246 | elem: EtreeElement, xml2html: dict[str, HtmlFormatter] 247 | ) -> list[str]: 248 | """Get style for an element (if available). 249 | 250 | :param elem: a run or paragraph element. 251 | :param xml2html: mapping to convert xml styles to html styles 252 | e.g., { 253 | 'b': ( at 0x0000026BC7875A60>,), 254 | 'smallCaps': ( at 0x0000026BC7896DC0>, 'font', 'style') 255 | } 256 | :return: ``[(rPr, val), (rPr, val) ...]`` 257 | """ 258 | if get_prefixed_tag(elem) == Tags.RUN: 259 | return get_run_formatting(elem, xml2html) 260 | if get_prefixed_tag(elem) == Tags.PARAGRAPH: 261 | return get_paragraph_formatting(elem, xml2html) 262 | return [] 263 | 264 | 265 | def html_open(style: Sequence[str]) -> str: 266 | """HTML tags to open a style. 267 | 268 | :param style: sequence of html tags without the '<' and '>' 269 | :return: opening html tags joined into a single string 270 | 271 | >>> style = ['font color="red" size="32"', 'b', 'i', 'u'] 272 | >>> html_open(style) 273 | '' 274 | """ 275 | return "".join(f"<{x}>" for x in style) 276 | 277 | 278 | def html_close(style: list[str]) -> str: 279 | """HTML tags to close a style. 280 | 281 | :param style: sequence of html tags without the '<' and '>' 282 | :return: closing html tags joined into a single string 283 | 284 | >>> style = ['font color="red" size="32"', 'b', 'i', 'u'] 285 | >>> html_close(style) 286 | '' 287 | 288 | Tags will always be in reverse (of open) order, so open - close will look like:: 289 | 290 | text 291 | """ 292 | return "".join(f"" for x in reversed(style)) 293 | -------------------------------------------------------------------------------- /docx2python/utilities.py: -------------------------------------------------------------------------------- 1 | """Utility / example functions using new (as of 2.0.0 Docx2Python features). 2 | 3 | :author: Shay Hill 4 | :created: 2021-12-21 5 | 6 | Docx2Python version two exposes extracted xml in the DocxReader object and has a new 7 | paragraph_styles argument. These functions use these new features as utilities / 8 | examples. 9 | """ 10 | 11 | from __future__ import annotations 12 | 13 | import copy 14 | import re 15 | from typing import TYPE_CHECKING 16 | 17 | from lxml import etree 18 | 19 | from docx2python.iterators import iter_at_depth 20 | from docx2python.main import docx2python 21 | 22 | if TYPE_CHECKING: 23 | import os 24 | from collections.abc import Iterator 25 | 26 | from lxml.etree import _Element as EtreeElement # type: ignore 27 | 28 | 29 | def _copy_new_text(elem: EtreeElement, new_text: str) -> EtreeElement: 30 | """Copy a text element and replace text. 31 | 32 | :param elem: an etree element with tag w:t 33 | :param new_text: text to replace elem.text 34 | :return: a new etree element with tag w:t and text new_text 35 | """ 36 | new_elem = copy.deepcopy(elem) 37 | new_elem.text = new_text 38 | return new_elem 39 | 40 | 41 | def _new_br_element(elem: EtreeElement) -> EtreeElement: 42 | """Return a break element with a representative elements namespace. 43 | 44 | :param elem: xml element 45 | :return: a new br element 46 | """ 47 | prefix = elem.nsmap["w"] 48 | return etree.Element(f"{{{prefix}}}br") 49 | 50 | 51 | def replace_root_text(root: EtreeElement, old: str, new: str) -> None: 52 | """Replace :old: with :new: in all descendants of :root:. 53 | 54 | :param root: an etree element presumably containing descendant text elements 55 | :param old: text to be replaced 56 | :param new: replacement text 57 | 58 | Will use softbreaks
to preserve line breaks in replacement text. 59 | """ 60 | 61 | def recursive_text_replace(branch: EtreeElement): 62 | """Replace any text element contining old with one or more elements. 63 | 64 | :param branch: an etree element 65 | """ 66 | for elem in tuple(branch): 67 | if not elem.text or old not in elem.text: 68 | recursive_text_replace(elem) 69 | continue 70 | 71 | # create a new text element for each line in replacement text 72 | text = elem.text.replace(old, new) 73 | new_elems = [_copy_new_text(elem, line) for line in text.splitlines()] 74 | 75 | # insert breakpoints where line breaks were 76 | breaks = [_new_br_element(elem) for _ in new_elems] 77 | new_elems = [x for pair in zip(new_elems, breaks) for x in pair][:-1] 78 | 79 | # replace the original element with the new elements 80 | parent = elem.getparent() 81 | if parent is not None: 82 | index = parent.index(elem) 83 | parent[index : index + 1] = new_elems 84 | 85 | recursive_text_replace(root) 86 | 87 | 88 | def replace_docx_text( 89 | path_in: str | os.PathLike[str], 90 | path_out: str | os.PathLike[str], 91 | *replacements: tuple[str, str], 92 | html: bool = False, 93 | ) -> None: 94 | """Replace text in a docx file. 95 | 96 | :param path_in: path to input docx 97 | :param path_out: path to output docx with text replaced 98 | :param replacements: tuples of strings (a, b) replace a with b for each in docx. 99 | :param html: respect formatting (as far as docx2python can see formatting) 100 | """ 101 | reader = docx2python(path_in, html=html).docx_reader 102 | for file in reader.content_files(): 103 | root = file.root_element 104 | for replacement in replacements: 105 | replace_root_text(root, *replacement) 106 | reader.save(path_out) 107 | reader.close() 108 | 109 | 110 | def get_links(path_in: str | os.PathLike[str]) -> Iterator[tuple[str, str]]: 111 | """Yield links inside a docx file as (href, text). 112 | 113 | :param path_in: path to input docx 114 | :yield: every link in the file as a tuple of (href, text) 115 | :return: None 116 | """ 117 | link_pattern = re.compile('(?P[^<]+)') 118 | extraction = docx2python(path_in) 119 | for run in iter_at_depth(extraction.document_runs, 5): 120 | match = re.match(link_pattern, run) 121 | if match: 122 | href, text = match.groups() 123 | yield href, text 124 | extraction.close() 125 | 126 | 127 | def get_headings(path_in: str | os.PathLike[str]) -> Iterator[list[str]]: 128 | """Yield paragraphs with 'Heading' patagraph_style. 129 | 130 | :param path_in: path to input docx 131 | :yield: every paragraph with 'Heading' paragraph_style as a list of strings 132 | :return: None 133 | 134 | When docx2python paragraph_styles parameter is set to True, the first run in 135 | every paragraph will be a paragraph style extracted from the xml, if present. 136 | Else, paragraphs style will be "". 137 | """ 138 | heading_pattern = re.compile(r"Heading\d") 139 | with docx2python(path_in, html=True) as extraction: 140 | for par in iter_at_depth(extraction.document_pars, 4): 141 | if re.match(heading_pattern, par.style): 142 | yield par.run_strings 143 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | 2 | [project] 3 | name = "docx2python" 4 | version = "3.5.0" 5 | description = "Extract content from docx files" 6 | authors = [{ name = "Shay Hill", email = "shay_public@hotmail.com" }] 7 | license = {text = "MIT"} 8 | readme = "README.md" 9 | requires-python = ">=3.9.0" 10 | dependencies = ["lxml", "paragraphs", "typing_extensions", "types-lxml"] 11 | 12 | [project.optional-dependencies] 13 | dev = ["commitizen", "pre-commit", "pytest", "tox", "types-lxml"] 14 | 15 | [build-system] 16 | requires = ["setuptools", "setuptools-scm"] 17 | build-backend = "setuptools.build_meta" 18 | 19 | 20 | [tool.commitizen] 21 | name = "cz_conventional_commits" 22 | version = "3.5.0" 23 | tag_format = "$version" 24 | major-version-zero = true 25 | version_files = ["pyproject.toml:^version"] 26 | 27 | 28 | [tool.isort] 29 | profile = "black" 30 | 31 | 32 | [tool.tox] 33 | legacy_tox_ini = """ 34 | [tox] 35 | envlist = py{313,312,311,310,39} 36 | 37 | [testenv] 38 | deps = pytest 39 | commands = pytest 40 | """ 41 | 42 | 43 | [tool.pytest.ini_options] 44 | pythonpath = ["tests"] 45 | log_cli = 1 46 | 47 | 48 | [tool.pyright] 49 | include = ["src"] 50 | exclude = ["**/__pycache__.py"] 51 | 52 | pythonVersion = "3.9" 53 | pythonPlatform = "Any" 54 | 55 | typeCheckingMode = "strict" 56 | reportShadowedImports = true 57 | reportCallInDefaultInitializer = true 58 | reportImplicitStringConcatenation = true 59 | # reportMissingSuperCall = true 60 | reportPropertyTypeMismatch = true 61 | reportUninitializedInstanceVariable = true 62 | reportUnnecessaryTypeIgnoreComment = true 63 | reportUnusedCallResult = true 64 | reportUnknownArgumentType = false 65 | reportUnknownLambdaType = false 66 | reportUnknownMemberType = false 67 | reportUnknownParameterType = false 68 | reportUnknownVariableType = false 69 | reportUntypedFunctionDecorator = false 70 | 71 | venvPath = "." 72 | venv = "./venv" 73 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/__init__.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | :author: Shay Hill 4 | :created: 7/2/2019 5 | 6 | """ 7 | 8 | from __future__ import annotations 9 | 10 | from pathlib import Path 11 | from typing import Any 12 | 13 | _PROJECT = Path(__file__).parent.parent 14 | 15 | 16 | def pytest_assertrepr_compare(config: Any, op: str, left: str, right: str) -> list[str]: 17 | """See full error diffs""" 18 | del config 19 | if op in ("==", "!="): 20 | return [f"{left} {op} {right}"] 21 | return [] 22 | 23 | 24 | RESOURCES = Path(_PROJECT, "tests", "resources") 25 | -------------------------------------------------------------------------------- /tests/do_not_test_missing_imagedata_rid.py: -------------------------------------------------------------------------------- 1 | """Skip image element when imagedata r:id cannot be found. 2 | 3 | :author: Shay Hill 4 | :created: 11/15/2020 5 | 6 | User forky2 sent a docx file with an empty imagedata element: 7 | 8 | `` 9 | 10 | Docx2python expects to encounter 11 | 12 | `` 13 | 14 | Where `r:id="rId689"` is mapped to an image filename in one of the `rels` files. 15 | 16 | The missing `r:id` raises a KeyError in docx2python v1.27 17 | 18 | ``` 19 | Traceback (most recent call last): 20 | File "./process.py", line 99, in 21 | process_zip("Specs/2020-06/Rel-16/25_series/25101-g10.zip") 22 | File "./process.py", line 70, in process_zip 23 | doc_data = docx2python(docx_file) 24 | File "/home/forky2/projects/docx2python/docx2python/main.py", line 61, in docx2python 25 | body = file_text(context["officeDocument"]) 26 | File "/home/forky2/projects/docx2python/docx2python/main.py", line 56, in file_text 27 | return get_text(unzipped, context) 28 | File "/home/forky2/projects/docx2python/docx2python/docx_text.py", line 264, in get_text 29 | branches(ElementTree.fromstring(xml)) 30 | File "/home/forky2/projects/docx2python/docx2python/docx_text.py", line 248, in branches 31 | branches(child) 32 | File "/home/forky2/projects/docx2python/docx2python/docx_text.py", line 248, in branches 33 | branches(child) 34 | File "/home/forky2/projects/docx2python/docx2python/docx_text.py", line 248, in branches 35 | branches(child) 36 | [Previous line repeated 2 more times] 37 | File "/home/forky2/projects/docx2python/docx2python/docx_text.py", line 239, in branches 38 | rId = child.attrib[qn("r:id")] 39 | KeyError: '{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id' 40 | ``` 41 | 42 | Solution: skip silently when an `r:id` cannot be found for an `imagedata` element. 43 | """ 44 | 45 | # from docx2python import docx2python 46 | 47 | 48 | # class TestMissingRIdInImagedata: 49 | # def test_skips_missing_rid(self) -> None: 50 | # """Silently skip over imagedata element if r:id not found""" 51 | # pars = docx2python("resources/imagedata_without_rid.docx") 52 | -------------------------------------------------------------------------------- /tests/do_not_test_problem_files.py: -------------------------------------------------------------------------------- 1 | """Run problem files I come across. 2 | 3 | :author: Shay Hill 4 | :created: 7/17/2019 5 | """ 6 | 7 | from docx2python.main import docx2python 8 | 9 | 10 | def test_dop_1013a() -> None: 11 | """Misidentifies ``word/document.xml`` as ``word/word/document.xml``""" 12 | with docx2python("resources/example.docx") as _: 13 | pass 14 | with docx2python("resources/240-DOP-1013A Lay Down Tubulars.docx") as _: 15 | pass 16 | -------------------------------------------------------------------------------- /tests/resources/240-DOP-1013A Lay Down Tubulars.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/240-DOP-1013A Lay Down Tubulars.docx -------------------------------------------------------------------------------- /tests/resources/ControlTest.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/ControlTest.docx -------------------------------------------------------------------------------- /tests/resources/apples_and_pears.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/apples_and_pears.docx -------------------------------------------------------------------------------- /tests/resources/ascii_printable.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/ascii_printable.docx -------------------------------------------------------------------------------- /tests/resources/basic.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/basic.docx -------------------------------------------------------------------------------- /tests/resources/check_drop_my.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/check_drop_my.docx -------------------------------------------------------------------------------- /tests/resources/checked-true-false.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/checked-true-false.docx -------------------------------------------------------------------------------- /tests/resources/checked_boxes.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/checked_boxes.docx -------------------------------------------------------------------------------- /tests/resources/checked_drop1.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/checked_drop1.docx -------------------------------------------------------------------------------- /tests/resources/comments.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/comments.docx -------------------------------------------------------------------------------- /tests/resources/created-in-pages-bulleted-lists.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/created-in-pages-bulleted-lists.docx -------------------------------------------------------------------------------- /tests/resources/created-in-pages-paragraphs-only.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/created-in-pages-paragraphs-only.docx -------------------------------------------------------------------------------- /tests/resources/equations.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/equations.docx -------------------------------------------------------------------------------- /tests/resources/example.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/example.docx -------------------------------------------------------------------------------- /tests/resources/example_numbering.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/example_numbering.docx -------------------------------------------------------------------------------- /tests/resources/has_pict.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/has_pict.docx -------------------------------------------------------------------------------- /tests/resources/hyperlink.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/hyperlink.docx -------------------------------------------------------------------------------- /tests/resources/imagedata_without_rid.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/imagedata_without_rid.docx -------------------------------------------------------------------------------- /tests/resources/invalid_tag_name.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/invalid_tag_name.docx -------------------------------------------------------------------------------- /tests/resources/libreoffice_conversion.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/libreoffice_conversion.docx -------------------------------------------------------------------------------- /tests/resources/list_index_a.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/list_index_a.docx -------------------------------------------------------------------------------- /tests/resources/long_hyperlink.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/long_hyperlink.docx -------------------------------------------------------------------------------- /tests/resources/merged_cells.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/merged_cells.docx -------------------------------------------------------------------------------- /tests/resources/merged_links.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/merged_links.docx -------------------------------------------------------------------------------- /tests/resources/multiple_runs_per_paragraph.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/multiple_runs_per_paragraph.docx -------------------------------------------------------------------------------- /tests/resources/nested_paragraphs.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/nested_paragraphs.docx -------------------------------------------------------------------------------- /tests/resources/nested_paragraphs_in_header.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/nested_paragraphs_in_header.docx -------------------------------------------------------------------------------- /tests/resources/nested_paragraphs_in_header3b.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/nested_paragraphs_in_header3b.docx -------------------------------------------------------------------------------- /tests/resources/paragraphs_and_tables.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/paragraphs_and_tables.docx -------------------------------------------------------------------------------- /tests/resources/pic_alt_text.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/pic_alt_text.docx -------------------------------------------------------------------------------- /tests/resources/renamed_document_xml.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/renamed_document_xml.docx -------------------------------------------------------------------------------- /tests/resources/run_styles.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/run_styles.docx -------------------------------------------------------------------------------- /tests/resources/slanted_quotes.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/slanted_quotes.docx -------------------------------------------------------------------------------- /tests/resources/soft_line_breaks.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/soft_line_breaks.docx -------------------------------------------------------------------------------- /tests/resources/strict.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/strict.docx -------------------------------------------------------------------------------- /tests/resources/symbols.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/symbols.docx -------------------------------------------------------------------------------- /tests/resources/test-docx2python-conversion-google_docs.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/test-docx2python-conversion-google_docs.docx -------------------------------------------------------------------------------- /tests/resources/test_file_with_comments.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/test_file_with_comments.docx -------------------------------------------------------------------------------- /tests/resources/unchecked_drop0.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/unchecked_drop0.docx -------------------------------------------------------------------------------- /tests/resources/zen_of_python.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/zen_of_python.docx -------------------------------------------------------------------------------- /tests/test_ascii_printable.py: -------------------------------------------------------------------------------- 1 | """Test that most characters in string.printable can are represented 2 | 3 | (some are altered) in Docx2Python output. 4 | """ 5 | 6 | import string 7 | 8 | from docx2python.main import docx2python 9 | from tests.conftest import RESOURCES 10 | 11 | 12 | class TestAsciiPrintable: 13 | """Confirming this works with v1.25""" 14 | 15 | def test_exact_representation(self) -> None: 16 | """Most characters are represented exactly 17 | The last seven characters are 18 | \n\r\x0b\b0cEND 19 | \n \r \x0b and \x0c are ignored by word when typed. 20 | END is there (added by hand to docx file) to let me know I'm past any 21 | trailing characters 22 | """ 23 | with docx2python(RESOURCES / "ascii_printable.docx") as pars: 24 | assert pars.text[:-7] == string.printable[:-4] 25 | 26 | def test_html_true(self) -> None: 27 | """Most characters are represented exactly. &, <, and > are escaped. 28 | 29 | The last seven characters are 30 | \n\r\x0b\b0cEND 31 | \n \r \x0b and \x0c are ignored by word when typed. 32 | END is there (added by hand to docx file) to let me know I'm past any 33 | trailing characters 34 | """ 35 | pars = docx2python(RESOURCES / "ascii_printable.docx", html=True) 36 | assert pars.text[:-7] == ( 37 | '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&' 38 | ";'()*+,-./:;<=>?@[\\]^_`{|}~ \t" 39 | ) 40 | pars.close() 41 | -------------------------------------------------------------------------------- /tests/test_check_drop.py: -------------------------------------------------------------------------------- 1 | """Test checkbox exports from a user-submitted and my own checkbox files. 2 | 3 | :author: Shay Hill 4 | :created: 6/17/2020 5 | 6 | List items from the user-submitted docx were listed B then A. Confusing for the test, 7 | but I didn't want to alter it in my version of Word. 8 | """ 9 | 10 | from docx2python.main import docx2python 11 | from tests.conftest import RESOURCES 12 | 13 | 14 | class TestCheckboxToHtml: 15 | def test_user_checked_dropdown0(self) -> None: 16 | """Get checked-out box glyph and second dd entry""" 17 | extraction = docx2python(RESOURCES / "checked_drop1.docx") 18 | assert extraction.body_runs == [[[[["☒", " "], ["PIlihan A"]]]]] 19 | extraction.close() 20 | 21 | def test_user_unchecked_dropdown1(self) -> None: 22 | """Get unchecked box glyph and first dd entry""" 23 | extraction = docx2python(RESOURCES / "unchecked_drop0.docx") 24 | assert extraction.text == "\u2610 \n\nPiihan B" 25 | extraction.close() 26 | 27 | def test_my_checkbox(self) -> None: 28 | """A good selection of checked and unchecked boxes, and several dropdowns""" 29 | extraction = docx2python(RESOURCES / "check_drop_my.docx") 30 | assert extraction.body_runs == [ 31 | [ 32 | [ 33 | [ 34 | ["[user unchecked]", "☐", "[user unchecked]"], 35 | [], 36 | ["[user checked]", "☒", "[user checked]"], 37 | [], 38 | ["[my unchecked]", "☐", "[my unchecked]"], 39 | [], 40 | ["[my checked]", "☒", "[my checked]"], 41 | [], 42 | ["User dropdown (Piihan B)"], 43 | ["Piihan B"], 44 | [], 45 | ["My dropdown (no choice)"], 46 | ] 47 | ], 48 | [[["Choose an item."]]], 49 | [[[], ["My dropdown (chose A)"]]], 50 | [[["my_item_A"]]], 51 | [[[], ["My dropdown (chose B)"]]], 52 | [[["my_item_B"]]], 53 | ] 54 | ] 55 | extraction.close() 56 | -------------------------------------------------------------------------------- /tests/test_checked_boxes.py: -------------------------------------------------------------------------------- 1 | """Identify checked boxes in user-submitted file 2 | 3 | :author: Shay Hill 4 | :created: 2021-12-17 5 | 6 | From user PandaJones: 7 | 8 | ''' 9 | Word docx's xml (i believe this is cause the docx version is pretty old) deletes 10 | w:val when the checkbox is checked and has w:val = 0 when the checkbox isn't checked. 11 | 12 | This causes a problems that the library defaults to 0 when w:val isn't found in 13 | w:checked. To fix this, I just checked if there is anything attributes in w:check and 14 | return a 1 if there isn't anything there. 15 | 16 | I can probably edit the code to check if w:val exist instead as I don't know if 17 | w:checked can have other attributes. 18 | 19 | Thank for have this library be able to display checkboxes, it is super useful when 20 | parsing through forms that have all of their stuff in tables. 21 | ''' 22 | """ 23 | 24 | from docx2python import docx2python 25 | from docx2python.iterators import iter_at_depth 26 | from tests.conftest import RESOURCES 27 | 28 | 29 | def test_checked_boxes_explicit() -> None: 30 | """ 31 | The following text boxes are checked. Remaining checkboxes are unchecked. 32 | 33 | Adult Protective Services 34 | Older Adult Mental Health 35 | ProsecutorΓÇÖs Office 36 | Regional Center 37 | 38 | Coroner/Medical Examiner 39 | Law Enforcement 40 | Civil Attorney/Legal Services 41 | Psychologist 42 | 43 | Medical Practitioner 44 | LTC Ombudsman 45 | Public Guardian 46 | Other (describe): 47 | 48 | """ 49 | pars = docx2python(RESOURCES / "checked_boxes.docx", duplicate_merged_cells=False) 50 | expect: list[list[list[list[str]]]] = [ 51 | [ 52 | [["\u2612", " Adult Protective Services"]], 53 | [[]], 54 | [["\u2612", " Older Adult Mental Health"]], 55 | [[]], 56 | [[]], 57 | [[]], 58 | [["\u2612", " Prosecutor’s Office"]], 59 | [[]], 60 | [[]], 61 | [[]], 62 | [["\u2612", " Regional Center"]], 63 | [[]], 64 | ], 65 | [ 66 | [["\u2612", " Coroner/Medical Examiner"]], 67 | [[]], 68 | [["\u2612", " Law Enforcement"]], 69 | [[]], 70 | [[]], 71 | [[]], 72 | [["\u2612", " Civil Attorney/Legal Services"]], 73 | [[]], 74 | [[]], 75 | [[]], 76 | [["\u2612", " Psychologist"]], 77 | [[]], 78 | ], 79 | [ 80 | [["\u2612", " Medical Practitioner"]], 81 | [[]], 82 | [["\u2612", " LTC Ombudsman"]], 83 | [[]], 84 | [[]], 85 | [[]], 86 | [["\u2612", " Public Guardian"]], 87 | [[]], 88 | [[]], 89 | [[]], 90 | [["\u2612", " Other (describe):\u2002\u2002\u2002\u2002\u2002"]], 91 | [[]], 92 | ], 93 | ] 94 | 95 | assert pars.body_runs[0][3:6] == expect 96 | pars.close() 97 | 98 | 99 | def test_unchecked_boxes() -> None: 100 | """ 101 | The following text boxes are checked. Remaining checkboxes are unchecked. 102 | 103 | Adult Protective Services 104 | Older Adult Mental Health 105 | ProsecutorΓÇÖs Office 106 | Regional Center 107 | 108 | Coroner/Medical Examiner 109 | Law Enforcement 110 | Civil Attorney/Legal Services 111 | Psychologist 112 | 113 | Medical Practitioner 114 | LTC Ombudsman 115 | Public Guardian 116 | Other (describe): 117 | 118 | All other checkboxes are unchecked 119 | 120 | """ 121 | pars = docx2python(RESOURCES / "checked_boxes.docx", duplicate_merged_cells=False) 122 | all_text = "".join(iter_at_depth(pars.text, 5)) 123 | assert all_text.count("\u2612") == 12 124 | assert all_text.count("\u2610") == 32 125 | pars.close() 126 | 127 | 128 | def test_checkboxes_true_false() -> None: 129 | """ 130 | Checkboxes with "true" and "false" instead of "1" and "0" values. 131 | """ 132 | with docx2python(RESOURCES / "checked-true-false.docx") as pars: 133 | all_text = "".join(iter_at_depth(pars.text, 5)) 134 | assert all_text.count("\u2612") == 4 135 | assert all_text.count("\u2610") == 4 136 | -------------------------------------------------------------------------------- /tests/test_close.py: -------------------------------------------------------------------------------- 1 | """Test opening docx reader and closing it. 2 | 3 | Closing a DocxReader or DocxContent instance will close the zipfile openend when the 4 | DocxReader instance was created. 5 | 6 | :author: Shay Hill 7 | :created: 7/5/2019 8 | """ 9 | 10 | import pytest 11 | 12 | from docx2python.attribute_register import Tags, get_prefixed_tag 13 | from docx2python.docx_reader import DocxReader 14 | from docx2python.main import docx2python 15 | from tests.conftest import RESOURCES 16 | 17 | example_docx = RESOURCES / "example.docx" 18 | example_copy_docx = RESOURCES / "example_copy.docx" 19 | 20 | 21 | class TestCloseDocxReader: 22 | def test_explicit_close(self) -> None: 23 | """Closing DocxReader closes the zipfile.""" 24 | input_context = DocxReader(example_docx) 25 | _ = input_context.file_of_type("officeDocument").root_element 26 | # assert DocxReader zipfile is open 27 | assert input_context._DocxReader__zipf.fp # type: ignore 28 | 29 | input_context.close() 30 | # assert DocxReader zipfile is closed 31 | assert not input_context._DocxReader__zipf.fp # type: ignore 32 | 33 | def test_no_access_after_explicit_close(self) -> None: 34 | """The zipfile will not automatically reopen after explicit close.""" 35 | input_context = DocxReader(example_docx) 36 | input_context.close() 37 | # assert zipfile cannot be accessed 38 | with pytest.raises(ValueError): 39 | _ = input_context.zipf 40 | 41 | 42 | class TestDocxReaderContext: 43 | def test_context_manager_enter(self): 44 | """DocxReader can be used as a context manager.""" 45 | with DocxReader(example_docx) as input_context: 46 | input_xml = input_context.file_of_type("officeDocument").root_element 47 | assert get_prefixed_tag(input_xml) == Tags.DOCUMENT 48 | 49 | def test_context_manager_close(self): 50 | """DocxReader can be used as a context manager.""" 51 | with DocxReader(example_docx) as input_context: 52 | _ = input_context.file_of_type("officeDocument").root_element 53 | with pytest.raises(ValueError): 54 | _ = input_context.zipf 55 | 56 | 57 | class TestCloseDocxContent: 58 | def test_explicit_close(self) -> None: 59 | """Closing DocxReader closes the zipfile.""" 60 | content = docx2python(example_docx) 61 | _ = content.header_runs 62 | assert content.docx_reader._DocxReader__zipf.fp # type: ignore 63 | 64 | content.close() 65 | # assert DocxReader zipfile is closed 66 | assert not content.docx_reader._DocxReader__zipf.fp # type: ignore 67 | 68 | def test_no_access_after_explicit_close(self) -> None: 69 | """The zipfile will not automatically reopen after explicit close.""" 70 | content = docx2python(example_docx) 71 | content.close() 72 | # assert zipfile cannot be accessed 73 | with pytest.raises(ValueError): 74 | _ = content.docx_reader.zipf 75 | 76 | 77 | class TestDocxContentContext: 78 | def test_context_manager_enter(self): 79 | """DocxReader can be used as a context manager.""" 80 | with docx2python(example_docx) as content: 81 | _ = content.header_runs 82 | 83 | def test_context_manager_close(self): 84 | """DocxReader can be used as a context manager.""" 85 | with docx2python(example_docx) as content: 86 | pass 87 | _ = content.header_runs 88 | with pytest.raises(ValueError): 89 | _ = content.docx_reader.zipf 90 | -------------------------------------------------------------------------------- /tests/test_comments.py: -------------------------------------------------------------------------------- 1 | """Test extracting comments. 2 | 3 | User flyguy62n requested comment extraction. Extract comments as tuples (text, 4 | author, date, comment). 5 | 6 | :author: Shay Hill 7 | :created: 2024-03-29 8 | """ 9 | 10 | import os 11 | import sys 12 | 13 | import pytest 14 | 15 | project = os.path.abspath(os.path.join(__file__, "..", "..")) 16 | sys.path.append(project) 17 | 18 | 19 | from paragraphs import par 20 | 21 | from docx2python import docx2python 22 | from tests.conftest import RESOURCES 23 | 24 | 25 | def test_comments() -> None: 26 | """Extract comments and some comment metadata.""" 27 | pars = docx2python(RESOURCES / "comments.docx") 28 | comments = pars.comments 29 | 30 | pars.close() 31 | assert comments == [ 32 | ( 33 | par( 34 | """Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do 35 | eiusmod tempor incididunt ut labore et dolore magna aliqua.""" 36 | ), 37 | "Randy Bartels", 38 | "2024-03-28T17:22:00Z", 39 | "COMMENT", 40 | ), 41 | ( 42 | par( 43 | """Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do 44 | eiusmod tempor incididunt ut labore et dolore magna aliqua.""" 45 | ), 46 | "Randy Bartels", 47 | "2024-03-28T17:22:00Z", 48 | "RESPONSE", 49 | ), 50 | ( 51 | par( 52 | """Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do 53 | eiusmod tempor incididunt ut labore et dolore magna aliqua.""" 54 | ), 55 | "Shay Hill", 56 | "2024-03-29T12:10:00Z", 57 | "Response from Shay Hill", 58 | ), 59 | ( 60 | "tempor incididunt ut labore et dolore magna aliqua.", 61 | "Shay Hill", 62 | "2024-03-29T12:28:00Z", 63 | "Comment on subset starting with tempor", 64 | ), 65 | ( 66 | par( 67 | """Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do 68 | eiusmod tempor incididunt ut labore et dolore magna aliqua.""" 69 | ), 70 | "Randy Bartels", 71 | "2024-03-28T17:22:00Z", 72 | "COMMENT on par 5", 73 | ), 74 | ( 75 | par( 76 | """Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do 77 | eiusmod tempor incididunt ut labore et dolore magna aliqua.""" 78 | ), 79 | "Randy Bartels", 80 | "2024-03-28T17:22:00Z", 81 | "RESPONSE to comment on par 5", 82 | ), 83 | ( 84 | par( 85 | """Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do 86 | eiusmod tempor incididunt ut labore et dolore magna aliqua.""" 87 | ), 88 | "Shay Hill", 89 | "2024-03-29T12:10:00Z", 90 | "Response from Shay Hill on par 5", 91 | ), 92 | ( 93 | "tempor incididunt ut labore et dolore magna aliqua.", 94 | "Shay Hill", 95 | "2024-03-29T12:28:00Z", 96 | "Comment on subset starting with tempor on par 5", 97 | ), 98 | ] 99 | 100 | 101 | @pytest.fixture(scope="module") 102 | def test_file_with_comments(): 103 | test_file = RESOURCES / "test_file_with_comments.docx" 104 | pars = docx2python(test_file) 105 | yield pars.comments 106 | pars.close() 107 | 108 | 109 | class TestAdditionalComments: 110 | test_file = RESOURCES / "test_file_with_comments.docx" 111 | 112 | def test_comment_1( 113 | self, test_file_with_comments: "list[tuple[str, str, str, str]]" 114 | ) -> None: 115 | """Extract the first comment.""" 116 | comment = test_file_with_comments[0] 117 | assert comment == ( 118 | "magna ", 119 | "Randy Bartels", 120 | "2024-04-02T16:57:00Z", 121 | "Comment 1", 122 | ) 123 | 124 | def test_comment_2( 125 | self, test_file_with_comments: "list[tuple[str, str, str, str]]" 126 | ) -> None: 127 | """Extract the first comment.""" 128 | comment = test_file_with_comments[1] 129 | assert comment == ( 130 | "quis ", 131 | "Randy Bartels", 132 | "2024-04-02T16:58:00Z", 133 | "Comment 2", 134 | ) 135 | 136 | def test_comment_3( 137 | self, test_file_with_comments: "list[tuple[str, str, str, str]]" 138 | ) -> None: 139 | """Extract the first comment.""" 140 | comment = test_file_with_comments[2] 141 | assert comment == ( 142 | "Bibendum", 143 | "Randy Bartels", 144 | "2024-04-02T16:58:00Z", 145 | "Comment 3", 146 | ) 147 | 148 | def test_comment_with_hyperlink( 149 | self, test_file_with_comments: "list[tuple[str, str, str, str]]" 150 | ) -> None: 151 | """Extract the first comment.""" 152 | comment = test_file_with_comments[3] 153 | assert comment == ( 154 | "dolor ", 155 | "Randy Bartels", 156 | "2024-04-02T16:58:00Z", 157 | 'Comment 4 with hyperlink', 158 | ) 159 | 160 | def test_comment_5( 161 | self, test_file_with_comments: "list[tuple[str, str, str, str]]" 162 | ) -> None: 163 | """Extract the first comment.""" 164 | comment = test_file_with_comments[4] 165 | assert comment == ( 166 | "suspendisse ", 167 | "Randy Bartels", 168 | "2024-04-02T16:59:00Z", 169 | "Comment 5", 170 | ) 171 | 172 | def test_comment_with_a_response( 173 | self, test_file_with_comments: "list[tuple[str, str, str, str]]" 174 | ) -> None: 175 | """Extract the first comment.""" 176 | comment = test_file_with_comments[5] 177 | assert comment == ( 178 | "suspendisse ", 179 | "Randy Bartels", 180 | "2024-04-02T16:59:00Z", 181 | "With a response", 182 | ) 183 | 184 | def test_long_comment( 185 | self, test_file_with_comments: "list[tuple[str, str, str, str]]" 186 | ) -> None: 187 | """Extract the first comment.""" 188 | comment = test_file_with_comments[6] 189 | assert comment == ( 190 | "Amet ", 191 | "Randy Bartels", 192 | "2024-04-02T17:00:00Z", 193 | par( 194 | """Comment 6 with a long comment.\n\nmagna fringilla urna porttitor 195 | rhoncus dolor purus non enim praesent elementum facilisis leo vel 196 | fringilla est ullamcorper eget nulla facilisi etiam dignissim diam 197 | quis enim lobortis scelerisque fermentum dui faucibus in ornare quam 198 | viverra orci sagittis eu volutpat odio facilisis mauris\n\nsit amet 199 | massa vitae tortor condimentum lacinia quis vel eros donec ac odio 200 | tempor orci dapibus ultrices in iaculis nunc sed augue lacus viverra 201 | vitae congue eu consequat ac felis donec et odio pellentesque diam 202 | volutpat commodo sed egestas egestas fringilla phasellus faucibus 203 | scelerisque eleifend donec pretium vulputate sapien nec sagittis 204 | aliquam malesuada bibendum""" 205 | ), 206 | ) 207 | 208 | def test_comment_7( 209 | self, test_file_with_comments: "list[tuple[str, str, str, str]]" 210 | ) -> None: 211 | """Extract the first comment.""" 212 | comment = test_file_with_comments[7] 213 | assert comment == ( 214 | "suspendisse ", 215 | "Randy Bartels", 216 | "2024-04-02T17:00:00Z", 217 | "Comment 7 with a long response", 218 | ) 219 | 220 | def test_long_response( 221 | self, test_file_with_comments: "list[tuple[str, str, str, str]]" 222 | ) -> None: 223 | """Extract the first comment.""" 224 | comment = test_file_with_comments[8] 225 | assert comment == ( 226 | "suspendisse ", 227 | "Randy Bartels", 228 | "2024-04-02T17:00:00Z", 229 | par( 230 | """Long response: magna fringilla urna porttitor rhoncus dolor purus 231 | non enim praesent elementum facilisis leo vel fringilla est 232 | ullamcorper eget nulla facilisi etiam dignissim diam quis enim 233 | lobortis scelerisque fermentum dui faucibus in ornare quam viverra 234 | orci sagittis eu volutpat odio facilisis mauris\n\nsit amet massa 235 | vitae tortor condimentum lacinia quis vel eros donec ac odio tempor 236 | orci dapibus ultrices in iaculis nunc sed augue lacus viverra vitae 237 | congue eu consequat ac felis donec et odio pellentesque diam volutpat 238 | commodo sed egestas egestas fringilla phasellus faucibus scelerisque 239 | eleifend donec pretium vulputate sapien nec sagittis aliquam 240 | malesuada bibendum""" 241 | ), 242 | ) 243 | 244 | def comment_8( 245 | self, test_file_with_comments: "list[tuple[str, str, str, str]]" 246 | ) -> None: 247 | """Extract the first comment.""" 248 | comment = test_file_with_comments[9] 249 | assert comment == ( 250 | "Magnis ", 251 | "Randy Bartels", 252 | "2024-04-02T17:04:00Z", 253 | "Comment 8 - marked Resolved", 254 | ) 255 | 256 | def comment_in_a_table( 257 | self, test_file_with_comments: "list[tuple[str, str, str, str]]" 258 | ) -> None: 259 | """Extract the first comment.""" 260 | comment = test_file_with_comments[10] 261 | assert comment == ( 262 | "R1C1", 263 | "Randy Bartels", 264 | "2024-04-02T17:07:00Z", 265 | "Comment in a table", 266 | ) 267 | 268 | def comment_on_a_picture( 269 | self, test_file_with_comments: "list[tuple[str, str, str, str]]" 270 | ) -> None: 271 | """Extract the first comment.""" 272 | comment = test_file_with_comments[11] 273 | assert comment == ( 274 | "", 275 | "Randy Bartels", 276 | "2024-04-02T17:08:00Z", 277 | "Comment on a picture", 278 | ) 279 | 280 | 281 | def test_no_comments() -> None: 282 | """Return an empty list when no comments are present.""" 283 | pars = docx2python(RESOURCES / "apples_and_pears.docx") 284 | comments = pars.comments 285 | pars.close() 286 | assert comments == [] 287 | -------------------------------------------------------------------------------- /tests/test_content_control_block_properties.py: -------------------------------------------------------------------------------- 1 | """Test accessing SDT properties above a paragraph. 2 | 3 | issue #81 4 | 5 | User YashasviMantha requested a way to access Content Control Block properties. In 6 | the xml, these are called Structured Document Tags (SDT). To allow this, I added two 7 | features: 8 | 9 | 1. Each Par instance now contains a pointer to the XML element from which it was 10 | created. 11 | 2. Add a `tag` argument to `gather_Pr` that allows the caller to search up for 12 | the Pr of a parent element. 13 | 14 | This is a simple test and an example. See `get_sdt_tag` example function for a 15 | description of the sdt context in xml and how to access it. 16 | 17 | :author: Shay Hill 18 | :created: 2024-11-17 19 | """ 20 | 21 | from __future__ import annotations 22 | 23 | from lxml.etree import _Element as EtreeElement # type: ignore 24 | 25 | from docx2python.attribute_register import Tags 26 | from docx2python.iterators import iter_paragraphs 27 | from docx2python.main import docx2python 28 | from docx2python.text_runs import gather_Pr 29 | from tests.conftest import RESOURCES 30 | 31 | _DOCX = RESOURCES / "ControlTest.docx" 32 | 33 | 34 | def get_sdt_tag(elem: EtreeElement) -> str | None: 35 | """If elem is or is inside a element, try to find the sdt props tag value. 36 | 37 | :param elem: lxml.etree._Element object 38 | :return: tag value of sibling or parent sdtPr element or None 39 | ``` 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | ``` 52 | """ 53 | properties_dict = gather_Pr(elem, Tags.SDT) 54 | return properties_dict.get("tag") 55 | 56 | 57 | class TestStructuredDocumentTags: 58 | 59 | def test_paragraphs_in_sdt_elements(self) -> None: 60 | """Get the SDT tag above a paragraph.""" 61 | with docx2python(_DOCX) as extraction: 62 | pars = extraction.document_pars 63 | 64 | text_paragraphs: list[str] = [] 65 | 66 | for paragraph in iter_paragraphs(pars): 67 | if paragraph.elem is None: 68 | par_tag = None 69 | else: 70 | par_tag = get_sdt_tag(paragraph.elem) 71 | par_text = "".join(paragraph.run_strings) 72 | text_paragraphs.append(f"[{par_tag}]: {par_text}") 73 | 74 | assert text_paragraphs == [ 75 | "[Test_Control]: This is a test", 76 | "[Test_Control]: For a content control or content container in word. ", 77 | ] 78 | -------------------------------------------------------------------------------- /tests/test_created_in_pages.py: -------------------------------------------------------------------------------- 1 | """Fix bullets in pages created in Pages 2 | 3 | :author: Shay Hill 4 | :created: 10/5/2020 5 | 6 | Issue 11: 7 | 8 | I have seen this happening for files created in Pages but not in files created in 9 | MSWord. 10 | 11 | How to reproduce: 12 | Use Pages (MacOS app) to write a document 13 | save the document as docx 14 | attempt to extract using docx2python 15 | 16 | It seems Pages is adding abstractNum nodes that don't contain w:lvl nodes. For example: 17 | 18 | 19 | 20 | 21 | collect_numFmts (from docx_context.py) then reads and stores these in the context as []. 22 | This context is then passed down to _get_bullet_string (from docx_text.py). Then the 23 | IndexError when we try to get the number format from context. 24 | 25 | User Raiyan provided two docx files created in pages: 26 | * created-in-pages-paragraphs-only.docx should work now (v 1.25) 27 | * created-in-pages-bulleted-lists.docx should fail (v 1.25) with above-described 28 | error. 29 | """ 30 | 31 | from docx2python.main import docx2python 32 | from tests.conftest import RESOURCES 33 | 34 | 35 | class TestParagraphsOnly: 36 | """Confirming this works with v1.25""" 37 | 38 | def test_paragraphs_only(self) -> None: 39 | """Run without issue""" 40 | pars = docx2python(RESOURCES / "created-in-pages-paragraphs-only.docx") 41 | assert pars.text == ( 42 | "\n\nThis is a document for testing docx2python module.\n\n\n\nThis " 43 | "document contains paragraphs.\n\n\n\nThis document does not contain any " 44 | "bulleted lists.\n\n" 45 | ) 46 | pars.close() 47 | 48 | 49 | class TestBulletedLists: 50 | """Replace numbering format with bullet (--) when format cannot be determined""" 51 | 52 | def test_bulleted_lists(self) -> None: 53 | pars = docx2python(RESOURCES / "created-in-pages-bulleted-lists.docx") 54 | assert pars.text == ( 55 | "\n\nThis is a document for testing docx2python module.\n\n\n\n" 56 | "--\tWhy did the chicken cross the road?\n\n" 57 | "\t--\tJust because\n\n" 58 | "\t--\tDon't know\n\n" 59 | "\t--\tTo get to the other side\n\n" 60 | "--\tWhat's the meaning of life, universe and everything?\n\n" 61 | "\t--\t42\n\n" 62 | "\t--\t0\n\n" 63 | "\t--\t-1\n\n" 64 | ) 65 | pars.close() 66 | -------------------------------------------------------------------------------- /tests/test_document2_xml.py: -------------------------------------------------------------------------------- 1 | """Test hyperlink functionality 2 | 3 | :author: Shay Hill 4 | :created: 4/19/2020 5 | 6 | The main content file in a docx is usually ``word/document.xml``, but this is not 7 | always the case. 8 | """ 9 | 10 | from docx2python.main import docx2python 11 | from tests.conftest import RESOURCES 12 | 13 | 14 | class TestHyperlink: 15 | def test_prints(self) -> None: 16 | """ 17 | Open a docx with ``word/document.xml`` renamed to ``word/blah_blah.xml`` 18 | and all references updated. Test that text extracts as expected.""" 19 | extraction = docx2python(RESOURCES / "renamed_document_xml.docx") 20 | assert ( 21 | 'my website.' in extraction.text 22 | ) 23 | extraction.close() 24 | -------------------------------------------------------------------------------- /tests/test_docx2python.py: -------------------------------------------------------------------------------- 1 | """Test full functionality of source_old 2 | 3 | :author: Shay Hill 4 | :created: 7/5/2019 5 | """ 6 | 7 | import os 8 | import re 9 | import shutil 10 | 11 | from paragraphs import par 12 | 13 | from docx2python.iterators import iter_at_depth 14 | from docx2python.main import docx2python 15 | from tests.conftest import RESOURCES 16 | 17 | ALT_TEXT = par( 18 | """----Image alt text---->A close up of a logo\n\n 19 | Description automatically generated<""" 20 | ) 21 | 22 | 23 | class TestFormatting: 24 | """Nested list output string formatting""" 25 | 26 | def test_header(self) -> None: 27 | """Header text in correct location""" 28 | with docx2python(RESOURCES / "example.docx") as output: 29 | header_text = "".join(iter_at_depth(output.header, 4)) 30 | assert re.match( 31 | rf"Header text{ALT_TEXT}----media/image\d+\.\w+----$", header_text 32 | ) 33 | 34 | def test_footer(self) -> None: 35 | """Footer text in correct location""" 36 | with docx2python(RESOURCES / "example.docx") as output: 37 | footer_text = "".join(iter_at_depth(output.footer, 4)) 38 | assert re.match( 39 | rf"Footer text{ALT_TEXT}----media/image\d+\.\w+----$", footer_text 40 | ) 41 | 42 | def test_footnotes(self) -> None: 43 | """Footnotes extracted.""" 44 | with docx2python(RESOURCES / "example.docx") as output: 45 | assert output.footnotes_runs == [ 46 | [ 47 | [ 48 | [[]], 49 | [[]], 50 | [["footnote1)\t", " First footnote"]], 51 | [ 52 | [ 53 | "footnote2)\t", 54 | " Second footnote", 55 | par( 56 | """----Image alt text---->A close up of a 57 | logo\n\nDescription automatically generated<""" 58 | ), 59 | "----media/image1.png----", 60 | ] 61 | ], 62 | ] 63 | ] 64 | ] 65 | 66 | def test_endnotes(self) -> None: 67 | """Endnotes extracted.""" 68 | with docx2python(RESOURCES / "example.docx") as output: 69 | assert output.endnotes_runs == [ 70 | [ 71 | [ 72 | [[]], 73 | [[]], 74 | [["endnote1)\t", " First endnote"]], 75 | [ 76 | [ 77 | "endnote2)\t", 78 | " Second endnote", 79 | par( 80 | """----Image alt text---->A close up of a 81 | logo\n\nDescription automatically generated<""" 82 | ), 83 | "----media/image1.png----", 84 | ] 85 | ], 86 | ] 87 | ] 88 | ] 89 | 90 | def test_numbered_lists(self) -> None: 91 | """Sublists reset. Expected formatting.""" 92 | with docx2python(RESOURCES / "example.docx") as output: 93 | assert output.body[0][0][0] == [ 94 | "I)\texpect I", 95 | "\tA)\texpect A", 96 | "\tB)\texpect B", 97 | "\t\t1)\texpect 1", 98 | "\t\t\ta)\texpect a", 99 | "\t\t\tb)\texpect b", 100 | "\t\t\t\t1)\texpect 1", 101 | "\t\t\t\t\ta)\texpect a", 102 | "\t\t\t\t\t\ti)\texpect i", 103 | "\t\t\t\t\t\tii)\texpect ii", 104 | "II)\tThis should be II", 105 | "\tA)\tThis should be A), not C)", 106 | ] 107 | 108 | def test_numbered_lists_with_custom_start_index(self) -> None: 109 | """Sublists start from non-default index. Expected formatting.""" 110 | with docx2python(RESOURCES / "example_numbering.docx") as output: 111 | assert output.body[0][0][0] == [ 112 | "II)\texpect II", 113 | "C)\texpect C", 114 | "D)\texpect D", 115 | "4)\texpect 4", 116 | "e)\texpect e", 117 | "f)\texpect f", 118 | "6)\texpect 6", 119 | "f)\texpect f", 120 | "viii)\texpect viii", 121 | "ix)\texpect ix", 122 | "", 123 | "", 124 | ] 125 | 126 | def test_bullets(self) -> None: 127 | """Expected bullet format and indent.""" 128 | with docx2python(RESOURCES / "example.docx") as output: 129 | assert output.body_runs[0][1][0] == [ 130 | ["--\t", "bullet no indent"], 131 | ["\t--\t", "bullet indent 1"], 132 | ["\t\t--\t", "bullet indent 2"], 133 | ] 134 | 135 | def test_ignore_formatting(self) -> None: 136 | """Text formatting is stripped.""" 137 | with docx2python(RESOURCES / "example.docx") as output: 138 | assert output.body[0][2][0] == [ 139 | "Bold", 140 | "Italics", 141 | "Underlined", 142 | "Large Font", 143 | "Colored", 144 | "Large Colored", 145 | "Large Bold", 146 | "Large Bold Italics Underlined", 147 | ] 148 | 149 | def test_nested_table(self) -> None: 150 | """Appears as a new table""" 151 | with docx2python(RESOURCES / "example.docx") as output: 152 | assert output.body[1] == [[["Nested"], ["Table"]], [["A"], ["B"]]] 153 | 154 | def test_tab_delimited(self) -> None: 155 | """Tabs converted to \t.""" 156 | with docx2python(RESOURCES / "example.docx") as output: 157 | assert output.body[2][1][0][0] == "Tab\tdelimited\ttext" 158 | 159 | def test_lt_gt(self) -> None: 160 | """> and < are not encoded.""" 161 | with docx2python(RESOURCES / "example.docx") as output: 162 | assert output.body[2][2][0][0] == "10 < 20 and 20 > 10" 163 | 164 | def test_text_outside_table(self) -> None: 165 | """Text outside table is its own table (also tests image marker)""" 166 | with docx2python(RESOURCES / "example.docx") as output: 167 | assert output.body[3] == [ 168 | [ 169 | [ 170 | "Text outside table", 171 | "Reference footnote 1----footnote1----", 172 | "Reference footnote 2----footnote2----", 173 | "Reference endnote 1----endnote1----", 174 | "Reference endnote 2----endnote2----", 175 | "Heading 1", 176 | "Heading 2", 177 | "", 178 | "----Image alt text---->A jellyfish in water\n\n" 179 | + "Description automatically generated" 180 | + "<----media/image2.jpg----", 181 | ] 182 | ] 183 | ] 184 | 185 | 186 | class TestHtmlFormatting: 187 | """Font styles exported as HTML.""" 188 | 189 | def test_lt_gt(self) -> None: 190 | """> and < encoded""" 191 | with docx2python(RESOURCES / "example.docx", html=True) as html_output: 192 | assert html_output.body[2][2][0][0] == "10 < 20 and 20 > 10" 193 | 194 | def test_formatting_captured(self) -> None: 195 | """Text formatting converted to html.""" 196 | with docx2python(RESOURCES / "example.docx", html=True) as html_output: 197 | assert html_output.body[0][2][0] == [ 198 | "Bold", 199 | "Italics", 200 | "Underlined", 201 | 'Large Font', 202 | 'Colored', 203 | 'Large Colored', 204 | 'Large Bold', 205 | par( 206 | """Large Bold Italics 207 | Underlined""" 208 | ), 209 | ] 210 | 211 | def test_paragraph_formatting(self) -> None: 212 | """Text formatting converted to html.""" 213 | with docx2python(RESOURCES / "example.docx", html=True) as html_output: 214 | expect = [ 215 | [ 216 | [ 217 | ["Text outside table"], 218 | ["Reference footnote 1", "----footnote1----"], 219 | ["Reference footnote 2", "----footnote2----"], 220 | ["Reference endnote 1", "----endnote1----"], 221 | ["Reference endnote 2", "----endnote2----"], 222 | ["

", "Heading 1", "

"], 223 | ["

", "Heading 2", "

"], 224 | [], 225 | [ 226 | par( 227 | """----Image alt text---->A jellyfish in 228 | water\n\nDescription automatically generated<""" 229 | ), 230 | "----media/image2.jpg----", 231 | ], 232 | ] 233 | ] 234 | ] 235 | result = html_output.body_runs[3] 236 | assert result == expect 237 | 238 | 239 | class TestImageDir: 240 | """Write images out to file given an image directory.""" 241 | 242 | def test_pull_image_files(self) -> None: 243 | """Copy image files to output path.""" 244 | pars = docx2python(RESOURCES / "example.docx", "delete_this/path/to/images") 245 | assert set(os.listdir("delete_this/path/to/images")) == { 246 | "image1.png", 247 | "image2.jpg", 248 | } 249 | # clean up 250 | shutil.rmtree("delete_this") 251 | pars.close() 252 | 253 | 254 | def test_header_runs() -> None: 255 | """Runs returned as separate strings. Paragraphs not joined""" 256 | pars = docx2python(RESOURCES / "multiple_runs_per_paragraph.docx", html=True) 257 | assert pars.document_runs == [ 258 | [[[["Multiple ", "Runs in the", " Header"]]]], 259 | [ 260 | [ 261 | [ 262 | [ 263 | "This document contains paragraphs with multiple runs per " 264 | + "paragraph. This ensures result.document and " 265 | + "result.document_runs return different things." 266 | ], 267 | [], 268 | ["Multiple ", "Runs in the", " Body"], 269 | ["Multiple ", "Runs in the", " Body"], 270 | ["Multiple ", "Runs in the", " Body"], 271 | ["Multiple ", "Runs in the", " Body"], 272 | [], 273 | ] 274 | ] 275 | ], 276 | [[[["Multiple ", "Runs in the", " Footer"]]]], 277 | [[[[]], [[]]]], 278 | [[[[]], [[]]]], 279 | ] 280 | pars.close() 281 | -------------------------------------------------------------------------------- /tests/test_docx_context.py: -------------------------------------------------------------------------------- 1 | """Test docx2python.docx_context.py 2 | 3 | author: Shay Hill 4 | created: 6/26/2019 5 | """ 6 | 7 | import os 8 | import tempfile 9 | import zipfile 10 | 11 | from lxml import etree 12 | 13 | from docx2python.attribute_register import Tags, get_prefixed_tag 14 | from docx2python.docx_context import collect_numAttrs 15 | from docx2python.docx_reader import DocxReader 16 | from docx2python.iterators import iter_at_depth 17 | from docx2python.main import docx2python 18 | from tests.conftest import RESOURCES 19 | 20 | example_docx = RESOURCES / "example.docx" 21 | example_numbering_docx = RESOURCES / "example_numbering.docx" 22 | 23 | 24 | class TestSaveDocx: 25 | def test_save_unchanged(self) -> None: 26 | """Creates a valid docx""" 27 | with tempfile.TemporaryDirectory() as temp_dir: 28 | example_copy_docx = os.path.join(temp_dir, "example_copy.docx") 29 | with DocxReader(example_docx) as input_context: 30 | input_xml = input_context.file_of_type("officeDocument").root_element 31 | input_context.save(example_copy_docx) 32 | with DocxReader(example_copy_docx) as output_context: 33 | output_xml = output_context.file_of_type("officeDocument").root_element 34 | assert etree.tostring(input_xml) == etree.tostring(output_xml) 35 | 36 | def test_save_changed(self) -> None: 37 | """Creates a valid docx and updates text""" 38 | input_context = DocxReader(example_docx) 39 | input_xml = input_context.file_of_type("officeDocument").root_element 40 | for elem in (x for x in input_xml.iter() if get_prefixed_tag(x) == Tags.TEXT): 41 | if not elem.text: 42 | continue 43 | elem.text = elem.text.replace("bullet", "BULLET") 44 | with tempfile.TemporaryDirectory() as temp_dir: 45 | with_text_replaced = os.path.join(temp_dir, "with_text_replaced.docx") 46 | input_context.save(with_text_replaced) 47 | with DocxReader(with_text_replaced) as output_context: 48 | output_runs = output_context.file_of_type("officeDocument").text 49 | output_text = "".join(iter_at_depth(output_runs, 5)) 50 | assert "bullet" not in output_text 51 | assert "BULLET" in output_text 52 | 53 | 54 | class TestCollectNumAttrs: 55 | """Test strip_text.collect_numFmts""" 56 | 57 | def test_gets_start_indexes(self) -> None: 58 | """Retrieves start indexes from example_numbering.docx 59 | 60 | This test files contains lists starting from non-default value: 61 | II. expect II 62 | C. expect C 63 | D. expect D 64 | 4. expect 4 65 | e. expect e 66 | f. expect f 67 | 6) expect 6 68 | f) expect f 69 | (viii) expect viii 70 | (ix) expect ix 71 | """ 72 | zipf = zipfile.ZipFile(example_numbering_docx, "r") 73 | numId2Attrs = collect_numAttrs( 74 | etree.fromstring(zipf.read("word/numbering.xml")) 75 | ) 76 | starts = {x.start for y in numId2Attrs.values() for x in y} 77 | assert starts == {1, 2, 3, 4, 5, 6, 8} 78 | 79 | def test_gets_formats(self) -> None: 80 | """Retrieves formats from example.docx 81 | 82 | This isn't a great test. There are numbered lists I've added then removed as 83 | I've edited my test docx. These still appear in the docx file. I could 84 | compare directly with the extracted numbering xml file, but even then I'd be 85 | comparing to something I don't know to be accurate. This just tests that all 86 | numbering formats are represented. 87 | """ 88 | zipf = zipfile.ZipFile(example_docx) 89 | numId2Attrs = collect_numAttrs( 90 | etree.fromstring(zipf.read("word/numbering.xml")) 91 | ) 92 | formats = {x.fmt for y in numId2Attrs.values() for x in y} 93 | assert formats == { 94 | "lowerLetter", 95 | "upperLetter", 96 | "lowerRoman", 97 | "upperRoman", 98 | "bullet", 99 | "decimal", 100 | } 101 | 102 | 103 | class TestCollectDocProps: 104 | """Test strip_text.collect_docProps""" 105 | 106 | def test_gets_properties(self) -> None: 107 | """Retrieves properties from docProps""" 108 | core_properties = docx2python(example_docx).core_properties 109 | expected = { 110 | "title": None, 111 | "subject": None, 112 | "creator": "Shay Hill", 113 | "keywords": None, 114 | "description": None, 115 | "lastModifiedBy": "Shay Hill", 116 | } 117 | for prop, value in expected.items(): 118 | assert core_properties[prop] == value 119 | 120 | 121 | class TestGetContext: 122 | """Text strip_text.get_context""" 123 | 124 | def test_numId2Attrs(self) -> None: 125 | """All targets mapped""" 126 | docx_context = DocxReader(example_docx) 127 | assert docx_context.numId2Attrs == collect_numAttrs( 128 | etree.fromstring(docx_context.zipf.read("word/numbering.xml")) 129 | ) 130 | 131 | def test_lists(self) -> None: 132 | """Pass silently when no numbered or bulleted lists.""" 133 | docx_context = DocxReader(RESOURCES / "basic.docx") 134 | assert docx_context.numId2Attrs == {} 135 | 136 | 137 | class TestPullImageFiles: 138 | """Test strip_text.pull_image_files""" 139 | 140 | def test_pull_image_files(self) -> None: 141 | """Copy image files to output path.""" 142 | docx_context = DocxReader(example_docx) 143 | with tempfile.TemporaryDirectory() as image_folder: 144 | _ = docx_context.pull_image_files(image_folder) 145 | assert set(os.listdir(image_folder)) == {"image1.png", "image2.jpg"} 146 | 147 | def test_no_image_files(self) -> None: 148 | """Pass silently when no image files.""" 149 | docx_context = DocxReader(RESOURCES / "basic.docx") 150 | with tempfile.TemporaryDirectory() as image_folder: 151 | _ = docx_context.pull_image_files(image_folder) 152 | assert os.listdir(image_folder) == [] 153 | -------------------------------------------------------------------------------- /tests/test_docx_output.py: -------------------------------------------------------------------------------- 1 | """Test features of DocxContent that weren't tested in test_docx2python. 2 | 3 | :author: Shay Hill 4 | :created: 7/6/2019 5 | """ 6 | 7 | from docx2python.iterators import iter_at_depth 8 | from docx2python.main import docx2python 9 | from tests.conftest import RESOURCES 10 | 11 | 12 | class TestDocument: 13 | def test_combine_of_header_body_footer(self) -> None: 14 | """Return all content combined as instance.document""" 15 | with docx2python(RESOURCES / "example.docx") as content: 16 | assert ( 17 | content.document 18 | == content.header 19 | + content.body 20 | + content.footer 21 | + content.footnotes 22 | + content.endnotes 23 | ) 24 | 25 | def test_read_only(self) -> None: 26 | """Document attribute is read only.""" 27 | with docx2python(RESOURCES / "example.docx") as content: 28 | doc1 = content.document 29 | doc1 = doc1[:1] 30 | assert doc1 != content.document 31 | assert ( 32 | content.document 33 | == content.header 34 | + content.body 35 | + content.footer 36 | + content.footnotes 37 | + content.endnotes 38 | ) 39 | 40 | 41 | class TestText: 42 | def test_function(self) -> None: 43 | r"""Return '\n\n'-delimited paragraphs as instance.text.""" 44 | with docx2python(RESOURCES / "example.docx") as content: 45 | assert content.text == "\n\n".join(iter_at_depth(content.document, 4)) 46 | 47 | 48 | class TestHtmlMap: 49 | def test_function(self) -> None: 50 | """Return html tables.""" 51 | with docx2python(RESOURCES / "example.docx") as content: 52 | assert ( 53 | content.html_map[:48] 54 | == '
(0, 0'
55 |             )
56 | 


--------------------------------------------------------------------------------
/tests/test_dropdown_selector_in_table.py:
--------------------------------------------------------------------------------
 1 | """Test the dropdown selector in a table.
 2 | 
 3 | Issue: [https://github.com/ShayHill/docx2python/issues/73]
 4 | 
 5 | User iamahcy reports that a ContentControl dropdown selector in a table raises an
 6 | error.
 7 | 
 8 | The issue is that dropdown selectors are a nested table, and the first row of that
 9 | table requests a vMerge. The fix was to reject any vMerge (copy the cell above)
10 | request in the first row of any table.
11 | 
12 | :author: Shay Hill
13 | :created: 2024-09-26
14 | """
15 | 
16 | from docx2python import docx2python
17 | from tests.conftest import RESOURCES
18 | 
19 | test_file = RESOURCES / "list_index_a.docx"
20 | 
21 | 
22 | class TestContentControlDropdownSelectorInTable:
23 |     def test_content_control_dropdown_selector_in_table(self):
24 |         """Test the dropdown selector in a table."""
25 |         with docx2python(test_file) as docx_content:
26 |             content_runs = docx_content.document
27 | 
28 |         # fmt: off
29 |         assert content_runs == [
30 |             [
31 |                 [
32 |                     [""], [""], [""], [""], ["", ""]
33 |                 ],
34 |                 [
35 |                     [""], [""], [""], [""], ["", ""]
36 |                 ],
37 |                 [
38 |                     [""], [""], [""], [""], ["", ""]
39 |                 ],
40 |                 [
41 |                     [""], [""], [""], [""], ["", ""]
42 |                 ],
43 |                 [
44 |                     [""], [""], [""], [""], ["", ""]
45 |                 ],
46 |                 [
47 |                     [""]
48 |                 ],
49 |             ],
50 |             [
51 |                 [
52 |                     ["Silver"]
53 |                 ]
54 |             ],
55 |             [
56 |                 [
57 |                     [""], [""], [""]
58 |                 ],
59 |                 [
60 |                     ["", ""], ["", ""], ["", ""], ["", ""], ["", ""]
61 |                 ]
62 |             ],
63 |             [
64 |                 [
65 |                     [""]
66 |                 ]
67 |             ],
68 |             [
69 |                 [
70 |                     [""], [""]
71 |                 ]
72 |             ],
73 |             [
74 |                 [
75 |                     [""], [""]
76 |                 ]
77 |             ],
78 |         ]
79 |         # fmt: on
80 | 


--------------------------------------------------------------------------------
/tests/test_equations.py:
--------------------------------------------------------------------------------
 1 | """Pull some information from equations
 2 | 
 3 | :author: Shay Hill
 4 | :created: 7/7/2021
 5 | 
 6 | User sreeroopnaidu requested equation export. Equations are made up internally of
 7 |  elements. Previous versions of Docx2Python ignored these elements. These are
 8 | now recognized.
 9 | 
10 | Equations in Word's Professional format will return garbage.
11 | Equations in Word's Inline format will return a nice string.
12 | """
13 | 
14 | from docx2python import docx2python
15 | from tests.conftest import RESOURCES
16 | 
17 | 
18 | class TestEquations:
19 |     def test_professional_format(self):
20 |         """
21 |         Start a new paragraph when a  element is found.
22 |         """
23 |         with docx2python(RESOURCES / "equations.docx") as content:
24 |             body = content.body
25 |         assert body == [
26 |             [
27 |                 [
28 |                     [
29 |                         "Professional Format",
30 |                         "01x",
31 |                         "Linear Format",
32 |                         "\\int_{0}^{1}x",
33 |                         "Linear Format with lt",
34 |                         "\\int0<1x<5",
35 |                     ]
36 |                 ]
37 |             ]
38 |         ]
39 | 


--------------------------------------------------------------------------------
/tests/test_file_object.py:
--------------------------------------------------------------------------------
 1 | """Test methods of File object that are not tested elsewhere.
 2 | 
 3 | :author: Shay Hill
 4 | :created: 4/3/2021
 5 | """
 6 | 
 7 | from docx2python.attribute_register import Tags, get_prefixed_tag
 8 | from docx2python.docx_reader import DocxReader
 9 | from docx2python.main import docx2python
10 | from tests.conftest import RESOURCES
11 | 
12 | 
13 | class TestFileObject:
14 |     """
15 |     Test methods of DocxContext object which are not tested elsewhere.
16 |     """
17 | 
18 |     def test_get_content_full(self) -> None:
19 |         """
20 |         Return full content if no root given.
21 |         """
22 |         full_extraction = docx2python(RESOURCES / "example.docx")
23 |         context = DocxReader(RESOURCES / "example.docx")
24 |         assert (
25 |             full_extraction.body_runs
26 |             == context.file_of_type("officeDocument").get_text()
27 |         )
28 |         context.close()
29 |         full_extraction.close()
30 | 
31 |     def test_get_content_partial(self) -> None:
32 |         """
33 |         Return content below root argument if given.
34 |         """
35 |         full_extraction = docx2python(RESOURCES / "example.docx")
36 |         context = DocxReader(RESOURCES / "example.docx")
37 |         document_xml = context.file_of_type("officeDocument")
38 |         first_par = next(
39 |             x
40 |             for x in document_xml.root_element.iter()
41 |             if get_prefixed_tag(x) == Tags.PARAGRAPH
42 |         )
43 |         assert [[[[full_extraction.body_runs[0][0][0][0]]]]] == document_xml.get_text(
44 |             first_par
45 |         )
46 |         context.close()
47 |         full_extraction.close()
48 | 


--------------------------------------------------------------------------------
/tests/test_from_bytes.py:
--------------------------------------------------------------------------------
 1 | """Test loading a .docx from a buffer of raw bytes.
 2 | 
 3 | :author: Shay Hill
 4 | :created: 2024-07-25
 5 | """
 6 | 
 7 | from io import BytesIO
 8 | 
 9 | from docx2python.main import docx2python
10 | from tests.conftest import RESOURCES
11 | 
12 | example_docx = RESOURCES / "example.docx"
13 | 
14 | 
15 | class TestFromBytes:
16 |     def test_from_bytes(self) -> None:
17 |         """Loads .docx from a buffer of raw bytes."""
18 |         with open(example_docx, "rb") as f:
19 |             buf = BytesIO(f.read())
20 |         with docx2python(buf) as content:
21 |             core_properties = content.core_properties
22 |             expected = {
23 |                 "title": None,
24 |                 "subject": None,
25 |                 "creator": "Shay Hill",
26 |                 "keywords": None,
27 |                 "description": None,
28 |                 "lastModifiedBy": "Shay Hill",
29 |             }
30 |             for prop, value in expected.items():
31 |                 assert core_properties[prop] == value
32 | 


--------------------------------------------------------------------------------
/tests/test_get_text.py:
--------------------------------------------------------------------------------
  1 | """Test functions in docx2python.get_text.py
  2 | 
  3 | author: Shay Hill
  4 | created: 5/20/2019
  5 | 
  6 | Does not test ``get_text``. ``get text`` is tested through source_old.
  7 | """
  8 | 
  9 | # pyright: reportPrivateUsage=false
 10 | 
 11 | from __future__ import annotations
 12 | 
 13 | from collections import defaultdict
 14 | from typing import TypedDict
 15 | 
 16 | import pytest
 17 | from lxml import etree
 18 | 
 19 | from docx2python.bullets_and_numbering import BulletGenerator, _increment_list_counter
 20 | from docx2python.docx_context import NumIdAttrs
 21 | from tests.helpers.utils import valid_xml
 22 | 
 23 | 
 24 | class NumberingContext(TypedDict):
 25 |     numId2Atts: dict[str, list[NumIdAttrs]]
 26 |     numId2count: defaultdict[str, defaultdict[str, int]]
 27 | 
 28 | 
 29 | class TestIncrementListCounter:
 30 |     """Test get_text.increment_list_counter"""
 31 | 
 32 |     def test_function(self) -> None:
 33 |         """Increments counter at ilvl, deletes deeper counters."""
 34 |         ilvl2count: defaultdict[str, int] = defaultdict(
 35 |             int, {str(x): x for x in range(1, 6)}
 36 |         )
 37 |         assert ilvl2count == {"1": 1, "2": 2, "3": 3, "4": 4, "5": 5}
 38 |         _ = _increment_list_counter(ilvl2count, "2")
 39 |         assert ilvl2count == {"1": 1, "2": 3}
 40 | 
 41 | 
 42 | @pytest.fixture()
 43 | def numbered_paragraphs() -> list[bytes]:
 44 |     """Seven numbered paragraphs, indented 0-6 ilvls."""
 45 |     paragraphs: list[str] = []
 46 |     for ilvl in range(7):
 47 |         paragraphs.append(
 48 |             ""
 49 |             + ''
 52 |             + ''
 53 |             + ""
 54 |         )
 55 |     return [valid_xml(x) for x in paragraphs]
 56 | 
 57 | 
 58 | @pytest.fixture()
 59 | def numbering_context() -> NumberingContext:
 60 |     """
 61 | 
 62 |     :return:
 63 |     """
 64 |     numId2Atts = {
 65 |         "1": [
 66 |             NumIdAttrs(fmt="bullet", start=None),
 67 |             NumIdAttrs(fmt="decimal", start=None),
 68 |             NumIdAttrs(fmt="lowerLetter", start=None),
 69 |             NumIdAttrs(fmt="upperLetter", start=None),
 70 |             NumIdAttrs(fmt="lowerRoman", start=None),
 71 |             NumIdAttrs(fmt="upperRoman", start=None),
 72 |             NumIdAttrs(fmt="undefined", start=None),
 73 |         ]
 74 |     }
 75 |     numId2count: defaultdict[str, defaultdict[str, int]] = defaultdict(
 76 |         lambda: defaultdict(int)
 77 |     )
 78 |     return {"numId2Atts": numId2Atts, "numId2count": numId2count}
 79 | 
 80 | 
 81 | class TestGetBulletString:
 82 |     """Test strip_test.get_bullet_string"""
 83 | 
 84 |     def test_bullet(
 85 |         self, numbered_paragraphs: list[bytes], numbering_context: NumberingContext
 86 |     ) -> None:
 87 |         """Returns '-- ' for 'bullet'"""
 88 | 
 89 |         paragraph = etree.fromstring(numbered_paragraphs[0])[0][0]
 90 |         bullets = BulletGenerator(numbering_context["numId2Atts"])
 91 |         assert bullets.get_bullet(paragraph) == "--\t"
 92 | 
 93 |     def test_decimal(
 94 |         self, numbered_paragraphs: list[bytes], numbering_context: NumberingContext
 95 |     ) -> None:
 96 |         """
 97 |         Returns '1) ' for 'decimal'
 98 |         indented one tab
 99 |         """
100 |         paragraph = etree.fromstring(numbered_paragraphs[1])[0][0]
101 |         bullets = BulletGenerator(numbering_context["numId2Atts"])
102 |         assert bullets.get_bullet(paragraph) == "\t1)\t"
103 | 
104 |     def test_lower_letter(
105 |         self, numbered_paragraphs: list[bytes], numbering_context: NumberingContext
106 |     ) -> None:
107 |         """
108 |         Returns 'a) ' for 'lowerLetter'
109 |         indented two tabs
110 |         """
111 |         paragraph = etree.fromstring(numbered_paragraphs[2])[0][0]
112 |         bullets = BulletGenerator(numbering_context["numId2Atts"])
113 |         assert bullets.get_bullet(paragraph) == "\t\ta)\t"
114 | 
115 |     def test_upper_letter(
116 |         self, numbered_paragraphs: list[bytes], numbering_context: NumberingContext
117 |     ) -> None:
118 |         """
119 |         Returns 'A) ' for 'upperLetter'
120 |         indented three tabs
121 |         """
122 |         paragraph = etree.fromstring(numbered_paragraphs[3])[0][0]
123 |         bullets = BulletGenerator(numbering_context["numId2Atts"])
124 |         assert bullets.get_bullet(paragraph) == "\t\t\tA)\t"
125 | 
126 |     def test_lower_roman(
127 |         self, numbered_paragraphs: list[bytes], numbering_context: NumberingContext
128 |     ) -> None:
129 |         """
130 |         Returns 'i) ' for 'lowerRoman'
131 |         indented 4 tabs
132 |         """
133 |         paragraph = etree.fromstring(numbered_paragraphs[4])[0][0]
134 |         bullets = BulletGenerator(numbering_context["numId2Atts"])
135 |         assert bullets.get_bullet(paragraph) == "\t\t\t\ti)\t"
136 | 
137 |     def test_upper_roman(
138 |         self, numbered_paragraphs: list[bytes], numbering_context: NumberingContext
139 |     ) -> None:
140 |         """
141 |         Returns 'I) ' for 'upperRoman'
142 |         indented 5 tabs
143 |         """
144 |         paragraph = etree.fromstring(numbered_paragraphs[5])[0][0]
145 |         bullets = BulletGenerator(numbering_context["numId2Atts"])
146 |         assert bullets.get_bullet(paragraph) == "\t\t\t\t\tI)\t"
147 | 
148 |     def test_undefined(
149 |         self, numbered_paragraphs: list[bytes], numbering_context: NumberingContext
150 |     ) -> None:
151 |         """
152 |         Returns '-- ' for unknown formats
153 |         indented 6 tabs
154 | 
155 |         Format "undefined" won't be defined in the function, so function will fall back
156 |         to bullet string (with a warning).
157 |         """
158 |         paragraph = etree.fromstring(numbered_paragraphs[6])[0][0]
159 |         bullets = BulletGenerator(numbering_context["numId2Atts"])
160 |         with pytest.warns(UserWarning):
161 |             _ = bullets.get_bullet(paragraph)
162 | 
163 |     def test_not_numbered(self, numbering_context: NumberingContext) -> None:
164 |         """
165 |         Returns '' when paragraph is not numbered.
166 |         """
167 |         one_par_file = valid_xml("")
168 |         paragraph = etree.fromstring(one_par_file)[0]
169 |         bullets = BulletGenerator(numbering_context["numId2Atts"])
170 |         assert bullets.get_bullet(paragraph) == ""
171 | 
172 |     def test_resets_sublists(
173 |         self, numbered_paragraphs: list[bytes], numbering_context: NumberingContext
174 |     ):
175 |         """Numbers reset when returning to shallower level
176 | 
177 |         1)  top level
178 |             a)  level 2
179 |             b)  another level 2
180 |                 A)  level 3
181 |             c)  level 2 is still counting
182 |                 A)  NEW sublist of level 2
183 |         2)  top level is still counting
184 |             a)  NEW sublist of top level
185 |         """
186 |         pars = [numbered_paragraphs[x] for x in (1, 2, 2, 3, 2, 3, 1, 2)]
187 |         bullets = BulletGenerator(numbering_context["numId2Atts"])
188 |         bullet_strings: list[str] = []
189 |         for par in pars:
190 |             paragraph = etree.fromstring(par)[0][0]
191 |             bullet_strings.append(bullets.get_bullet(paragraph).strip())
192 | 
193 |         assert bullet_strings == ["1)", "a)", "b)", "A)", "c)", "A)", "2)", "a)"]
194 | 


--------------------------------------------------------------------------------
/tests/test_google_docs.py:
--------------------------------------------------------------------------------
 1 | """Test corrections for google docs docx files
 2 | 
 3 | :author: Shay Hill
 4 | :created: 11/2/2020
 5 | 
 6 | Docx files created in MS Work have a ``docProps.xml`` file with author, etc.
 7 | Docx files created in google docs do not have a ``docProps.xml`` file.
 8 | 
 9 | File `test-docx2python-conversion-google_docs.docx` sent by a user.
10 | 
11 | Traceback (most recent call last):
12 | File "/Users/cyee/.local/share/virtualenvs/word-to-md-EFw2UvDn/bin/word2md", line 33, in
13 | sys.exit(load_entry_point('word2md', 'console_scripts', 'word2md')())
14 | File "/Users/cyee/.local/share/virtualenvs/word-to-md-EFw2UvDn/lib/python3.8/site-packages/click/core.py", line 829, in call
15 | return self.main(*args, **kwargs)
16 | File "/Users/cyee/.local/share/virtualenvs/word-to-md-EFw2UvDn/lib/python3.8/site-packages/click/core.py", line 782, in main
17 | rv = self.invoke(ctx)
18 | File "/Users/cyee/.local/share/virtualenvs/word-to-md-EFw2UvDn/lib/python3.8/site-packages/click/core.py", line 1066, in invoke
19 | return ctx.invoke(self.callback, **ctx.params)
20 | File "/Users/cyee/.local/share/virtualenvs/word-to-md-EFw2UvDn/lib/python3.8/site-packages/click/core.py", line 610, in invoke
21 | return callback(*args, **kwargs)
22 | File "/Users/cyee/projects/python/word-to-md/word2md.py", line 349, in cli
23 | make_md_from_entire_doc(path)
24 | File "/Users/cyee/projects/python/word-to-md/word2md.py", line 300, in make_md_from_entire_doc
25 | document = docx2python(input_file, html=True)
26 | File "/Users/cyee/.local/share/virtualenvs/word-to-md-EFw2UvDn/lib/python3.8/site-packages/docx2python/main.py", line 35, in docx2python
27 | context = get_context(zipf)
28 | File "/Users/cyee/.local/share/virtualenvs/word-to-md-EFw2UvDn/lib/python3.8/site-packages/docx2python/docx_context.py", line 272, in get_context
29 | "docProp2text": collect_docProps(zipf.read("docProps/core.xml")),
30 | File "/usr/local/opt/python@3.8/Frameworks/Python.framework/Versions/3.8/lib/python3.8/zipfile.py", line 1475, in read
31 | with self.open(name, "r", pwd) as fp:
32 | File "/usr/local/opt/python@3.8/Frameworks/Python.framework/Versions/3.8/lib/python3.8/zipfile.py", line 1514, in open
33 | zinfo = self.getinfo(name)
34 | File "/usr/local/opt/python@3.8/Frameworks/Python.framework/Versions/3.8/lib/python3.8/zipfile.py", line 1441, in getinfo
35 | raise KeyError(
36 | KeyError: "There is no item named 'docProps/core.xml' in the archive"
37 | """
38 | 
39 | import pytest
40 | 
41 | from docx2python import docx2python
42 | from tests.conftest import RESOURCES
43 | 
44 | FILE_WITH_DOCPROPS = RESOURCES / "example.docx"
45 | 
46 | FILE_WITHOUT_DOCPROPS = RESOURCES / "test-docx2python-conversion-google_docs.docx"
47 | 
48 | 
49 | class TestDeprecatedPropertiesProperty:
50 |     def test_deprecated_properties_property(self) -> None:
51 |         """
52 |         Raise a future warning when user requests ``result.properties``
53 |         """
54 |         with docx2python(FILE_WITH_DOCPROPS) as result:
55 |             with pytest.warns(FutureWarning):
56 |                 _ = result.properties
57 | 
58 | 
59 | class TestDocPropsFound:
60 |     def test_docprops_found(self) -> None:
61 |         """
62 |         Return docProps as a dictionary
63 |         """
64 |         with docx2python(FILE_WITH_DOCPROPS) as result:
65 |             assert result.core_properties == {
66 |                 "created": "2019-07-05T21:51:00Z",
67 |                 "creator": "Shay Hill",
68 |                 "description": None,
69 |                 "keywords": None,
70 |                 "lastModifiedBy": "Shay Hill",
71 |                 "modified": "2021-03-26T00:30:00Z",
72 |                 "revision": "7",
73 |                 "subject": None,
74 |                 "title": None,
75 |             }
76 | 
77 | 
78 | class TestGoogleDocs:
79 |     def test_empty_properties_dict_if_docProps_not_found(self) -> None:
80 |         """
81 |         It seems Google Docs docx files to not contain a document properties file:
82 |         `docProps/core.xml`. The contents of this file are returned as a dictionary.
83 |         To correct the above error, result.properties will now return an empty
84 |         dictionary (with a warning).
85 |         """
86 |         with docx2python(FILE_WITHOUT_DOCPROPS) as result:
87 |             with pytest.warns(UserWarning):
88 |                 assert result.core_properties == {}
89 | 


--------------------------------------------------------------------------------
/tests/test_hyperlinks.py:
--------------------------------------------------------------------------------
 1 | """Test that consecutive links pointing to the same address are merged.
 2 | 
 3 | :author: Shay Hill
 4 | :created: 3/17/2021
 5 | 
 6 | Such links will look like this (after removing proofErr, rsid, and other noise).
 7 | 
 8 |     
 9 |           
10 |             
11 |                 hy
12 |             
13 |         
14 |           
15 |             
16 |                 per
17 |             
18 |         
19 |           
20 |             
21 |                 link
22 |             
23 |         
24 |     
25 | 
26 | Docx2python condenses these to
27 | 
28 |     
29 |           
30 |             
31 |                 hy
32 |             
33 |             
34 |                 per
35 |             
36 |             
37 |                 link
38 |             
39 |         
40 |     
41 | 
42 | Then to
43 | 
44 |     
45 |           
46 |             
47 |                 hyperlink
48 |             
49 |         
50 |     
51 | 
52 | This module tests the final result.
53 | """
54 | 
55 | from docx2python.main import docx2python
56 | from tests.conftest import RESOURCES
57 | 
58 | 
59 | class TestHyperlink:
60 |     def test_prints(self) -> None:
61 |         """Consecutive hyperlinks referencing same target are joined"""
62 |         with docx2python(RESOURCES / "hyperlink.docx") as extraction:
63 |             assert extraction.body_runs == [
64 |                 [
65 |                     [
66 |                         [
67 |                             [
68 |                                 "This is a link to ",
69 |                                 ''
70 |                                 + "my website",
71 |                                 ".",
72 |                             ]
73 |                         ]
74 |                     ]
75 |                 ]
76 |             ]
77 | 


--------------------------------------------------------------------------------
/tests/test_import.py:
--------------------------------------------------------------------------------
 1 | """Make sure from docx2python import ... works
 2 | 
 3 | :author: Shay Hill
 4 | :created: 7/17/2019
 5 | 
 6 | """
 7 | 
 8 | from docx2python import docx2python
 9 | from tests.conftest import RESOURCES
10 | 
11 | 
12 | def test() -> None:
13 |     """Just making sure the import works."""
14 |     with docx2python(RESOURCES / "example.docx") as _:
15 |         pass
16 | 


--------------------------------------------------------------------------------
/tests/test_invalid_tag_name.py:
--------------------------------------------------------------------------------
 1 | """Issue 72: Invalid tag name.
 2 | 
 3 | User makretch found a file converted by Aspose that had an invalid tag name in a
 4 | comment. This tag name caused a ValueError when passed to `etree.QName`.
 5 | 
 6 | ValueError: Invalid tag name 'cyfunction Comment at 0x12345678abcd'
 7 | 
 8 | I addressed this by skipping elements with invalid tag names and raising a warning.
 9 | 
10 | :author: Shay Hill
11 | :created: 2024-12-05
12 | """
13 | 
14 | import pytest
15 | from conftest import RESOURCES
16 | 
17 | from docx2python import docx2python
18 | 
19 | 
20 | class TestInvalidTagName:
21 |     """Confirming this works with v1.25"""
22 | 
23 |     def test_invalid_tag_name(self) -> None:
24 |         """Pass if no ValueError is raised."""
25 |         extraction = docx2python(RESOURCES / "invalid_tag_name.docx")
26 |         with pytest.warns(UserWarning, match="skipping invalid tag name"):
27 |             _ = extraction.text
28 |         extraction.close()
29 | 


--------------------------------------------------------------------------------
/tests/test_iterators.py:
--------------------------------------------------------------------------------
  1 | """Test docx2python.iterators.py
  2 | 
  3 | author: Shay Hill
  4 | created: 6/28/2019
  5 | """
  6 | 
  7 | import itertools as it
  8 | 
  9 | import pytest
 10 | 
 11 | from docx2python.iterators import (
 12 |     enum_at_depth,
 13 |     enum_cells,
 14 |     enum_paragraphs,
 15 |     enum_rows,
 16 |     enum_tables,
 17 |     get_html_map,
 18 |     iter_cells,
 19 |     iter_paragraphs,
 20 |     iter_rows,
 21 |     iter_tables,
 22 | )
 23 | 
 24 | TABLES = [
 25 |     [
 26 |         [[["0000", "0001"], ["0010", "0011"]], [["0100", "0101"], ["0110", "0111"]]],
 27 |         [[["1000", "1001"], ["1010", "1011"]], [["1100", "1101"], ["1110", "1111"]]],
 28 |     ]
 29 | ]
 30 | 
 31 | 
 32 | class TestOutOfRange:
 33 |     def test_enum_at_depth_low(self) -> None:
 34 |         """Raise ValueError when attempting to enumerate over depth < 1."""
 35 |         with pytest.raises(ValueError) as msg:
 36 |             _ = tuple(enum_at_depth(TABLES, 0))  # type: ignore
 37 |         assert "depth argument must be 1, 2, 3, 4, or 5" in str(msg.value)
 38 | 
 39 |     def test_enum_at_depth_high(self) -> None:
 40 |         """Raise ValueError when attempting to enumerate over depth < 1."""
 41 |         with pytest.raises(ValueError) as msg:
 42 |             _ = tuple(enum_at_depth(TABLES, 6))  # type: ignore
 43 |         assert "depth argument must be 1, 2, 3, 4, or 5" in str(msg.value)
 44 | 
 45 | 
 46 | class TestIterators:
 47 |     """Test iterators.iter_*"""
 48 | 
 49 |     def test_iter_tables(self) -> None:
 50 |         """Return all tables."""
 51 |         assert list(iter_tables(TABLES)) == TABLES
 52 | 
 53 |     def test_iter_rows(self) -> None:
 54 |         """Return all rows."""
 55 |         assert list(iter_rows(TABLES)) == list(it.chain(*iter_tables(TABLES)))
 56 | 
 57 |     def test_iter_cells(self) -> None:
 58 |         """Return all cells."""
 59 |         assert list(iter_cells(TABLES)) == list(it.chain(*iter_rows(TABLES)))
 60 | 
 61 |     def test_iter_paragraphs(self) -> None:
 62 |         """Return all paragraphs."""
 63 |         assert list(iter_paragraphs(TABLES)) == list(it.chain(*iter_cells(TABLES)))
 64 | 
 65 | 
 66 | class TestEnumerators:
 67 |     """Test iterators.enum_*"""
 68 | 
 69 |     def test_enum_tables(self) -> None:
 70 |         """Return all tables."""
 71 |         assert list(enum_tables(TABLES)) == [
 72 |             (
 73 |                 (0,),
 74 |                 [
 75 |                     [
 76 |                         [["0000", "0001"], ["0010", "0011"]],
 77 |                         [["0100", "0101"], ["0110", "0111"]],
 78 |                     ],
 79 |                     [
 80 |                         [["1000", "1001"], ["1010", "1011"]],
 81 |                         [["1100", "1101"], ["1110", "1111"]],
 82 |                     ],
 83 |                 ],
 84 |             )
 85 |         ]
 86 | 
 87 |     def test_enum_rows(self) -> None:
 88 |         """Return all rows."""
 89 |         assert list(enum_rows(TABLES)) == [
 90 |             (
 91 |                 (0, 0),
 92 |                 [
 93 |                     [["0000", "0001"], ["0010", "0011"]],
 94 |                     [["0100", "0101"], ["0110", "0111"]],
 95 |                 ],
 96 |             ),
 97 |             (
 98 |                 (0, 1),
 99 |                 [
100 |                     [["1000", "1001"], ["1010", "1011"]],
101 |                     [["1100", "1101"], ["1110", "1111"]],
102 |                 ],
103 |             ),
104 |         ]
105 | 
106 |     def test_enum_cells(self) -> None:
107 |         """Return all cells."""
108 |         assert list(enum_cells(TABLES)) == [
109 |             ((0, 0, 0), [["0000", "0001"], ["0010", "0011"]]),
110 |             ((0, 0, 1), [["0100", "0101"], ["0110", "0111"]]),
111 |             ((0, 1, 0), [["1000", "1001"], ["1010", "1011"]]),
112 |             ((0, 1, 1), [["1100", "1101"], ["1110", "1111"]]),
113 |         ]
114 | 
115 |     def test_enum_paragraphs(self) -> None:
116 |         """Return all paragraphs."""
117 |         assert list(enum_paragraphs(TABLES)) == [
118 |             ((0, 0, 0, 0), ["0000", "0001"]),
119 |             ((0, 0, 0, 1), ["0010", "0011"]),
120 |             ((0, 0, 1, 0), ["0100", "0101"]),
121 |             ((0, 0, 1, 1), ["0110", "0111"]),
122 |             ((0, 1, 0, 0), ["1000", "1001"]),
123 |             ((0, 1, 0, 1), ["1010", "1011"]),
124 |             ((0, 1, 1, 0), ["1100", "1101"]),
125 |             ((0, 1, 1, 1), ["1110", "1111"]),
126 |         ]
127 | 
128 | 
129 | class TestGetHtmlMap:
130 |     """Test iterators.get_html_map"""
131 | 
132 |     def test_get_html_map(self) -> None:
133 |         """Create valid html."""
134 |         # fmt: off
135 |         assert get_html_map(TABLES) == (
136 |             ""
137 |             ""
138 |             ''
139 |             ""
140 |             ""
146 |             ""
152 |             ""
153 |             ""
154 |             ""
160 |             ""
166 |             ""
167 |             "
" 141 | "
(0, 0, 0, 0) 00000001"
142 |             "
" 143 | "
(0, 0, 0, 1) 00100011"
144 |             "
" 145 | "
" 147 | "
(0, 0, 1, 0) 01000101"
148 |             "
" 149 | "
(0, 0, 1, 1) 01100111"
150 |             "
" 151 | "
" 155 | "
(0, 1, 0, 0) 10001001"
156 |             "
" 157 | "
(0, 1, 0, 1) 10101011"
158 |             "
" 159 | "
" 161 | "
(0, 1, 1, 0) 11001101"
162 |             "
" 163 | "
(0, 1, 1, 1) 11101111"
164 |             "
" 165 | "
" 168 | "" 169 | "" 170 | ) 171 | # fmt: on 172 | -------------------------------------------------------------------------------- /tests/test_libreoffice_conversion.py: -------------------------------------------------------------------------------- 1 | """Libreoffice conversions from doc to docx raise CaretDepthError 2 | 3 | :author: Shay Hill 4 | :created: 8/11/2021 5 | 6 | Uner shadowmimosa shared a docx (libreoffice_conversion.docx), converted by libreoffice 7 | from a doc that raises a CaretDepthError. 8 | """ 9 | 10 | import pytest 11 | 12 | from docx2python.main import docx2python 13 | from tests.conftest import RESOURCES 14 | 15 | 16 | class TestLibreofficeConversion: 17 | def test_libreoffice_conversion(self) -> None: 18 | """Extracts text without a CaretDepthError 19 | 20 | This test file for a user just happens to be in Chinese and contains an 21 | unsupported Chinese numbering format, hence the ``pytest.warns`` context. 22 | """ 23 | with docx2python(RESOURCES / "libreoffice_conversion.docx") as content: 24 | with pytest.warns(UserWarning): 25 | _ = content.document 26 | -------------------------------------------------------------------------------- /tests/test_lineage.py: -------------------------------------------------------------------------------- 1 | """Test the lineage attribute of Par instances. 2 | 3 | :author: Shay Hill 4 | :created: 2024-07-14 5 | """ 6 | 7 | from docx2python.iterators import ( 8 | is_tbl, 9 | is_tc, 10 | is_tr, 11 | iter_cells, 12 | iter_paragraphs, 13 | iter_rows, 14 | iter_tables, 15 | ) 16 | from docx2python.main import docx2python 17 | 18 | from .conftest import RESOURCES 19 | 20 | 21 | class TestLineage: 22 | """Are lineage tags correct for Par instances?""" 23 | 24 | def test_explicit(self): 25 | """Output matches expected lineage.""" 26 | with docx2python(RESOURCES / "paragraphs_and_tables.docx") as extraction: 27 | pars = extraction.document_pars 28 | lineages = [par.lineage for par in iter_paragraphs(pars)] 29 | assert lineages == [ 30 | ("document", None, None, None, "p"), 31 | ("document", "tbl", "tr", "tc", "p"), 32 | ("document", "tbl", "tr", "tc", "p"), 33 | ("document", "tbl", "tr", "tc", "p"), 34 | ("document", "tbl", "tr", "tc", "p"), 35 | ("document", "tbl", "tr", "tc", "p"), 36 | ("document", "tbl", "tr", "tc", "p"), 37 | ("document", None, None, None, "p"), 38 | ("document", None, None, None, "p"), 39 | ("document", "tbl", "tr", "tc", "p"), 40 | ("document", "tbl", "tr", "tc", "p"), 41 | ("document", "tbl", "tr", "tc", "p"), 42 | ("document", "tbl", "tr", "tc", "p"), 43 | ("document", None, None, None, "p"), 44 | ] 45 | 46 | 47 | class TestTableIdentification: 48 | """Are tables identified correctly?""" 49 | 50 | def test_is_tbl(self): 51 | """Tables are identified correctly.""" 52 | with docx2python(RESOURCES / "paragraphs_and_tables.docx") as extraction: 53 | pars = extraction.document_pars 54 | assert [is_tbl(tbl) for tbl in iter_tables(pars)] == [ 55 | False, 56 | True, 57 | False, 58 | True, 59 | False, 60 | ] 61 | 62 | def test_is_tr(self): 63 | """Tables are identified correctly.""" 64 | with docx2python(RESOURCES / "paragraphs_and_tables.docx") as extraction: 65 | pars = extraction.document_pars 66 | assert [is_tr(tr) for tr in iter_rows(pars)] == [ 67 | False, 68 | True, 69 | True, 70 | True, 71 | False, 72 | True, 73 | True, 74 | True, 75 | True, 76 | False, 77 | ] 78 | 79 | def test_is_tc(self): 80 | """Tables are identified correctly.""" 81 | with docx2python(RESOURCES / "paragraphs_and_tables.docx") as extraction: 82 | pars = extraction.document_pars 83 | assert [is_tc(tc) for tc in iter_cells(pars)] == [ 84 | False, 85 | True, 86 | True, 87 | True, 88 | True, 89 | True, 90 | True, 91 | False, 92 | True, 93 | True, 94 | True, 95 | True, 96 | False, 97 | ] 98 | -------------------------------------------------------------------------------- /tests/test_linebreak_replace_text.py: -------------------------------------------------------------------------------- 1 | """Try to use replace text with a linebreak. 2 | 3 | :author: Shay Hill 4 | :created: 2023-04-26 5 | """ 6 | 7 | from docx2python.main import docx2python 8 | from tests.conftest import RESOURCES 9 | 10 | 11 | class TestText: 12 | def test_user_checked_dropdown0(self) -> None: 13 | """Get checked-out box glyph and second dd entry""" 14 | extraction = docx2python(RESOURCES / "checked_drop1.docx") 15 | assert extraction.body_runs == [[[[["☒", " "], ["PIlihan A"]]]]] 16 | extraction.close() 17 | -------------------------------------------------------------------------------- /tests/test_list_position.py: -------------------------------------------------------------------------------- 1 | """Test list_position attribute of list paragraphs. 2 | 3 | :author: Shay Hill 4 | :created: 2024-07-17 5 | """ 6 | 7 | from docx2python.iterators import iter_at_depth 8 | from docx2python.main import docx2python 9 | from tests.conftest import RESOURCES 10 | 11 | 12 | class TestListPosition: 13 | def test_explicit(self): 14 | # """List paragraphs match hand-counted list_position.""" 15 | with docx2python(RESOURCES / "example.docx") as content: 16 | pars = iter_at_depth(content.officeDocument_pars, 4) 17 | positions = [p.list_position for p in pars] 18 | assert positions == [ 19 | ("2", [1]), 20 | ("2", [1, 1]), 21 | ("2", [1, 2]), 22 | ("2", [1, 2, 1]), 23 | ("2", [1, 2, 1, 1]), 24 | ("2", [1, 2, 1, 2]), 25 | ("2", [1, 2, 1, 2, 1]), 26 | ("2", [1, 2, 1, 2, 1, 1]), 27 | ("2", [1, 2, 1, 2, 1, 1, 1]), 28 | ("2", [1, 2, 1, 2, 1, 1, 2]), 29 | ("2", [2]), 30 | ("2", [2, 1]), 31 | ("1", [1]), 32 | ("1", [1, 1]), 33 | ("1", [1, 1, 1]), 34 | (None, []), 35 | (None, []), 36 | (None, []), 37 | (None, []), 38 | (None, []), 39 | (None, []), 40 | (None, []), 41 | (None, []), 42 | (None, []), 43 | (None, []), 44 | (None, []), 45 | (None, []), 46 | (None, []), 47 | (None, []), 48 | (None, []), 49 | (None, []), 50 | (None, []), 51 | (None, []), 52 | (None, []), 53 | (None, []), 54 | (None, []), 55 | (None, []), 56 | (None, []), 57 | (None, []), 58 | ] 59 | -------------------------------------------------------------------------------- /tests/test_long_hyperlink.py: -------------------------------------------------------------------------------- 1 | """User K Ravikiran had trouble with long hyperlinks. 2 | 3 | The sample file here has a hyperlink he was not able to export correctly. 4 | 5 | :author: Shay Hill 6 | :created: 2024-01-20 7 | """ 8 | 9 | from docx2python.main import docx2python 10 | from tests.conftest import RESOURCES 11 | 12 | long_hyperlink = RESOURCES / "long_hyperlink.docx" 13 | 14 | 15 | class TestLongHyperlink: 16 | def test_non_html(self) -> None: 17 | """Exports full hyperlink without html flag.""" 18 | with docx2python(long_hyperlink) as docx_content: 19 | extracted_text = docx_content.text 20 | long_url = ( 21 | "https://connect.asdfg.com/wikis/home?lang-en-us" 22 | + "#!/wiki/asdfasdf_asdfasdf/page/EOL%20support%20-%20MDGI" 23 | ) 24 | assert long_url in extracted_text 25 | 26 | def test_html(self) -> None: 27 | """Exports full hyperlink with html flag.""" 28 | with docx2python(long_hyperlink, html=True) as docx_content: 29 | extracted_text = docx_content.text 30 | long_url = ( 31 | "https://connect.asdfg.com/wikis/home?lang-en-us" 32 | + "#!/wiki/asdfasdf_asdfasdf/page/EOL%20support%20-%20MDGI" 33 | ) 34 | assert long_url in extracted_text 35 | -------------------------------------------------------------------------------- /tests/test_merge_runs.py: -------------------------------------------------------------------------------- 1 | """Test that consecutive links pointing to the same address are merged. 2 | 3 | :author: Shay Hill 4 | :created: 3/17/2021 5 | 6 | There are a few ways consecutive elements can be "identical": 7 | * same link 8 | * same style 9 | 10 | Often, consecutive, "identical" elements are written as separate elements, 11 | because they aren't identical to Word. Work keeps track of revision history, 12 | spelling errors, etc., which are meaningless to docx2python. 13 | 14 | 15 | 16 | 17 | hy 18 | 19 | 20 | 21 | 22 | 23 | per 24 | 25 | 26 | 27 | 28 | link 29 | 30 | 31 | 32 | 33 | Docx2python condenses the above to (by merging links) 34 | 35 | 36 | 37 | 38 | hy 39 | 40 | 41 | per 42 | 43 | 44 | link 45 | 46 | 47 | 48 | 49 | Then to (by merging runs) 50 | 51 | 52 | 53 | 54 | hy 55 | per 56 | link 57 | 58 | 59 | 60 | 61 | Then finally to (by merging text) 62 | 63 | 64 | 65 | 66 | hyperlink 67 | 68 | 69 | 70 | """ 71 | 72 | from docx2python.main import docx2python 73 | from tests.conftest import RESOURCES 74 | 75 | 76 | def test_merge_runs(): 77 | """ 78 | Merge duplicate, consecutive hyperlinks 79 | 80 | The output text would look the same whether run and text elements were merged. 81 | This test only verifies that hyperlink elements have been merged, else the output 82 | text would contain something closer to ``hyperlink`` 83 | """ 84 | extraction = docx2python(RESOURCES / "merged_links.docx") 85 | assert extraction.body_runs == [ 86 | [ 87 | [ 88 | [ 89 | [ 90 | "This page created by putting three links to the same address " 91 | + "in three different paragraphs (as below) …" 92 | ], 93 | ['hy'], 94 | ['per'], 95 | ['link'], 96 | ["Then removing the endlines to create a single link."], 97 | ['hyperlink'], 98 | [ 99 | "Internally, the XML records the joined paragraphs as " 100 | + "three consecutive links, each with a different r:id, " 101 | + "all r:ids referencing the same address. Docx2python v2+ " 102 | + "should re-join these consecutive links." 103 | ], 104 | [], 105 | [], 106 | ] 107 | ] 108 | ] 109 | ] 110 | extraction.close() 111 | -------------------------------------------------------------------------------- /tests/test_merged_cells.py: -------------------------------------------------------------------------------- 1 | """Attempt to properly handle merged table cells. 2 | 3 | :author: Shay Hill 4 | :created: 2023-01-23 5 | """ 6 | 7 | from docx2python import docx2python 8 | from tests.conftest import RESOURCES 9 | 10 | 11 | class TestMergedCells: 12 | def test_duplicate_merged_cells_false(self): 13 | """By default, duplicate merged cells.""" 14 | with docx2python( 15 | RESOURCES / "merged_cells.docx", duplicate_merged_cells=False 16 | ) as content: 17 | # fmt: off 18 | assert content.body == [ 19 | [ 20 | [["0-0"], ["0-12"], [""], ["0-3"]], 21 | [["12-0"], ["1-1"], ["1-2"], ["1-3"]], 22 | [[""], ["2-1"], ["2-2"], ["2-3"]], 23 | [["3-0"], ["34-123"], [""], [""]], 24 | [["4-0"], [""], [""], [""]], 25 | ], 26 | [[[""]]], 27 | ] 28 | # fmt: on 29 | 30 | def test_duplicate_merged_cells_true(self): 31 | """Duplicate contents in merged cells for an mxn table list.""" 32 | with docx2python(RESOURCES / "merged_cells.docx") as content: 33 | # fmt: off 34 | assert content.body == [ 35 | [ 36 | [["0-0"], ["0-12"], ["0-12"], ["0-3"]], 37 | [["12-0"], ["1-1"], ["1-2"], ["1-3"]], 38 | [["12-0"], ["2-1"], ["2-2"], ["2-3"]], 39 | [["3-0"], ["34-123"], ["34-123"], ["34-123"]], 40 | [["4-0"], ["34-123"], ["34-123"], ["34-123"]], 41 | ], 42 | [[[""]]], 43 | ] 44 | # fmt: on 45 | -------------------------------------------------------------------------------- /tests/test_more_html.py: -------------------------------------------------------------------------------- 1 | """Test that passing `more_html = True` collects paragraph styles 2 | 3 | :author: Shay Hill 4 | :created: 11/5/2020 5 | 6 | Paragraphs and runs can end up nested with text boxes. Docx2python 7 | un-nests these paragraphs. 8 | 9 | 10 | 11 | 12 | 13 | 14 | EHS Manual 15 | 16 | 17 | 18 | 19 | EHS Manual 20 | 21 | 22 | 24 | 25 | EHS Manual 26 | 27 | 28 | 29 | 30 | EHS Manual 31 | 32 | 33 | ``` 34 | 35 | par 1 text 36 | 37 | par 2 text 38 | 39 | more par 1 text 40 | 41 | ``` 42 | 43 | gets flattened to 44 | 45 | ``` 46 | `par 2 text` 47 | 'par 1 text` 48 | `more par 1 text` 49 | ``` 50 | Paragraphs are returned in by the order in which they *close*. 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | EHS Manual 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | """ 80 | 81 | from paragraphs import par 82 | 83 | from docx2python.iterators import iter_at_depth 84 | from docx2python.main import docx2python 85 | from tests.conftest import RESOURCES 86 | 87 | 88 | def test_paragraphs_only() -> None: 89 | """Html tags inserted into text""" 90 | with docx2python(RESOURCES / "nested_paragraphs.docx", html=True) as extraction: 91 | document_pars = extraction.document_pars 92 | styled = [(p.style, p.run_strings) for p in iter_at_depth(document_pars, 4)] 93 | expect = [ 94 | ( 95 | "", 96 | [ 97 | par( 98 | """[Grab your reader’s attention with a great quote from the 99 | document or use this space to emphasize a key point. To place 100 | this text box anywhere on the page, just drag it.]""" 101 | ) 102 | ], 103 | ), 104 | ( 105 | "", 106 | [ 107 | par( 108 | """[Grab your reader’s attention with a great quote from the 109 | document or use this space to emphasize a key point. To place 110 | this text box anywhere on the page, just drag it.]""" 111 | ) 112 | ], 113 | ), 114 | ( 115 | "Heading1", 116 | [ 117 | "

", 118 | par( 119 | """aaa aab aac aad aae aaf aag aah aai aaj aak aal aam aan aao 120 | aap aaq aar aas aat aau aav aaw aax aay aaz aba abb abc abd abe 121 | abf abg abh abi abj abk abl abm abn abo abp abq abr abs abt abu 122 | abv abw abx aby abz aca acb acc acd ace acf acg ach aci acj ack 123 | acl acm acn aco acp acq acr acs act acu acv acw acx acy acz ada 124 | adb adc add ade adf adg adh adi adj adk adl adm adn ado adp adq 125 | adr ads adt adu adv adw adx ady adz aea aeb aec aed aee aef aeg 126 | aeh aei aej aek ael aem aen aeo aep aeq aer aes aet aeu aev aew 127 | aex aey aez afa afb afc afd afe aff afg afh afi afj afk afl afm 128 | afn afo afp afq afr afs aft afu afv afw afx afy afz aga agb agc 129 | agd age agf agg agh agi agj agk agl agm agn ago agp agq agr ags 130 | agt agu agv agw agx agy agz aha ahb ahc ahd ahe ahf ahg ahh ahi 131 | ahj ahk ahl ahm ahn aho ahp ahq ahr ahs aht ahu ahv ahw ahx ahy 132 | ahz aia aib aic aid aie aif aig aih aii aij aik ail aim ain aio 133 | aip aiq air ais ait aiu aiv aiw aix aiy aiz aja ajb ajc ajd aje 134 | ajf ajg ajh aji ajj ajk ajl ajm ajn ajo ajp ajq ajr ajs ajt aju 135 | ajv ajw ajx ajy ajz aka akb akc akd ake akf akg akh aki akj akk 136 | akl akm akn ako akp akq akr aks akt aku akv akw akx aky akz ala 137 | alb alc ald ale alf alg alh ali alj alk all alm aln alo alp alq 138 | alr als alt alu alv alw alx aly alz ama amb amc amd ame amf amg 139 | amh ami amj amk aml amm amn amo amp amq amr ams amt amu amv amw 140 | amx amy amz ana anb anc and ane anf ang anh ani anj ank anl anm 141 | ann ano anp anq anr ans ant anu anv anw anx any anz aoa aob aoc 142 | aod aoe aof aog aoh aoi aoj aok aol aom aon aoo aop aoq aor aos 143 | aot aou aov aow aox aoy aoz apa apb apc apd ape apf apg aph api 144 | apj apk apl apm apn apo app apq apr aps apt apu apv apw apx apy 145 | apz aqa aqb aqc aqd aqe aqf aqg aqh aqi aqj aqk aql aqm aqn aqo 146 | aqp aqq aqr aqs aqt aqu aqv aqw aqx aqy aqz ara arb arc ard are 147 | arf arg arh ari arj ark arl arm arn aro arp arq arr ars art aru 148 | arv arw arx ary arz asa asb asc asd ase asf asg ash asi asj ask 149 | asl asm asn aso asp asq asr ass ast asu asv asw asx asy asz ata 150 | atb atc atd ate atf atg ath ati atj atk atl atm atn ato atp atq 151 | atr ats att atu atv atw atx aty atz aua aub auc aud aue auf aug 152 | auh aui auj auk aul aum aun auo aup auq aur aus aut auu auv auw 153 | aux auy auz ava avb avc avd ave avf avg avh avi avj avk avl avm 154 | avn avo avp avq avr avs avt avu avv avw avx avy avz awa awb awc 155 | awd awe awf awg awh awi awj awk awl awm awn awo awp awq awr aws 156 | awt awu awv aww awx awy awz axa axb axc axd axe axf axg axh axi 157 | axj axk axl axm axn axo axp axq axr axs axt axu axv axw axx axy 158 | axz aya ayb ayc ayd aye ayf ayg ayh ayi ayj ayk ayl aym ayn ayo 159 | ayp ayq ayr ays ayt ayu ayv ayw ayx ayy ayz aza azb azc azd aze 160 | azf azg azh azi azj azk azl azm azn azo azp azq azr azs azt azu 161 | azv azw azx azy azz""" 162 | ), 163 | "

", 164 | ], 165 | ), 166 | ] 167 | assert styled == expect 168 | 169 | 170 | def test_par_styles_not_in_text() -> None: 171 | """Par styles skipped in pure text export""" 172 | pars = docx2python(RESOURCES / "nested_paragraphs.docx", html=True) 173 | assert pars.text == par( 174 | """[Grab your reader’s attention with a great quote from the document or use 175 | this space to emphasize a key point. To place this text box anywhere on the 176 | page, just drag it.] 177 | 178 | [Grab your reader’s attention with a great quote from the document or use 179 | this space to emphasize a key point. To place this text box anywhere on the 180 | page, just drag it.] 181 | 182 |

aaa aab aac aad aae aaf aag aah aai aaj aak aal aam aan aao aap aaq aar 183 | aas aat aau aav aaw aax aay aaz aba abb abc abd abe abf abg abh abi abj abk 184 | abl abm abn abo abp abq abr abs abt abu abv abw abx aby abz aca acb acc acd 185 | ace acf acg ach aci acj ack acl acm acn aco acp acq acr acs act acu acv acw 186 | acx acy acz ada adb adc add ade adf adg adh adi adj adk adl adm adn ado adp 187 | adq adr ads adt adu adv adw adx ady adz aea aeb aec aed aee aef aeg aeh aei 188 | aej aek ael aem aen aeo aep aeq aer aes aet aeu aev aew aex aey aez afa afb 189 | afc afd afe aff afg afh afi afj afk afl afm afn afo afp afq afr afs aft afu 190 | afv afw afx afy afz aga agb agc agd age agf agg agh agi agj agk agl agm agn 191 | ago agp agq agr ags agt agu agv agw agx agy agz aha ahb ahc ahd ahe ahf ahg 192 | ahh ahi ahj ahk ahl ahm ahn aho ahp ahq ahr ahs aht ahu ahv ahw ahx ahy ahz 193 | aia aib aic aid aie aif aig aih aii aij aik ail aim ain aio aip aiq air ais 194 | ait aiu aiv aiw aix aiy aiz aja ajb ajc ajd aje ajf ajg ajh aji ajj ajk ajl 195 | ajm ajn ajo ajp ajq ajr ajs ajt aju ajv ajw ajx ajy ajz aka akb akc akd ake 196 | akf akg akh aki akj akk akl akm akn ako akp akq akr aks akt aku akv akw akx 197 | aky akz ala alb alc ald ale alf alg alh ali alj alk all alm aln alo alp alq 198 | alr als alt alu alv alw alx aly alz ama amb amc amd ame amf amg amh ami amj 199 | amk aml amm amn amo amp amq amr ams amt amu amv amw amx amy amz ana anb anc 200 | and ane anf ang anh ani anj ank anl anm ann ano anp anq anr ans ant anu anv 201 | anw anx any anz aoa aob aoc aod aoe aof aog aoh aoi aoj aok aol aom aon aoo 202 | aop aoq aor aos aot aou aov aow aox aoy aoz apa apb apc apd ape apf apg aph 203 | api apj apk apl apm apn apo app apq apr aps apt apu apv apw apx apy apz aqa 204 | aqb aqc aqd aqe aqf aqg aqh aqi aqj aqk aql aqm aqn aqo aqp aqq aqr aqs aqt 205 | aqu aqv aqw aqx aqy aqz ara arb arc ard are arf arg arh ari arj ark arl arm 206 | arn aro arp arq arr ars art aru arv arw arx ary arz asa asb asc asd ase asf 207 | asg ash asi asj ask asl asm asn aso asp asq asr ass ast asu asv asw asx asy 208 | asz ata atb atc atd ate atf atg ath ati atj atk atl atm atn ato atp atq atr 209 | ats att atu atv atw atx aty atz aua aub auc aud aue auf aug auh aui auj auk 210 | aul aum aun auo aup auq aur aus aut auu auv auw aux auy auz ava avb avc avd 211 | ave avf avg avh avi avj avk avl avm avn avo avp avq avr avs avt avu avv avw 212 | avx avy avz awa awb awc awd awe awf awg awh awi awj awk awl awm awn awo awp 213 | awq awr aws awt awu awv aww awx awy awz axa axb axc axd axe axf axg axh axi 214 | axj axk axl axm axn axo axp axq axr axs axt axu axv axw axx axy axz aya ayb 215 | ayc ayd aye ayf ayg ayh ayi ayj ayk ayl aym ayn ayo ayp ayq ayr ays ayt ayu 216 | ayv ayw ayx ayy ayz aza azb azc azd aze azf azg azh azi azj azk azl azm azn 217 | azo azp azq azr azs azt azu azv azw azx azy azz

""" 218 | ) 219 | pars.close() 220 | 221 | 222 | class TestBulletedLists: 223 | """Replace numbering format with bullet (--) when format cannot be determined""" 224 | 225 | def test_bulleted_lists(self) -> None: 226 | pars = docx2python(RESOURCES / "created-in-pages-bulleted-lists.docx") 227 | assert pars.text == ( 228 | "\n\nThis is a document for testing docx2python module.\n\n\n\n--\tWhy " 229 | "did the chicken cross the road?\n\n\t--\tJust because\n\n\t--\tDon't " 230 | "know\n\n\t--\tTo get to the other side\n\n--\tWhat's the meaning of life, " 231 | "universe and everything?\n\n\t--\t42\n\n\t--\t0\n\n\t--\t-1\n\n" 232 | ) 233 | pars.close() 234 | -------------------------------------------------------------------------------- /tests/test_numbering_formats.py: -------------------------------------------------------------------------------- 1 | """Test functions in docx2python.numbering_formats.py 2 | 3 | :author: Shay Hill 4 | :created: 6/26/2019 5 | """ 6 | 7 | from random import randint 8 | 9 | import pytest 10 | 11 | from docx2python.numbering_formats import ( 12 | bullet, 13 | decimal, 14 | lower_letter, 15 | lower_roman, 16 | upper_letter, 17 | upper_roman, 18 | ) 19 | from tests.helpers.utils import ARABIC_2_ROMAN 20 | 21 | 22 | class TestLowerLetter: 23 | """Test numbering_formats.lower_letter""" 24 | 25 | def test_convert_positive_int(self) -> None: 26 | """Convert a positive integer to a string of letters""" 27 | assert lower_letter(1) == "a" 28 | assert lower_letter(26) == "z" 29 | assert lower_letter(27) == "aa" 30 | 31 | def test_zero(self) -> None: 32 | """Raise a value error for < 1""" 33 | with pytest.raises(ValueError) as msg: 34 | _ = lower_letter(0) 35 | assert "0 and <1 are not defined" in str(msg.value) 36 | 37 | def test_neg(self) -> None: 38 | """Raise a value error for < 1""" 39 | with pytest.raises(ValueError) as msg: 40 | _ = lower_letter(-1) 41 | assert "0 and <1 are not defined" in str(msg.value) 42 | 43 | 44 | def test_upper_letter() -> None: 45 | """Same as lower_letter, but upper""" 46 | for _ in range(100): 47 | n = randint(1, 10000) 48 | assert upper_letter(n) == lower_letter(n).upper() 49 | 50 | 51 | class TestLowerRoman: 52 | """Test numbering_formats.lower_roman""" 53 | 54 | def test_convert_positive_int(self) -> None: 55 | """Convert a positive integer to a string of letters""" 56 | for arabic, roman in ARABIC_2_ROMAN.items(): 57 | assert lower_roman(arabic) == roman 58 | 59 | def test_zero(self) -> None: 60 | """Raise a value error for < 1""" 61 | with pytest.raises(ValueError) as msg: 62 | _ = lower_roman(0) 63 | assert "Roman" in str(msg.value) 64 | 65 | def test_neg(self) -> None: 66 | """Raise a value error for < 1""" 67 | with pytest.raises(ValueError) as msg: 68 | _ = lower_roman(-1) 69 | assert "Roman" in str(msg.value) 70 | 71 | 72 | def test_upper_roman() -> None: 73 | """Same as lower_roman, but upper""" 74 | for _ in range(100): 75 | n = randint(1, 10000) 76 | assert upper_roman(n) == lower_roman(n).upper() 77 | 78 | 79 | def test_decimal() -> None: 80 | """Return string representation of input""" 81 | for i in range(10): 82 | assert decimal(i) == str(i) 83 | 84 | 85 | def test_bullet() -> None: 86 | """Return same string for every input.""" 87 | for i in range(10): 88 | assert bullet(i) == bullet(i * 10) 89 | -------------------------------------------------------------------------------- /tests/test_par_styles.py: -------------------------------------------------------------------------------- 1 | """Par styles converted to flags 2 | 3 | :author: Shay Hill 4 | :created: 3/18/2021 5 | 6 | """ 7 | 8 | from docx2python.iterators import iter_at_depth 9 | from docx2python.main import docx2python 10 | from tests.conftest import RESOURCES 11 | 12 | 13 | class TestParStyles: 14 | def test_par_styles(self) -> None: 15 | """ 16 | If do_html, paragraphs style is the first element of every paragraph 17 | 18 | If no paragraph style, empty string is first element of evert paragraph 19 | 20 | :return: 21 | """ 22 | with docx2python(RESOURCES / "example.docx") as extraction: 23 | document_pars = extraction.document_pars 24 | styled = [(p.style, p.run_strings) for p in iter_at_depth(document_pars, 4)] 25 | styled = [x for x in styled if x[1]] 26 | expect = [ 27 | ( 28 | "Header", 29 | [ 30 | "Header text", 31 | "----Image alt text---->A close up of a logo\n\n" 32 | + "Description automatically generated<", 33 | "----media/image1.png----", 34 | ], 35 | ), 36 | ("ListParagraph", ["I)\t", "expect I"]), 37 | ("ListParagraph", ["\tA)\t", "expect A"]), 38 | ("ListParagraph", ["\tB)\t", "expect B"]), 39 | ("ListParagraph", ["\t\t1)\t", "expect 1"]), 40 | ("ListParagraph", ["\t\t\ta)\t", "expect a"]), 41 | ("ListParagraph", ["\t\t\tb)\t", "expect b"]), 42 | ("ListParagraph", ["\t\t\t\t1)\t", "expect 1"]), 43 | ("ListParagraph", ["\t\t\t\t\ta)\t", "expect a"]), 44 | ("ListParagraph", ["\t\t\t\t\t\ti)\t", "expect i"]), 45 | ("ListParagraph", ["\t\t\t\t\t\tii)\t", "expect ii"]), 46 | ("ListParagraph", ["II)\t", "This should be II"]), 47 | ("ListParagraph", ["\tA)\t", "This should be A), not C)"]), 48 | ("ListParagraph", ["--\t", "bullet no indent"]), 49 | ("ListParagraph", ["\t--\t", "bullet indent 1"]), 50 | ("ListParagraph", ["\t\t--\t", "bullet indent 2"]), 51 | ("", ["Bold"]), 52 | ("", ["Italics"]), 53 | ("", ["Underlined"]), 54 | ("", ["Large Font"]), 55 | ("", ["Colored"]), 56 | ("", ["Large Colored"]), 57 | ("", ["Large Bold"]), 58 | ("", ["Large Bold Italics Underlined"]), 59 | ("", ["Nested"]), 60 | ("", ["Table"]), 61 | ("", ["A"]), 62 | ("", ["B"]), 63 | ("", ["Tab", "\t", "delimited", "\t", "text"]), 64 | ("", ["10 < 20 and 20 > 10"]), 65 | ("", ["Text outside table"]), 66 | ("", ["Reference footnote 1", "----footnote1----"]), 67 | ("", ["Reference footnote 2", "----footnote2----"]), 68 | ("", ["Reference endnote 1", "----endnote1----"]), 69 | ("", ["Reference endnote 2", "----endnote2----"]), 70 | ("Heading1", ["Heading 1"]), 71 | ("Heading2", ["Heading 2"]), 72 | ( 73 | "", 74 | [ 75 | "----Image alt text---->A jellyfish in water\n\n" 76 | + "Description automatically generated<", 77 | "----media/image2.jpg----", 78 | ], 79 | ), 80 | ( 81 | "Footer", 82 | [ 83 | "Footer text", 84 | "----Image alt text---->A close up of a logo\n\n" 85 | + "Description automatically generated<", 86 | "----media/image1.png----", 87 | ], 88 | ), 89 | ("FootnoteText", ["footnote1)\t", " First footnote"]), 90 | ( 91 | "FootnoteText", 92 | [ 93 | "footnote2)\t", 94 | " Second footnote", 95 | "----Image alt text---->A close up of a logo\n\n" 96 | + "Description automatically generated<", 97 | "----media/image1.png----", 98 | ], 99 | ), 100 | ("EndnoteText", ["endnote1)\t", " First endnote"]), 101 | ( 102 | "EndnoteText", 103 | [ 104 | "endnote2)\t", 105 | " Second endnote", 106 | "----Image alt text---->A close up of a logo\n\n" 107 | + "Description automatically generated<", 108 | "----media/image1.png----", 109 | ], 110 | ), 111 | ] 112 | assert styled == expect 113 | -------------------------------------------------------------------------------- /tests/test_pict.py: -------------------------------------------------------------------------------- 1 | """Test functionality with pict elements. 2 | 3 | :author: Shay Hill 4 | :created: 1/29/2020 5 | 6 | Such file was sent to me by stefan-hock20 on github. Images are referenced in 7 | document.html as 8 | 9 | ``` 10 | 11 | 13 | 14 | 15 | ``` 16 | 17 | docx2text 1.19 would get the image, but not mark the image location in the output text. 18 | """ 19 | 20 | from docx2python.main import docx2python 21 | from tests.conftest import RESOURCES 22 | 23 | 24 | class TestPictElement: 25 | def test_extraction(self) -> None: 26 | """Image placeholder inserted into extracted text.""" 27 | extraction = docx2python(RESOURCES / "has_pict.docx") 28 | assert "image1.png" in extraction.images 29 | assert "----media/image1.png----" in extraction.text 30 | extraction.close() 31 | 32 | 33 | class TestPictWithAltText: 34 | def test_extraction(self) -> None: 35 | """Image placeholder inserted into extracted text.""" 36 | extraction = docx2python(RESOURCES / "pic_alt_text.docx") 37 | text = extraction.text 38 | assert "Alt description" in text 39 | extraction.close() 40 | -------------------------------------------------------------------------------- /tests/test_run_styles.py: -------------------------------------------------------------------------------- 1 | """Run styles converted to html 2 | 3 | :author: Shay Hill 4 | :created: 3/18/2021 5 | 6 | 16 point 7 | Red 8 | 9 | Courier new 10 | Italic 11 | Bold 12 | Underline 13 | Strikethrough 14 | Double Strikethrough 15 | Superscript 16 | Subscript 17 | Small Caps 18 | All Caps 19 | Highlighted yellow 20 | Highlighted green 21 | 22 | italic 23 | bold 24 | underline 25 | strike 26 | double strike 27 | superscript 28 | subscript 29 | small caps 30 | all caps 31 | highlighted yellow 32 | highlighted green 33 | """ 34 | 35 | from docx2python.main import docx2python 36 | from tests.conftest import RESOURCES 37 | 38 | 39 | class TestParStyles: 40 | def test_par_styles(self) -> None: 41 | """ 42 | If do_html, paragraphs style is the first element of every run 43 | 44 | :return: 45 | """ 46 | content = docx2python(RESOURCES / "run_styles.docx", html=True) 47 | assert content.document_runs == [ 48 | [ 49 | [ 50 | [ 51 | ["Normal"], 52 | ['16 point'], 53 | ['Red'], 54 | ["Courier new"], 55 | ["Italic"], 56 | ["Bold"], 57 | ["Underline"], 58 | ["Strikethrough"], 59 | ["Double Strikethough"], 60 | ["Superscript"], 61 | ["Subscript"], 62 | ['Small Caps'], 63 | ['All Caps'], 64 | [ 65 | '' 66 | + "Highlighted yellow" 67 | ], 68 | [ 69 | '' 70 | + "Highlighted green" 71 | ], 72 | ["Consecutive"], 73 | ["Bold"], 74 | ["Paragraphs"], 75 | [ 76 | 'Subscript ', 77 | '' 78 | + "Small Caps ", 79 | '' 80 | + "All Caps ", 81 | '' 82 | + "Highlighted yellow ", 83 | '' 84 | + "Highlighted green", 85 | ], 86 | [], 87 | [], 88 | ] 89 | ] 90 | ] 91 | ] 92 | content.close() 93 | -------------------------------------------------------------------------------- /tests/test_slanted_quotes.py: -------------------------------------------------------------------------------- 1 | """Test that Word's tilted quotes and double quotes extract Docx2Python.""" 2 | 3 | from docx2python.main import docx2python 4 | from tests.conftest import RESOURCES 5 | 6 | 7 | class TestTiltedQuotes: 8 | """Confirming this works with v1.25""" 9 | 10 | def test_exact_representation(self) -> None: 11 | """Most characters are represented exactly""" 12 | with docx2python(RESOURCES / "slanted_quotes.docx") as pars: 13 | assert pars.text == "“double quote”\n\n‘single quote’\n\nApostrophe’s" 14 | -------------------------------------------------------------------------------- /tests/test_soft_line_breaks.py: -------------------------------------------------------------------------------- 1 | """Start a new paragraph at a soft line break ```` 2 | 3 | :author: Shay Hill 4 | :created: 7/7/2021 5 | 6 | Docx2Python previously ignored elements: 7 | 8 | ``` 9 | pars = docx2python('soft_line_breaks.docx') 10 | [[[[['Line1Line2Line3'], ['Line4'], []]], [[[]]]], [[[[]]]]] 11 | ``` 12 | """ 13 | 14 | from docx2python import docx2python 15 | from docx2python.iterators import iter_paragraphs 16 | from tests.conftest import RESOURCES 17 | 18 | 19 | class TestSoftLineBreaks: 20 | def test_separate_pars(self): 21 | """ 22 | Start a new paragraph when a element is found. 23 | """ 24 | with docx2python(RESOURCES / "soft_line_breaks.docx") as content: 25 | body = content.body 26 | pars = [x for x in iter_paragraphs(body) if x] 27 | assert pars == ["Line1\nLine2\nLine3", "Line4"] 28 | -------------------------------------------------------------------------------- /tests/test_strict.py: -------------------------------------------------------------------------------- 1 | """A simple test for docx files saved with the strict menu option. 2 | 3 | :author: Shay Hill 4 | :created: 2024-07-02 5 | """ 6 | 7 | from docx2python.main import docx2python 8 | from tests.conftest import RESOURCES 9 | 10 | 11 | class TestParagraphsOnly: 12 | """Confirming this works with v1.25""" 13 | 14 | def test_paragraphs_only(self) -> None: 15 | """Run without issue""" 16 | pars = docx2python(RESOURCES / "strict.docx") 17 | assert pars.document == [ 18 | [[["--\tBullet1", "--\tBullet2", "1)\tNumber1", "2)\tNumber2"]]], 19 | [[["Cellaa"], ["Cellab"]], [["Cellba"], ["Cellbb"]]], 20 | [[[""]]], 21 | ] 22 | -------------------------------------------------------------------------------- /tests/test_symbols.py: -------------------------------------------------------------------------------- 1 | """Test symbol extraction. 2 | 3 | :author: Shay Hill 4 | :created: 11/2/2021 5 | 6 | Symbols are captured in the docx content files as ```` elements. 7 | 8 | ``` 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | ``` 19 | """ 20 | 21 | from docx2python.main import docx2python 22 | from tests.conftest import RESOURCES 23 | 24 | 25 | def test_symbols() -> None: 26 | """Export symbols as span elements.""" 27 | with docx2python(RESOURCES / "symbols.docx") as pars: 28 | assert pars.text == ( 29 | "h" 30 | "≠" 31 | "ð" 32 | "∞×÷≥≤±™®©¥£€µαβπΩ∑" 33 | "J" 34 | "K" 35 | "" 36 | "æ" 37 | "Ý" 38 | ) 39 | 40 | 41 | def test_symbols_with_html_true() -> None: 42 | """Export symbols as span elements.""" 43 | with docx2python(RESOURCES / "symbols.docx", html=True) as pars: 44 | assert pars.text == ( 45 | "h" 46 | "≠" 47 | "ð" 48 | "∞×÷≥≤±™®©¥£€µαβπΩ∑" 49 | "J" 50 | "K" 51 | "" 52 | "æ" 53 | "Ý" 54 | ) 55 | -------------------------------------------------------------------------------- /tests/test_tables_to_markdown.py: -------------------------------------------------------------------------------- 1 | """Test converting tables to markdown. 2 | 3 | This is more of an example that an actual test, because I've had multiple requests 4 | for tables as markdown. The new features in docx2python v3 make this straightforward. 5 | 6 | :author: Shay Hill 7 | :created: 2024-07-14 8 | """ 9 | 10 | from __future__ import annotations 11 | 12 | from conftest import RESOURCES 13 | 14 | from docx2python import docx2python 15 | from docx2python.depth_collector import Par 16 | from docx2python.iterators import is_tbl, iter_at_depth, iter_tables 17 | 18 | 19 | def _print_tc(cell: list[Par]) -> str: 20 | """Print a table cell as a string on one line.""" 21 | ps = ["".join(p.run_strings).replace("\n", " ") for p in cell] 22 | return "\n\n".join(ps) 23 | 24 | 25 | def _join_and_enclose_with_pipes(strings: list[str]) -> str: 26 | """Join strings with pipes and enclose with pipes.""" 27 | return "|" + "|".join(strings) + "|" 28 | 29 | 30 | def _print_text(tbl: list[list[list[Par]]]) -> str: 31 | """Text in this list [[[Par]]] is not a table. It's just text.""" 32 | all_cells = iter_at_depth(tbl, 2) 33 | return "\n\n".join(_print_tc(tc) for tc in all_cells) 34 | 35 | 36 | def _print_tbl(tbl: list[list[list[Par]]]) -> str: 37 | """Text in this list [[[Par]]] is a table.""" 38 | rows_as_string_lists = [[_print_tc(tc) for tc in tr] for tr in tbl] 39 | rows_as_string_lists.insert(1, ["---"] * len(rows_as_string_lists[0])) 40 | rows_as_strings = [ 41 | _join_and_enclose_with_pipes(row) for row in rows_as_string_lists 42 | ] 43 | return "\n".join(rows_as_strings) 44 | 45 | 46 | EXPECT = """This document has paragraphs. 47 | 48 | |This|Document| 49 | |---|---| 50 | |Also|Has| 51 | |Tables|| 52 | 53 | There are paragraphs between tables. These are used to check the .lineage attribute of Par instances. 54 | 55 | Here is another paragraph between the first and second tables. 56 | 57 | |One More Table| 58 | |---| 59 | |One| 60 | |More| 61 | |Table| 62 | 63 | """ 64 | 65 | 66 | def test_tables_to_markdown() -> None: 67 | with docx2python(RESOURCES / "paragraphs_and_tables.docx") as extraction: 68 | tables = extraction.document_pars 69 | 70 | as_text: list[str] = [] 71 | 72 | for possible_table in iter_tables(tables): 73 | if is_tbl(possible_table): 74 | as_text.append(_print_tbl(possible_table)) 75 | else: 76 | as_text.append(_print_text(possible_table)) 77 | 78 | assert "\n\n".join(as_text) == EXPECT 79 | -------------------------------------------------------------------------------- /tests/test_text_runs.py: -------------------------------------------------------------------------------- 1 | """Test functions in docx2python.text_runs.py 2 | 3 | :author: Shay Hill 4 | :created: 7/4/2019 5 | """ 6 | 7 | from lxml import etree 8 | 9 | from docx2python.attribute_register import XML2HTML_FORMATTER 10 | from docx2python.text_runs import gather_Pr, get_run_formatting, html_close, html_open 11 | from tests.helpers.utils import valid_xml 12 | 13 | ONE_TEXT_RUN = valid_xml( 14 | '' 15 | + "" 16 | + '' 17 | + "" 18 | + "" 19 | + "" 20 | + '' 21 | + '' 22 | + '' 23 | + '' 24 | + "" 25 | + "text styled with rPr" 26 | + "" 27 | + "" 28 | ) 29 | 30 | NO_STYLE_RUN = valid_xml( 31 | '' + "no styles applies" + "" + "" 32 | ) 33 | 34 | 35 | class TestGatherRpr: 36 | """Test text_runs.gather_rPr""" 37 | 38 | def test_get_styles(self): 39 | """Map styles to values.""" 40 | document = etree.fromstring(ONE_TEXT_RUN) 41 | assert gather_Pr(document[0][0][0]) == { 42 | "rFonts": None, 43 | "b": None, 44 | "u": "single", 45 | "i": None, 46 | "sz": "32", 47 | "color": "red", 48 | "szCs": "32", 49 | } 50 | 51 | def test_no_styles(self): 52 | """Return empty dict when no rPr for text run.""" 53 | document = etree.fromstring(NO_STYLE_RUN) 54 | assert gather_Pr(document[0][0][0]) == {} 55 | 56 | 57 | class TestGetRunStyle: 58 | """Test text_runs.get_run_style""" 59 | 60 | def test_font_and_others(self) -> None: 61 | """Return font first, then other styles.""" 62 | document = etree.fromstring(ONE_TEXT_RUN) 63 | assert get_run_formatting(document[0][0][0], XML2HTML_FORMATTER) == [ 64 | 'span style="color:red;font-size:32pt"', 65 | "b", 66 | "i", 67 | "u", 68 | ] 69 | 70 | 71 | class TestStyleStrings: 72 | """Test text_runs.style_open and text_runs.style_close""" 73 | 74 | def test_style_open(self) -> None: 75 | """Produce valid html for all defined styles.""" 76 | style = ['span style="color:red"', "b", "i", "u"] 77 | assert html_open(style) == '' 78 | 79 | def test_style_close(self) -> None: 80 | """Produce valid html for all defined styles.""" 81 | style = ['span style="color:red"', "b", "i", "u"] 82 | assert html_close(style) == "" 83 | -------------------------------------------------------------------------------- /tests/test_toc_support.py: -------------------------------------------------------------------------------- 1 | """Testing Table of Contents support as requested by user leboni 2 | 3 | :author: Shay Hill 4 | :created: 8/19/2020 5 | 6 | User leboni forwarded a docx file, `zen_of_python.docx` with Table of Contents. 7 | Addressing issue 8 | 9 | `KeyError: '{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id'` 10 | 11 | When attempting to extract content from such documents. 12 | 13 | Two types of links in docx files. Internal links look like actual hyperlinks without 14 | an href. 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | Beautiful is better than ugly. 23 | 24 | 25 | """ 26 | 27 | from paragraphs import par 28 | 29 | from docx2python.main import docx2python 30 | from tests.conftest import RESOURCES 31 | 32 | 33 | class TestTocText: 34 | def test_get_toc_text(self) -> None: 35 | """Extract header text from table-of-contents header.""" 36 | extraction = docx2python(RESOURCES / "zen_of_python.docx") 37 | assert extraction.document_runs == [ 38 | [ 39 | [[["Contents"], ["\t", "Beautiful is better than ugly.\t1"], []]], 40 | [ 41 | [ 42 | [], 43 | [], 44 | ["Beautiful is better than ugly."], 45 | ["Explicit is better than implicit."], 46 | ["Simple is better than complex."], 47 | ["Complex is better than complicated."], 48 | ["Flat is better than nested."], 49 | ["Sparse is better than dense."], 50 | ["Readability counts."], 51 | ["Special cases aren't special enough to break the rules."], 52 | ["Although practicality beats purity."], 53 | ["Errors should never pass silently."], 54 | ["Unless explicitly silenced."], 55 | ["In the face of ambiguity, refuse the temptation to guess."], 56 | [ 57 | par( 58 | """There should be one-- and preferably only one 59 | --obvious way to do it.""" 60 | ) 61 | ], 62 | [ 63 | par( 64 | """Although that way may not be obvious at first 65 | unless you're Dutch.""" 66 | ) 67 | ], 68 | ["Now is better than never."], 69 | ["Although never is often better than *right* now."], 70 | ["If the implementation is hard to explain, it's a bad idea."], 71 | [ 72 | par( 73 | """If the implementation is easy to explain, it may 74 | be a good idea.""" 75 | ) 76 | ], 77 | [ 78 | par( 79 | """Namespaces are one honking great idea -- let's do 80 | more of those!""" 81 | ) 82 | ], 83 | ] 84 | ], 85 | ] 86 | ] 87 | extraction.close() 88 | -------------------------------------------------------------------------------- /tests/test_utilities.py: -------------------------------------------------------------------------------- 1 | """DocxReader object is able to open a docx file, search and replace text, then save. 2 | 3 | :author: Shay Hill 4 | :created: 2021-12-20 5 | """ 6 | 7 | import os 8 | import tempfile 9 | 10 | from docx2python.main import docx2python 11 | from docx2python.utilities import get_headings, get_links, replace_docx_text 12 | from tests.conftest import RESOURCES 13 | 14 | 15 | class TestSearchReplace: 16 | def test_search_and_replace(self) -> None: 17 | """Apples -> Pears, Pears -> Apples 18 | 19 | Ignore html differences when html is False""" 20 | 21 | # assert test file is in default state 22 | html = False 23 | input_filename = RESOURCES / "apples_and_pears.docx" 24 | expect = ( 25 | "Apples and Pears\n\nPears and Apples\n\n" 26 | "Apples and Pears\n\nPears and Apples" 27 | ) 28 | with docx2python(input_filename, html=html) as input_doc: 29 | result = input_doc.text 30 | assert result == expect 31 | 32 | # attempt a search and replace 33 | with tempfile.TemporaryDirectory() as temp_dir: 34 | output_filename = os.path.join(temp_dir, "pears_and_apples.docx") 35 | replace_docx_text( 36 | input_filename, 37 | output_filename, 38 | ("Apples", "Bananas"), 39 | ("Pears", "Apples"), 40 | ("Bananas", "Pears"), 41 | html=html, 42 | ) 43 | expect = ( 44 | "Pears and Apples\n\nApples and Pears\n\n" 45 | "Pears and Apples\n\nApples and Pears" 46 | ) 47 | with docx2python(output_filename, html=html) as output_doc: 48 | result = output_doc.text 49 | 50 | assert result == expect 51 | 52 | def test_ampersand(self) -> None: 53 | """Apples -> Pears, Pears -> Apples 54 | 55 | Replace text with an ampersand""" 56 | html = False 57 | input_filename = RESOURCES / "apples_and_pears.docx" 58 | 59 | with tempfile.TemporaryDirectory() as temp_dir: 60 | output_filename = os.path.join(temp_dir, "pears_and_apples.docx") 61 | replace_docx_text( 62 | input_filename, 63 | output_filename, 64 | ("Apples", "Apples & Pears <>"), 65 | html=html, 66 | ) 67 | with docx2python(output_filename, html=html) as output_doc: 68 | assert output_doc.text == ( 69 | "Apples & Pears <> and Pears\n\nPears and Apples & Pears <>\n\n" 70 | "Apples & Pears <> and Pears\n\nPears and Apples & Pears <>" 71 | ) 72 | 73 | def test_search_and_replace_html(self) -> None: 74 | """Apples -> Pears, Pears -> Apples 75 | 76 | Exchange strings when formatting is consistent across the string. Leave 77 | alone otherwise. 78 | """ 79 | html = True 80 | input_filename = RESOURCES / "apples_and_pears.docx" 81 | 82 | with tempfile.TemporaryDirectory() as temp_dir: 83 | output_filename = os.path.join(temp_dir, "pears_and_apples.docx") 84 | replace_docx_text( 85 | input_filename, 86 | output_filename, 87 | ("Apples", "Bananas"), 88 | ("Pears", "Apples"), 89 | ("Bananas", "Pears"), 90 | html=html, 91 | ) 92 | with docx2python(output_filename, html=html) as output_doc: 93 | assert output_doc.text == ( 94 | "Pears and Apples\n\n" 95 | "Apples and Pears\n\n" 96 | 'Pears and Apples\n\n' 97 | "Pears and Pears" 98 | ) 99 | 100 | def test_search_and_replace_with_linebreaks(self) -> None: 101 | """Apples -> Pears, Pears -> Apples 102 | 103 | Exchange strings when replacement has linebreaks. 104 | """ 105 | html = True 106 | input_filename = RESOURCES / "apples_and_pears.docx" 107 | with tempfile.TemporaryDirectory() as temp_dir: 108 | output_filename = os.path.join(temp_dir, "pears_and_apples.docx") 109 | replace_docx_text( 110 | input_filename, 111 | output_filename, 112 | ("Apples", "Bananas"), 113 | ("Pears", "Apples\nPears\nGrapes"), 114 | ("Bananas", "Pears"), 115 | html=html, 116 | ) 117 | with docx2python(output_filename, html=html) as output_doc: 118 | assert output_doc.text == ( 119 | "Pears and Apples\nPears\nGrapes\n\n" 120 | "Apples\nPears\nGrapes and Pears\n\n" 121 | 'Pears and ' 122 | "Apples\nPears\nGrapes\n\n" 123 | "Pears and Pears" 124 | ) 125 | 126 | 127 | def test_get_links() -> None: 128 | """Return links as tuples""" 129 | assert [x for x in get_links(RESOURCES / "merged_links.docx")] == [ 130 | ("https://www.shayallenhill.com", "hy"), 131 | ("https://www.shayallenhill.com", "per"), 132 | ("https://www.shayallenhill.com", "link"), 133 | ("https://www.shayallenhill.com", "hyperlink"), 134 | ] 135 | 136 | 137 | def test_get_headings() -> None: 138 | """Return all headings (paragraphs with heading style) in document""" 139 | assert [x for x in get_headings(RESOURCES / "example.docx")] == [ 140 | ["

", "Heading 1", "

"], 141 | ["

", "Heading 2", "

"], 142 | ] 143 | --------------------------------------------------------------------------------