├── .github
    └── workflows
    │   └── pypi-project.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CHANGELOG.md
├── LICENSE.txt
├── README.md
├── README_DOCX_FILE_STRUCTURE.md
├── docx2python
    ├── __init__.py
    ├── attribute_register.py
    ├── bullets_and_numbering.py
    ├── depth_collector.py
    ├── docx_context.py
    ├── docx_output.py
    ├── docx_reader.py
    ├── docx_text.py
    ├── forms.py
    ├── iterators.py
    ├── main.py
    ├── merge_runs.py
    ├── namespace.py
    ├── numbering_formats.py
    ├── py.typed
    ├── text_runs.py
    └── utilities.py
├── pyproject.toml
└── tests
    ├── __init__.py
    ├── conftest.py
    ├── do_not_test_missing_imagedata_rid.py
    ├── do_not_test_problem_files.py
    ├── helpers
        └── utils.py
    ├── resources
        ├── 240-DOP-1013A Lay Down Tubulars.docx
        ├── ControlTest.docx
        ├── apples_and_pears.docx
        ├── ascii_printable.docx
        ├── basic.docx
        ├── check_drop_my.docx
        ├── checked-true-false.docx
        ├── checked_boxes.docx
        ├── checked_drop1.docx
        ├── comments.docx
        ├── created-in-pages-bulleted-lists.docx
        ├── created-in-pages-paragraphs-only.docx
        ├── equations.docx
        ├── example.docx
        ├── example_numbering.docx
        ├── has_pict.docx
        ├── hyperlink.docx
        ├── imagedata_without_rid.docx
        ├── invalid_tag_name.docx
        ├── libreoffice_conversion.docx
        ├── list_index_a.docx
        ├── long_hyperlink.docx
        ├── merged_cells.docx
        ├── merged_links.docx
        ├── multiple_runs_per_paragraph.docx
        ├── nested_paragraphs.docx
        ├── nested_paragraphs_in_header.docx
        ├── nested_paragraphs_in_header3b.docx
        ├── paragraphs_and_tables.docx
        ├── pic_alt_text.docx
        ├── renamed_document_xml.docx
        ├── run_styles.docx
        ├── slanted_quotes.docx
        ├── soft_line_breaks.docx
        ├── strict.docx
        ├── symbols.docx
        ├── test-docx2python-conversion-google_docs.docx
        ├── test_file_with_comments.docx
        ├── unchecked_drop0.docx
        └── zen_of_python.docx
    ├── test_ascii_printable.py
    ├── test_check_drop.py
    ├── test_checked_boxes.py
    ├── test_close.py
    ├── test_comments.py
    ├── test_content_control_block_properties.py
    ├── test_created_in_pages.py
    ├── test_document2_xml.py
    ├── test_docx2python.py
    ├── test_docx_context.py
    ├── test_docx_output.py
    ├── test_dropdown_selector_in_table.py
    ├── test_equations.py
    ├── test_file_object.py
    ├── test_from_bytes.py
    ├── test_get_text.py
    ├── test_google_docs.py
    ├── test_hyperlinks.py
    ├── test_import.py
    ├── test_invalid_tag_name.py
    ├── test_iterators.py
    ├── test_libreoffice_conversion.py
    ├── test_lineage.py
    ├── test_linebreak_replace_text.py
    ├── test_list_position.py
    ├── test_long_hyperlink.py
    ├── test_merge_runs.py
    ├── test_merged_cells.py
    ├── test_more_html.py
    ├── test_numbering_formats.py
    ├── test_par_styles.py
    ├── test_pict.py
    ├── test_run_styles.py
    ├── test_slanted_quotes.py
    ├── test_soft_line_breaks.py
    ├── test_strict.py
    ├── test_symbols.py
    ├── test_tables_to_markdown.py
    ├── test_text_runs.py
    ├── test_toc_support.py
    └── test_utilities.py


/.github/workflows/pypi-project.yml:
--------------------------------------------------------------------------------
  1 | # Run tests then upload to Pypi on version bumps.
  2 | # Run tests on each push.
  3 | # Try to bump version
  4 | # If version is bumped, upload to pypi or test.pypi depending on branch name.
  5 | 
  6 | name: pypi project
  7 | 
  8 | on:
  9 |   push:
 10 |     branches: [dev, master]
 11 |   pull_request:
 12 |     branches: [master]
 13 | 
 14 | jobs:
 15 |   tests:
 16 |     runs-on: ubuntu-latest
 17 |     strategy:
 18 |       fail-fast: false
 19 |       matrix:
 20 |         python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
 21 |         os: [ubuntu-latest, macos-latest, windows-latest]
 22 |     # if: startsWith(github.event.head_commit.message, 'bump:') == false
 23 |     steps:
 24 |     - uses: actions/checkout@v4
 25 |     - name: Set up Python ${{ matrix.python-version }}
 26 |       uses: actions/setup-python@v5
 27 |       with:
 28 |         python-version: ${{ matrix.python-version }}
 29 |     - name: Install dependencies
 30 |       run: |
 31 |         python -m pip install --upgrade pip
 32 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
 33 |         python -m pip install pytest
 34 |         python -m pip install commitizen
 35 |         python -m pip install .
 36 | 
 37 |     - name: Test with pytest
 38 |       run: |
 39 |         pytest
 40 | 
 41 |   # # If the tests pass, try to bump the version number. If no bump is warranted,
 42 |   # # pass silently.
 43 |   # bump_version:
 44 |   #   runs-on: ubuntu-latest
 45 |   #   name: "Bump version and create changelog with commitizen"
 46 |   #   continue-on-error: false
 47 |   #   needs: [tests]
 48 |   #   if: github.ref == 'refs/heads/dev'
 49 |   #   steps:
 50 |   #     - name: Check out
 51 |   #       uses: actions/checkout@v4
 52 |   #       with:
 53 |   #         fetch-depth: 0
 54 |   #         token: "${{ secrets.COMMITIZEN_BUMP }}"
 55 |   #     - id: cz
 56 |   #       name: Create bump and changelog
 57 |   #       uses: commitizen-tools/commitizen-action@master
 58 |   #       with:
 59 |   #         github_token: ${{ secrets.COMMITIZEN_BUMP }}
 60 |   #     - name: Print Version
 61 |   #       run: echo "Bumped to version ${{ steps.cz.outputs.version }}"
 62 | 
 63 |   # Deploy on test.pypi when branch is dev and commit message starts with 'bump'
 64 |   deploy-on-testpypi:
 65 |     runs-on: ubuntu-latest
 66 |     continue-on-error: true
 67 |     needs: [tests]
 68 |     if: github.ref_name == 'dev' && startsWith(github.event.head_commit.message, 'bump:')
 69 |     steps:
 70 |       - uses: actions/checkout@v4
 71 |       - name: Set up Python
 72 |         uses: actions/setup-python@v5
 73 |         with:
 74 |           python-version: '3.x'
 75 |       - name: Install dependencies
 76 |         run: |
 77 |           python -m pip install --upgrade pip
 78 |           pip install build
 79 |       - name: Build package
 80 |         run: python -m build
 81 |       - name: Publish package
 82 |         uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
 83 |         with:
 84 |           repository_url: https://test.pypi.org/legacy/
 85 |           user: __token__
 86 |           password: ${{ secrets.TEST_PYPI_API_TOKEN }}
 87 | 
 88 |   # Deploy on pypi when branch is master and commit message starts with 'bump'
 89 |   deploy-on-pypi:
 90 |     runs-on: ubuntu-latest
 91 |     continue-on-error: true
 92 |     needs: [tests]
 93 |     if: github.ref_name == 'master' && startsWith(github.event.head_commit.message, 'bump:')
 94 |     steps:
 95 |       - uses: actions/checkout@v4
 96 |       - name: Set up Python
 97 |         uses: actions/setup-python@v5
 98 |         with:
 99 |           python-version: '3.x'
100 |       - name: Install dependencies
101 |         run: |
102 |           python -m pip install --upgrade pip
103 |           pip install build
104 |       - name: Build package
105 |         run: python -m build
106 |       - name: Publish package
107 |         uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
108 |         with:
109 |           user: __token__
110 |           password: ${{ secrets.PYPI_API_TOKEN }}
111 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | __pycache__/
3 | **/~*
4 | requirements.txt
5 | dev-requirements.txt
6 | Update-PythonVenv.ps1
7 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
  1 | 
  2 | ci:
  3 |   skip: [pyright]
  4 | 
  5 | # exclude: 'scripts/.*|tests/.*'
  6 | exclude: 'scripts/.*'
  7 | 
  8 | repos:
  9 | 
 10 | - repo: https://github.com/pre-commit/pre-commit-hooks
 11 |   rev: v5.0.0
 12 |   hooks:
 13 |     - id: check-added-large-files
 14 |     - id: check-ast
 15 |     - id: check-case-conflict
 16 |     - id: check-docstring-first
 17 |     - id: check-executables-have-shebangs
 18 |     - id: check-json
 19 |     - id: check-merge-conflict
 20 |       args:
 21 |         - --assume-in-merge
 22 |     - id: check-shebang-scripts-are-executable
 23 |     - id: check-symlinks
 24 |     - id: check-toml
 25 |     - id: check-vcs-permalinks
 26 |     - id: check-xml
 27 |     - id: check-yaml
 28 |     - id: debug-statements
 29 |     - id: destroyed-symlinks
 30 |     - id: detect-private-key
 31 |     - id: end-of-file-fixer
 32 |     - id: mixed-line-ending
 33 |     - id: requirements-txt-fixer
 34 |     - id: trailing-whitespace
 35 |     - id: fix-encoding-pragma
 36 |       args:
 37 |       - --remove
 38 |     # - id: name-tests-test
 39 |     #   args:
 40 |     #   - --pytest-test-first
 41 |     - id: no-commit-to-branch
 42 |     - id: pretty-format-json
 43 |       args: ['--autofix']
 44 |     # - id: sort-simple-yaml
 45 |         # files: .pre-commit-config.yaml
 46 | 
 47 | - repo: https://github.com/pre-commit/mirrors-mypy
 48 |   rev: v1.15.0
 49 |   hooks:
 50 |   - id: mypy
 51 |     name: mypy
 52 |     language: python
 53 |     language_version: python3.12
 54 |     types: [python]
 55 |     require_serial: true
 56 |     verbose: true
 57 |     additional_dependencies: ['types-requests']
 58 |     # exclude: "tests"
 59 |     # args:
 60 |     # - --ignore-missing-imports
 61 |     # files: ^(src/|tests/)
 62 | 
 63 | - repo: https://github.com/PyCQA/isort
 64 |   rev: 6.0.1
 65 |   hooks:
 66 |   - id: isort
 67 |     args: ["--profile", "black", "--filter-files", "--combine-as", "honor--noqa"]
 68 | 
 69 | - repo: https://github.com/psf/black
 70 |   rev: 25.1.0
 71 |   hooks:
 72 |   - id: black
 73 |     language_version: python3.9
 74 |     args: ["--skip-magic-trailing-comma"]
 75 | 
 76 | - repo: https://github.com/asottile/pyupgrade
 77 |   rev: v3.19.1
 78 |   hooks:
 79 |   - args:
 80 |     - --py39-plus
 81 |     id: pyupgrade
 82 | 
 83 | - repo: https://github.com/Lucas-C/pre-commit-hooks
 84 |   rev: v1.5.5
 85 |   hooks:
 86 |   - id: remove-tabs
 87 | 
 88 | # - repo: https://github.com/commitizen-tools/commitizen
 89 | #   rev: v2.40.0
 90 | #   hooks:
 91 | #   - id: commitizen
 92 | 
 93 | # pylint still broken in python 3.12
 94 | # - repo: https://github.com/pre-commit/mirrors-pylint
 95 | #   rev: v3.0.0a5
 96 | #   hooks:
 97 | #   - id: pylint
 98 | #     exclude: "tests"
 99 | #     name: pylint
100 | #     args:
101 | #     - --good-names=i,j,_,f
102 | #     - --disable=protected-access
103 | #     - --disable=no-member
104 | #     - --disable=import-error
105 | #     - --disable=no-name-in-module
106 | #     - --load-plugins=pylint.extensions.docparams
107 | #     - --accept-no-param-doc=n
108 | #     - --accept-no-raise-doc=n
109 | #     - --accept-no-return-doc=n
110 | #     - --accept-no-yields-doc=n
111 | 
112 | - repo: https://github.com/astral-sh/ruff-pre-commit
113 |   # ignores
114 |   # ANN201 Missing return type annotation for public function
115 |   # ANN202 Missing return type annotation for private function (wants -> None everywhere)
116 |   # B905 zip() without an explicit strict= parameter
117 |   # COM812 Trailing comma missing
118 |   # D203 1 blank line required before class docstring
119 |   # D213 multi line summary second line
120 |   # D400 first line should end with a period
121 |   # I001 Import block is un-sorted or un-formatted
122 |   # ISC003 Explicitly concatenated string should be implicitly concatenated
123 |   # N802 Function name should be lowercase
124 |   # N806 Variable in function should be lowercase
125 |   # PERF401 Use a list comprehension to create a transformed list
126 |   # PGH003 Use specific rule codes when ignoring type issues
127 |   # PLR0913 Too many arguments to function call
128 |   #
129 |   # ERA001 Found commented-out code
130 |   # N803 Argument name should be lowercase
131 |   # S320 Using `lxml` to parse untrusted data is known to be ... XML attacks
132 |   # PLR2004 Magic values
133 |   # C901 function is too complex  # for iter_at_depth
134 |   # PLR0912 too many branches  # for iter_at_depth
135 |   #
136 |   rev: 'v0.11.9'
137 |   hooks:
138 |     - id: ruff
139 |       exclude: "tests"
140 |       args:
141 |       - --target-version=py38
142 |       - --select=ALL
143 |       - --ignore=ANN201,ANN202,B905,COM812,D203,D213,D400,I001,ISC003,N802,N806,PERF401,PGH003,PLR0913,ERA001,N803,S320,PLR2004,C901,PLR0912
144 |       # # - --fix
145 | 
146 | # reads pyproject.toml for additional config
147 | - repo: https://github.com/RobertCraigie/pyright-python
148 |   rev: v1.1.400
149 |   hooks:
150 |     - id: pyright
151 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | 
  2 | ## 3.5.0 (2025-02-03)
  3 | 
  4 | ### Feat
  5 | 
  6 | - Remove Python 3.8 support.
  7 | - Refactor File.path inference to support rare files with rels in a
  8 |   `word/glossary` directory.
  9 | - Test Python 3.13 support.
 10 | 
 11 | ## 3.4.0 (2025-02-01)
 12 | 
 13 | ### Feat
 14 | 
 15 | - edit and save rels files. You can now access the `rels_element` attribute of
 16 |   File instances to update hyperlink urls and other values. These will be saves
 17 |   on DocxReader.save(). This is an advanced feature and will not change text
 18 |   extraction.
 19 | 
 20 | ## 3.3.0 (2024-12-05)
 21 | 
 22 | ### Feat
 23 | 
 24 | - skip elements with invalid tags. Issue a warning. These are usually the
 25 |   result of faulty conversion software.
 26 | 
 27 | ## 3.2.1 (2024-11-17)
 28 | 
 29 | ### Feat
 30 | 
 31 | - add an `elem` attribute to `Par` instances, returning the xml element from
 32 |   which the paragraph was generated
 33 | 
 34 | ## 3.0.0 (2024-07-27)
 35 | 
 36 | ### BREAKING CHANGE
 37 | 
 38 | - The html and duplicate_merged_cells arguments to docx2python are now keyword
 39 |   only.
 40 | - Inserts empty cells and whitespace into exported
 41 |   tables.
 42 | - Removed IndexedItem class which was *probably* only used internally, but it
 43 |   was a part of the public interface.
 44 | - Function get_text was a public function. It mirrored the identical
 45 |   flatten_text from the docx_text module.
 46 | - This change breaks the way paragraph styles (internally pStyle) were handled.
 47 |   The input argument `do_pStyle` will no now raise an error.
 48 | - This doesn't change the interface and doesn't break any of my tests, but it
 49 |   took a lot of refactoring to make this change and it may break some
 50 |   unofficial patches I've made for clients.
 51 | 
 52 | ### Feat
 53 | 
 54 | - improve type hints for DocxContent properties
 55 | - insert blank cells to match gridSpan
 56 | - add list_position attribute for Par instances
 57 | - explicate return types in iterators
 58 | - use input file namespace
 59 | 
 60 | ### Fix
 61 | 
 62 | - eliminate double html tags for paragraph styles
 63 | 
 64 | ### Refactor
 65 | 
 66 | - make boolean args keyword only
 67 | - use pathlib in lieu of os.path
 68 | - remove Any types from DocxContent close method
 69 | - convert HtmlFormatter lambdas to defs
 70 | - specialize join_leaves into join_runs
 71 | - insert html when extracting text
 72 | - make queuing text outside paragraphs explicit
 73 | - make _open_pars private
 74 | - stop accepting extract_image bool argument
 75 | - default duplicate_merged_cells to True
 76 | - remove unused helper functions
 77 | - use pathlib in conftest
 78 | - expose numPr, ilvl, and number in BulletGenerator
 79 | - remove redundant functions
 80 | - remove do_pStyle argument from flatten_text
 81 | - remove function get_text from iterators module
 82 | - store content table as nested list of Par instances
 83 | - move xml2html_format attrib from TagRunner to DepthCollector
 84 | - factor out DepthCollector.item_depth param
 85 | - make set_caret recursive
 86 | - remove unused `styled` param from insert_text_as_new_run
 87 | - remove relative imports in src modules
 88 | 
 89 | ## 2.10.2 (2024-06-30)
 90 | 
 91 | ### Refactor
 92 | 
 93 | - remove relative imports in src modules
 94 | 
 95 | ## 2.10.1 (2024-04-03)
 96 | 
 97 | ### Fix
 98 | 
 99 | - move paragraphs to main dependencies
100 | 
101 | ## 2.10.0 (2024-04-03)
102 | 
103 | ### Feat
104 | 
105 | - support checkox "true"/"false" values
106 | 
107 | ## 2.9.2 (2024-04-03)
108 | 
109 | ### Fix
110 | 
111 | - extract hyperlinks in comments
112 | - remove open_par limit in DepthCollector
113 | - return empty list when comments fails
114 | 
115 | ## 2.9.1 (2024-04-02)
116 | 
117 | ### Refactor
118 | 
119 | - comb full-text and line-text formatting
120 | - refactor element text extractors into methods
121 | 
122 | ## 2.9.0 (2024-03-30)
123 | 
124 | ### Feat
125 | 
126 | - extract comments from docx files
127 | - capture comment ranges
128 | 
129 | ### Refactor
130 | 
131 | - expose DepthCollector instance for File object
132 | - expose DepthCollector instance when get_text
133 | 
134 | ## 2.8.0 (2024-01-21)
135 | 
136 | ### Feat
137 | 
138 | - capture hyperlink anchors
139 | 
140 | ## 2.7.3 (2023-06-17)
141 | 
142 | ### Fix
143 | 
144 | - sync commitizen and poetry version numbers
145 | 
146 | ## 2.7.2 (2023-06-16)
147 | 
148 | ### Fix
149 | 
150 | - update poetry lock file
151 | 
152 | ## 2.7.1 (2023-05-02)
153 | 
154 | ### Refactor
155 | 
156 | - update and pass pre-commit hooks
157 | 
158 | ## 2.7.0 (2023-04-27)
159 | 
160 | ### Feat
161 | 
162 | - preserve newlines in replace_docx_text
163 | - add py.typed for typecheckers
164 | - add argument duplicate_merged_cells for docx tables
165 | - add context manager protocol
166 | - allow type IOBytes for filename arguments
167 | - add and mostly pass pre-commit hooks
168 | - remove Python 3.7 support
169 | 
170 | ### Fix
171 | 
172 | - move pre-commit to dev requirement
173 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2019 Shay Hill
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README_DOCX_FILE_STRUCTURE.md:
--------------------------------------------------------------------------------
 1 | ## typical docx file format
 2 | 
 3 | To assist with reading the project documentation or extending `docx2python`.
 4 | 
 5 | There are four basic types of files:
 6 | 
 7 |     1. _rels/.rels - A list of docx content files (e.g., ``document.xml``)
 8 | 
 9 |     2. content files - files that contain the text displayed in the docx. (e.g., ``document.xml``, ``header1.xml``).
10 |        These files reference non-content files (images and formatting specifications) through relId numbers, which are
11 |        defined in content-file rels.
12 | 
13 |     3. content-file rels - (e.g., ``document.xml.rels``) this is where relId numbers are defined. The relId numbers
14 |        used in ``document.xml`` will be defined in ``document.xml.rels``.
15 | 
16 |     4. display files - (e.g., ``numbering.xml``) that tell the content files how to display text. These are linked from
17 |        the content files through content-file rels.
18 | 
19 | ### Docx file structure
20 | 
21 |     + _rels  # named references to data (links, values, etc. for entire document)
22 |         - .rels  # map to locations of major files (e.g., document.xml)
23 | 
24 |     + customXml  # all ignored by docx2python
25 |         - item1.xml
26 |         - item2.xml
27 |         - item3.xml
28 |         - itemProps1.xml
29 |         - itemProps2.xml
30 |         - itemProps2.xml
31 |         _ _rels
32 |             - item1.xml.rels
33 |             - item2.xml.rels
34 |             - item3.xml.rels
35 | 
36 |     + docProps
37 |         - app.xml  # ignored by docx2python
38 |         - core.xml  # author, modification date, etc.
39 |         - custom.xml  # ignored by docx2python
40 | 
41 |     + word  # content of docx
42 |         + _rels  # images, numbering formats, etc. for content xml files
43 |             - document.xml.rels
44 |             - header1.xml.rels
45 |             - header2.xml.rels
46 |             - header3.xml.rels
47 |         + media  # folder holding all pictures attached in the docx file
48 |             - image1.jpg
49 |             - image2.jpg
50 |         + theme  # ignored by docx2python
51 |             - theme1.xml
52 |         - document.xml  # main body text
53 |         - header1.xml  # header 1 content
54 |         - footer1.xml
55 |         - footnotes.xml
56 |         - fontTable.xml  # "long-hand" font descriptions. Ignored by docx2python
57 |         - numbering.xml  # required data to auto number paragraphs. doxc2python reads this
58 |         - settings.xml  # global file specifications. Ignored by docx2python
59 |         - styles.xml # table styles, etc. Ignored by docx2python
60 |         - webSettings.xml  # ignored by docx2python
61 | 
62 | A ``*.docx`` file is just a zipped up file structure (the structure defined above). You can unzip a docx file, make changes, then zip it back up and everything will work (provided your changes are valid xml).
63 | 


--------------------------------------------------------------------------------
/docx2python/__init__.py:
--------------------------------------------------------------------------------
 1 | """Import function docx2python into the docx2python namespace.
 2 | 
 3 | :author: Shay Hill
 4 | :created: 2023-01-09
 5 | """
 6 | 
 7 | from docx2python.main import docx2python
 8 | 
 9 | __all__ = ["docx2python"]
10 | 


--------------------------------------------------------------------------------
/docx2python/bullets_and_numbering.py:
--------------------------------------------------------------------------------
  1 | """Generate bullet and numbered-list strings.
  2 | 
  3 | :author: Shay Hill
  4 | :created: 11/15/2021
  5 | 
  6 | Docx xml files do not track explicit numbering values. Each numbered paragraph has ::
  7 | 
  8 |     <w:ilvl w:val="0"/>   # indentation level
  9 |     <w:numId w:val="9"/>  # index to a list [by ilvl] of numbered-list formats
 10 | 
 11 | Docx2Python keeps track of current numbering value, and increments these values as
 12 | numbered paragraphs are encountered. If extracting partial text, the numbers may be
 13 | incorrect, because all paragraphs in a numbered-list format may not be encountered
 14 | during the extraction.
 15 | """
 16 | 
 17 | from __future__ import annotations
 18 | 
 19 | import warnings
 20 | from collections import defaultdict
 21 | from contextlib import suppress
 22 | from typing import TYPE_CHECKING, Callable
 23 | 
 24 | from docx2python import numbering_formats as nums
 25 | from docx2python.namespace import get_attrib_by_qn, iterfind_by_qn
 26 | 
 27 | if TYPE_CHECKING:
 28 |     from lxml.etree import _Element as EtreeElement  # type: ignore
 29 | 
 30 |     from docx2python.docx_context import NumIdAttrs
 31 | 
 32 | 
 33 | def _get_bullet_function(numFmt: str) -> Callable[[int], str]:
 34 |     """Select a bullet or numbering format function from xml numFmt.
 35 | 
 36 |     :param numFmt: xml numFmt (e.g., decimal, lowerLetter)
 37 |     :return: a function that takes an int and returns a string. If numFmt is not
 38 |         recognized, treat numbers as bullets.
 39 |     """
 40 |     numFmt2bullet_function: dict[str, Callable[[int], str]] = {
 41 |         "decimal": nums.decimal,
 42 |         "lowerLetter": nums.lower_letter,
 43 |         "upperLetter": nums.upper_letter,
 44 |         "lowerRoman": nums.lower_roman,
 45 |         "upperRoman": nums.upper_roman,
 46 |         "bullet": nums.bullet,
 47 |     }
 48 |     try:
 49 |         retval_: Callable[[int], str] = numFmt2bullet_function[numFmt]
 50 |     except KeyError:
 51 |         warnings.warn(
 52 |             f"{numFmt} numbering format not implemented, "
 53 |             + f"substituting '{nums.bullet()}'",
 54 |             stacklevel=2,
 55 |         )
 56 |         return nums.bullet
 57 |     else:
 58 |         return retval_
 59 | 
 60 | 
 61 | def _new_list_counter() -> defaultdict[str, defaultdict[str, int]]:
 62 |     """Return a counter, starting at zero, for each numId.
 63 | 
 64 |     :return: {
 65 |         a_numId: 0,
 66 |         b_numId: 0
 67 |     }
 68 | 
 69 |     This is what you need to keep track of where every nested list is at.
 70 |     """
 71 |     return defaultdict(lambda: defaultdict(int))
 72 | 
 73 | 
 74 | def _increment_list_counter(ilvl2count: defaultdict[str, int], ilvl: str) -> int:
 75 |     """Increase counter at ilvl, reset counter at deeper levels.
 76 | 
 77 |     :param ilvl2count: context['numId2count']
 78 |     :param ilvl: string representing an integer
 79 |     :return: updated count at ilvl.
 80 |         updates context['numId2count'] by reference
 81 | 
 82 |     On a numbered list, the count for sub-lists should reset when a parent list
 83 |     increases, e.g.,
 84 | 
 85 |     1. top-level list
 86 |         a. sublist
 87 |         b. sublist continues
 88 |     2. back to top-level list
 89 |         a. sublist counter has been reset
 90 | 
 91 |     List counters are defaultdicts, so we can reset sublist counters by deleting
 92 |     them.
 93 |     """
 94 |     ilvl2count[ilvl] += 1
 95 |     deeper_levels = [k for k in ilvl2count if k > ilvl]
 96 |     for level in deeper_levels:
 97 |         del ilvl2count[level]
 98 |     return ilvl2count[ilvl]
 99 | 
100 | 
101 | class BulletGenerator:
102 |     """Keep track of list counters and generate bullet strings.
103 | 
104 |     <w:p>
105 |         <w:pPr>
106 |             <w:numPr>
107 |                 <w:ilvl w:val="0"/>   # indentation level
108 |                 <w:numId w:val="9"/>  # index to (multi-level) list format
109 |             </w:numPr>
110 |         </wpPr>
111 |         <w:r>
112 |             <w:t>this text in numbered or bulleted list
113 |             </w:t>
114 |         </w:r>
115 |     </w:p>
116 |     """
117 | 
118 |     def __init__(self, numId2Attrs: dict[str, list[NumIdAttrs]]) -> None:
119 |         """Set numId2numFmts. Initiate counters."""
120 |         self.numId2Attrs = numId2Attrs
121 |         self.numId2count = _new_list_counter()
122 | 
123 |         # Only increment the number of a paragraph if that paragraph has not been
124 |         # seen. See docstring for self._get_par_number.
125 |         self._par2par_number: dict[EtreeElement, int | None] = {}
126 | 
127 |     def _get_numPr(self, paragraph: EtreeElement) -> EtreeElement | None:
128 |         """Get the parent element of the numId and ilvl elements.
129 | 
130 |         :param paragraph: <w:p> xml element
131 |         :return: <w:numPr> xml element or None if this fails.
132 |         """
133 |         try:
134 |             pPr = next(iterfind_by_qn(paragraph, "w:pPr"))
135 |             return next(iterfind_by_qn(pPr, "w:numPr"))
136 |         except (StopIteration, KeyError):
137 |             return None
138 | 
139 |     def _get_numId(self, numPr: EtreeElement) -> str | None:
140 |         """Get the numId for the paragraph.
141 | 
142 |         :param numPr: <w:numPr> xml element (see class docstring)
143 |         :return: numId as a string or None if this fails.
144 | 
145 |         The numId is an integer (string of an integer) index to a list of multi-level
146 |         list formats. For each numId, there is a list of formats for each indentation
147 |         level.
148 |         """
149 |         try:
150 |             numId_element = next(iterfind_by_qn(numPr, "w:numId"))
151 |             return get_attrib_by_qn(numId_element, "w:val")
152 |         except (StopIteration, KeyError):
153 |             return None
154 | 
155 |     def _get_ilvl(self, numPr: EtreeElement) -> str | None:
156 |         """Get the ilvl for the paragraph.
157 | 
158 |         :param numPr: <w:numPr> xml element (see class docstring)
159 |         :return: ilvl as a string or None if this fails.
160 | 
161 |         The ilvl is an integer (string of an integer) index of a multi-level list
162 |         formats. For each ilvl, there is a format.
163 |         """
164 |         try:
165 |             ilvl_element = next(iterfind_by_qn(numPr, "w:ilvl"))
166 |             return get_attrib_by_qn(ilvl_element, "w:val")
167 |         except (StopIteration, KeyError):
168 |             return None
169 | 
170 |     def get_bullet_fmt(self, paragraph: EtreeElement) -> tuple[str | None, str | None]:
171 |         """Expose the numId and ilvl of a numbered paragraph.
172 | 
173 |         :param paragraph: <w:p> xml element
174 |         :return: numId (which list), ilvl (indentation level)
175 | 
176 |         This will return None, None, None if the paragraph is not numbered.
177 |         """
178 |         numPr = self._get_numPr(paragraph)
179 |         if numPr is None:
180 |             return None, None
181 |         numId = self._get_numId(numPr)
182 |         ilvl = self._get_ilvl(numPr)
183 |         if numId is None or ilvl is None:
184 |             return numId, ilvl
185 |         return numId, ilvl
186 | 
187 |     def get_par_number(self, paragraph: EtreeElement) -> int | None:
188 |         """Get the number (at the current indentation level) of a paragraph.
189 | 
190 |         :param paragraph: <w:p> xml element
191 |         :return: number of the paragraph
192 |         :effects: increment self.numId2count[numId][ilvl] if the paragraph has not
193 |             been seen before.
194 | 
195 |         E.g.,
196 | 
197 |             1. paragraph  # called here, return 1
198 |                 a. paragraph  # called here, return 1
199 |                 b. paragraph  # called here, return 2
200 |             2. paragraph  # called here, return 2
201 |                 a. paragraph  # called here, return 1
202 |                     1. paragraph  # called here, return 1
203 | 
204 |         numId and ilvl should both be defined for a numbered paragraph, but I'm
205 |         testing both here to fail silently if that assumption is wrong.
206 |         """
207 |         with suppress(KeyError):
208 |             return self._par2par_number[paragraph]
209 |         numId, ilvl = self.get_bullet_fmt(paragraph)
210 |         if numId is None or ilvl is None:
211 |             par_number = None
212 |         else:
213 |             counter = _increment_list_counter(self.numId2count[numId], ilvl)
214 |             par_number = counter + self.get_start_value_zero_based(numId, ilvl)
215 |         self._par2par_number[paragraph] = par_number
216 |         return par_number
217 | 
218 |     def get_start_value_zero_based(self, numId: str | None, ilvl: str | None) -> int:
219 |         """Get the start value, 0-based, for numbering sequence at particular level.
220 | 
221 |         :return: start index if present for a particular numId and ilvl, 0 otherwise
222 |         """
223 |         attrs = self.__get_num_fmt_attributes(numId, ilvl)
224 |         if not attrs or not attrs.start:
225 |             return 0
226 |         return attrs.start - 1  # subtract 1 to have 0-based result
227 | 
228 |     def get_list_position(
229 |         self, paragraph: EtreeElement
230 |     ) -> tuple[str | None, list[int]]:
231 |         """Get the current numbering values.
232 | 
233 |         :return: numbering values as a tuple of integers
234 | 
235 |         E.g.,
236 | 
237 |             Not in a list  # called here, return ()
238 | 
239 |             1. paragraph  # called here, return (numPr, 1)
240 |                 a. paragraph  # called here, return (numPr, 1, 1)
241 |                 b. paragraph  # called here, return (numPr, 1, 2)
242 |             2. paragraph  # called here, return (numPr, 2)
243 |                 a. paragraph  # called here, return (numPr, 2, 1)
244 |                     1. paragraph  # called here, return (numPr, 2, 1, 1)
245 | 
246 |         The numbering values are the current count at each indentation level.
247 |         """
248 |         numPr, _ = self.get_bullet_fmt(paragraph)
249 |         if numPr is None:
250 |             return (numPr, [])
251 |         # ensure the paragraph counter has been incremented
252 |         _ = self.get_par_number(paragraph)
253 |         return numPr, list(self.numId2count[numPr].values())
254 | 
255 |     def get_bullet(self, paragraph: EtreeElement) -> str:
256 |         """Get bullet string if paragraph is numbered. (e.g, '--  ' or '1)  ').
257 | 
258 |         :param paragraph: <w:p> xml element
259 |         :return: specified 'bullet' string or '' if paragraph is not numbered
260 | 
261 |         Get an index to a multi-level list format (numId) and the indentation level
262 |         (ilvl). If no numId or ilvl are defined, assume this is not a numbered list.
263 |         If these values to exist, look up a list format with
264 |         numId2numFmts[numId][ilvl]. If this fails, silently give up and use a bullet.
265 | 
266 |         bullet preceded by one tab for every indentation level.
267 |         """
268 |         numId, ilvl = self.get_bullet_fmt(paragraph)
269 |         number = self.get_par_number(paragraph)
270 |         if numId is None:
271 |             return ""
272 |         if ilvl is None:
273 |             return ""
274 |         if number is None:
275 |             return ""
276 |         attrs = self.__get_num_fmt_attributes(numId, ilvl)
277 |         numFmt = attrs.fmt if attrs and attrs.fmt else "bullet"
278 | 
279 |         def format_bullet(bullet: str) -> str:
280 |             """Indent, format and pad the bullet or number string.
281 | 
282 |             :param bullet: any kind of list-item string (bullet, number, Roman, ...)
283 |             :return: formatted bullet string
284 |             """
285 |             if bullet != nums.bullet():
286 |                 bullet += ")"
287 |             return "\t" * int(ilvl) + bullet + "\t"
288 | 
289 |         get_unformatted_bullet_str = _get_bullet_function(numFmt)
290 |         return format_bullet(get_unformatted_bullet_str(number))
291 | 
292 |     def __get_num_fmt_attributes(
293 |         self, numId: str | None, ilvl: str | None
294 |     ) -> NumIdAttrs | None:
295 |         if numId is None:
296 |             return None
297 |         if ilvl is None:
298 |             return None
299 |         try:
300 |             return self.numId2Attrs[str(numId)][int(ilvl)]
301 |         except (KeyError, IndexError, ValueError):
302 |             return None
303 | 


--------------------------------------------------------------------------------
/docx2python/docx_context.py:
--------------------------------------------------------------------------------
  1 | """Content from files that aren't ``word/document.xml``.
  2 | 
  3 | :author: Shay Hill
  4 | :created: 6/26/2019
  5 | 
  6 | Most of the "meat" in a docx file is in ``word/document.xml``. These functions retrieve
  7 | numbering formats, images, and font styles from *other* files in a decompressed docx.
  8 | """
  9 | 
 10 | from __future__ import annotations
 11 | 
 12 | import dataclasses
 13 | from typing import TYPE_CHECKING
 14 | 
 15 | from lxml import etree
 16 | 
 17 | from docx2python.attribute_register import get_localname
 18 | from docx2python.namespace import find_by_qn, findall_by_qn, get_attrib_by_qn
 19 | 
 20 | if TYPE_CHECKING:
 21 |     import zipfile
 22 | 
 23 |     from lxml.etree import _Element as EtreeElement  # type: ignore
 24 | 
 25 | 
 26 | @dataclasses.dataclass
 27 | class NumIdAttrs:
 28 |     """NumIdAttrs represents numbering attributes, such as format and start index."""
 29 | 
 30 |     fmt: str | None
 31 |     start: int | None
 32 | 
 33 | 
 34 | def collect_numAttrs(numFmts_root: EtreeElement) -> dict[str, list[NumIdAttrs]]:
 35 |     """Collect abstractNum bullet attributes into a dictionary.
 36 | 
 37 |     :param numFmts_root: Root element of ``word/numbering.xml``.
 38 |     :return: numId mapped to numFmts (by ilvl)
 39 | 
 40 |     :background:
 41 | 
 42 |     ``word/numbering.xml`` will have two sections.
 43 | 
 44 |     **SECTION 1** - Some abstractNum elements defining numbering formats for multiple
 45 |     indentation levels::
 46 | 
 47 |         <w:abstractNum w:abstractNumId="0">
 48 |             <w:lvl w:ilvl="0"><w:numFmt w:val="decimal"/></w:lvl>
 49 |             <w:lvl w:ilvl="1"><w:numFmt w:val="lowerLetter"/></w:lvl>
 50 |             ...
 51 |         </w:abstractNum>
 52 | 
 53 |     **SECTION 2** - Some num elements, each referencing an abstractNum. Multiple nums
 54 |     may reference the same abstractNum, but each will maintain a separate count (i.e.,
 55 |     each numbered paragraph will start from 1, even if it shares a style with another
 56 |     paragraph.)::
 57 | 
 58 |         <w:num w:numId="1">
 59 |             <w:abstractNumId w:val="0"/>
 60 |         </w:num>
 61 |         <w:num w:numId="2">
 62 |             <w:abstractNumId w:val="5"/>
 63 |         </w:num>
 64 | 
 65 |     **E.g, Given**: *above*
 66 | 
 67 |     **E.g., Returns**::
 68 | 
 69 |         {
 70 |             # -----ilvl=0------ilvl=1------ilvl=2---
 71 |             "1": [ NumIdAttrs(fmt:"decimal",start:2),
 72 |                 NumIdAttrs(fmt:"lowerLetter",start:1), ...],
 73 |             "2": ...
 74 |         }
 75 |     """
 76 |     abstractNumId2Attrs: dict[str, list[NumIdAttrs]] = {}
 77 | 
 78 |     for abstractNum in findall_by_qn(numFmts_root, "w:abstractNum"):
 79 |         id_ = str(get_attrib_by_qn(abstractNum, "w:abstractNumId"))
 80 | 
 81 |         abstractNumId2Attrs[id_] = []
 82 |         for lvl in findall_by_qn(abstractNum, "w:lvl"):
 83 |             numFmtEl = find_by_qn(lvl, "w:numFmt")
 84 |             fmt = None
 85 |             if numFmtEl is not None:
 86 |                 fmt = str(get_attrib_by_qn(numFmtEl, "w:val"))
 87 |             startEl = find_by_qn(lvl, "w:start")
 88 |             start = None
 89 |             if startEl is not None:
 90 |                 qn = get_attrib_by_qn(startEl, "w:val")
 91 |                 start = int(qn)
 92 |             abstractNumId2Attrs[id_].append(NumIdAttrs(fmt=fmt, start=start))
 93 | 
 94 |     numId2attrs: dict[str, list[NumIdAttrs]] = {}
 95 |     num: EtreeElement
 96 |     for num in findall_by_qn(numFmts_root, "w:num"):
 97 |         numId = get_attrib_by_qn(num, "w:numId")
 98 |         abstractNumId = find_by_qn(num, "w:abstractNumId")
 99 |         if abstractNumId is None:
100 |             continue
101 |         abstractNumIdval = get_attrib_by_qn(abstractNumId, "w:val")
102 |         numId2attrs[str(numId)] = abstractNumId2Attrs[str(abstractNumIdval)]
103 | 
104 |     return numId2attrs
105 | 
106 | 
107 | def collect_rels(zipf: zipfile.ZipFile) -> dict[str, list[dict[str, str]]]:
108 |     """Map file to relId to attrib.
109 | 
110 |     :param zipf: created by ``zipfile.ZipFile("docx_filename")``
111 |     :return: a deep dictionary ``{filename: list of Relationships``
112 | 
113 |     Each rel in list of Relationships is::
114 | 
115 |         {
116 |             "Id": "rId1",
117 |             "Type": "http...",
118 |             "Target": "path to file in docx"
119 |         }
120 | 
121 |     There are several rels files:
122 | 
123 |     ``_rels/.rels``: rels related to entire structure.  The identity of
124 |         ``word/document.xml`` is here. (It might be called ``word/document2.xml`` or
125 |         something else. Checking here is the best way to make sure.)
126 | 
127 |     ``word/_rels/document.xml.rels``: images, headers, etc. referenced by
128 |         ``word/document.xml``
129 | 
130 |     ``word/_rels/header1.xml.rels``: images, etc. for ``header1.xml``
131 | 
132 |     ...
133 | 
134 |     Get everything from everywhere. Map ``_rels/.rels`` to ``'rels'`` and everything
135 |     else to e.g., ``'document'`` or ``'header'``. RelIds are **not** unique between
136 |     these files.
137 | 
138 |     **E.g, Given**::
139 | 
140 |     # one of several files
141 | 
142 |         <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
143 |         <Relationships xmlns="http://schemas.../relationships">
144 |             <Relationship Id="rId3" Type="http://schemas... \
145 |                 /extended-properties" Target="docProps/app.xml"/>
146 |             <Relationship Id="rId2" Type="http://schemas... \
147 |                 /core-properties" Target="docProps/core.xml"/>
148 |             <Relationship Id="rId1" Type="http://schemas... \
149 |                 /officeDocument" Target="word/document.xml"/>
150 |             <Relationship Id="rId4" Type="http://schemas... \
151 |                 /custom-properties" Target="docProps/custom.xml"/>
152 |         </Relationships>
153 | 
154 |     **Returns**::
155 | 
156 |         {
157 |             "filename": [
158 |                 {
159 |                     "Id": "rId3",
160 |                     "Type": "http://schemas.../extended-properties",
161 |                     "Target": "docProps/app.xml",
162 |                 },
163 |                 {
164 |                     "Id": "rId2",
165 |                     "Type": "http://schemas.../core-properties",
166 |                     "Target": "docProps/core.xml",
167 |                 },
168 |             ]
169 |         }
170 |     """
171 |     path2rels: dict[str, list[dict[str, str]]] = {}
172 |     for rels in (x for x in zipf.namelist() if x[-5:] == ".rels"):
173 |         rels_elem = etree.fromstring(zipf.read(rels))
174 |         path2rels[rels] = [
175 |             {str(y): str(z) for y, z in x.attrib.items()} for x in rels_elem
176 |         ]
177 |         tag = rels_elem.tag
178 |         if isinstance(tag, bytearray):  # for type checkers
179 |             tag = tag.decode("utf-8")
180 |         path2rels[rels].append(
181 |             {"Id": "none", "Type": etree.QName(tag).namespace or "", "Target": rels}
182 |         )
183 | 
184 |     return path2rels
185 | 
186 | 
187 | def collect_docProps(root: EtreeElement) -> dict[str, str | None]:
188 |     """Get author, modified, etc. from core-properties (should be docProps/core.xml).
189 | 
190 |     :param root: root of the XML tree
191 |     :return: document property names mapped to values
192 | 
193 |     **E.g., Given**::
194 | 
195 |         <cp:coreProperties xmlns:cp="http://schemas.openxmlformats...">
196 |             <dc:title>SG-DOP-5009 - Operate ROMAR swarf unit
197 |             </dc:title>
198 |             <dc:creator>Shay Hill
199 |             </dc:creator>
200 |             <cp:lastModifiedBy>Shay Hill
201 |             </cp:lastModifiedBy>
202 |             <cp:revision>6
203 |             </cp:revision>
204 |             <cp:lastPrinted>2017-11-17T15:47:00Z
205 |             </cp:lastPrinted>
206 |             <dcterms:created xsi:type="dcterms:W3CDTF">2019-01-10T07:21:00Z
207 |             </dcterms:created>
208 |             <dcterms:modified xsi:type="dcterms:W3CDTF">2019-01-11T11:41:00Z
209 |             </dcterms:modified>
210 |         </cp:coreProperties>
211 | 
212 |     **E.g., Returns**::
213 | 
214 |         {
215 |             "title": "SG-DOP-5009 - Operate ROMAR swarf unit",
216 |             "creator": "Shay Hill",
217 |             "lastModifiedBy": "Shay Hill",
218 |             "revision": "6",
219 |             ...
220 |         }
221 |     """
222 |     return {get_localname(x): x.text for x in root}
223 | 


--------------------------------------------------------------------------------
/docx2python/forms.py:
--------------------------------------------------------------------------------
 1 | """Form checkboxes, dropdowns, and other non-text elements visible in Word.
 2 | 
 3 | :author: Shay Hill
 4 | :created: 6/17/2020
 5 | 
 6 | Word represents some special characters as non-text elements (e.g., checkBox). These
 7 | functions examine these elements to infer suitable text replacements.
 8 | 
 9 | This file references "\u2610" and "\u2612" a few times. These are open and
10 | crossed-out checkboxes. Pypi doesn't like them in my file, so I have to reference
11 | them by their escape sequences.
12 | """
13 | 
14 | from __future__ import annotations
15 | 
16 | from contextlib import suppress
17 | from typing import TYPE_CHECKING
18 | 
19 | from docx2python.namespace import get_attrib_by_qn, iterfind_by_qn, qn
20 | 
21 | if TYPE_CHECKING:
22 |     from lxml.etree import _Element as EtreeElement  # type: ignore
23 | 
24 | 
25 | def get_checkBox_entry(checkBox: EtreeElement) -> str:
26 |     """Create text representation for a checkBox element.
27 | 
28 |     :param checkBox: a checkBox xml element
29 |     :return:
30 |         1. attempt to get ``checked.w:val`` and return "\u2610" or "\u2612"
31 |         2. attempt to get ``default.w:val`` and return "\u2610" or "\u2612"
32 |         3. return ``--checkbox failed--``
33 | 
34 |     Docx xml has at least two types of checkbox elements::
35 | 
36 |         1. ``checkBox`` can only be checked when the form is locked. These do not
37 |         contain a text element, so this function is needed to select one from the
38 |         ``w:checked`` or ``w:default`` sub-elements.
39 | 
40 |         2. ``checkbox`` can be checked any time. Prints text as "\u2610" or "\u2612".
41 |         Docx2Python can safely ignore this second type, as there will be a <w:t>
42 |         element inside with a checkbox character.
43 | 
44 |     <w:checkBox>
45 |         <w:sizeAuto/>
46 |         <w:default w:val="1"/>
47 |         <w:checked w:val="0"/>
48 |     </w:checkBox>
49 | 
50 |     If the ``checked`` attribute is absent, return the default
51 |     If the ``checked`` attribute is present, but not w:val is given, return unchecked
52 |     """
53 | 
54 |     def get_wval() -> str | None:
55 |         """Get the value of the ``w:val`` attribute of the ``checked`` element.
56 | 
57 |         :return: the value of the ``w:val`` attribute of the ``checked`` element
58 |         """
59 |         with suppress(StopIteration):
60 |             checked = next(iterfind_by_qn(checkBox, "w:checked"))
61 |             return str(checked.attrib.get(qn(checked, "w:val")) or "1")
62 |         with suppress(StopIteration, KeyError):
63 |             default = next(iterfind_by_qn(checkBox, "w:default"))
64 |             return str(get_attrib_by_qn(default, "w:val"))
65 |         return None
66 | 
67 |     return {
68 |         "0": "\u2610",
69 |         "false": "\u2610",
70 |         "1": "\u2612",
71 |         "true": "\u2612",
72 |         None: "----checkbox failed----",
73 |     }[get_wval()]
74 | 
75 | 
76 | def get_ddList_entry(ddList: EtreeElement) -> str:
77 |     """Get only the selected string of a dropdown list.
78 | 
79 |     :param ddList: a dropdown-list element
80 |     :return: w:listEntry value of input element.
81 | 
82 |     <w:ddList>
83 |         <w:result w:val="1"/>
84 |         <w:listEntry w:val="selection 1"/>
85 |         <w:listEntry w:val="selection 2"/>
86 |     </w:ddList>
87 | 
88 |     <w:result w:val="0"/> might be missing when selection is "0"
89 |     """
90 |     list_entries = [
91 |         get_attrib_by_qn(x, "w:val") for x in iterfind_by_qn(ddList, "w:listEntry")
92 |     ]
93 |     try:
94 |         result = next(iterfind_by_qn(ddList, "w:result"))
95 |         list_index = int(get_attrib_by_qn(result, "w:val"))
96 |     except (StopIteration, KeyError):
97 |         list_index = 0
98 |     return str(list_entries[list_index])
99 | 


--------------------------------------------------------------------------------
/docx2python/main.py:
--------------------------------------------------------------------------------
 1 | """Top-level code.
 2 | 
 3 | :author: Shay Hill
 4 | :created: 7/2/2019
 5 | """
 6 | 
 7 | from __future__ import annotations
 8 | 
 9 | from typing import TYPE_CHECKING
10 | 
11 | from docx2python.docx_output import DocxContent
12 | from docx2python.docx_reader import DocxReader
13 | 
14 | if TYPE_CHECKING:
15 |     import os
16 |     from io import BytesIO
17 | 
18 | 
19 | def docx2python(
20 |     docx_filename: str | os.PathLike[str] | BytesIO,
21 |     image_folder: str | os.PathLike[str] | None = None,
22 |     *,
23 |     html: bool = False,
24 |     duplicate_merged_cells: bool = True,
25 | ) -> DocxContent:
26 |     """Unzip a docx file and extract contents.
27 | 
28 |     :param docx_filename: path to a docx file
29 |     :param image_folder: optionally specify an image folder
30 |         (images in docx will be copied to this folder)
31 |     :param html: bool, extract some formatting as html
32 |     :param duplicate_merged_cells: bool, duplicate merged cells to return a mxn
33 |         nested list for each table (default True)
34 |     :return: DocxContent object
35 |     """
36 |     docx_context = DocxReader(
37 |         docx_filename, html=html, duplicate_merged_cells=duplicate_merged_cells
38 |     )
39 |     docx_content = DocxContent(docx_context, image_folder)
40 |     if image_folder:
41 |         _ = docx_content.images
42 |     return docx_content
43 | 


--------------------------------------------------------------------------------
/docx2python/merge_runs.py:
--------------------------------------------------------------------------------
  1 | """Merge runs with identical formatting.
  2 | 
  3 | :author: Shay Hill
  4 | :created: 12/13/2021
  5 | 
  6 | Join consecutive xml runs with identical formatting. See docstring for ``merge_elems``.
  7 | """
  8 | 
  9 | from __future__ import annotations
 10 | 
 11 | import functools
 12 | from itertools import groupby
 13 | from typing import TYPE_CHECKING
 14 | 
 15 | from docx2python.attribute_register import Tags, get_prefixed_tag, has_content
 16 | from docx2python.text_runs import get_html_formatting
 17 | 
 18 | if TYPE_CHECKING:
 19 |     from lxml.etree import _Element as EtreeElement  # type: ignore
 20 | 
 21 |     from docx2python.docx_reader import File
 22 | 
 23 | # identify tags that will be merged together (if formatting is equivalent)
 24 | _MERGEABLE_TAGS = {Tags.RUN, Tags.HYPERLINK, Tags.TEXT, Tags.TEXT_MATH}
 25 | 
 26 | 
 27 | def _is_mergeable(elem: EtreeElement) -> bool:
 28 |     """Can a run be merged with another run?"""
 29 |     return elem.tag in _MERGEABLE_TAGS or get_prefixed_tag(elem) in _MERGEABLE_TAGS
 30 | 
 31 | 
 32 | def _elem_key(file: File, elem: EtreeElement) -> tuple[str, str, list[str]]:
 33 |     """Return enough info to tell if two elements are closely formatted.
 34 | 
 35 |     :param elem: any element in an xml file.
 36 |     :return: A summary of attributes (if two adjacent elements return the same key,
 37 |         they are considered mergeable). Only used to merge elements, so returns None
 38 |         if elements are not mergeable.
 39 | 
 40 |     Ignore text formatting differences if consecutive link elements point to the same
 41 |     address. Always join these.
 42 | 
 43 |     Docx2Text joins consecutive runs and links of the same style. Comparing two
 44 |     elem_key return values will tell you if
 45 |         * elements are the same type
 46 |         * link rels ids reference the same link
 47 |         * run styles are the same (as far as docx2python understands them)
 48 | 
 49 |     Elem rId attributes are replaced with rId['Target'] because different rIds can
 50 |     point to identical targets. This is important for hyperlinks, which can look
 51 |     different but point to the same address.
 52 | 
 53 |     """
 54 |     tag = str(elem.tag)
 55 |     if not _is_mergeable(elem):
 56 |         return tag, "", []
 57 | 
 58 |     # always join links pointing to the same address
 59 |     # elem.attrib key for relationship ids. These can find the information they
 60 |     # reference by ``file_instance.rels[elem.attrib[RELS_ID]]``
 61 |     rels_id_key = f"{{{elem.nsmap['r']}}}id"
 62 |     rels_id = elem.attrib.get(rels_id_key)
 63 |     if rels_id:
 64 |         return tag, str(file.rels[str(rels_id)]), []
 65 | 
 66 |     return tag, "", get_html_formatting(elem, file.context.xml2html_format)
 67 | 
 68 | 
 69 | def _is_text_or_text_math(elem: EtreeElement) -> bool:
 70 |     """Can an element be treated as text?"""
 71 |     text_or_text_math = {Tags.TEXT, Tags.TEXT_MATH}
 72 |     return elem.tag in text_or_text_math or get_prefixed_tag(elem) in text_or_text_math
 73 | 
 74 | 
 75 | def merge_elems(file: File, tree: EtreeElement) -> None:
 76 |     """Recursively merge duplicate (as far as docx2python is concerned) elements.
 77 | 
 78 |     :param file: File instancce
 79 |     :param tree: root_element from an xml in File instance
 80 |     :effects: Merges consecutive elements if tag, attrib, and style are the same
 81 | 
 82 |     There are a few ways consecutive elements can be "identical":
 83 |         * same link
 84 |         * same style
 85 | 
 86 |     Often, consecutive, "identical" elements are written as separate elements,
 87 |     because they aren't identical to Word. Word keeps track of revision history,
 88 |     spelling errors, etc., which are meaningless to docx2python.
 89 | 
 90 |     <w:p>
 91 |         <w:hyperlink r:id="rId7">  <!-- points to http://www.shayallenhill.com -->
 92 |             <w:r>
 93 |                 <w:t>hy</w:t>
 94 |             </w:r>
 95 |         </w:hyperlink>
 96 |         <w:proofErr/>  <!-- docx2python will ignore this proofErr -->
 97 |         <w:hyperlink r:id="rId8">  <!-- points to http://www.shayallenhill.com -->
 98 |             <w:r>
 99 |                 <w:t>per</w:t>
100 |             </w:r>
101 |         </w:hyperlink>
102 |         <w:hyperlink r:id="rId9">  <!-- points to http://www.shayallenhill.com -->
103 |             <w:r w:rsid="asdfas">  <!-- docx2python will ignore this rsid -->
104 |                 <w:t>link</w:t>
105 |             </w:r>
106 |         </w:hyperlink>
107 |     </w:p>
108 | 
109 |     Docx2python condenses the above to (by merging links)
110 | 
111 |     <w:p>
112 |         <w:hyperlink r:id="rId7">  <!-- points to http://www.shayallenhill.com -->
113 |             <w:r>
114 |                 <w:t>hy</w:t>
115 |             </w:r>
116 |             <w:r>
117 |                 <w:t>per</w:t>
118 |             </w:r>
119 |             <w:r w:rsid="asdfas">  <!-- docx2python will ignore this rsid -->
120 |                 <w:t>link</w:t>
121 |             </w:r>
122 |         </w:hyperlink>
123 |     </w:p>
124 | 
125 |     Then to (by merging runs)
126 | 
127 |     <w:p>
128 |         <w:hyperlink r:id="rId7">  <!-- points to http://www.shayallenhill.com -->
129 |             <w:r>
130 |                 <w:t>hy</w:t>
131 |                 <w:t>per</w:t>
132 |                 <w:t>link</w:t>
133 |             </w:r>
134 |         </w:hyperlink>
135 |     </w:p>
136 | 
137 |     Then finally to (by merging text)
138 | 
139 |     <w:p>
140 |         <w:hyperlink r:id="rId7">  <!-- points to http://www.shayallenhill.com -->
141 |             <w:r>
142 |                 <w:t>hyperlink</w:t>
143 |             </w:r>
144 |         </w:hyperlink>
145 |     </w:p>
146 | 
147 |     This function only merges runs, text, and hyperlinks, because merging paragraphs
148 |     or larger elements would ignore information docx2python DOES want to preserve.
149 | 
150 |     Filter out non-content items so runs can be joined even
151 |     """
152 |     file_elem_key = functools.partial(_elem_key, file)
153 | 
154 |     elems = [x for x in tree if has_content(x)]
155 |     runs = [list(y) for _, y in groupby(elems, key=file_elem_key)]
156 | 
157 |     for run in (x for x in runs if len(x) > 1 and _is_mergeable(x[0])):
158 |         if _is_text_or_text_math(run[0]):
159 |             run[0].text = "".join(x.text or "" for x in run)
160 |         for elem in run[1:]:
161 |             for e in elem:
162 |                 run[0].append(e)
163 |             tree.remove(elem)
164 | 
165 |     for branch in tree:
166 |         merge_elems(file, branch)
167 | 


--------------------------------------------------------------------------------
/docx2python/namespace.py:
--------------------------------------------------------------------------------
  1 | """Register namespace entries in xml ``document`` elements.
  2 | 
  3 | :author: Shay Hill
  4 | :created: 7/5/2019
  5 | 
  6 | A ``<w:document>`` element at the top of each xml file defines a namespace::
  7 | 
  8 |     <w:document
  9 |         xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
 10 |         xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
 11 |     />
 12 | 
 13 | These entries can be accessed in the file by their abbreviations::
 14 | 
 15 |     <w:p>
 16 |         contents of paragraph
 17 |     </w:p>
 18 | 
 19 | ``lxml.etree`` reads ``"<w:p>"`` as
 20 | 
 21 | ``"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}p"``
 22 | 
 23 | This module defines the necessary namespaces and transforms ``"w:p"`` to
 24 | ``{http://...}p``. This allows readable code like::
 25 | 
 26 |     if element.tag == qn("w:p"):
 27 | 
 28 | instead of::
 29 | 
 30 |     if element.tag == "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}p":
 31 | 
 32 | If somewhere along the line this package just stops working, it may be that the NSMAP
 33 | entries have been updated for whatever docx you're working with (though that's not
 34 | supposed to ever happen). *If* this happens::
 35 | 
 36 |     1) Unzip the docx.
 37 |     2) open ``word/document.xml`` in a text editor.
 38 |     3) Search for xmlns:w=[some string]
 39 |     4) update NSMAP['w'] = some string
 40 | 
 41 | Lxml allows (deceptively) easy access to a file's namespaces; however, this is
 42 | problematic because ``root_element.nsmap`` may not retrieve all nsmap entries. Other
 43 | entries may be buried inside sub-environments further down in the tree. It is safer
 44 | to explicate namespace mapping.
 45 | 
 46 | If you extend docx2text with other tags, additional NSMAP entries may be necessary.
 47 | """
 48 | 
 49 | from __future__ import annotations
 50 | 
 51 | from typing import TYPE_CHECKING
 52 | 
 53 | from docx2python.attribute_register import get_prefixed_tag
 54 | 
 55 | if TYPE_CHECKING:
 56 |     from collections.abc import Iterator
 57 | 
 58 |     from lxml.etree import _Element as EtreeElement  # type: ignore
 59 | 
 60 | 
 61 | def qn(elem: EtreeElement, tag: str) -> str:
 62 |     """Turn a namespace-prefixed tag into a Clark-notation qualified tag.
 63 | 
 64 |     :param elem: lxml.etree._Element object
 65 |     :param tag: namespace-prefixed tag, e.g. ``w:p``
 66 |     :return: Clark-notation qualified tag,
 67 |         e.g. ``{http://schemas.openxmlformats.org/wordprocessingml/2006/main}p``
 68 |         IN THE NAMESPACES DEFINED IN THE ``elem`` ELEMENT
 69 | 
 70 |     Most lxml elements contain the entire namespace of their parent elements. Create
 71 |     a tag within this namespace.
 72 | 
 73 |     Stands for 'qualified name', a utility function to turn a namespace prefixed tag
 74 |     name into a Clark-notation qualified tag name for lxml.
 75 | 
 76 |         >>> qn('w:cSld')
 77 |         '{http://schemas.../main}cSld'
 78 | 
 79 |     Source: https://github.com/python-openxml/python-docx/
 80 |     """
 81 |     prefix, localname = tag.split(":")
 82 |     uri = elem.nsmap[prefix]
 83 |     return f"{{{uri}}}{localname}"
 84 | 
 85 | 
 86 | def get_attrib_by_qn(elem: EtreeElement, tag: str) -> str:
 87 |     """Get the attribute of an element by a namespace-prefixed tag.
 88 | 
 89 |     :param elem: lxml.etree._Element object
 90 |     :param tag: namespace-prefixed tag, e.g. ``w:p``
 91 |     :return: attribute of the element with the namespace-prefixed tag
 92 |     """
 93 |     return elem.attrib[qn(elem, tag)]
 94 | 
 95 | 
 96 | def find_by_qn(elem: EtreeElement, tag: str) -> EtreeElement | None:
 97 |     """Find next element in the tree with a namespace-prefixed tag.
 98 | 
 99 |     :param elem: lxml.etree._Element object
100 |     :param tag: namespace-prefixed tag, e.g. ``w:p``
101 |     :return: next element with the namespace-prefixed tag
102 |     """
103 |     return elem.find(qn(elem, tag))
104 | 
105 | 
106 | def findall_by_qn(elem: EtreeElement, tag: str) -> list[EtreeElement]:
107 |     """Find all elements in the tree with a namespace-prefixed tag.
108 | 
109 |     :param elem: lxml.etree._Element object
110 |     :param tag: namespace-prefixed tag, e.g. ``w:p``
111 |     :return: list of elements with the namespace-prefixed tag
112 |     """
113 |     return elem.findall(qn(elem, tag))
114 | 
115 | 
116 | def find_parent_by_qn(elem: EtreeElement | None, tag: str) -> EtreeElement | None:
117 |     """Find the parent element in the tree with a namespace-prefixed tag.
118 | 
119 |     :param elem: lxml.etree._Element object
120 |     :param tag: namespace-prefixed tag, e.g. ``w:p``
121 |     :return: parent element with the namespace-prefixed tag
122 |     """
123 |     if elem is None:
124 |         return None
125 |     if get_prefixed_tag(elem) == tag:
126 |         return elem
127 |     return find_parent_by_qn(elem.getparent(), tag)
128 | 
129 | 
130 | def iterfind_by_qn(elem: EtreeElement, tag: str) -> Iterator[EtreeElement]:
131 |     """Iterate over all elements in the tree with a namespace-prefixed tag.
132 | 
133 |     :param elem: lxml.etree._Element object
134 |     :param tag: namespace-prefixed tag, e.g. ``w:p``
135 |     :return: iterator over elements with the namespace-prefixed tag
136 |     """
137 |     yield from elem.iterfind(qn(elem, tag))
138 | 


--------------------------------------------------------------------------------
/docx2python/numbering_formats.py:
--------------------------------------------------------------------------------
  1 | """Numbering formats for converted XML lists.
  2 | 
  3 | :author: Shay Hill
  4 | :created: 6/26/2019
  5 | 
  6 | I don't want to add non-ascii text to a potentially ascii-only file, so all bullets
  7 | are '--' and Roman numerals stop at 3999.
  8 | 
  9 | Doesn't capture formatting like 1.1.1 or b) or (ii). Only the six basic formats are
 10 | covered::
 11 | 
 12 |     -- bullet
 13 |     1  decimal
 14 |     a  lowerLetter
 15 |     A  upperLetter
 16 |     i  lowerRoman
 17 |     I  upperRoman
 18 | """
 19 | 
 20 | from string import ascii_lowercase
 21 | 
 22 | # Subs to convert any number of i's to a proper Roman numeral
 23 | # fmt=off
 24 | ROMAN_SUBS = [
 25 |     ("iiiii", "v"),  # 1+1+1+1+1 -> 5
 26 |     ("vv", "x"),  # 5+5 -> 10
 27 |     ("xxxxx", "l"),  # 10+10+10+10 -> 50
 28 |     ("ll", "c"),  # 50+50 -> 100
 29 |     ("ccccc", "d"),  # 100+100+100+100+100 -> 500
 30 |     ("dd", "m"),  # 500+500 -> 1000
 31 |     ("iiii", "iv"),  # 1+1+1+1 -> 4
 32 |     ("viv", "ix"),  # 5+4 -> 9
 33 |     ("xxxx", "xl"),  # 10+10+10+10 -> 40
 34 |     ("lxl", "xc"),  # 50+40 -> 90
 35 |     ("cccc", "cd"),  # 100+100+100+100 -> 40
 36 |     ("dcd", "cm"),  # 500+400 -> 900
 37 | ]
 38 | # fmt=on
 39 | 
 40 | 
 41 | def lower_letter(n: int) -> str:
 42 |     """Convert a positive integer to a string of letters representing base 26.
 43 | 
 44 |     :param n: any positive integer
 45 |     :return: the kind of "numbering" used for numbered lists and excel columns.
 46 |         (a, b, c ... aa, ab ...) Zero is undefined.
 47 |     :raise ValueError: if n is not a positive integer
 48 | 
 49 |         >>> lower_letter(1)
 50 |         'a'
 51 |         >>> lower_letter(26)
 52 |         'z'
 53 |         >>> lower_letter(27)
 54 |         'aa'
 55 |     """
 56 |     if n < 1:
 57 |         msg = f"0 and <1 are not defined for this numbering: {n}"
 58 |         raise ValueError(msg)
 59 |     result = ""
 60 |     while n:
 61 |         n, remainder = divmod(n - 1, 26)
 62 |         result = ascii_lowercase[remainder] + result
 63 |     return result
 64 | 
 65 | 
 66 | def upper_letter(n: int) -> str:
 67 |     """Get int as an upprecase letter.
 68 | 
 69 |     :param n: any positive integer
 70 |     :return: the kind of "numbering" used for numbered lists and excel columns.
 71 |     """
 72 |     return lower_letter(n).upper()
 73 | 
 74 | 
 75 | def lower_roman(n: int) -> str:
 76 |     """Convert a positive integer to a lowercase Roman numeral.
 77 | 
 78 |     :param n: any positive integer
 79 |     :return: Roman number equivalent of n
 80 |     :raise ValueError: if n is not a positive integer
 81 | 
 82 |         >>> lower_roman(1)
 83 |         'i'
 84 |         >>> lower_roman(9)
 85 |         'ix'
 86 |         >>> lower_roman(44)
 87 |         'xliv'
 88 | 
 89 |     Numbers greater than 3999 can be expressed with a bar over the number. The bar
 90 |     means "times 1000" (e.g., iv with a bar over it would be 4000).
 91 | 
 92 |     It'll never happen in this project, and I don't want to add non-ascii to what
 93 |     might be a pure ascii file, so this function will keep adding 'm' to as many
 94 |     thousand as you'd like.
 95 | 
 96 |         >>> lower_roman(10000)
 97 |         'mmmmmmmmmm'
 98 |     """
 99 |     if n < 1:
100 |         msg = f"the Romans hadn't figured out {n}"
101 |         raise ValueError(msg)
102 |     result = "i" * n
103 |     for pattern, replacement in ROMAN_SUBS:
104 |         result = result.replace(pattern, replacement)
105 |     return result
106 | 
107 | 
108 | def upper_roman(n: int) -> str:
109 |     """Get int as an uppercase Roman numeral.
110 | 
111 |     :param n: any positive integer
112 |     :return: Roman number equivalent of n
113 |     """
114 |     return lower_roman(n).upper()
115 | 
116 | 
117 | def decimal(n: int) -> str:
118 |     """Get int as a decimal number string.
119 | 
120 |     :param n: any integer
121 |     :return: string such that int(decimal(n)) == n
122 |     """
123 |     return str(n)
124 | 
125 | 
126 | def bullet(_: int = 0) -> str:
127 |     """Get the string we're using to replace bullets.
128 | 
129 |     :return: the string we're using to replace bullets.
130 |     """
131 |     return "--"
132 | 


--------------------------------------------------------------------------------
/docx2python/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/docx2python/py.typed


--------------------------------------------------------------------------------
/docx2python/text_runs.py:
--------------------------------------------------------------------------------
  1 | """Get text run formatting.
  2 | 
  3 | :author: Shay Hill
  4 | :created: 7/4/2019
  5 | 
  6 | Text runs are formatted inline in the ``trash/document.xml`` or header files. Read
  7 | those elements to extract formatting information.
  8 | """
  9 | 
 10 | from __future__ import annotations
 11 | 
 12 | from collections import defaultdict
 13 | from contextlib import suppress
 14 | from typing import TYPE_CHECKING
 15 | 
 16 | from docx2python.attribute_register import (
 17 |     HtmlFormatter,
 18 |     Tags,
 19 |     get_localname,
 20 |     get_prefixed_tag,
 21 | )
 22 | from docx2python.namespace import find_parent_by_qn, qn
 23 | 
 24 | if TYPE_CHECKING:
 25 |     from collections.abc import Sequence
 26 | 
 27 |     from lxml.etree import _Element as EtreeElement  # type: ignore
 28 | 
 29 | 
 30 | def _gather_sub_vals(element: EtreeElement, qname: str) -> dict[str, str | None]:
 31 |     """Gather formatting elements for a paragraph or text run.
 32 | 
 33 |     :param element: a ``<w:r>`` or ``<w:p>`` xml element. Maybe others
 34 |     :param qname: qualified name for child element.
 35 | 
 36 |     create with::
 37 | 
 38 |         document = etree.fromstring('bytes string')
 39 |         # recursively search document for <w:r> elements.
 40 | 
 41 |     :return: Style names ('b/', 'sz', etc.) mapped to values.
 42 | 
 43 |     To keep things more homogeneous, I've given tags like ``<w:b/>`` (bold) a value of
 44 |     None, even though they don't take a value in xml.
 45 | 
 46 |     Each element of rPr will be either present (returned tag: None) or have a value
 47 |     (returned tag: val).
 48 | 
 49 |     **E.g., given**::
 50 | 
 51 |          <w:r w:rsidRPr="000E1B98">
 52 |             <w:rPr>
 53 |                 <w:rFonts w:ascii="Arial"/>
 54 |                 <w:b/>
 55 |                 <w:sz w:val="32"/>
 56 |                 <w:szCs w:val="32"/>
 57 |                 <w:u w:val="single"/>
 58 |             </w:rPr>
 59 |             <w:t>text styled  with rPaa
 60 |             </w:t>
 61 |         </w:r>
 62 | 
 63 |     **E.g., returns**::
 64 | 
 65 |         {
 66 |             "rFonts": True,
 67 |             "b": None,
 68 |             "u": "single",
 69 |             "i": None,
 70 |             "sz": "32",
 71 |             "color": "red",
 72 |             "szCs": "32",
 73 |         }
 74 |     """
 75 |     sub_vals: dict[str, str | None] = {}
 76 |     with suppress(StopIteration):
 77 |         for sub_element in next(element.iterfind(qname)):
 78 |             sub_val = sub_element.attrib.get(qn(sub_element, "w:val"))
 79 | 
 80 |             if sub_val:
 81 |                 sub_vals[get_localname(sub_element)] = str(sub_val)
 82 |             else:
 83 |                 sub_vals[get_localname(sub_element)] = None
 84 |     return sub_vals
 85 | 
 86 | 
 87 | def gather_Pr(element: EtreeElement, tag: str | None = None) -> dict[str, str | None]:
 88 |     """Gather style values for a <w:r>, <w:tc>, or <w:p> element (maybe others).
 89 | 
 90 |     :param element: any xml element. r and p elems typically have Pr values.
 91 |     :param tag: optionally specify a tag to search for, e.g., 'w:sdt'
 92 |     :return: Style names ('b/', 'sz', etc.) mapped to values.
 93 | 
 94 |     These elements often have a subelement ``<w:pPr>`` or ``<w:rPr>`` which contains
 95 |     formatting instructions. This includes colspan, rowspan, and other table-cell
 96 |     properties.
 97 | 
 98 |     Will infer a style element qualified name: p -> pPr; r -> rPr
 99 | 
100 |     Call this with any element. Runs and Paragraphs may have a Pr element. Most
101 |     elements will not, but the function will will quietly return an empty dict.
102 | 
103 |     **Optional tag argument**
104 | 
105 |     The properties element is a child of the element it describes. With the default
106 |     tag=None argument, this function will return that child. Given a tag, the
107 |     function will first search up for a matching tag, then return the properties
108 |     element of that tag. This allows simple access to, for example, the pPr element
109 |     from a descendent `w:t` or `w:r` element.
110 | 
111 |     ```
112 |     <w:p>
113 |         <w:pPr> </wpPr>
114 |         <w:r>
115 |             <w:t> </w:t>
116 |         </w:r>
117 |     </w:p>
118 |     ```
119 |     """
120 |     parent = element if tag is None else find_parent_by_qn(element, tag)
121 |     if parent is None:
122 |         return {}
123 |     return _gather_sub_vals(parent, str(parent.tag) + "Pr")
124 | 
125 | 
126 | def get_pStyle(paragraph_element: EtreeElement) -> str:
127 |     """Collect and format paragraph -> pPr -> pStyle value.
128 | 
129 |     :param paragraph_element: a ``<w:p>`` xml element
130 | 
131 |     :return: ``[(pStyle value, '')]``
132 | 
133 |     Also see docstring for ``gather_pPr``
134 |     """
135 |     return gather_Pr(paragraph_element).get("pStyle", "") or ""
136 | 
137 | 
138 | def get_run_formatting(
139 |     run_element: EtreeElement, xml2html: dict[str, HtmlFormatter]
140 | ) -> list[str]:
141 |     """Get run-element formatting converted into html.
142 | 
143 |     :param run_element: a ``<w:r>`` xml element
144 |         create with::
145 | 
146 |             document = etree.fromstring('bytes string')
147 |             # recursively search document for <w:r> elements.
148 | 
149 |     :param xml2html: mapping to convert xml styles to html styles
150 |         e.g., {
151 |             'b': (<function <lambda> at 0x0000026BC7875A60>,),
152 |             'smallCaps': (<function <lambda> at 0x0000026BC7896DC0>, 'font', 'style')
153 |         }
154 | 
155 |     :return: ``['b', 'i', ...]``
156 | 
157 |     Lists are always returned in order:
158 | 
159 |     ``"span"`` first then any other styles in alphabetical order.
160 | 
161 |     Also see docstring for ``gather_rPr``
162 |     """
163 |     return _format_Pr_into_html(gather_Pr(run_element), xml2html)
164 | 
165 | 
166 | def get_paragraph_formatting(
167 |     paragraph_element: EtreeElement, xml2html: dict[str, HtmlFormatter]
168 | ) -> list[str]:
169 |     """Get paragraph-element formatting converted into html.
170 | 
171 |     :param paragraph_element: a ``<w:p>`` xml element
172 |         create with::
173 | 
174 |             document = etree.fromstring('bytes string')
175 |             # recursively search document for <w:r> elements.
176 | 
177 |     :param xml2html: mapping to convert xml styles to html styles
178 |         e.g., {
179 |             'b': (<function <lambda> at 0x0000026BC7875A60>,),
180 |             'smallCaps': (<function <lambda> at 0x0000026BC7896DC0>, 'font', 'style')
181 |         }
182 | 
183 |     :return: ``['b', 'i', ...]``
184 | 
185 |     Tuples are always returned in order:
186 | 
187 |     ``"font"`` first then any other styles in alphabetical order.
188 | 
189 |     Also see docstring for ``gather_rPr``
190 |     """
191 |     return _format_Pr_into_html({get_pStyle(paragraph_element): None}, xml2html)
192 | 
193 | 
194 | def _format_Pr_into_html(
195 |     Pr2val: dict[str, str | None], xml2html: dict[str, HtmlFormatter]
196 | ) -> list[str]:
197 |     """Format tags and values into html strings.
198 | 
199 |     :param Pr2val: tags mapped to values (extracted from xml)
200 |         e.g., {'b': None, 'bCs': None}
201 |     :param xml2html: mapping to convert xml styles to html styles
202 |         e.g., {
203 |             'b': (<function <lambda> at 0x0000026BC7875A60>,),
204 |             'smallCaps': (<function <lambda> at 0x0000026BC7896DC0>, 'span', 'style')
205 |         }
206 |     :return: the interior part of html opening tags, eg, ['span style="..."', 'b', 'i']
207 | 
208 |     Types of styles supported:
209 |     (None, None, formatter -> tag, None)
210 |         -> outside any containers, no value set, e.g., `<b>`
211 |     ('span', 'style', formatter -> tag, val)
212 |         -> inside a span, inside a style property, e.g., `<span style="tag: val">`
213 | 
214 |     Other formats would probably work, but they aren't necessary to support the tags
215 |     supported (see README).
216 |     """
217 |     style: list[str] = []
218 | 
219 |     # group together supported formats with the same container and property_
220 |     # e.g., group together everything that goes into `<span style="$HERE$">`
221 |     # con_pro2for[(con, pro)] = string created from for
222 |     con_pro2for: defaultdict[tuple[None | str, None | str], list[str]]
223 |     con_pro2for = defaultdict(list)
224 |     for tag, val in ((k, v) for k, v in Pr2val.items() if k in xml2html):
225 |         formatter, container, property_ = xml2html[tag]
226 |         con_pro2for[(container, property_)].append(formatter(tag, val or ""))
227 | 
228 |     # group together supported formats with the same container
229 |     # e.g., group together everything that goes into `<span $HERE$>`
230 |     # con2pro_for[(con,)] = string created from pro and for
231 |     con2pro_for: defaultdict[str, list[str]] = defaultdict(list)
232 |     for k, v in sorted((k, v) for k, v in con_pro2for.items() if k[1] is not None):
233 |         con2pro_for[k[0] or ""].append(f'{k[1]}="{";".join(sorted(v))}"')
234 | 
235 |     # incorporate container type into string
236 |     # style.append(string created from con, pro, and for)
237 |     for k_, v_ in sorted((k, v) for k, v in con2pro_for.items() if k):
238 |         style.append(f"{k_} {' '.join(v_)}")
239 | 
240 |     # add back in formats with no container or property_
241 |     style += sorted(con_pro2for[(None, None)])
242 |     return style
243 | 
244 | 
245 | def get_html_formatting(
246 |     elem: EtreeElement, xml2html: dict[str, HtmlFormatter]
247 | ) -> list[str]:
248 |     """Get style for an element (if available).
249 | 
250 |     :param elem: a run or paragraph element.
251 |     :param xml2html: mapping to convert xml styles to html styles
252 |         e.g., {
253 |             'b': (<function <lambda> at 0x0000026BC7875A60>,),
254 |             'smallCaps': (<function <lambda> at 0x0000026BC7896DC0>, 'font', 'style')
255 |         }
256 |     :return: ``[(rPr, val), (rPr, val) ...]``
257 |     """
258 |     if get_prefixed_tag(elem) == Tags.RUN:
259 |         return get_run_formatting(elem, xml2html)
260 |     if get_prefixed_tag(elem) == Tags.PARAGRAPH:
261 |         return get_paragraph_formatting(elem, xml2html)
262 |     return []
263 | 
264 | 
265 | def html_open(style: Sequence[str]) -> str:
266 |     """HTML tags to open a style.
267 | 
268 |     :param style: sequence of html tags without the '<' and '>'
269 |     :return: opening html tags joined into a single string
270 | 
271 |     >>> style = ['font color="red" size="32"', 'b', 'i', 'u']
272 |     >>> html_open(style)
273 |     '<font color="red" size="32"><b><i><u>'
274 |     """
275 |     return "".join(f"<{x}>" for x in style)
276 | 
277 | 
278 | def html_close(style: list[str]) -> str:
279 |     """HTML tags to close a style.
280 | 
281 |     :param style: sequence of html tags without the '<' and '>'
282 |     :return: closing html tags joined into a single string
283 | 
284 |     >>> style = ['font color="red" size="32"', 'b', 'i', 'u']
285 |     >>> html_close(style)
286 |     '</u></i></b></font>'
287 | 
288 |     Tags will always be in reverse (of open) order, so open - close will look like::
289 | 
290 |         <b><i><u>text</u></i></b>
291 |     """
292 |     return "".join(f"</{x.split()[0]}>" for x in reversed(style))
293 | 


--------------------------------------------------------------------------------
/docx2python/utilities.py:
--------------------------------------------------------------------------------
  1 | """Utility / example functions using new (as of 2.0.0 Docx2Python features).
  2 | 
  3 | :author: Shay Hill
  4 | :created: 2021-12-21
  5 | 
  6 | Docx2Python version two exposes extracted xml in the DocxReader object and has a new
  7 | paragraph_styles argument. These functions use these new features as utilities /
  8 | examples.
  9 | """
 10 | 
 11 | from __future__ import annotations
 12 | 
 13 | import copy
 14 | import re
 15 | from typing import TYPE_CHECKING
 16 | 
 17 | from lxml import etree
 18 | 
 19 | from docx2python.iterators import iter_at_depth
 20 | from docx2python.main import docx2python
 21 | 
 22 | if TYPE_CHECKING:
 23 |     import os
 24 |     from collections.abc import Iterator
 25 | 
 26 |     from lxml.etree import _Element as EtreeElement  # type: ignore
 27 | 
 28 | 
 29 | def _copy_new_text(elem: EtreeElement, new_text: str) -> EtreeElement:
 30 |     """Copy a text element and replace text.
 31 | 
 32 |     :param elem: an etree element with tag w:t
 33 |     :param new_text: text to replace elem.text
 34 |     :return: a new etree element with tag w:t and text new_text
 35 |     """
 36 |     new_elem = copy.deepcopy(elem)
 37 |     new_elem.text = new_text
 38 |     return new_elem
 39 | 
 40 | 
 41 | def _new_br_element(elem: EtreeElement) -> EtreeElement:
 42 |     """Return a break element with a representative elements namespace.
 43 | 
 44 |     :param elem: xml element
 45 |     :return: a new br element
 46 |     """
 47 |     prefix = elem.nsmap["w"]
 48 |     return etree.Element(f"{{{prefix}}}br")
 49 | 
 50 | 
 51 | def replace_root_text(root: EtreeElement, old: str, new: str) -> None:
 52 |     """Replace :old: with :new: in all descendants of :root:.
 53 | 
 54 |     :param root: an etree element presumably containing descendant text elements
 55 |     :param old: text to be replaced
 56 |     :param new: replacement text
 57 | 
 58 |     Will use softbreaks <br> to preserve line breaks in replacement text.
 59 |     """
 60 | 
 61 |     def recursive_text_replace(branch: EtreeElement):
 62 |         """Replace any text element contining old with one or more elements.
 63 | 
 64 |         :param branch: an etree element
 65 |         """
 66 |         for elem in tuple(branch):
 67 |             if not elem.text or old not in elem.text:
 68 |                 recursive_text_replace(elem)
 69 |                 continue
 70 | 
 71 |             # create a new text element for each line in replacement text
 72 |             text = elem.text.replace(old, new)
 73 |             new_elems = [_copy_new_text(elem, line) for line in text.splitlines()]
 74 | 
 75 |             # insert breakpoints where line breaks were
 76 |             breaks = [_new_br_element(elem) for _ in new_elems]
 77 |             new_elems = [x for pair in zip(new_elems, breaks) for x in pair][:-1]
 78 | 
 79 |             # replace the original element with the new elements
 80 |             parent = elem.getparent()
 81 |             if parent is not None:
 82 |                 index = parent.index(elem)
 83 |                 parent[index : index + 1] = new_elems
 84 | 
 85 |     recursive_text_replace(root)
 86 | 
 87 | 
 88 | def replace_docx_text(
 89 |     path_in: str | os.PathLike[str],
 90 |     path_out: str | os.PathLike[str],
 91 |     *replacements: tuple[str, str],
 92 |     html: bool = False,
 93 | ) -> None:
 94 |     """Replace text in a docx file.
 95 | 
 96 |     :param path_in: path to input docx
 97 |     :param path_out: path to output docx with text replaced
 98 |     :param replacements: tuples of strings (a, b) replace a with b for each in docx.
 99 |     :param html: respect formatting (as far as docx2python can see formatting)
100 |     """
101 |     reader = docx2python(path_in, html=html).docx_reader
102 |     for file in reader.content_files():
103 |         root = file.root_element
104 |         for replacement in replacements:
105 |             replace_root_text(root, *replacement)
106 |     reader.save(path_out)
107 |     reader.close()
108 | 
109 | 
110 | def get_links(path_in: str | os.PathLike[str]) -> Iterator[tuple[str, str]]:
111 |     """Yield links inside a docx file as (href, text).
112 | 
113 |     :param path_in: path to input docx
114 |     :yield: every link in the file as a tuple of (href, text)
115 |     :return: None
116 |     """
117 |     link_pattern = re.compile('<a href="(?P<href>[^"]+)">(?P<text>[^<]+)</a>')
118 |     extraction = docx2python(path_in)
119 |     for run in iter_at_depth(extraction.document_runs, 5):
120 |         match = re.match(link_pattern, run)
121 |         if match:
122 |             href, text = match.groups()
123 |             yield href, text
124 |     extraction.close()
125 | 
126 | 
127 | def get_headings(path_in: str | os.PathLike[str]) -> Iterator[list[str]]:
128 |     """Yield paragraphs with 'Heading' patagraph_style.
129 | 
130 |     :param path_in: path to input docx
131 |     :yield: every paragraph with 'Heading' paragraph_style as a list of strings
132 |     :return: None
133 | 
134 |     When docx2python paragraph_styles parameter is set to True, the first run in
135 |     every paragraph will be a paragraph style extracted from the xml, if present.
136 |     Else, paragraphs style will be "".
137 |     """
138 |     heading_pattern = re.compile(r"Heading\d")
139 |     with docx2python(path_in, html=True) as extraction:
140 |         for par in iter_at_depth(extraction.document_pars, 4):
141 |             if re.match(heading_pattern, par.style):
142 |                 yield par.run_strings
143 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | 
 2 | [project]
 3 | name = "docx2python"
 4 | version = "3.5.0"
 5 | description = "Extract content from docx files"
 6 | authors = [{ name = "Shay Hill", email = "shay_public@hotmail.com" }]
 7 | license = {text = "MIT"}
 8 | readme = "README.md"
 9 | requires-python = ">=3.9.0"
10 | dependencies = ["lxml", "paragraphs", "typing_extensions", "types-lxml"]
11 | 
12 | [project.optional-dependencies]
13 | dev = ["commitizen", "pre-commit", "pytest", "tox", "types-lxml"]
14 | 
15 | [build-system]
16 | requires = ["setuptools", "setuptools-scm"]
17 | build-backend = "setuptools.build_meta"
18 | 
19 | 
20 | [tool.commitizen]
21 | name = "cz_conventional_commits"
22 | version = "3.5.0"
23 | tag_format = "$version"
24 | major-version-zero = true
25 | version_files = ["pyproject.toml:^version"]
26 | 
27 | 
28 | [tool.isort]
29 | profile = "black"
30 | 
31 | 
32 | [tool.tox]
33 | legacy_tox_ini = """
34 | [tox]
35 | envlist = py{313,312,311,310,39}
36 | 
37 | [testenv]
38 | deps = pytest
39 | commands = pytest
40 | """
41 | 
42 | 
43 | [tool.pytest.ini_options]
44 | pythonpath = ["tests"]
45 | log_cli = 1
46 | 
47 | 
48 | [tool.pyright]
49 | include = ["src"]
50 | exclude = ["**/__pycache__.py"]
51 | 
52 | pythonVersion = "3.9"
53 | pythonPlatform = "Any"
54 | 
55 | typeCheckingMode = "strict"
56 | reportShadowedImports = true
57 | reportCallInDefaultInitializer = true
58 | reportImplicitStringConcatenation = true
59 | # reportMissingSuperCall = true
60 | reportPropertyTypeMismatch = true
61 | reportUninitializedInstanceVariable = true
62 | reportUnnecessaryTypeIgnoreComment = true
63 | reportUnusedCallResult = true
64 | reportUnknownArgumentType = false
65 | reportUnknownLambdaType = false
66 | reportUnknownMemberType = false
67 | reportUnknownParameterType = false
68 | reportUnknownVariableType = false
69 | reportUntypedFunctionDecorator = false
70 | 
71 | venvPath = "."
72 | venv = "./venv"
73 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/__init__.py


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | :author: Shay Hill
 4 | :created: 7/2/2019
 5 | 
 6 | """
 7 | 
 8 | from __future__ import annotations
 9 | 
10 | from pathlib import Path
11 | from typing import Any
12 | 
13 | _PROJECT = Path(__file__).parent.parent
14 | 
15 | 
16 | def pytest_assertrepr_compare(config: Any, op: str, left: str, right: str) -> list[str]:
17 |     """See full error diffs"""
18 |     del config
19 |     if op in ("==", "!="):
20 |         return [f"{left} {op} {right}"]
21 |     return []
22 | 
23 | 
24 | RESOURCES = Path(_PROJECT, "tests", "resources")
25 | 


--------------------------------------------------------------------------------
/tests/do_not_test_missing_imagedata_rid.py:
--------------------------------------------------------------------------------
 1 | """Skip image element when imagedata r:id cannot be found.
 2 | 
 3 | :author: Shay Hill
 4 | :created: 11/15/2020
 5 | 
 6 | User forky2 sent a docx file with an empty imagedata element:
 7 | 
 8 | `<v:imagedata croptop="-65520f" cropbottom="65520f"/>`
 9 | 
10 | Docx2python expects to encounter
11 | 
12 | `<v:imagedata r:id="rId689" o:title=""/>`
13 | 
14 | Where `r:id="rId689"` is mapped to an image filename in one of the `rels` files.
15 | 
16 | The missing `r:id` raises a KeyError in docx2python v1.27
17 | 
18 | ```
19 |     Traceback (most recent call last):
20 |       File "./process.py", line 99, in <module>
21 |         process_zip("Specs/2020-06/Rel-16/25_series/25101-g10.zip")
22 |       File "./process.py", line 70, in process_zip
23 |         doc_data = docx2python(docx_file)
24 |       File "/home/forky2/projects/docx2python/docx2python/main.py", line 61, in docx2python
25 |         body = file_text(context["officeDocument"])
26 |       File "/home/forky2/projects/docx2python/docx2python/main.py", line 56, in file_text
27 |         return get_text(unzipped, context)
28 |       File "/home/forky2/projects/docx2python/docx2python/docx_text.py", line 264, in get_text
29 |         branches(ElementTree.fromstring(xml))
30 |       File "/home/forky2/projects/docx2python/docx2python/docx_text.py", line 248, in branches
31 |         branches(child)
32 |       File "/home/forky2/projects/docx2python/docx2python/docx_text.py", line 248, in branches
33 |         branches(child)
34 |       File "/home/forky2/projects/docx2python/docx2python/docx_text.py", line 248, in branches
35 |         branches(child)
36 |       [Previous line repeated 2 more times]
37 |       File "/home/forky2/projects/docx2python/docx2python/docx_text.py", line 239, in branches
38 |         rId = child.attrib[qn("r:id")]
39 |     KeyError: '{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id'
40 | ```
41 | 
42 | Solution: skip silently when an `r:id` cannot be found for an `imagedata` element.
43 | """
44 | 
45 | # from docx2python import docx2python
46 | 
47 | 
48 | # class TestMissingRIdInImagedata:
49 | # def test_skips_missing_rid(self) -> None:
50 | # """Silently skip over imagedata element if r:id not found"""
51 | # pars = docx2python("resources/imagedata_without_rid.docx")
52 | 


--------------------------------------------------------------------------------
/tests/do_not_test_problem_files.py:
--------------------------------------------------------------------------------
 1 | """Run problem files I come across.
 2 | 
 3 | :author: Shay Hill
 4 | :created: 7/17/2019
 5 | """
 6 | 
 7 | from docx2python.main import docx2python
 8 | 
 9 | 
10 | def test_dop_1013a() -> None:
11 |     """Misidentifies ``word/document.xml`` as ``word/word/document.xml``"""
12 |     with docx2python("resources/example.docx") as _:
13 |         pass
14 |     with docx2python("resources/240-DOP-1013A Lay Down Tubulars.docx") as _:
15 |         pass
16 | 


--------------------------------------------------------------------------------
/tests/resources/240-DOP-1013A Lay Down Tubulars.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/240-DOP-1013A Lay Down Tubulars.docx


--------------------------------------------------------------------------------
/tests/resources/ControlTest.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/ControlTest.docx


--------------------------------------------------------------------------------
/tests/resources/apples_and_pears.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/apples_and_pears.docx


--------------------------------------------------------------------------------
/tests/resources/ascii_printable.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/ascii_printable.docx


--------------------------------------------------------------------------------
/tests/resources/basic.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/basic.docx


--------------------------------------------------------------------------------
/tests/resources/check_drop_my.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/check_drop_my.docx


--------------------------------------------------------------------------------
/tests/resources/checked-true-false.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/checked-true-false.docx


--------------------------------------------------------------------------------
/tests/resources/checked_boxes.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/checked_boxes.docx


--------------------------------------------------------------------------------
/tests/resources/checked_drop1.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/checked_drop1.docx


--------------------------------------------------------------------------------
/tests/resources/comments.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/comments.docx


--------------------------------------------------------------------------------
/tests/resources/created-in-pages-bulleted-lists.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/created-in-pages-bulleted-lists.docx


--------------------------------------------------------------------------------
/tests/resources/created-in-pages-paragraphs-only.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/created-in-pages-paragraphs-only.docx


--------------------------------------------------------------------------------
/tests/resources/equations.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/equations.docx


--------------------------------------------------------------------------------
/tests/resources/example.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/example.docx


--------------------------------------------------------------------------------
/tests/resources/example_numbering.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/example_numbering.docx


--------------------------------------------------------------------------------
/tests/resources/has_pict.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/has_pict.docx


--------------------------------------------------------------------------------
/tests/resources/hyperlink.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/hyperlink.docx


--------------------------------------------------------------------------------
/tests/resources/imagedata_without_rid.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/imagedata_without_rid.docx


--------------------------------------------------------------------------------
/tests/resources/invalid_tag_name.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/invalid_tag_name.docx


--------------------------------------------------------------------------------
/tests/resources/libreoffice_conversion.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/libreoffice_conversion.docx


--------------------------------------------------------------------------------
/tests/resources/list_index_a.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/list_index_a.docx


--------------------------------------------------------------------------------
/tests/resources/long_hyperlink.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/long_hyperlink.docx


--------------------------------------------------------------------------------
/tests/resources/merged_cells.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/merged_cells.docx


--------------------------------------------------------------------------------
/tests/resources/merged_links.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/merged_links.docx


--------------------------------------------------------------------------------
/tests/resources/multiple_runs_per_paragraph.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/multiple_runs_per_paragraph.docx


--------------------------------------------------------------------------------
/tests/resources/nested_paragraphs.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/nested_paragraphs.docx


--------------------------------------------------------------------------------
/tests/resources/nested_paragraphs_in_header.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/nested_paragraphs_in_header.docx


--------------------------------------------------------------------------------
/tests/resources/nested_paragraphs_in_header3b.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/nested_paragraphs_in_header3b.docx


--------------------------------------------------------------------------------
/tests/resources/paragraphs_and_tables.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/paragraphs_and_tables.docx


--------------------------------------------------------------------------------
/tests/resources/pic_alt_text.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/pic_alt_text.docx


--------------------------------------------------------------------------------
/tests/resources/renamed_document_xml.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/renamed_document_xml.docx


--------------------------------------------------------------------------------
/tests/resources/run_styles.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/run_styles.docx


--------------------------------------------------------------------------------
/tests/resources/slanted_quotes.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/slanted_quotes.docx


--------------------------------------------------------------------------------
/tests/resources/soft_line_breaks.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/soft_line_breaks.docx


--------------------------------------------------------------------------------
/tests/resources/strict.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/strict.docx


--------------------------------------------------------------------------------
/tests/resources/symbols.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/symbols.docx


--------------------------------------------------------------------------------
/tests/resources/test-docx2python-conversion-google_docs.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/test-docx2python-conversion-google_docs.docx


--------------------------------------------------------------------------------
/tests/resources/test_file_with_comments.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/test_file_with_comments.docx


--------------------------------------------------------------------------------
/tests/resources/unchecked_drop0.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/unchecked_drop0.docx


--------------------------------------------------------------------------------
/tests/resources/zen_of_python.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShayHill/docx2python/7ba456ea2fdbe62daa97aacb5fa7cb413221ff21/tests/resources/zen_of_python.docx


--------------------------------------------------------------------------------
/tests/test_ascii_printable.py:
--------------------------------------------------------------------------------
 1 | """Test that most characters in string.printable can are represented
 2 | 
 3 | (some are altered) in Docx2Python output.
 4 | """
 5 | 
 6 | import string
 7 | 
 8 | from docx2python.main import docx2python
 9 | from tests.conftest import RESOURCES
10 | 
11 | 
12 | class TestAsciiPrintable:
13 |     """Confirming this works with v1.25"""
14 | 
15 |     def test_exact_representation(self) -> None:
16 |         """Most characters are represented exactly
17 |         The last seven characters are
18 |         \n\r\x0b\b0cEND
19 |         \n \r \x0b and \x0c are ignored by word when typed.
20 |         END is there (added by hand to docx file) to let me know I'm past any
21 |         trailing characters
22 |         """
23 |         with docx2python(RESOURCES / "ascii_printable.docx") as pars:
24 |             assert pars.text[:-7] == string.printable[:-4]
25 | 
26 |     def test_html_true(self) -> None:
27 |         """Most characters are represented exactly. &, <, and > are escaped.
28 | 
29 |         The last seven characters are
30 |         \n\r\x0b\b0cEND
31 |         \n \r \x0b and \x0c are ignored by word when typed.
32 |         END is there (added by hand to docx file) to let me know I'm past any
33 |         trailing characters
34 |         """
35 |         pars = docx2python(RESOURCES / "ascii_printable.docx", html=True)
36 |         assert pars.text[:-7] == (
37 |             '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&amp'
38 |             ";'()*+,-./:;&lt;=&gt;?@[\\]^_`{|}~ \t"
39 |         )
40 |         pars.close()
41 | 


--------------------------------------------------------------------------------
/tests/test_check_drop.py:
--------------------------------------------------------------------------------
 1 | """Test checkbox exports from a user-submitted and my own checkbox files.
 2 | 
 3 | :author: Shay Hill
 4 | :created: 6/17/2020
 5 | 
 6 | List items from the user-submitted docx were listed B then A. Confusing for the test,
 7 | but I didn't want to alter it in my version of Word.
 8 | """
 9 | 
10 | from docx2python.main import docx2python
11 | from tests.conftest import RESOURCES
12 | 
13 | 
14 | class TestCheckboxToHtml:
15 |     def test_user_checked_dropdown0(self) -> None:
16 |         """Get checked-out box glyph and second dd entry"""
17 |         extraction = docx2python(RESOURCES / "checked_drop1.docx")
18 |         assert extraction.body_runs == [[[[["☒", " "], ["PIlihan A"]]]]]
19 |         extraction.close()
20 | 
21 |     def test_user_unchecked_dropdown1(self) -> None:
22 |         """Get unchecked box glyph and first dd entry"""
23 |         extraction = docx2python(RESOURCES / "unchecked_drop0.docx")
24 |         assert extraction.text == "\u2610 \n\nPiihan B"
25 |         extraction.close()
26 | 
27 |     def test_my_checkbox(self) -> None:
28 |         """A good selection of checked and unchecked boxes, and several dropdowns"""
29 |         extraction = docx2python(RESOURCES / "check_drop_my.docx")
30 |         assert extraction.body_runs == [
31 |             [
32 |                 [
33 |                     [
34 |                         ["[user unchecked]", "☐", "[user unchecked]"],
35 |                         [],
36 |                         ["[user checked]", "☒", "[user checked]"],
37 |                         [],
38 |                         ["[my unchecked]", "☐", "[my unchecked]"],
39 |                         [],
40 |                         ["[my checked]", "☒", "[my checked]"],
41 |                         [],
42 |                         ["User dropdown (Piihan B)"],
43 |                         ["Piihan B"],
44 |                         [],
45 |                         ["My dropdown (no choice)"],
46 |                     ]
47 |                 ],
48 |                 [[["Choose an item."]]],
49 |                 [[[], ["My dropdown (chose A)"]]],
50 |                 [[["my_item_A"]]],
51 |                 [[[], ["My dropdown (chose B)"]]],
52 |                 [[["my_item_B"]]],
53 |             ]
54 |         ]
55 |         extraction.close()
56 | 


--------------------------------------------------------------------------------
/tests/test_checked_boxes.py:
--------------------------------------------------------------------------------
  1 | """Identify checked boxes in user-submitted file
  2 | 
  3 | :author: Shay Hill
  4 | :created: 2021-12-17
  5 | 
  6 | From user PandaJones:
  7 | 
  8 | '''
  9 | Word docx's xml (i believe this is cause the docx version is pretty old) deletes
 10 | w:val when the checkbox is checked and has w:val = 0 when the checkbox isn't checked.
 11 | 
 12 | This causes a problems that the library defaults to 0 when w:val isn't found in
 13 | w:checked. To fix this, I just checked if there is anything attributes in w:check and
 14 | return a 1 if there isn't anything there.
 15 | 
 16 | I can probably edit the code to check if w:val exist instead as I don't know if
 17 | w:checked can have other attributes.
 18 | 
 19 | Thank for have this library be able to display checkboxes, it is super useful when
 20 | parsing through forms that have all of their stuff in tables.
 21 | '''
 22 | """
 23 | 
 24 | from docx2python import docx2python
 25 | from docx2python.iterators import iter_at_depth
 26 | from tests.conftest import RESOURCES
 27 | 
 28 | 
 29 | def test_checked_boxes_explicit() -> None:
 30 |     """
 31 |     The following text boxes are checked. Remaining checkboxes are unchecked.
 32 | 
 33 |     Adult Protective Services
 34 |     Older Adult Mental Health
 35 |     ProsecutorΓÇÖs Office
 36 |     Regional Center
 37 | 
 38 |     Coroner/Medical Examiner
 39 |     Law Enforcement
 40 |     Civil Attorney/Legal Services
 41 |     Psychologist
 42 | 
 43 |     Medical Practitioner
 44 |     LTC Ombudsman
 45 |     Public Guardian
 46 |     Other (describe):
 47 | 
 48 |     """
 49 |     pars = docx2python(RESOURCES / "checked_boxes.docx", duplicate_merged_cells=False)
 50 |     expect: list[list[list[list[str]]]] = [
 51 |         [
 52 |             [["\u2612", " Adult Protective Services"]],
 53 |             [[]],
 54 |             [["\u2612", " Older Adult Mental Health"]],
 55 |             [[]],
 56 |             [[]],
 57 |             [[]],
 58 |             [["\u2612", " Prosecutor’s Office"]],
 59 |             [[]],
 60 |             [[]],
 61 |             [[]],
 62 |             [["\u2612", " Regional Center"]],
 63 |             [[]],
 64 |         ],
 65 |         [
 66 |             [["\u2612", " Coroner/Medical Examiner"]],
 67 |             [[]],
 68 |             [["\u2612", " Law Enforcement"]],
 69 |             [[]],
 70 |             [[]],
 71 |             [[]],
 72 |             [["\u2612", " Civil Attorney/Legal Services"]],
 73 |             [[]],
 74 |             [[]],
 75 |             [[]],
 76 |             [["\u2612", " Psychologist"]],
 77 |             [[]],
 78 |         ],
 79 |         [
 80 |             [["\u2612", " Medical Practitioner"]],
 81 |             [[]],
 82 |             [["\u2612", " LTC Ombudsman"]],
 83 |             [[]],
 84 |             [[]],
 85 |             [[]],
 86 |             [["\u2612", " Public Guardian"]],
 87 |             [[]],
 88 |             [[]],
 89 |             [[]],
 90 |             [["\u2612", " Other (describe):\u2002\u2002\u2002\u2002\u2002"]],
 91 |             [[]],
 92 |         ],
 93 |     ]
 94 | 
 95 |     assert pars.body_runs[0][3:6] == expect
 96 |     pars.close()
 97 | 
 98 | 
 99 | def test_unchecked_boxes() -> None:
100 |     """
101 |     The following text boxes are checked. Remaining checkboxes are unchecked.
102 | 
103 |     Adult Protective Services
104 |     Older Adult Mental Health
105 |     ProsecutorΓÇÖs Office
106 |     Regional Center
107 | 
108 |     Coroner/Medical Examiner
109 |     Law Enforcement
110 |     Civil Attorney/Legal Services
111 |     Psychologist
112 | 
113 |     Medical Practitioner
114 |     LTC Ombudsman
115 |     Public Guardian
116 |     Other (describe):
117 | 
118 |     All other checkboxes are unchecked
119 | 
120 |     """
121 |     pars = docx2python(RESOURCES / "checked_boxes.docx", duplicate_merged_cells=False)
122 |     all_text = "".join(iter_at_depth(pars.text, 5))
123 |     assert all_text.count("\u2612") == 12
124 |     assert all_text.count("\u2610") == 32
125 |     pars.close()
126 | 
127 | 
128 | def test_checkboxes_true_false() -> None:
129 |     """
130 |     Checkboxes with "true" and "false" instead of "1" and "0" values.
131 |     """
132 |     with docx2python(RESOURCES / "checked-true-false.docx") as pars:
133 |         all_text = "".join(iter_at_depth(pars.text, 5))
134 |     assert all_text.count("\u2612") == 4
135 |     assert all_text.count("\u2610") == 4
136 | 


--------------------------------------------------------------------------------
/tests/test_close.py:
--------------------------------------------------------------------------------
 1 | """Test opening docx reader and closing it.
 2 | 
 3 | Closing a DocxReader or DocxContent instance will close the zipfile openend when the
 4 | DocxReader instance was created.
 5 | 
 6 | :author: Shay Hill
 7 | :created: 7/5/2019
 8 | """
 9 | 
10 | import pytest
11 | 
12 | from docx2python.attribute_register import Tags, get_prefixed_tag
13 | from docx2python.docx_reader import DocxReader
14 | from docx2python.main import docx2python
15 | from tests.conftest import RESOURCES
16 | 
17 | example_docx = RESOURCES / "example.docx"
18 | example_copy_docx = RESOURCES / "example_copy.docx"
19 | 
20 | 
21 | class TestCloseDocxReader:
22 |     def test_explicit_close(self) -> None:
23 |         """Closing DocxReader closes the zipfile."""
24 |         input_context = DocxReader(example_docx)
25 |         _ = input_context.file_of_type("officeDocument").root_element
26 |         # assert DocxReader zipfile is open
27 |         assert input_context._DocxReader__zipf.fp  # type: ignore
28 | 
29 |         input_context.close()
30 |         # assert DocxReader zipfile is closed
31 |         assert not input_context._DocxReader__zipf.fp  # type: ignore
32 | 
33 |     def test_no_access_after_explicit_close(self) -> None:
34 |         """The zipfile will not automatically reopen after explicit close."""
35 |         input_context = DocxReader(example_docx)
36 |         input_context.close()
37 |         # assert zipfile cannot be accessed
38 |         with pytest.raises(ValueError):
39 |             _ = input_context.zipf
40 | 
41 | 
42 | class TestDocxReaderContext:
43 |     def test_context_manager_enter(self):
44 |         """DocxReader can be used as a context manager."""
45 |         with DocxReader(example_docx) as input_context:
46 |             input_xml = input_context.file_of_type("officeDocument").root_element
47 |             assert get_prefixed_tag(input_xml) == Tags.DOCUMENT
48 | 
49 |     def test_context_manager_close(self):
50 |         """DocxReader can be used as a context manager."""
51 |         with DocxReader(example_docx) as input_context:
52 |             _ = input_context.file_of_type("officeDocument").root_element
53 |         with pytest.raises(ValueError):
54 |             _ = input_context.zipf
55 | 
56 | 
57 | class TestCloseDocxContent:
58 |     def test_explicit_close(self) -> None:
59 |         """Closing DocxReader closes the zipfile."""
60 |         content = docx2python(example_docx)
61 |         _ = content.header_runs
62 |         assert content.docx_reader._DocxReader__zipf.fp  # type: ignore
63 | 
64 |         content.close()
65 |         # assert DocxReader zipfile is closed
66 |         assert not content.docx_reader._DocxReader__zipf.fp  # type: ignore
67 | 
68 |     def test_no_access_after_explicit_close(self) -> None:
69 |         """The zipfile will not automatically reopen after explicit close."""
70 |         content = docx2python(example_docx)
71 |         content.close()
72 |         # assert zipfile cannot be accessed
73 |         with pytest.raises(ValueError):
74 |             _ = content.docx_reader.zipf
75 | 
76 | 
77 | class TestDocxContentContext:
78 |     def test_context_manager_enter(self):
79 |         """DocxReader can be used as a context manager."""
80 |         with docx2python(example_docx) as content:
81 |             _ = content.header_runs
82 | 
83 |     def test_context_manager_close(self):
84 |         """DocxReader can be used as a context manager."""
85 |         with docx2python(example_docx) as content:
86 |             pass
87 |             _ = content.header_runs
88 |         with pytest.raises(ValueError):
89 |             _ = content.docx_reader.zipf
90 | 


--------------------------------------------------------------------------------
/tests/test_comments.py:
--------------------------------------------------------------------------------
  1 | """Test extracting comments.
  2 | 
  3 | User flyguy62n requested comment extraction. Extract comments as tuples (text,
  4 | author, date, comment).
  5 | 
  6 | :author: Shay Hill
  7 | :created: 2024-03-29
  8 | """
  9 | 
 10 | import os
 11 | import sys
 12 | 
 13 | import pytest
 14 | 
 15 | project = os.path.abspath(os.path.join(__file__, "..", ".."))
 16 | sys.path.append(project)
 17 | 
 18 | 
 19 | from paragraphs import par
 20 | 
 21 | from docx2python import docx2python
 22 | from tests.conftest import RESOURCES
 23 | 
 24 | 
 25 | def test_comments() -> None:
 26 |     """Extract comments and some comment metadata."""
 27 |     pars = docx2python(RESOURCES / "comments.docx")
 28 |     comments = pars.comments
 29 | 
 30 |     pars.close()
 31 |     assert comments == [
 32 |         (
 33 |             par(
 34 |                 """Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do
 35 |                 eiusmod tempor incididunt ut labore et dolore magna aliqua."""
 36 |             ),
 37 |             "Randy Bartels",
 38 |             "2024-03-28T17:22:00Z",
 39 |             "COMMENT",
 40 |         ),
 41 |         (
 42 |             par(
 43 |                 """Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do
 44 |                 eiusmod tempor incididunt ut labore et dolore magna aliqua."""
 45 |             ),
 46 |             "Randy Bartels",
 47 |             "2024-03-28T17:22:00Z",
 48 |             "RESPONSE",
 49 |         ),
 50 |         (
 51 |             par(
 52 |                 """Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do
 53 |                 eiusmod tempor incididunt ut labore et dolore magna aliqua."""
 54 |             ),
 55 |             "Shay Hill",
 56 |             "2024-03-29T12:10:00Z",
 57 |             "Response from Shay Hill",
 58 |         ),
 59 |         (
 60 |             "tempor incididunt ut labore et dolore magna aliqua.",
 61 |             "Shay Hill",
 62 |             "2024-03-29T12:28:00Z",
 63 |             "Comment on subset starting with tempor",
 64 |         ),
 65 |         (
 66 |             par(
 67 |                 """Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do
 68 |                 eiusmod tempor incididunt ut labore et dolore magna aliqua."""
 69 |             ),
 70 |             "Randy Bartels",
 71 |             "2024-03-28T17:22:00Z",
 72 |             "COMMENT on par 5",
 73 |         ),
 74 |         (
 75 |             par(
 76 |                 """Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do
 77 |                 eiusmod tempor incididunt ut labore et dolore magna aliqua."""
 78 |             ),
 79 |             "Randy Bartels",
 80 |             "2024-03-28T17:22:00Z",
 81 |             "RESPONSE to comment on par 5",
 82 |         ),
 83 |         (
 84 |             par(
 85 |                 """Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do
 86 |                 eiusmod tempor incididunt ut labore et dolore magna aliqua."""
 87 |             ),
 88 |             "Shay Hill",
 89 |             "2024-03-29T12:10:00Z",
 90 |             "Response from Shay Hill on par 5",
 91 |         ),
 92 |         (
 93 |             "tempor incididunt ut labore et dolore magna aliqua.",
 94 |             "Shay Hill",
 95 |             "2024-03-29T12:28:00Z",
 96 |             "Comment on subset starting with tempor on par 5",
 97 |         ),
 98 |     ]
 99 | 
100 | 
101 | @pytest.fixture(scope="module")
102 | def test_file_with_comments():
103 |     test_file = RESOURCES / "test_file_with_comments.docx"
104 |     pars = docx2python(test_file)
105 |     yield pars.comments
106 |     pars.close()
107 | 
108 | 
109 | class TestAdditionalComments:
110 |     test_file = RESOURCES / "test_file_with_comments.docx"
111 | 
112 |     def test_comment_1(
113 |         self, test_file_with_comments: "list[tuple[str, str, str, str]]"
114 |     ) -> None:
115 |         """Extract the first comment."""
116 |         comment = test_file_with_comments[0]
117 |         assert comment == (
118 |             "magna ",
119 |             "Randy Bartels",
120 |             "2024-04-02T16:57:00Z",
121 |             "Comment 1",
122 |         )
123 | 
124 |     def test_comment_2(
125 |         self, test_file_with_comments: "list[tuple[str, str, str, str]]"
126 |     ) -> None:
127 |         """Extract the first comment."""
128 |         comment = test_file_with_comments[1]
129 |         assert comment == (
130 |             "quis ",
131 |             "Randy Bartels",
132 |             "2024-04-02T16:58:00Z",
133 |             "Comment 2",
134 |         )
135 | 
136 |     def test_comment_3(
137 |         self, test_file_with_comments: "list[tuple[str, str, str, str]]"
138 |     ) -> None:
139 |         """Extract the first comment."""
140 |         comment = test_file_with_comments[2]
141 |         assert comment == (
142 |             "Bibendum",
143 |             "Randy Bartels",
144 |             "2024-04-02T16:58:00Z",
145 |             "Comment 3",
146 |         )
147 | 
148 |     def test_comment_with_hyperlink(
149 |         self, test_file_with_comments: "list[tuple[str, str, str, str]]"
150 |     ) -> None:
151 |         """Extract the first comment."""
152 |         comment = test_file_with_comments[3]
153 |         assert comment == (
154 |             "dolor ",
155 |             "Randy Bartels",
156 |             "2024-04-02T16:58:00Z",
157 |             'Comment 4 with <a href="http://www.google.com">hyperlink</a>',
158 |         )
159 | 
160 |     def test_comment_5(
161 |         self, test_file_with_comments: "list[tuple[str, str, str, str]]"
162 |     ) -> None:
163 |         """Extract the first comment."""
164 |         comment = test_file_with_comments[4]
165 |         assert comment == (
166 |             "suspendisse ",
167 |             "Randy Bartels",
168 |             "2024-04-02T16:59:00Z",
169 |             "Comment 5",
170 |         )
171 | 
172 |     def test_comment_with_a_response(
173 |         self, test_file_with_comments: "list[tuple[str, str, str, str]]"
174 |     ) -> None:
175 |         """Extract the first comment."""
176 |         comment = test_file_with_comments[5]
177 |         assert comment == (
178 |             "suspendisse ",
179 |             "Randy Bartels",
180 |             "2024-04-02T16:59:00Z",
181 |             "With a response",
182 |         )
183 | 
184 |     def test_long_comment(
185 |         self, test_file_with_comments: "list[tuple[str, str, str, str]]"
186 |     ) -> None:
187 |         """Extract the first comment."""
188 |         comment = test_file_with_comments[6]
189 |         assert comment == (
190 |             "Amet ",
191 |             "Randy Bartels",
192 |             "2024-04-02T17:00:00Z",
193 |             par(
194 |                 """Comment 6 with a long comment.\n\nmagna fringilla urna porttitor
195 |                 rhoncus dolor purus non enim praesent elementum facilisis leo vel
196 |                 fringilla est ullamcorper eget nulla facilisi etiam dignissim diam
197 |                 quis enim lobortis scelerisque fermentum dui faucibus in ornare quam
198 |                 viverra orci sagittis eu volutpat odio facilisis mauris\n\nsit amet
199 |                 massa vitae tortor condimentum lacinia quis vel eros donec ac odio
200 |                 tempor orci dapibus ultrices in iaculis nunc sed augue lacus viverra
201 |                 vitae congue eu consequat ac felis donec et odio pellentesque diam
202 |                 volutpat commodo sed egestas egestas fringilla phasellus faucibus
203 |                 scelerisque eleifend donec pretium vulputate sapien nec sagittis
204 |                 aliquam malesuada bibendum"""
205 |             ),
206 |         )
207 | 
208 |     def test_comment_7(
209 |         self, test_file_with_comments: "list[tuple[str, str, str, str]]"
210 |     ) -> None:
211 |         """Extract the first comment."""
212 |         comment = test_file_with_comments[7]
213 |         assert comment == (
214 |             "suspendisse ",
215 |             "Randy Bartels",
216 |             "2024-04-02T17:00:00Z",
217 |             "Comment 7 with a long response",
218 |         )
219 | 
220 |     def test_long_response(
221 |         self, test_file_with_comments: "list[tuple[str, str, str, str]]"
222 |     ) -> None:
223 |         """Extract the first comment."""
224 |         comment = test_file_with_comments[8]
225 |         assert comment == (
226 |             "suspendisse ",
227 |             "Randy Bartels",
228 |             "2024-04-02T17:00:00Z",
229 |             par(
230 |                 """Long response: magna fringilla urna porttitor rhoncus dolor purus
231 |                 non enim praesent elementum facilisis leo vel fringilla est
232 |                 ullamcorper eget nulla facilisi etiam dignissim diam quis enim
233 |                 lobortis scelerisque fermentum dui faucibus in ornare quam viverra
234 |                 orci sagittis eu volutpat odio facilisis mauris\n\nsit amet massa
235 |                 vitae tortor condimentum lacinia quis vel eros donec ac odio tempor
236 |                 orci dapibus ultrices in iaculis nunc sed augue lacus viverra vitae
237 |                 congue eu consequat ac felis donec et odio pellentesque diam volutpat
238 |                 commodo sed egestas egestas fringilla phasellus faucibus scelerisque
239 |                 eleifend donec pretium vulputate sapien nec sagittis aliquam
240 |                 malesuada bibendum"""
241 |             ),
242 |         )
243 | 
244 |     def comment_8(
245 |         self, test_file_with_comments: "list[tuple[str, str, str, str]]"
246 |     ) -> None:
247 |         """Extract the first comment."""
248 |         comment = test_file_with_comments[9]
249 |         assert comment == (
250 |             "Magnis ",
251 |             "Randy Bartels",
252 |             "2024-04-02T17:04:00Z",
253 |             "Comment 8 - marked Resolved",
254 |         )
255 | 
256 |     def comment_in_a_table(
257 |         self, test_file_with_comments: "list[tuple[str, str, str, str]]"
258 |     ) -> None:
259 |         """Extract the first comment."""
260 |         comment = test_file_with_comments[10]
261 |         assert comment == (
262 |             "R1C1",
263 |             "Randy Bartels",
264 |             "2024-04-02T17:07:00Z",
265 |             "Comment in a table",
266 |         )
267 | 
268 |     def comment_on_a_picture(
269 |         self, test_file_with_comments: "list[tuple[str, str, str, str]]"
270 |     ) -> None:
271 |         """Extract the first comment."""
272 |         comment = test_file_with_comments[11]
273 |         assert comment == (
274 |             "",
275 |             "Randy Bartels",
276 |             "2024-04-02T17:08:00Z",
277 |             "Comment on a picture",
278 |         )
279 | 
280 | 
281 | def test_no_comments() -> None:
282 |     """Return an empty list when no comments are present."""
283 |     pars = docx2python(RESOURCES / "apples_and_pears.docx")
284 |     comments = pars.comments
285 |     pars.close()
286 |     assert comments == []
287 | 


--------------------------------------------------------------------------------
/tests/test_content_control_block_properties.py:
--------------------------------------------------------------------------------
 1 | """Test accessing SDT properties above a paragraph.
 2 | 
 3 | issue #81
 4 | 
 5 | User YashasviMantha requested a way to access Content Control Block properties. In
 6 | the xml, these are called Structured Document Tags (SDT). To allow this, I added two
 7 | features:
 8 | 
 9 |     1. Each Par instance now contains a pointer to the XML element from which it was
10 |        created.
11 |     2. Add a `tag` argument to `gather_Pr` that allows the caller to search up for
12 |         the Pr of a parent element.
13 | 
14 | This is a simple test and an example. See `get_sdt_tag` example function for a
15 | description of the sdt context in xml and how to access it.
16 | 
17 | :author: Shay Hill
18 | :created: 2024-11-17
19 | """
20 | 
21 | from __future__ import annotations
22 | 
23 | from lxml.etree import _Element as EtreeElement  # type: ignore
24 | 
25 | from docx2python.attribute_register import Tags
26 | from docx2python.iterators import iter_paragraphs
27 | from docx2python.main import docx2python
28 | from docx2python.text_runs import gather_Pr
29 | from tests.conftest import RESOURCES
30 | 
31 | _DOCX = RESOURCES / "ControlTest.docx"
32 | 
33 | 
34 | def get_sdt_tag(elem: EtreeElement) -> str | None:
35 |     """If elem is or is inside a <w:sdt> element, try to find the sdt props tag value.
36 | 
37 |     :param elem: lxml.etree._Element object
38 |     :return: tag value of sibling or parent sdtPr element or None
39 |     ```
40 |     <w:body>
41 |         <w:sdt>
42 |             <w:sdtPr>
43 |                 <w:tag w:val="my_tag"/>
44 |             </w:sdtPr>
45 |             <w:sdtContent>
46 |                 <w:p> </w:p>
47 |                 <w:p> </w:p>
48 |             </w:sdtContent>
49 |         </w:sdt>
50 |     </w:body>
51 |     ```
52 |     """
53 |     properties_dict = gather_Pr(elem, Tags.SDT)
54 |     return properties_dict.get("tag")
55 | 
56 | 
57 | class TestStructuredDocumentTags:
58 | 
59 |     def test_paragraphs_in_sdt_elements(self) -> None:
60 |         """Get the SDT tag above a paragraph."""
61 |         with docx2python(_DOCX) as extraction:
62 |             pars = extraction.document_pars
63 | 
64 |         text_paragraphs: list[str] = []
65 | 
66 |         for paragraph in iter_paragraphs(pars):
67 |             if paragraph.elem is None:
68 |                 par_tag = None
69 |             else:
70 |                 par_tag = get_sdt_tag(paragraph.elem)
71 |             par_text = "".join(paragraph.run_strings)
72 |             text_paragraphs.append(f"[{par_tag}]: {par_text}")
73 | 
74 |         assert text_paragraphs == [
75 |             "[Test_Control]: This is a test",
76 |             "[Test_Control]: For a content control or content container in word. ",
77 |         ]
78 | 


--------------------------------------------------------------------------------
/tests/test_created_in_pages.py:
--------------------------------------------------------------------------------
 1 | """Fix bullets in pages created in Pages
 2 | 
 3 | :author: Shay Hill
 4 | :created: 10/5/2020
 5 | 
 6 | Issue 11:
 7 | 
 8 | I have seen this happening for files created in Pages but not in files created in
 9 | MSWord.
10 | 
11 | How to reproduce:
12 |     Use Pages (MacOS app) to write a document
13 |     save the document as docx
14 |     attempt to extract using docx2python
15 | 
16 | It seems Pages is adding abstractNum nodes that don't contain w:lvl nodes. For example:
17 |         <w:multiLevelType w:val="hybridMultilevel"/>
18 |         <w:numStyleLink w:val="Numbered"/>
19 |     </w:abstractNum>
20 | 
21 | collect_numFmts (from docx_context.py) then reads and stores these in the context as [].
22 | This context is then passed down to _get_bullet_string (from docx_text.py). Then the
23 | IndexError when we try to get the number format from context.
24 | 
25 | User Raiyan provided two docx files created in pages:
26 |     * created-in-pages-paragraphs-only.docx should work now (v 1.25)
27 |     * created-in-pages-bulleted-lists.docx should fail (v 1.25) with above-described
28 |     error.
29 | """
30 | 
31 | from docx2python.main import docx2python
32 | from tests.conftest import RESOURCES
33 | 
34 | 
35 | class TestParagraphsOnly:
36 |     """Confirming this works with v1.25"""
37 | 
38 |     def test_paragraphs_only(self) -> None:
39 |         """Run without issue"""
40 |         pars = docx2python(RESOURCES / "created-in-pages-paragraphs-only.docx")
41 |         assert pars.text == (
42 |             "\n\nThis is a document for testing docx2python module.\n\n\n\nThis "
43 |             "document contains paragraphs.\n\n\n\nThis document does not contain any "
44 |             "bulleted lists.\n\n"
45 |         )
46 |         pars.close()
47 | 
48 | 
49 | class TestBulletedLists:
50 |     """Replace numbering format with bullet (--) when format cannot be determined"""
51 | 
52 |     def test_bulleted_lists(self) -> None:
53 |         pars = docx2python(RESOURCES / "created-in-pages-bulleted-lists.docx")
54 |         assert pars.text == (
55 |             "\n\nThis is a document for testing docx2python module.\n\n\n\n"
56 |             "--\tWhy did the chicken cross the road?\n\n"
57 |             "\t--\tJust because\n\n"
58 |             "\t--\tDon't know\n\n"
59 |             "\t--\tTo get to the other side\n\n"
60 |             "--\tWhat's the meaning of life, universe and everything?\n\n"
61 |             "\t--\t42\n\n"
62 |             "\t--\t0\n\n"
63 |             "\t--\t-1\n\n"
64 |         )
65 |         pars.close()
66 | 


--------------------------------------------------------------------------------
/tests/test_document2_xml.py:
--------------------------------------------------------------------------------
 1 | """Test hyperlink functionality
 2 | 
 3 | :author: Shay Hill
 4 | :created: 4/19/2020
 5 | 
 6 | The main content file in a docx is usually ``word/document.xml``, but this is not
 7 | always the case.
 8 | """
 9 | 
10 | from docx2python.main import docx2python
11 | from tests.conftest import RESOURCES
12 | 
13 | 
14 | class TestHyperlink:
15 |     def test_prints(self) -> None:
16 |         """
17 |         Open a docx with ``word/document.xml`` renamed to ``word/blah_blah.xml``
18 |         and all references updated. Test that text extracts as expected."""
19 |         extraction = docx2python(RESOURCES / "renamed_document_xml.docx")
20 |         assert (
21 |             '<a href="http://www.shayallenhill.com/">my website</a>.' in extraction.text
22 |         )
23 |         extraction.close()
24 | 


--------------------------------------------------------------------------------
/tests/test_docx2python.py:
--------------------------------------------------------------------------------
  1 | """Test full functionality of source_old
  2 | 
  3 | :author: Shay Hill
  4 | :created: 7/5/2019
  5 | """
  6 | 
  7 | import os
  8 | import re
  9 | import shutil
 10 | 
 11 | from paragraphs import par
 12 | 
 13 | from docx2python.iterators import iter_at_depth
 14 | from docx2python.main import docx2python
 15 | from tests.conftest import RESOURCES
 16 | 
 17 | ALT_TEXT = par(
 18 |     """----Image alt text---->A close up of a logo\n\n
 19 |         Description automatically generated<"""
 20 | )
 21 | 
 22 | 
 23 | class TestFormatting:
 24 |     """Nested list output string formatting"""
 25 | 
 26 |     def test_header(self) -> None:
 27 |         """Header text in correct location"""
 28 |         with docx2python(RESOURCES / "example.docx") as output:
 29 |             header_text = "".join(iter_at_depth(output.header, 4))
 30 |             assert re.match(
 31 |                 rf"Header text{ALT_TEXT}----media/image\d+\.\w+----$", header_text
 32 |             )
 33 | 
 34 |     def test_footer(self) -> None:
 35 |         """Footer text in correct location"""
 36 |         with docx2python(RESOURCES / "example.docx") as output:
 37 |             footer_text = "".join(iter_at_depth(output.footer, 4))
 38 |             assert re.match(
 39 |                 rf"Footer text{ALT_TEXT}----media/image\d+\.\w+----$", footer_text
 40 |             )
 41 | 
 42 |     def test_footnotes(self) -> None:
 43 |         """Footnotes extracted."""
 44 |         with docx2python(RESOURCES / "example.docx") as output:
 45 |             assert output.footnotes_runs == [
 46 |                 [
 47 |                     [
 48 |                         [[]],
 49 |                         [[]],
 50 |                         [["footnote1)\t", " First footnote"]],
 51 |                         [
 52 |                             [
 53 |                                 "footnote2)\t",
 54 |                                 " Second footnote",
 55 |                                 par(
 56 |                                     """----Image alt text---->A close up of a
 57 |                                     logo\n\nDescription automatically generated<"""
 58 |                                 ),
 59 |                                 "----media/image1.png----",
 60 |                             ]
 61 |                         ],
 62 |                     ]
 63 |                 ]
 64 |             ]
 65 | 
 66 |     def test_endnotes(self) -> None:
 67 |         """Endnotes extracted."""
 68 |         with docx2python(RESOURCES / "example.docx") as output:
 69 |             assert output.endnotes_runs == [
 70 |                 [
 71 |                     [
 72 |                         [[]],
 73 |                         [[]],
 74 |                         [["endnote1)\t", " First endnote"]],
 75 |                         [
 76 |                             [
 77 |                                 "endnote2)\t",
 78 |                                 " Second endnote",
 79 |                                 par(
 80 |                                     """----Image alt text---->A close up of a
 81 |                                     logo\n\nDescription automatically generated<"""
 82 |                                 ),
 83 |                                 "----media/image1.png----",
 84 |                             ]
 85 |                         ],
 86 |                     ]
 87 |                 ]
 88 |             ]
 89 | 
 90 |     def test_numbered_lists(self) -> None:
 91 |         """Sublists reset. Expected formatting."""
 92 |         with docx2python(RESOURCES / "example.docx") as output:
 93 |             assert output.body[0][0][0] == [
 94 |                 "I)\texpect I",
 95 |                 "\tA)\texpect A",
 96 |                 "\tB)\texpect B",
 97 |                 "\t\t1)\texpect 1",
 98 |                 "\t\t\ta)\texpect a",
 99 |                 "\t\t\tb)\texpect b",
100 |                 "\t\t\t\t1)\texpect 1",
101 |                 "\t\t\t\t\ta)\texpect a",
102 |                 "\t\t\t\t\t\ti)\texpect i",
103 |                 "\t\t\t\t\t\tii)\texpect ii",
104 |                 "II)\tThis should be II",
105 |                 "\tA)\tThis should be A), not C)",
106 |             ]
107 | 
108 |     def test_numbered_lists_with_custom_start_index(self) -> None:
109 |         """Sublists start from non-default index. Expected formatting."""
110 |         with docx2python(RESOURCES / "example_numbering.docx") as output:
111 |             assert output.body[0][0][0] == [
112 |                 "II)\texpect II",
113 |                 "C)\texpect C",
114 |                 "D)\texpect D",
115 |                 "4)\texpect 4",
116 |                 "e)\texpect e",
117 |                 "f)\texpect f",
118 |                 "6)\texpect 6",
119 |                 "f)\texpect f",
120 |                 "viii)\texpect viii",
121 |                 "ix)\texpect ix",
122 |                 "",
123 |                 "",
124 |             ]
125 | 
126 |     def test_bullets(self) -> None:
127 |         """Expected bullet format and indent."""
128 |         with docx2python(RESOURCES / "example.docx") as output:
129 |             assert output.body_runs[0][1][0] == [
130 |                 ["--\t", "bullet no indent"],
131 |                 ["\t--\t", "bullet indent 1"],
132 |                 ["\t\t--\t", "bullet indent 2"],
133 |             ]
134 | 
135 |     def test_ignore_formatting(self) -> None:
136 |         """Text formatting is stripped."""
137 |         with docx2python(RESOURCES / "example.docx") as output:
138 |             assert output.body[0][2][0] == [
139 |                 "Bold",
140 |                 "Italics",
141 |                 "Underlined",
142 |                 "Large Font",
143 |                 "Colored",
144 |                 "Large Colored",
145 |                 "Large Bold",
146 |                 "Large Bold Italics Underlined",
147 |             ]
148 | 
149 |     def test_nested_table(self) -> None:
150 |         """Appears as a new table"""
151 |         with docx2python(RESOURCES / "example.docx") as output:
152 |             assert output.body[1] == [[["Nested"], ["Table"]], [["A"], ["B"]]]
153 | 
154 |     def test_tab_delimited(self) -> None:
155 |         """Tabs converted to \t."""
156 |         with docx2python(RESOURCES / "example.docx") as output:
157 |             assert output.body[2][1][0][0] == "Tab\tdelimited\ttext"
158 | 
159 |     def test_lt_gt(self) -> None:
160 |         """> and < are not encoded."""
161 |         with docx2python(RESOURCES / "example.docx") as output:
162 |             assert output.body[2][2][0][0] == "10 < 20 and 20 > 10"
163 | 
164 |     def test_text_outside_table(self) -> None:
165 |         """Text outside table is its own table (also tests image marker)"""
166 |         with docx2python(RESOURCES / "example.docx") as output:
167 |             assert output.body[3] == [
168 |                 [
169 |                     [
170 |                         "Text outside table",
171 |                         "Reference footnote 1----footnote1----",
172 |                         "Reference footnote 2----footnote2----",
173 |                         "Reference endnote 1----endnote1----",
174 |                         "Reference endnote 2----endnote2----",
175 |                         "Heading 1",
176 |                         "Heading 2",
177 |                         "",
178 |                         "----Image alt text---->A jellyfish in water\n\n"
179 |                         + "Description automatically generated"
180 |                         + "<----media/image2.jpg----",
181 |                     ]
182 |                 ]
183 |             ]
184 | 
185 | 
186 | class TestHtmlFormatting:
187 |     """Font styles exported as HTML."""
188 | 
189 |     def test_lt_gt(self) -> None:
190 |         """> and < encoded"""
191 |         with docx2python(RESOURCES / "example.docx", html=True) as html_output:
192 |             assert html_output.body[2][2][0][0] == "10 &lt; 20 and 20 &gt; 10"
193 | 
194 |     def test_formatting_captured(self) -> None:
195 |         """Text formatting converted to html."""
196 |         with docx2python(RESOURCES / "example.docx", html=True) as html_output:
197 |             assert html_output.body[0][2][0] == [
198 |                 "<b>Bold</b>",
199 |                 "<i>Italics</i>",
200 |                 "<u>Underlined</u>",
201 |                 '<span style="font-size:40pt">Large Font</span>',
202 |                 '<span style="color:FF0000">Colored</span>',
203 |                 '<span style="color:FF0000;font-size:40pt">Large Colored</span>',
204 |                 '<span style="font-size:40pt"><b>Large Bold</b></span>',
205 |                 par(
206 |                     """<span style="font-size:40pt"><b><i><u>Large Bold Italics
207 |                     Underlined</u></i></b></span>"""
208 |                 ),
209 |             ]
210 | 
211 |     def test_paragraph_formatting(self) -> None:
212 |         """Text formatting converted to html."""
213 |         with docx2python(RESOURCES / "example.docx", html=True) as html_output:
214 |             expect = [
215 |                 [
216 |                     [
217 |                         ["Text outside table"],
218 |                         ["Reference footnote 1", "----footnote1----"],
219 |                         ["Reference footnote 2", "----footnote2----"],
220 |                         ["Reference endnote 1", "----endnote1----"],
221 |                         ["Reference endnote 2", "----endnote2----"],
222 |                         ["<h1>", "Heading 1", "</h1>"],
223 |                         ["<h2>", "Heading 2", "</h2>"],
224 |                         [],
225 |                         [
226 |                             par(
227 |                                 """----Image alt text---->A jellyfish in
228 |                                 water\n\nDescription automatically generated<"""
229 |                             ),
230 |                             "----media/image2.jpg----",
231 |                         ],
232 |                     ]
233 |                 ]
234 |             ]
235 |             result = html_output.body_runs[3]
236 |             assert result == expect
237 | 
238 | 
239 | class TestImageDir:
240 |     """Write images out to file given an image directory."""
241 | 
242 |     def test_pull_image_files(self) -> None:
243 |         """Copy image files to output path."""
244 |         pars = docx2python(RESOURCES / "example.docx", "delete_this/path/to/images")
245 |         assert set(os.listdir("delete_this/path/to/images")) == {
246 |             "image1.png",
247 |             "image2.jpg",
248 |         }
249 |         # clean up
250 |         shutil.rmtree("delete_this")
251 |         pars.close()
252 | 
253 | 
254 | def test_header_runs() -> None:
255 |     """Runs returned as separate strings. Paragraphs not joined"""
256 |     pars = docx2python(RESOURCES / "multiple_runs_per_paragraph.docx", html=True)
257 |     assert pars.document_runs == [
258 |         [[[["Multiple ", "<b>Runs in the</b>", " Header"]]]],
259 |         [
260 |             [
261 |                 [
262 |                     [
263 |                         "This document contains paragraphs with multiple runs per "
264 |                         + "paragraph. This ensures result.document and "
265 |                         + "result.document_runs return different things."
266 |                     ],
267 |                     [],
268 |                     ["Multiple ", "<b>Runs in the</b>", " Body"],
269 |                     ["Multiple ", "<b>Runs in the</b>", " Body"],
270 |                     ["Multiple ", "<b>Runs in the</b>", " Body"],
271 |                     ["Multiple ", "<b>Runs in the</b>", " Body"],
272 |                     [],
273 |                 ]
274 |             ]
275 |         ],
276 |         [[[["Multiple ", "<b>Runs in the</b>", " Footer"]]]],
277 |         [[[[]], [[]]]],
278 |         [[[[]], [[]]]],
279 |     ]
280 |     pars.close()
281 | 


--------------------------------------------------------------------------------
/tests/test_docx_context.py:
--------------------------------------------------------------------------------
  1 | """Test docx2python.docx_context.py
  2 | 
  3 | author: Shay Hill
  4 | created: 6/26/2019
  5 | """
  6 | 
  7 | import os
  8 | import tempfile
  9 | import zipfile
 10 | 
 11 | from lxml import etree
 12 | 
 13 | from docx2python.attribute_register import Tags, get_prefixed_tag
 14 | from docx2python.docx_context import collect_numAttrs
 15 | from docx2python.docx_reader import DocxReader
 16 | from docx2python.iterators import iter_at_depth
 17 | from docx2python.main import docx2python
 18 | from tests.conftest import RESOURCES
 19 | 
 20 | example_docx = RESOURCES / "example.docx"
 21 | example_numbering_docx = RESOURCES / "example_numbering.docx"
 22 | 
 23 | 
 24 | class TestSaveDocx:
 25 |     def test_save_unchanged(self) -> None:
 26 |         """Creates a valid docx"""
 27 |         with tempfile.TemporaryDirectory() as temp_dir:
 28 |             example_copy_docx = os.path.join(temp_dir, "example_copy.docx")
 29 |             with DocxReader(example_docx) as input_context:
 30 |                 input_xml = input_context.file_of_type("officeDocument").root_element
 31 |                 input_context.save(example_copy_docx)
 32 |             with DocxReader(example_copy_docx) as output_context:
 33 |                 output_xml = output_context.file_of_type("officeDocument").root_element
 34 |                 assert etree.tostring(input_xml) == etree.tostring(output_xml)
 35 | 
 36 |     def test_save_changed(self) -> None:
 37 |         """Creates a valid docx and updates text"""
 38 |         input_context = DocxReader(example_docx)
 39 |         input_xml = input_context.file_of_type("officeDocument").root_element
 40 |         for elem in (x for x in input_xml.iter() if get_prefixed_tag(x) == Tags.TEXT):
 41 |             if not elem.text:
 42 |                 continue
 43 |             elem.text = elem.text.replace("bullet", "BULLET")
 44 |         with tempfile.TemporaryDirectory() as temp_dir:
 45 |             with_text_replaced = os.path.join(temp_dir, "with_text_replaced.docx")
 46 |             input_context.save(with_text_replaced)
 47 |             with DocxReader(with_text_replaced) as output_context:
 48 |                 output_runs = output_context.file_of_type("officeDocument").text
 49 |         output_text = "".join(iter_at_depth(output_runs, 5))
 50 |         assert "bullet" not in output_text
 51 |         assert "BULLET" in output_text
 52 | 
 53 | 
 54 | class TestCollectNumAttrs:
 55 |     """Test strip_text.collect_numFmts"""
 56 | 
 57 |     def test_gets_start_indexes(self) -> None:
 58 |         """Retrieves start indexes from example_numbering.docx
 59 | 
 60 |         This test files contains lists starting from non-default value:
 61 |         II. expect II
 62 |             C. expect C
 63 |             D. expect D
 64 |                 4. expect 4
 65 |                     e. expect e
 66 |                     f. expect f
 67 |                         6) expect 6
 68 |                             f) expect f
 69 |                                 (viii) expect viii
 70 |                                 (ix) expect ix
 71 |         """
 72 |         zipf = zipfile.ZipFile(example_numbering_docx, "r")
 73 |         numId2Attrs = collect_numAttrs(
 74 |             etree.fromstring(zipf.read("word/numbering.xml"))
 75 |         )
 76 |         starts = {x.start for y in numId2Attrs.values() for x in y}
 77 |         assert starts == {1, 2, 3, 4, 5, 6, 8}
 78 | 
 79 |     def test_gets_formats(self) -> None:
 80 |         """Retrieves formats from example.docx
 81 | 
 82 |         This isn't a great test. There are numbered lists I've added then removed as
 83 |         I've edited my test docx. These still appear in the docx file. I could
 84 |         compare directly with the extracted numbering xml file, but even then I'd be
 85 |         comparing to something I don't know to be accurate. This just tests that all
 86 |         numbering formats are represented.
 87 |         """
 88 |         zipf = zipfile.ZipFile(example_docx)
 89 |         numId2Attrs = collect_numAttrs(
 90 |             etree.fromstring(zipf.read("word/numbering.xml"))
 91 |         )
 92 |         formats = {x.fmt for y in numId2Attrs.values() for x in y}
 93 |         assert formats == {
 94 |             "lowerLetter",
 95 |             "upperLetter",
 96 |             "lowerRoman",
 97 |             "upperRoman",
 98 |             "bullet",
 99 |             "decimal",
100 |         }
101 | 
102 | 
103 | class TestCollectDocProps:
104 |     """Test strip_text.collect_docProps"""
105 | 
106 |     def test_gets_properties(self) -> None:
107 |         """Retrieves properties from docProps"""
108 |         core_properties = docx2python(example_docx).core_properties
109 |         expected = {
110 |             "title": None,
111 |             "subject": None,
112 |             "creator": "Shay Hill",
113 |             "keywords": None,
114 |             "description": None,
115 |             "lastModifiedBy": "Shay Hill",
116 |         }
117 |         for prop, value in expected.items():
118 |             assert core_properties[prop] == value
119 | 
120 | 
121 | class TestGetContext:
122 |     """Text strip_text.get_context"""
123 | 
124 |     def test_numId2Attrs(self) -> None:
125 |         """All targets mapped"""
126 |         docx_context = DocxReader(example_docx)
127 |         assert docx_context.numId2Attrs == collect_numAttrs(
128 |             etree.fromstring(docx_context.zipf.read("word/numbering.xml"))
129 |         )
130 | 
131 |     def test_lists(self) -> None:
132 |         """Pass silently when no numbered or bulleted lists."""
133 |         docx_context = DocxReader(RESOURCES / "basic.docx")
134 |         assert docx_context.numId2Attrs == {}
135 | 
136 | 
137 | class TestPullImageFiles:
138 |     """Test strip_text.pull_image_files"""
139 | 
140 |     def test_pull_image_files(self) -> None:
141 |         """Copy image files to output path."""
142 |         docx_context = DocxReader(example_docx)
143 |         with tempfile.TemporaryDirectory() as image_folder:
144 |             _ = docx_context.pull_image_files(image_folder)
145 |             assert set(os.listdir(image_folder)) == {"image1.png", "image2.jpg"}
146 | 
147 |     def test_no_image_files(self) -> None:
148 |         """Pass silently when no image files."""
149 |         docx_context = DocxReader(RESOURCES / "basic.docx")
150 |         with tempfile.TemporaryDirectory() as image_folder:
151 |             _ = docx_context.pull_image_files(image_folder)
152 |             assert os.listdir(image_folder) == []
153 | 


--------------------------------------------------------------------------------
/tests/test_docx_output.py:
--------------------------------------------------------------------------------
 1 | """Test features of DocxContent that weren't tested in test_docx2python.
 2 | 
 3 | :author: Shay Hill
 4 | :created: 7/6/2019
 5 | """
 6 | 
 7 | from docx2python.iterators import iter_at_depth
 8 | from docx2python.main import docx2python
 9 | from tests.conftest import RESOURCES
10 | 
11 | 
12 | class TestDocument:
13 |     def test_combine_of_header_body_footer(self) -> None:
14 |         """Return all content combined as instance.document"""
15 |         with docx2python(RESOURCES / "example.docx") as content:
16 |             assert (
17 |                 content.document
18 |                 == content.header
19 |                 + content.body
20 |                 + content.footer
21 |                 + content.footnotes
22 |                 + content.endnotes
23 |             )
24 | 
25 |     def test_read_only(self) -> None:
26 |         """Document attribute is read only."""
27 |         with docx2python(RESOURCES / "example.docx") as content:
28 |             doc1 = content.document
29 |             doc1 = doc1[:1]
30 |             assert doc1 != content.document
31 |             assert (
32 |                 content.document
33 |                 == content.header
34 |                 + content.body
35 |                 + content.footer
36 |                 + content.footnotes
37 |                 + content.endnotes
38 |             )
39 | 
40 | 
41 | class TestText:
42 |     def test_function(self) -> None:
43 |         r"""Return '\n\n'-delimited paragraphs as instance.text."""
44 |         with docx2python(RESOURCES / "example.docx") as content:
45 |             assert content.text == "\n\n".join(iter_at_depth(content.document, 4))
46 | 
47 | 
48 | class TestHtmlMap:
49 |     def test_function(self) -> None:
50 |         """Return html tables."""
51 |         with docx2python(RESOURCES / "example.docx") as content:
52 |             assert (
53 |                 content.html_map[:48]
54 |                 == '<html><body><table border="1"><tr><td><pre>(0, 0'
55 |             )
56 | 


--------------------------------------------------------------------------------
/tests/test_dropdown_selector_in_table.py:
--------------------------------------------------------------------------------
 1 | """Test the dropdown selector in a table.
 2 | 
 3 | Issue: [https://github.com/ShayHill/docx2python/issues/73]
 4 | 
 5 | User iamahcy reports that a ContentControl dropdown selector in a table raises an
 6 | error.
 7 | 
 8 | The issue is that dropdown selectors are a nested table, and the first row of that
 9 | table requests a vMerge. The fix was to reject any vMerge (copy the cell above)
10 | request in the first row of any table.
11 | 
12 | :author: Shay Hill
13 | :created: 2024-09-26
14 | """
15 | 
16 | from docx2python import docx2python
17 | from tests.conftest import RESOURCES
18 | 
19 | test_file = RESOURCES / "list_index_a.docx"
20 | 
21 | 
22 | class TestContentControlDropdownSelectorInTable:
23 |     def test_content_control_dropdown_selector_in_table(self):
24 |         """Test the dropdown selector in a table."""
25 |         with docx2python(test_file) as docx_content:
26 |             content_runs = docx_content.document
27 | 
28 |         # fmt: off
29 |         assert content_runs == [
30 |             [
31 |                 [
32 |                     [""], [""], [""], [""], ["", ""]
33 |                 ],
34 |                 [
35 |                     [""], [""], [""], [""], ["", ""]
36 |                 ],
37 |                 [
38 |                     [""], [""], [""], [""], ["", ""]
39 |                 ],
40 |                 [
41 |                     [""], [""], [""], [""], ["", ""]
42 |                 ],
43 |                 [
44 |                     [""], [""], [""], [""], ["", ""]
45 |                 ],
46 |                 [
47 |                     [""]
48 |                 ],
49 |             ],
50 |             [
51 |                 [
52 |                     ["Silver"]
53 |                 ]
54 |             ],
55 |             [
56 |                 [
57 |                     [""], [""], [""]
58 |                 ],
59 |                 [
60 |                     ["", ""], ["", ""], ["", ""], ["", ""], ["", ""]
61 |                 ]
62 |             ],
63 |             [
64 |                 [
65 |                     [""]
66 |                 ]
67 |             ],
68 |             [
69 |                 [
70 |                     [""], [""]
71 |                 ]
72 |             ],
73 |             [
74 |                 [
75 |                     [""], [""]
76 |                 ]
77 |             ],
78 |         ]
79 |         # fmt: on
80 | 


--------------------------------------------------------------------------------
/tests/test_equations.py:
--------------------------------------------------------------------------------
 1 | """Pull some information from equations
 2 | 
 3 | :author: Shay Hill
 4 | :created: 7/7/2021
 5 | 
 6 | User sreeroopnaidu requested equation export. Equations are made up internally of
 7 | <w:m> elements. Previous versions of Docx2Python ignored these elements. These are
 8 | now recognized.
 9 | 
10 | Equations in Word's Professional format will return garbage.
11 | Equations in Word's Inline format will return a nice string.
12 | """
13 | 
14 | from docx2python import docx2python
15 | from tests.conftest import RESOURCES
16 | 
17 | 
18 | class TestEquations:
19 |     def test_professional_format(self):
20 |         """
21 |         Start a new paragraph when a <w:br/> element is found.
22 |         """
23 |         with docx2python(RESOURCES / "equations.docx") as content:
24 |             body = content.body
25 |         assert body == [
26 |             [
27 |                 [
28 |                     [
29 |                         "Professional Format",
30 |                         "<latex>01x</latex>",
31 |                         "Linear Format",
32 |                         "<latex>\\int_{0}^{1}x</latex>",
33 |                         "Linear Format with lt",
34 |                         "<latex>\\int0<1x<5</latex>",
35 |                     ]
36 |                 ]
37 |             ]
38 |         ]
39 | 


--------------------------------------------------------------------------------
/tests/test_file_object.py:
--------------------------------------------------------------------------------
 1 | """Test methods of File object that are not tested elsewhere.
 2 | 
 3 | :author: Shay Hill
 4 | :created: 4/3/2021
 5 | """
 6 | 
 7 | from docx2python.attribute_register import Tags, get_prefixed_tag
 8 | from docx2python.docx_reader import DocxReader
 9 | from docx2python.main import docx2python
10 | from tests.conftest import RESOURCES
11 | 
12 | 
13 | class TestFileObject:
14 |     """
15 |     Test methods of DocxContext object which are not tested elsewhere.
16 |     """
17 | 
18 |     def test_get_content_full(self) -> None:
19 |         """
20 |         Return full content if no root given.
21 |         """
22 |         full_extraction = docx2python(RESOURCES / "example.docx")
23 |         context = DocxReader(RESOURCES / "example.docx")
24 |         assert (
25 |             full_extraction.body_runs
26 |             == context.file_of_type("officeDocument").get_text()
27 |         )
28 |         context.close()
29 |         full_extraction.close()
30 | 
31 |     def test_get_content_partial(self) -> None:
32 |         """
33 |         Return content below root argument if given.
34 |         """
35 |         full_extraction = docx2python(RESOURCES / "example.docx")
36 |         context = DocxReader(RESOURCES / "example.docx")
37 |         document_xml = context.file_of_type("officeDocument")
38 |         first_par = next(
39 |             x
40 |             for x in document_xml.root_element.iter()
41 |             if get_prefixed_tag(x) == Tags.PARAGRAPH
42 |         )
43 |         assert [[[[full_extraction.body_runs[0][0][0][0]]]]] == document_xml.get_text(
44 |             first_par
45 |         )
46 |         context.close()
47 |         full_extraction.close()
48 | 


--------------------------------------------------------------------------------
/tests/test_from_bytes.py:
--------------------------------------------------------------------------------
 1 | """Test loading a .docx from a buffer of raw bytes.
 2 | 
 3 | :author: Shay Hill
 4 | :created: 2024-07-25
 5 | """
 6 | 
 7 | from io import BytesIO
 8 | 
 9 | from docx2python.main import docx2python
10 | from tests.conftest import RESOURCES
11 | 
12 | example_docx = RESOURCES / "example.docx"
13 | 
14 | 
15 | class TestFromBytes:
16 |     def test_from_bytes(self) -> None:
17 |         """Loads .docx from a buffer of raw bytes."""
18 |         with open(example_docx, "rb") as f:
19 |             buf = BytesIO(f.read())
20 |         with docx2python(buf) as content:
21 |             core_properties = content.core_properties
22 |             expected = {
23 |                 "title": None,
24 |                 "subject": None,
25 |                 "creator": "Shay Hill",
26 |                 "keywords": None,
27 |                 "description": None,
28 |                 "lastModifiedBy": "Shay Hill",
29 |             }
30 |             for prop, value in expected.items():
31 |                 assert core_properties[prop] == value
32 | 


--------------------------------------------------------------------------------
/tests/test_get_text.py:
--------------------------------------------------------------------------------
  1 | """Test functions in docx2python.get_text.py
  2 | 
  3 | author: Shay Hill
  4 | created: 5/20/2019
  5 | 
  6 | Does not test ``get_text``. ``get text`` is tested through source_old.
  7 | """
  8 | 
  9 | # pyright: reportPrivateUsage=false
 10 | 
 11 | from __future__ import annotations
 12 | 
 13 | from collections import defaultdict
 14 | from typing import TypedDict
 15 | 
 16 | import pytest
 17 | from lxml import etree
 18 | 
 19 | from docx2python.bullets_and_numbering import BulletGenerator, _increment_list_counter
 20 | from docx2python.docx_context import NumIdAttrs
 21 | from tests.helpers.utils import valid_xml
 22 | 
 23 | 
 24 | class NumberingContext(TypedDict):
 25 |     numId2Atts: dict[str, list[NumIdAttrs]]
 26 |     numId2count: defaultdict[str, defaultdict[str, int]]
 27 | 
 28 | 
 29 | class TestIncrementListCounter:
 30 |     """Test get_text.increment_list_counter"""
 31 | 
 32 |     def test_function(self) -> None:
 33 |         """Increments counter at ilvl, deletes deeper counters."""
 34 |         ilvl2count: defaultdict[str, int] = defaultdict(
 35 |             int, {str(x): x for x in range(1, 6)}
 36 |         )
 37 |         assert ilvl2count == {"1": 1, "2": 2, "3": 3, "4": 4, "5": 5}
 38 |         _ = _increment_list_counter(ilvl2count, "2")
 39 |         assert ilvl2count == {"1": 1, "2": 3}
 40 | 
 41 | 
 42 | @pytest.fixture()
 43 | def numbered_paragraphs() -> list[bytes]:
 44 |     """Seven numbered paragraphs, indented 0-6 ilvls."""
 45 |     paragraphs: list[str] = []
 46 |     for ilvl in range(7):
 47 |         paragraphs.append(
 48 |             "<w:p><w:pPr><w:numPr>"
 49 |             + '<w:ilvl w:val="'
 50 |             + str(ilvl)
 51 |             + '"/>'
 52 |             + '<w:numId w:val="1"/>'
 53 |             + "</w:numPr></w:pPr></w:p>"
 54 |         )
 55 |     return [valid_xml(x) for x in paragraphs]
 56 | 
 57 | 
 58 | @pytest.fixture()
 59 | def numbering_context() -> NumberingContext:
 60 |     """
 61 | 
 62 |     :return:
 63 |     """
 64 |     numId2Atts = {
 65 |         "1": [
 66 |             NumIdAttrs(fmt="bullet", start=None),
 67 |             NumIdAttrs(fmt="decimal", start=None),
 68 |             NumIdAttrs(fmt="lowerLetter", start=None),
 69 |             NumIdAttrs(fmt="upperLetter", start=None),
 70 |             NumIdAttrs(fmt="lowerRoman", start=None),
 71 |             NumIdAttrs(fmt="upperRoman", start=None),
 72 |             NumIdAttrs(fmt="undefined", start=None),
 73 |         ]
 74 |     }
 75 |     numId2count: defaultdict[str, defaultdict[str, int]] = defaultdict(
 76 |         lambda: defaultdict(int)
 77 |     )
 78 |     return {"numId2Atts": numId2Atts, "numId2count": numId2count}
 79 | 
 80 | 
 81 | class TestGetBulletString:
 82 |     """Test strip_test.get_bullet_string"""
 83 | 
 84 |     def test_bullet(
 85 |         self, numbered_paragraphs: list[bytes], numbering_context: NumberingContext
 86 |     ) -> None:
 87 |         """Returns '-- ' for 'bullet'"""
 88 | 
 89 |         paragraph = etree.fromstring(numbered_paragraphs[0])[0][0]
 90 |         bullets = BulletGenerator(numbering_context["numId2Atts"])
 91 |         assert bullets.get_bullet(paragraph) == "--\t"
 92 | 
 93 |     def test_decimal(
 94 |         self, numbered_paragraphs: list[bytes], numbering_context: NumberingContext
 95 |     ) -> None:
 96 |         """
 97 |         Returns '1) ' for 'decimal'
 98 |         indented one tab
 99 |         """
100 |         paragraph = etree.fromstring(numbered_paragraphs[1])[0][0]
101 |         bullets = BulletGenerator(numbering_context["numId2Atts"])
102 |         assert bullets.get_bullet(paragraph) == "\t1)\t"
103 | 
104 |     def test_lower_letter(
105 |         self, numbered_paragraphs: list[bytes], numbering_context: NumberingContext
106 |     ) -> None:
107 |         """
108 |         Returns 'a) ' for 'lowerLetter'
109 |         indented two tabs
110 |         """
111 |         paragraph = etree.fromstring(numbered_paragraphs[2])[0][0]
112 |         bullets = BulletGenerator(numbering_context["numId2Atts"])
113 |         assert bullets.get_bullet(paragraph) == "\t\ta)\t"
114 | 
115 |     def test_upper_letter(
116 |         self, numbered_paragraphs: list[bytes], numbering_context: NumberingContext
117 |     ) -> None:
118 |         """
119 |         Returns 'A) ' for 'upperLetter'
120 |         indented three tabs
121 |         """
122 |         paragraph = etree.fromstring(numbered_paragraphs[3])[0][0]
123 |         bullets = BulletGenerator(numbering_context["numId2Atts"])
124 |         assert bullets.get_bullet(paragraph) == "\t\t\tA)\t"
125 | 
126 |     def test_lower_roman(
127 |         self, numbered_paragraphs: list[bytes], numbering_context: NumberingContext
128 |     ) -> None:
129 |         """
130 |         Returns 'i) ' for 'lowerRoman'
131 |         indented 4 tabs
132 |         """
133 |         paragraph = etree.fromstring(numbered_paragraphs[4])[0][0]
134 |         bullets = BulletGenerator(numbering_context["numId2Atts"])
135 |         assert bullets.get_bullet(paragraph) == "\t\t\t\ti)\t"
136 | 
137 |     def test_upper_roman(
138 |         self, numbered_paragraphs: list[bytes], numbering_context: NumberingContext
139 |     ) -> None:
140 |         """
141 |         Returns 'I) ' for 'upperRoman'
142 |         indented 5 tabs
143 |         """
144 |         paragraph = etree.fromstring(numbered_paragraphs[5])[0][0]
145 |         bullets = BulletGenerator(numbering_context["numId2Atts"])
146 |         assert bullets.get_bullet(paragraph) == "\t\t\t\t\tI)\t"
147 | 
148 |     def test_undefined(
149 |         self, numbered_paragraphs: list[bytes], numbering_context: NumberingContext
150 |     ) -> None:
151 |         """
152 |         Returns '-- ' for unknown formats
153 |         indented 6 tabs
154 | 
155 |         Format "undefined" won't be defined in the function, so function will fall back
156 |         to bullet string (with a warning).
157 |         """
158 |         paragraph = etree.fromstring(numbered_paragraphs[6])[0][0]
159 |         bullets = BulletGenerator(numbering_context["numId2Atts"])
160 |         with pytest.warns(UserWarning):
161 |             _ = bullets.get_bullet(paragraph)
162 | 
163 |     def test_not_numbered(self, numbering_context: NumberingContext) -> None:
164 |         """
165 |         Returns '' when paragraph is not numbered.
166 |         """
167 |         one_par_file = valid_xml("<w:p></w:p>")
168 |         paragraph = etree.fromstring(one_par_file)[0]
169 |         bullets = BulletGenerator(numbering_context["numId2Atts"])
170 |         assert bullets.get_bullet(paragraph) == ""
171 | 
172 |     def test_resets_sublists(
173 |         self, numbered_paragraphs: list[bytes], numbering_context: NumberingContext
174 |     ):
175 |         """Numbers reset when returning to shallower level
176 | 
177 |         1)  top level
178 |             a)  level 2
179 |             b)  another level 2
180 |                 A)  level 3
181 |             c)  level 2 is still counting
182 |                 A)  NEW sublist of level 2
183 |         2)  top level is still counting
184 |             a)  NEW sublist of top level
185 |         """
186 |         pars = [numbered_paragraphs[x] for x in (1, 2, 2, 3, 2, 3, 1, 2)]
187 |         bullets = BulletGenerator(numbering_context["numId2Atts"])
188 |         bullet_strings: list[str] = []
189 |         for par in pars:
190 |             paragraph = etree.fromstring(par)[0][0]
191 |             bullet_strings.append(bullets.get_bullet(paragraph).strip())
192 | 
193 |         assert bullet_strings == ["1)", "a)", "b)", "A)", "c)", "A)", "2)", "a)"]
194 | 


--------------------------------------------------------------------------------
/tests/test_google_docs.py:
--------------------------------------------------------------------------------
 1 | """Test corrections for google docs docx files
 2 | 
 3 | :author: Shay Hill
 4 | :created: 11/2/2020
 5 | 
 6 | Docx files created in MS Work have a ``docProps.xml`` file with author, etc.
 7 | Docx files created in google docs do not have a ``docProps.xml`` file.
 8 | 
 9 | File `test-docx2python-conversion-google_docs.docx` sent by a user.
10 | 
11 | Traceback (most recent call last):
12 | File "/Users/cyee/.local/share/virtualenvs/word-to-md-EFw2UvDn/bin/word2md", line 33, in
13 | sys.exit(load_entry_point('word2md', 'console_scripts', 'word2md')())
14 | File "/Users/cyee/.local/share/virtualenvs/word-to-md-EFw2UvDn/lib/python3.8/site-packages/click/core.py", line 829, in call
15 | return self.main(*args, **kwargs)
16 | File "/Users/cyee/.local/share/virtualenvs/word-to-md-EFw2UvDn/lib/python3.8/site-packages/click/core.py", line 782, in main
17 | rv = self.invoke(ctx)
18 | File "/Users/cyee/.local/share/virtualenvs/word-to-md-EFw2UvDn/lib/python3.8/site-packages/click/core.py", line 1066, in invoke
19 | return ctx.invoke(self.callback, **ctx.params)
20 | File "/Users/cyee/.local/share/virtualenvs/word-to-md-EFw2UvDn/lib/python3.8/site-packages/click/core.py", line 610, in invoke
21 | return callback(*args, **kwargs)
22 | File "/Users/cyee/projects/python/word-to-md/word2md.py", line 349, in cli
23 | make_md_from_entire_doc(path)
24 | File "/Users/cyee/projects/python/word-to-md/word2md.py", line 300, in make_md_from_entire_doc
25 | document = docx2python(input_file, html=True)
26 | File "/Users/cyee/.local/share/virtualenvs/word-to-md-EFw2UvDn/lib/python3.8/site-packages/docx2python/main.py", line 35, in docx2python
27 | context = get_context(zipf)
28 | File "/Users/cyee/.local/share/virtualenvs/word-to-md-EFw2UvDn/lib/python3.8/site-packages/docx2python/docx_context.py", line 272, in get_context
29 | "docProp2text": collect_docProps(zipf.read("docProps/core.xml")),
30 | File "/usr/local/opt/python@3.8/Frameworks/Python.framework/Versions/3.8/lib/python3.8/zipfile.py", line 1475, in read
31 | with self.open(name, "r", pwd) as fp:
32 | File "/usr/local/opt/python@3.8/Frameworks/Python.framework/Versions/3.8/lib/python3.8/zipfile.py", line 1514, in open
33 | zinfo = self.getinfo(name)
34 | File "/usr/local/opt/python@3.8/Frameworks/Python.framework/Versions/3.8/lib/python3.8/zipfile.py", line 1441, in getinfo
35 | raise KeyError(
36 | KeyError: "There is no item named 'docProps/core.xml' in the archive"
37 | """
38 | 
39 | import pytest
40 | 
41 | from docx2python import docx2python
42 | from tests.conftest import RESOURCES
43 | 
44 | FILE_WITH_DOCPROPS = RESOURCES / "example.docx"
45 | 
46 | FILE_WITHOUT_DOCPROPS = RESOURCES / "test-docx2python-conversion-google_docs.docx"
47 | 
48 | 
49 | class TestDeprecatedPropertiesProperty:
50 |     def test_deprecated_properties_property(self) -> None:
51 |         """
52 |         Raise a future warning when user requests ``result.properties``
53 |         """
54 |         with docx2python(FILE_WITH_DOCPROPS) as result:
55 |             with pytest.warns(FutureWarning):
56 |                 _ = result.properties
57 | 
58 | 
59 | class TestDocPropsFound:
60 |     def test_docprops_found(self) -> None:
61 |         """
62 |         Return docProps as a dictionary
63 |         """
64 |         with docx2python(FILE_WITH_DOCPROPS) as result:
65 |             assert result.core_properties == {
66 |                 "created": "2019-07-05T21:51:00Z",
67 |                 "creator": "Shay Hill",
68 |                 "description": None,
69 |                 "keywords": None,
70 |                 "lastModifiedBy": "Shay Hill",
71 |                 "modified": "2021-03-26T00:30:00Z",
72 |                 "revision": "7",
73 |                 "subject": None,
74 |                 "title": None,
75 |             }
76 | 
77 | 
78 | class TestGoogleDocs:
79 |     def test_empty_properties_dict_if_docProps_not_found(self) -> None:
80 |         """
81 |         It seems Google Docs docx files to not contain a document properties file:
82 |         `docProps/core.xml`. The contents of this file are returned as a dictionary.
83 |         To correct the above error, result.properties will now return an empty
84 |         dictionary (with a warning).
85 |         """
86 |         with docx2python(FILE_WITHOUT_DOCPROPS) as result:
87 |             with pytest.warns(UserWarning):
88 |                 assert result.core_properties == {}
89 | 


--------------------------------------------------------------------------------
/tests/test_hyperlinks.py:
--------------------------------------------------------------------------------
 1 | """Test that consecutive links pointing to the same address are merged.
 2 | 
 3 | :author: Shay Hill
 4 | :created: 3/17/2021
 5 | 
 6 | Such links will look like this (after removing proofErr, rsid, and other noise).
 7 | 
 8 |     <w:p>
 9 |         <w:hyperlink r:id="rId7">  <!-- points to http://www.shayallenhill.com -->
10 |             <w:r>
11 |                 <w:t>hy</w:t>
12 |             </w:r>
13 |         </w:hyperlink>
14 |         <w:hyperlink r:id="rId8">  <!-- points to http://www.shayallenhill.com -->
15 |             <w:r>
16 |                 <w:t>per</w:t>
17 |             </w:r>
18 |         </w:hyperlink>
19 |         <w:hyperlink r:id="rId9">  <!-- points to http://www.shayallenhill.com -->
20 |             <w:r>
21 |                 <w:t>link</w:t>
22 |             </w:r>
23 |         </w:hyperlink>
24 |     </w:p>
25 | 
26 | Docx2python condenses these to
27 | 
28 |     <w:p>
29 |         <w:hyperlink r:id="rId7">  <!-- points to http://www.shayallenhill.com -->
30 |             <w:r>
31 |                 <w:t>hy</w:t>
32 |             </w:r>
33 |             <w:r>
34 |                 <w:t>per</w:t>
35 |             </w:r>
36 |             <w:r>
37 |                 <w:t>link</w:t>
38 |             </w:r>
39 |         </w:hyperlink>
40 |     </w:p>
41 | 
42 | Then to
43 | 
44 |     <w:p>
45 |         <w:hyperlink r:id="rId7">  <!-- points to http://www.shayallenhill.com -->
46 |             <w:r>
47 |                 <w:t>hyperlink</w:t>
48 |             </w:r>
49 |         </w:hyperlink>
50 |     </w:p>
51 | 
52 | This module tests the final result.
53 | """
54 | 
55 | from docx2python.main import docx2python
56 | from tests.conftest import RESOURCES
57 | 
58 | 
59 | class TestHyperlink:
60 |     def test_prints(self) -> None:
61 |         """Consecutive hyperlinks referencing same target are joined"""
62 |         with docx2python(RESOURCES / "hyperlink.docx") as extraction:
63 |             assert extraction.body_runs == [
64 |                 [
65 |                     [
66 |                         [
67 |                             [
68 |                                 "This is a link to ",
69 |                                 '<a href="http://www.shayallenhill.com/">'
70 |                                 + "my website</a>",
71 |                                 ".",
72 |                             ]
73 |                         ]
74 |                     ]
75 |                 ]
76 |             ]
77 | 


--------------------------------------------------------------------------------
/tests/test_import.py:
--------------------------------------------------------------------------------
 1 | """Make sure from docx2python import ... works
 2 | 
 3 | :author: Shay Hill
 4 | :created: 7/17/2019
 5 | 
 6 | """
 7 | 
 8 | from docx2python import docx2python
 9 | from tests.conftest import RESOURCES
10 | 
11 | 
12 | def test() -> None:
13 |     """Just making sure the import works."""
14 |     with docx2python(RESOURCES / "example.docx") as _:
15 |         pass
16 | 


--------------------------------------------------------------------------------
/tests/test_invalid_tag_name.py:
--------------------------------------------------------------------------------
 1 | """Issue 72: Invalid tag name.
 2 | 
 3 | User makretch found a file converted by Aspose that had an invalid tag name in a
 4 | comment. This tag name caused a ValueError when passed to `etree.QName`.
 5 | 
 6 | ValueError: Invalid tag name 'cyfunction Comment at 0x12345678abcd'
 7 | 
 8 | I addressed this by skipping elements with invalid tag names and raising a warning.
 9 | 
10 | :author: Shay Hill
11 | :created: 2024-12-05
12 | """
13 | 
14 | import pytest
15 | from conftest import RESOURCES
16 | 
17 | from docx2python import docx2python
18 | 
19 | 
20 | class TestInvalidTagName:
21 |     """Confirming this works with v1.25"""
22 | 
23 |     def test_invalid_tag_name(self) -> None:
24 |         """Pass if no ValueError is raised."""
25 |         extraction = docx2python(RESOURCES / "invalid_tag_name.docx")
26 |         with pytest.warns(UserWarning, match="skipping invalid tag name"):
27 |             _ = extraction.text
28 |         extraction.close()
29 | 


--------------------------------------------------------------------------------
/tests/test_iterators.py:
--------------------------------------------------------------------------------
  1 | """Test docx2python.iterators.py
  2 | 
  3 | author: Shay Hill
  4 | created: 6/28/2019
  5 | """
  6 | 
  7 | import itertools as it
  8 | 
  9 | import pytest
 10 | 
 11 | from docx2python.iterators import (
 12 |     enum_at_depth,
 13 |     enum_cells,
 14 |     enum_paragraphs,
 15 |     enum_rows,
 16 |     enum_tables,
 17 |     get_html_map,
 18 |     iter_cells,
 19 |     iter_paragraphs,
 20 |     iter_rows,
 21 |     iter_tables,
 22 | )
 23 | 
 24 | TABLES = [
 25 |     [
 26 |         [[["0000", "0001"], ["0010", "0011"]], [["0100", "0101"], ["0110", "0111"]]],
 27 |         [[["1000", "1001"], ["1010", "1011"]], [["1100", "1101"], ["1110", "1111"]]],
 28 |     ]
 29 | ]
 30 | 
 31 | 
 32 | class TestOutOfRange:
 33 |     def test_enum_at_depth_low(self) -> None:
 34 |         """Raise ValueError when attempting to enumerate over depth < 1."""
 35 |         with pytest.raises(ValueError) as msg:
 36 |             _ = tuple(enum_at_depth(TABLES, 0))  # type: ignore
 37 |         assert "depth argument must be 1, 2, 3, 4, or 5" in str(msg.value)
 38 | 
 39 |     def test_enum_at_depth_high(self) -> None:
 40 |         """Raise ValueError when attempting to enumerate over depth < 1."""
 41 |         with pytest.raises(ValueError) as msg:
 42 |             _ = tuple(enum_at_depth(TABLES, 6))  # type: ignore
 43 |         assert "depth argument must be 1, 2, 3, 4, or 5" in str(msg.value)
 44 | 
 45 | 
 46 | class TestIterators:
 47 |     """Test iterators.iter_*"""
 48 | 
 49 |     def test_iter_tables(self) -> None:
 50 |         """Return all tables."""
 51 |         assert list(iter_tables(TABLES)) == TABLES
 52 | 
 53 |     def test_iter_rows(self) -> None:
 54 |         """Return all rows."""
 55 |         assert list(iter_rows(TABLES)) == list(it.chain(*iter_tables(TABLES)))
 56 | 
 57 |     def test_iter_cells(self) -> None:
 58 |         """Return all cells."""
 59 |         assert list(iter_cells(TABLES)) == list(it.chain(*iter_rows(TABLES)))
 60 | 
 61 |     def test_iter_paragraphs(self) -> None:
 62 |         """Return all paragraphs."""
 63 |         assert list(iter_paragraphs(TABLES)) == list(it.chain(*iter_cells(TABLES)))
 64 | 
 65 | 
 66 | class TestEnumerators:
 67 |     """Test iterators.enum_*"""
 68 | 
 69 |     def test_enum_tables(self) -> None:
 70 |         """Return all tables."""
 71 |         assert list(enum_tables(TABLES)) == [
 72 |             (
 73 |                 (0,),
 74 |                 [
 75 |                     [
 76 |                         [["0000", "0001"], ["0010", "0011"]],
 77 |                         [["0100", "0101"], ["0110", "0111"]],
 78 |                     ],
 79 |                     [
 80 |                         [["1000", "1001"], ["1010", "1011"]],
 81 |                         [["1100", "1101"], ["1110", "1111"]],
 82 |                     ],
 83 |                 ],
 84 |             )
 85 |         ]
 86 | 
 87 |     def test_enum_rows(self) -> None:
 88 |         """Return all rows."""
 89 |         assert list(enum_rows(TABLES)) == [
 90 |             (
 91 |                 (0, 0),
 92 |                 [
 93 |                     [["0000", "0001"], ["0010", "0011"]],
 94 |                     [["0100", "0101"], ["0110", "0111"]],
 95 |                 ],
 96 |             ),
 97 |             (
 98 |                 (0, 1),
 99 |                 [
100 |                     [["1000", "1001"], ["1010", "1011"]],
101 |                     [["1100", "1101"], ["1110", "1111"]],
102 |                 ],
103 |             ),
104 |         ]
105 | 
106 |     def test_enum_cells(self) -> None:
107 |         """Return all cells."""
108 |         assert list(enum_cells(TABLES)) == [
109 |             ((0, 0, 0), [["0000", "0001"], ["0010", "0011"]]),
110 |             ((0, 0, 1), [["0100", "0101"], ["0110", "0111"]]),
111 |             ((0, 1, 0), [["1000", "1001"], ["1010", "1011"]]),
112 |             ((0, 1, 1), [["1100", "1101"], ["1110", "1111"]]),
113 |         ]
114 | 
115 |     def test_enum_paragraphs(self) -> None:
116 |         """Return all paragraphs."""
117 |         assert list(enum_paragraphs(TABLES)) == [
118 |             ((0, 0, 0, 0), ["0000", "0001"]),
119 |             ((0, 0, 0, 1), ["0010", "0011"]),
120 |             ((0, 0, 1, 0), ["0100", "0101"]),
121 |             ((0, 0, 1, 1), ["0110", "0111"]),
122 |             ((0, 1, 0, 0), ["1000", "1001"]),
123 |             ((0, 1, 0, 1), ["1010", "1011"]),
124 |             ((0, 1, 1, 0), ["1100", "1101"]),
125 |             ((0, 1, 1, 1), ["1110", "1111"]),
126 |         ]
127 | 
128 | 
129 | class TestGetHtmlMap:
130 |     """Test iterators.get_html_map"""
131 | 
132 |     def test_get_html_map(self) -> None:
133 |         """Create valid html."""
134 |         # fmt: off
135 |         assert get_html_map(TABLES) == (
136 |             "<html>"
137 |             "<body>"
138 |             '<table border="1">'
139 |             "<tr>"
140 |             "<td>"
141 |             "<pre>(0, 0, 0, 0) 00000001"
142 |             "</pre>"
143 |             "<pre>(0, 0, 0, 1) 00100011"
144 |             "</pre>"
145 |             "</td>"
146 |             "<td>"
147 |             "<pre>(0, 0, 1, 0) 01000101"
148 |             "</pre>"
149 |             "<pre>(0, 0, 1, 1) 01100111"
150 |             "</pre>"
151 |             "</td>"
152 |             "</tr>"
153 |             "<tr>"
154 |             "<td>"
155 |             "<pre>(0, 1, 0, 0) 10001001"
156 |             "</pre>"
157 |             "<pre>(0, 1, 0, 1) 10101011"
158 |             "</pre>"
159 |             "</td>"
160 |             "<td>"
161 |             "<pre>(0, 1, 1, 0) 11001101"
162 |             "</pre>"
163 |             "<pre>(0, 1, 1, 1) 11101111"
164 |             "</pre>"
165 |             "</td>"
166 |             "</tr>"
167 |             "</table>"
168 |             "</body>"
169 |             "</html>"
170 |         )
171 |         # fmt: on
172 | 


--------------------------------------------------------------------------------
/tests/test_libreoffice_conversion.py:
--------------------------------------------------------------------------------
 1 | """Libreoffice conversions from doc to docx raise CaretDepthError
 2 | 
 3 | :author: Shay Hill
 4 | :created: 8/11/2021
 5 | 
 6 | Uner shadowmimosa shared a docx (libreoffice_conversion.docx), converted by libreoffice
 7 | from a doc that raises a CaretDepthError.
 8 | """
 9 | 
10 | import pytest
11 | 
12 | from docx2python.main import docx2python
13 | from tests.conftest import RESOURCES
14 | 
15 | 
16 | class TestLibreofficeConversion:
17 |     def test_libreoffice_conversion(self) -> None:
18 |         """Extracts text without a CaretDepthError
19 | 
20 |         This test file for a user just happens to be in Chinese and contains an
21 |         unsupported Chinese numbering format, hence the ``pytest.warns`` context.
22 |         """
23 |         with docx2python(RESOURCES / "libreoffice_conversion.docx") as content:
24 |             with pytest.warns(UserWarning):
25 |                 _ = content.document
26 | 


--------------------------------------------------------------------------------
/tests/test_lineage.py:
--------------------------------------------------------------------------------
 1 | """Test the lineage attribute of Par instances.
 2 | 
 3 | :author: Shay Hill
 4 | :created: 2024-07-14
 5 | """
 6 | 
 7 | from docx2python.iterators import (
 8 |     is_tbl,
 9 |     is_tc,
10 |     is_tr,
11 |     iter_cells,
12 |     iter_paragraphs,
13 |     iter_rows,
14 |     iter_tables,
15 | )
16 | from docx2python.main import docx2python
17 | 
18 | from .conftest import RESOURCES
19 | 
20 | 
21 | class TestLineage:
22 |     """Are lineage tags correct for Par instances?"""
23 | 
24 |     def test_explicit(self):
25 |         """Output matches expected lineage."""
26 |         with docx2python(RESOURCES / "paragraphs_and_tables.docx") as extraction:
27 |             pars = extraction.document_pars
28 |         lineages = [par.lineage for par in iter_paragraphs(pars)]
29 |         assert lineages == [
30 |             ("document", None, None, None, "p"),
31 |             ("document", "tbl", "tr", "tc", "p"),
32 |             ("document", "tbl", "tr", "tc", "p"),
33 |             ("document", "tbl", "tr", "tc", "p"),
34 |             ("document", "tbl", "tr", "tc", "p"),
35 |             ("document", "tbl", "tr", "tc", "p"),
36 |             ("document", "tbl", "tr", "tc", "p"),
37 |             ("document", None, None, None, "p"),
38 |             ("document", None, None, None, "p"),
39 |             ("document", "tbl", "tr", "tc", "p"),
40 |             ("document", "tbl", "tr", "tc", "p"),
41 |             ("document", "tbl", "tr", "tc", "p"),
42 |             ("document", "tbl", "tr", "tc", "p"),
43 |             ("document", None, None, None, "p"),
44 |         ]
45 | 
46 | 
47 | class TestTableIdentification:
48 |     """Are tables identified correctly?"""
49 | 
50 |     def test_is_tbl(self):
51 |         """Tables are identified correctly."""
52 |         with docx2python(RESOURCES / "paragraphs_and_tables.docx") as extraction:
53 |             pars = extraction.document_pars
54 |         assert [is_tbl(tbl) for tbl in iter_tables(pars)] == [
55 |             False,
56 |             True,
57 |             False,
58 |             True,
59 |             False,
60 |         ]
61 | 
62 |     def test_is_tr(self):
63 |         """Tables are identified correctly."""
64 |         with docx2python(RESOURCES / "paragraphs_and_tables.docx") as extraction:
65 |             pars = extraction.document_pars
66 |         assert [is_tr(tr) for tr in iter_rows(pars)] == [
67 |             False,
68 |             True,
69 |             True,
70 |             True,
71 |             False,
72 |             True,
73 |             True,
74 |             True,
75 |             True,
76 |             False,
77 |         ]
78 | 
79 |     def test_is_tc(self):
80 |         """Tables are identified correctly."""
81 |         with docx2python(RESOURCES / "paragraphs_and_tables.docx") as extraction:
82 |             pars = extraction.document_pars
83 |         assert [is_tc(tc) for tc in iter_cells(pars)] == [
84 |             False,
85 |             True,
86 |             True,
87 |             True,
88 |             True,
89 |             True,
90 |             True,
91 |             False,
92 |             True,
93 |             True,
94 |             True,
95 |             True,
96 |             False,
97 |         ]
98 | 


--------------------------------------------------------------------------------
/tests/test_linebreak_replace_text.py:
--------------------------------------------------------------------------------
 1 | """Try to use replace text with a linebreak.
 2 | 
 3 | :author: Shay Hill
 4 | :created: 2023-04-26
 5 | """
 6 | 
 7 | from docx2python.main import docx2python
 8 | from tests.conftest import RESOURCES
 9 | 
10 | 
11 | class TestText:
12 |     def test_user_checked_dropdown0(self) -> None:
13 |         """Get checked-out box glyph and second dd entry"""
14 |         extraction = docx2python(RESOURCES / "checked_drop1.docx")
15 |         assert extraction.body_runs == [[[[["☒", " "], ["PIlihan A"]]]]]
16 |         extraction.close()
17 | 


--------------------------------------------------------------------------------
/tests/test_list_position.py:
--------------------------------------------------------------------------------
 1 | """Test list_position attribute of list paragraphs.
 2 | 
 3 | :author: Shay Hill
 4 | :created: 2024-07-17
 5 | """
 6 | 
 7 | from docx2python.iterators import iter_at_depth
 8 | from docx2python.main import docx2python
 9 | from tests.conftest import RESOURCES
10 | 
11 | 
12 | class TestListPosition:
13 |     def test_explicit(self):
14 |         # """List paragraphs match hand-counted list_position."""
15 |         with docx2python(RESOURCES / "example.docx") as content:
16 |             pars = iter_at_depth(content.officeDocument_pars, 4)
17 |         positions = [p.list_position for p in pars]
18 |         assert positions == [
19 |             ("2", [1]),
20 |             ("2", [1, 1]),
21 |             ("2", [1, 2]),
22 |             ("2", [1, 2, 1]),
23 |             ("2", [1, 2, 1, 1]),
24 |             ("2", [1, 2, 1, 2]),
25 |             ("2", [1, 2, 1, 2, 1]),
26 |             ("2", [1, 2, 1, 2, 1, 1]),
27 |             ("2", [1, 2, 1, 2, 1, 1, 1]),
28 |             ("2", [1, 2, 1, 2, 1, 1, 2]),
29 |             ("2", [2]),
30 |             ("2", [2, 1]),
31 |             ("1", [1]),
32 |             ("1", [1, 1]),
33 |             ("1", [1, 1, 1]),
34 |             (None, []),
35 |             (None, []),
36 |             (None, []),
37 |             (None, []),
38 |             (None, []),
39 |             (None, []),
40 |             (None, []),
41 |             (None, []),
42 |             (None, []),
43 |             (None, []),
44 |             (None, []),
45 |             (None, []),
46 |             (None, []),
47 |             (None, []),
48 |             (None, []),
49 |             (None, []),
50 |             (None, []),
51 |             (None, []),
52 |             (None, []),
53 |             (None, []),
54 |             (None, []),
55 |             (None, []),
56 |             (None, []),
57 |             (None, []),
58 |         ]
59 | 


--------------------------------------------------------------------------------
/tests/test_long_hyperlink.py:
--------------------------------------------------------------------------------
 1 | """User K Ravikiran had trouble with long hyperlinks.
 2 | 
 3 | The sample file here has a hyperlink he was not able to export correctly.
 4 | 
 5 | :author: Shay Hill
 6 | :created: 2024-01-20
 7 | """
 8 | 
 9 | from docx2python.main import docx2python
10 | from tests.conftest import RESOURCES
11 | 
12 | long_hyperlink = RESOURCES / "long_hyperlink.docx"
13 | 
14 | 
15 | class TestLongHyperlink:
16 |     def test_non_html(self) -> None:
17 |         """Exports full hyperlink without html flag."""
18 |         with docx2python(long_hyperlink) as docx_content:
19 |             extracted_text = docx_content.text
20 |         long_url = (
21 |             "https://connect.asdfg.com/wikis/home?lang-en-us"
22 |             + "#!/wiki/asdfasdf_asdfasdf/page/EOL%20support%20-%20MDGI"
23 |         )
24 |         assert long_url in extracted_text
25 | 
26 |     def test_html(self) -> None:
27 |         """Exports full hyperlink with html flag."""
28 |         with docx2python(long_hyperlink, html=True) as docx_content:
29 |             extracted_text = docx_content.text
30 |         long_url = (
31 |             "https://connect.asdfg.com/wikis/home?lang-en-us"
32 |             + "#!/wiki/asdfasdf_asdfasdf/page/EOL%20support%20-%20MDGI"
33 |         )
34 |         assert long_url in extracted_text
35 | 


--------------------------------------------------------------------------------
/tests/test_merge_runs.py:
--------------------------------------------------------------------------------
  1 | """Test that consecutive links pointing to the same address are merged.
  2 | 
  3 | :author: Shay Hill
  4 | :created: 3/17/2021
  5 | 
  6 | There are a few ways consecutive elements can be "identical":
  7 |     * same link
  8 |     * same style
  9 | 
 10 | Often, consecutive, "identical" elements are written as separate elements,
 11 | because they aren't identical to Word. Work keeps track of revision history,
 12 | spelling errors, etc., which are meaningless to docx2python.
 13 | 
 14 | <w:p>
 15 |     <w:hyperlink r:id="rId7">  <!-- points to http://www.shayallenhill.com -->
 16 |         <w:r>
 17 |             <w:t>hy</w:t>
 18 |         </w:r>
 19 |     </w:hyperlink>
 20 |     <w:proofErr/>  <!-- docx2python will ignore this proofErr -->
 21 |     <w:hyperlink r:id="rId8">  <!-- points to http://www.shayallenhill.com -->
 22 |         <w:r>
 23 |             <w:t>per</w:t>
 24 |         </w:r>
 25 |     </w:hyperlink>
 26 |     <w:hyperlink r:id="rId9">  <!-- points to http://www.shayallenhill.com -->
 27 |         <w:r w:rsid="asdfas">  <!-- docx2python will ignore this rsid -->
 28 |             <w:t>link</w:t>
 29 |         </w:r>
 30 |     </w:hyperlink>
 31 | </w:p>
 32 | 
 33 | Docx2python condenses the above to (by merging links)
 34 | 
 35 | <w:p>
 36 |     <w:hyperlink r:id="rId7">  <!-- points to http://www.shayallenhill.com -->
 37 |         <w:r>
 38 |             <w:t>hy</w:t>
 39 |         </w:r>
 40 |         <w:r>
 41 |             <w:t>per</w:t>
 42 |         </w:r>
 43 |         <w:r w:rsid="asdfas">  <!-- docx2python will ignore this rsid -->
 44 |             <w:t>link</w:t>
 45 |         </w:r>
 46 |     </w:hyperlink>
 47 | </w:p>
 48 | 
 49 | Then to (by merging runs)
 50 | 
 51 | <w:p>
 52 |     <w:hyperlink r:id="rId7">  <!-- points to http://www.shayallenhill.com -->
 53 |         <w:r>
 54 |             <w:t>hy</w:t>
 55 |             <w:t>per</w:t>
 56 |             <w:t>link</w:t>
 57 |         </w:r>
 58 |     </w:hyperlink>
 59 | </w:p>
 60 | 
 61 | Then finally to (by merging text)
 62 | 
 63 | <w:p>
 64 |     <w:hyperlink r:id="rId7">  <!-- points to http://www.shayallenhill.com -->
 65 |         <w:r>
 66 |             <w:t>hyperlink</w:t>
 67 |         </w:r>
 68 |     </w:hyperlink>
 69 | </w:p>
 70 | """
 71 | 
 72 | from docx2python.main import docx2python
 73 | from tests.conftest import RESOURCES
 74 | 
 75 | 
 76 | def test_merge_runs():
 77 |     """
 78 |     Merge duplicate, consecutive hyperlinks
 79 | 
 80 |     The output text would look the same whether run and text elements were merged.
 81 |     This test only verifies that hyperlink elements have been merged, else the output
 82 |     text would contain something closer to ``<a>hy</a><a>per</a><a>link</a>``
 83 |     """
 84 |     extraction = docx2python(RESOURCES / "merged_links.docx")
 85 |     assert extraction.body_runs == [
 86 |         [
 87 |             [
 88 |                 [
 89 |                     [
 90 |                         "This page created by putting three links to the same address "
 91 |                         + "in three different paragraphs (as below) …"
 92 |                     ],
 93 |                     ['<a href="https://www.shayallenhill.com">hy</a>'],
 94 |                     ['<a href="https://www.shayallenhill.com">per</a>'],
 95 |                     ['<a href="https://www.shayallenhill.com">link</a>'],
 96 |                     ["Then removing the endlines to create a single link."],
 97 |                     ['<a href="https://www.shayallenhill.com">hyperlink</a>'],
 98 |                     [
 99 |                         "Internally, the XML records the joined paragraphs as "
100 |                         + "three consecutive links, each with a different r:id, "
101 |                         + "all r:ids referencing the same address. Docx2python v2+ "
102 |                         + "should re-join these consecutive links."
103 |                     ],
104 |                     [],
105 |                     [],
106 |                 ]
107 |             ]
108 |         ]
109 |     ]
110 |     extraction.close()
111 | 


--------------------------------------------------------------------------------
/tests/test_merged_cells.py:
--------------------------------------------------------------------------------
 1 | """Attempt to properly handle merged table cells.
 2 | 
 3 | :author: Shay Hill
 4 | :created: 2023-01-23
 5 | """
 6 | 
 7 | from docx2python import docx2python
 8 | from tests.conftest import RESOURCES
 9 | 
10 | 
11 | class TestMergedCells:
12 |     def test_duplicate_merged_cells_false(self):
13 |         """By default, duplicate merged cells."""
14 |         with docx2python(
15 |             RESOURCES / "merged_cells.docx", duplicate_merged_cells=False
16 |         ) as content:
17 |             # fmt: off
18 |             assert content.body == [
19 |                 [
20 |                     [["0-0"],  ["0-12"],  [""],  ["0-3"]],
21 |                     [["12-0"], ["1-1"],    ["1-2"],    ["1-3"]],
22 |                     [[""],     ["2-1"],    ["2-2"],    ["2-3"]],
23 |                     [["3-0"],  ["34-123"], [""], [""]],
24 |                     [["4-0"],  [""], [""], [""]],
25 |                 ],
26 |                 [[[""]]],
27 |             ]
28 |             # fmt: on
29 | 
30 |     def test_duplicate_merged_cells_true(self):
31 |         """Duplicate contents in merged cells for an mxn table list."""
32 |         with docx2python(RESOURCES / "merged_cells.docx") as content:
33 |             # fmt: off
34 |             assert content.body == [
35 |                 [
36 |                     [["0-0"],  ["0-12"],   ["0-12"],   ["0-3"]],
37 |                     [["12-0"], ["1-1"],    ["1-2"],    ["1-3"]],
38 |                     [["12-0"], ["2-1"],    ["2-2"],    ["2-3"]],
39 |                     [["3-0"],  ["34-123"], ["34-123"], ["34-123"]],
40 |                     [["4-0"],  ["34-123"], ["34-123"], ["34-123"]],
41 |                 ],
42 |                 [[[""]]],
43 |             ]
44 |             # fmt: on
45 | 


--------------------------------------------------------------------------------
/tests/test_more_html.py:
--------------------------------------------------------------------------------
  1 | """Test that passing `more_html = True` collects paragraph styles
  2 | 
  3 | :author: Shay Hill
  4 | :created: 11/5/2020
  5 | 
  6 | Paragraphs and runs can end up nested with text boxes. Docx2python
  7 | un-nests these paragraphs.
  8 | 
  9 |     <w:p>
 10 |         <w:pPr>
 11 |             <w:pStyle w:val="Header"/>
 12 |         </w:pPr>
 13 |         <w:r>
 14 |                 <w:t>EHS Manual</w:t>
 15 |         </w:r>
 16 |         <w:r>
 17 |             <w:p>
 18 |                 <w:r>
 19 |                     <w:t>EHS Manual</w:t>
 20 |                 </w:r>
 21 |             </w:p>
 22 |             <w:p w14:paraId="37B5F1EE" w14:textId="1E56D065" w:rsidR="003A2388"
 23 |                 w:rsidRPr="00815EC1" w:rsidRDefault="003A2388" w:rsidP="00CA47BD">
 24 |                 <w:r>
 25 |                     <w:t>EHS Manual</w:t>
 26 |                 </w:r>
 27 |             </w:p>
 28 |         </w:r>
 29 |         <w:r>
 30 |             <w:t>EHS Manual</w:t>
 31 |         </w:r>
 32 |     </w:p>
 33 | ```
 34 |     <open par 1>
 35 |         par 1 text
 36 |         <open par 2>
 37 |             par 2 text
 38 |         <close par 2>
 39 |         more par 1 text
 40 |     <close par 1>
 41 | ```
 42 | 
 43 | gets flattened to
 44 | 
 45 | ```
 46 | `par 2 text`
 47 | 'par 1 text`
 48 | `more par 1 text`
 49 | ```
 50 | Paragraphs are returned in by the order in which they *close*.
 51 | 
 52 | <w:p>
 53 |     <w:pPr>
 54 |         <w:pStyle w:val="Header"/>
 55 |     </w:pPr>
 56 |     <w:r w:rsidRPr="00210F67">
 57 |         <w:rPr>
 58 |             <w:sz w:val="17"/>
 59 |             <w:szCs w:val="17"/>
 60 |         </w:rPr>
 61 |         <w:p>
 62 |             <w:r>
 63 |                 <w:rPr>
 64 |                     <w:smallCaps/>
 65 |                     <w:sz w:val="72"/>
 66 |                     <w:szCs w:val="72"/>
 67 |                 </w:rPr>
 68 |                 <w:t>EHS Manual </w:t>
 69 |             </w:r>
 70 |         </w:p>
 71 |     </w:r>
 72 |     <w:r>
 73 |         <w:rPr>
 74 |             <w:noProof/>
 75 |         </w:rPr>
 76 |     </w:r>
 77 | </w:p>
 78 | 
 79 | """
 80 | 
 81 | from paragraphs import par
 82 | 
 83 | from docx2python.iterators import iter_at_depth
 84 | from docx2python.main import docx2python
 85 | from tests.conftest import RESOURCES
 86 | 
 87 | 
 88 | def test_paragraphs_only() -> None:
 89 |     """Html tags inserted into text"""
 90 |     with docx2python(RESOURCES / "nested_paragraphs.docx", html=True) as extraction:
 91 |         document_pars = extraction.document_pars
 92 |     styled = [(p.style, p.run_strings) for p in iter_at_depth(document_pars, 4)]
 93 |     expect = [
 94 |         (
 95 |             "",
 96 |             [
 97 |                 par(
 98 |                     """[Grab your reader’s attention with a great quote from the
 99 |                     document or use this space to emphasize a key point. To place
100 |                     this text box anywhere on the page, just drag it.]"""
101 |                 )
102 |             ],
103 |         ),
104 |         (
105 |             "",
106 |             [
107 |                 par(
108 |                     """[Grab your reader’s attention with a great quote from the
109 |                     document or use this space to emphasize a key point. To place
110 |                     this text box anywhere on the page, just drag it.]"""
111 |                 )
112 |             ],
113 |         ),
114 |         (
115 |             "Heading1",
116 |             [
117 |                 "<h1>",
118 |                 par(
119 |                     """aaa aab aac aad aae aaf aag aah aai aaj aak aal aam aan aao
120 |                     aap aaq aar aas aat aau aav aaw aax aay aaz aba abb abc abd abe
121 |                     abf abg abh abi abj abk abl abm abn abo abp abq abr abs abt abu
122 |                     abv abw abx aby abz aca acb acc acd ace acf acg ach aci acj ack
123 |                     acl acm acn aco acp acq acr acs act acu acv acw acx acy acz ada
124 |                     adb adc add ade adf adg adh adi adj adk adl adm adn ado adp adq
125 |                     adr ads adt adu adv adw adx ady adz aea aeb aec aed aee aef aeg
126 |                     aeh aei aej aek ael aem aen aeo aep aeq aer aes aet aeu aev aew
127 |                     aex aey aez afa afb afc afd afe aff afg afh afi afj afk afl afm
128 |                     afn afo afp afq afr afs aft afu afv afw afx afy afz aga agb agc
129 |                     agd age agf agg agh agi agj agk agl agm agn ago agp agq agr ags
130 |                     agt agu agv agw agx agy agz aha ahb ahc ahd ahe ahf ahg ahh ahi
131 |                     ahj ahk ahl ahm ahn aho ahp ahq ahr ahs aht ahu ahv ahw ahx ahy
132 |                     ahz aia aib aic aid aie aif aig aih aii aij aik ail aim ain aio
133 |                     aip aiq air ais ait aiu aiv aiw aix aiy aiz aja ajb ajc ajd aje
134 |                     ajf ajg ajh aji ajj ajk ajl ajm ajn ajo ajp ajq ajr ajs ajt aju
135 |                     ajv ajw ajx ajy ajz aka akb akc akd ake akf akg akh aki akj akk
136 |                     akl akm akn ako akp akq akr aks akt aku akv akw akx aky akz ala
137 |                     alb alc ald ale alf alg alh ali alj alk all alm aln alo alp alq
138 |                     alr als alt alu alv alw alx aly alz ama amb amc amd ame amf amg
139 |                     amh ami amj amk aml amm amn amo amp amq amr ams amt amu amv amw
140 |                     amx amy amz ana anb anc and ane anf ang anh ani anj ank anl anm
141 |                     ann ano anp anq anr ans ant anu anv anw anx any anz aoa aob aoc
142 |                     aod aoe aof aog aoh aoi aoj aok aol aom aon aoo aop aoq aor aos
143 |                     aot aou aov aow aox aoy aoz apa apb apc apd ape apf apg aph api
144 |                     apj apk apl apm apn apo app apq apr aps apt apu apv apw apx apy
145 |                     apz aqa aqb aqc aqd aqe aqf aqg aqh aqi aqj aqk aql aqm aqn aqo
146 |                     aqp aqq aqr aqs aqt aqu aqv aqw aqx aqy aqz ara arb arc ard are
147 |                     arf arg arh ari arj ark arl arm arn aro arp arq arr ars art aru
148 |                     arv arw arx ary arz asa asb asc asd ase asf asg ash asi asj ask
149 |                     asl asm asn aso asp asq asr ass ast asu asv asw asx asy asz ata
150 |                     atb atc atd ate atf atg ath ati atj atk atl atm atn ato atp atq
151 |                     atr ats att atu atv atw atx aty atz aua aub auc aud aue auf aug
152 |                     auh aui auj auk aul aum aun auo aup auq aur aus aut auu auv auw
153 |                     aux auy auz ava avb avc avd ave avf avg avh avi avj avk avl avm
154 |                     avn avo avp avq avr avs avt avu avv avw avx avy avz awa awb awc
155 |                     awd awe awf awg awh awi awj awk awl awm awn awo awp awq awr aws
156 |                     awt awu awv aww awx awy awz axa axb axc axd axe axf axg axh axi
157 |                     axj axk axl axm axn axo axp axq axr axs axt axu axv axw axx axy
158 |                     axz aya ayb ayc ayd aye ayf ayg ayh ayi ayj ayk ayl aym ayn ayo
159 |                     ayp ayq ayr ays ayt ayu ayv ayw ayx ayy ayz aza azb azc azd aze
160 |                     azf azg azh azi azj azk azl azm azn azo azp azq azr azs azt azu
161 |                     azv azw azx azy azz"""
162 |                 ),
163 |                 "</h1>",
164 |             ],
165 |         ),
166 |     ]
167 |     assert styled == expect
168 | 
169 | 
170 | def test_par_styles_not_in_text() -> None:
171 |     """Par styles skipped in pure text export"""
172 |     pars = docx2python(RESOURCES / "nested_paragraphs.docx", html=True)
173 |     assert pars.text == par(
174 |         """[Grab your reader’s attention with a great quote from the document or use
175 |         this space to emphasize a key point. To place this text box anywhere on the
176 |         page, just drag it.]
177 | 
178 |         [Grab your reader’s attention with a great quote from the document or use
179 |         this space to emphasize a key point. To place this text box anywhere on the
180 |         page, just drag it.]
181 | 
182 |         <h1>aaa aab aac aad aae aaf aag aah aai aaj aak aal aam aan aao aap aaq aar
183 |         aas aat aau aav aaw aax aay aaz aba abb abc abd abe abf abg abh abi abj abk
184 |         abl abm abn abo abp abq abr abs abt abu abv abw abx aby abz aca acb acc acd
185 |         ace acf acg ach aci acj ack acl acm acn aco acp acq acr acs act acu acv acw
186 |         acx acy acz ada adb adc add ade adf adg adh adi adj adk adl adm adn ado adp
187 |         adq adr ads adt adu adv adw adx ady adz aea aeb aec aed aee aef aeg aeh aei
188 |         aej aek ael aem aen aeo aep aeq aer aes aet aeu aev aew aex aey aez afa afb
189 |         afc afd afe aff afg afh afi afj afk afl afm afn afo afp afq afr afs aft afu
190 |         afv afw afx afy afz aga agb agc agd age agf agg agh agi agj agk agl agm agn
191 |         ago agp agq agr ags agt agu agv agw agx agy agz aha ahb ahc ahd ahe ahf ahg
192 |         ahh ahi ahj ahk ahl ahm ahn aho ahp ahq ahr ahs aht ahu ahv ahw ahx ahy ahz
193 |         aia aib aic aid aie aif aig aih aii aij aik ail aim ain aio aip aiq air ais
194 |         ait aiu aiv aiw aix aiy aiz aja ajb ajc ajd aje ajf ajg ajh aji ajj ajk ajl
195 |         ajm ajn ajo ajp ajq ajr ajs ajt aju ajv ajw ajx ajy ajz aka akb akc akd ake
196 |         akf akg akh aki akj akk akl akm akn ako akp akq akr aks akt aku akv akw akx
197 |         aky akz ala alb alc ald ale alf alg alh ali alj alk all alm aln alo alp alq
198 |         alr als alt alu alv alw alx aly alz ama amb amc amd ame amf amg amh ami amj
199 |         amk aml amm amn amo amp amq amr ams amt amu amv amw amx amy amz ana anb anc
200 |         and ane anf ang anh ani anj ank anl anm ann ano anp anq anr ans ant anu anv
201 |         anw anx any anz aoa aob aoc aod aoe aof aog aoh aoi aoj aok aol aom aon aoo
202 |         aop aoq aor aos aot aou aov aow aox aoy aoz apa apb apc apd ape apf apg aph
203 |         api apj apk apl apm apn apo app apq apr aps apt apu apv apw apx apy apz aqa
204 |         aqb aqc aqd aqe aqf aqg aqh aqi aqj aqk aql aqm aqn aqo aqp aqq aqr aqs aqt
205 |         aqu aqv aqw aqx aqy aqz ara arb arc ard are arf arg arh ari arj ark arl arm
206 |         arn aro arp arq arr ars art aru arv arw arx ary arz asa asb asc asd ase asf
207 |         asg ash asi asj ask asl asm asn aso asp asq asr ass ast asu asv asw asx asy
208 |         asz ata atb atc atd ate atf atg ath ati atj atk atl atm atn ato atp atq atr
209 |         ats att atu atv atw atx aty atz aua aub auc aud aue auf aug auh aui auj auk
210 |         aul aum aun auo aup auq aur aus aut auu auv auw aux auy auz ava avb avc avd
211 |         ave avf avg avh avi avj avk avl avm avn avo avp avq avr avs avt avu avv avw
212 |         avx avy avz awa awb awc awd awe awf awg awh awi awj awk awl awm awn awo awp
213 |         awq awr aws awt awu awv aww awx awy awz axa axb axc axd axe axf axg axh axi
214 |         axj axk axl axm axn axo axp axq axr axs axt axu axv axw axx axy axz aya ayb
215 |         ayc ayd aye ayf ayg ayh ayi ayj ayk ayl aym ayn ayo ayp ayq ayr ays ayt ayu
216 |         ayv ayw ayx ayy ayz aza azb azc azd aze azf azg azh azi azj azk azl azm azn
217 |         azo azp azq azr azs azt azu azv azw azx azy azz</h1>"""
218 |     )
219 |     pars.close()
220 | 
221 | 
222 | class TestBulletedLists:
223 |     """Replace numbering format with bullet (--) when format cannot be determined"""
224 | 
225 |     def test_bulleted_lists(self) -> None:
226 |         pars = docx2python(RESOURCES / "created-in-pages-bulleted-lists.docx")
227 |         assert pars.text == (
228 |             "\n\nThis is a document for testing docx2python module.\n\n\n\n--\tWhy "
229 |             "did the chicken cross the road?\n\n\t--\tJust because\n\n\t--\tDon't "
230 |             "know\n\n\t--\tTo get to the other side\n\n--\tWhat's the meaning of life, "
231 |             "universe and everything?\n\n\t--\t42\n\n\t--\t0\n\n\t--\t-1\n\n"
232 |         )
233 |         pars.close()
234 | 


--------------------------------------------------------------------------------
/tests/test_numbering_formats.py:
--------------------------------------------------------------------------------
 1 | """Test functions in docx2python.numbering_formats.py
 2 | 
 3 | :author: Shay Hill
 4 | :created: 6/26/2019
 5 | """
 6 | 
 7 | from random import randint
 8 | 
 9 | import pytest
10 | 
11 | from docx2python.numbering_formats import (
12 |     bullet,
13 |     decimal,
14 |     lower_letter,
15 |     lower_roman,
16 |     upper_letter,
17 |     upper_roman,
18 | )
19 | from tests.helpers.utils import ARABIC_2_ROMAN
20 | 
21 | 
22 | class TestLowerLetter:
23 |     """Test numbering_formats.lower_letter"""
24 | 
25 |     def test_convert_positive_int(self) -> None:
26 |         """Convert a positive integer to a string of letters"""
27 |         assert lower_letter(1) == "a"
28 |         assert lower_letter(26) == "z"
29 |         assert lower_letter(27) == "aa"
30 | 
31 |     def test_zero(self) -> None:
32 |         """Raise a value error for < 1"""
33 |         with pytest.raises(ValueError) as msg:
34 |             _ = lower_letter(0)
35 |         assert "0 and <1 are not defined" in str(msg.value)
36 | 
37 |     def test_neg(self) -> None:
38 |         """Raise a value error for < 1"""
39 |         with pytest.raises(ValueError) as msg:
40 |             _ = lower_letter(-1)
41 |         assert "0 and <1 are not defined" in str(msg.value)
42 | 
43 | 
44 | def test_upper_letter() -> None:
45 |     """Same as lower_letter, but upper"""
46 |     for _ in range(100):
47 |         n = randint(1, 10000)
48 |         assert upper_letter(n) == lower_letter(n).upper()
49 | 
50 | 
51 | class TestLowerRoman:
52 |     """Test numbering_formats.lower_roman"""
53 | 
54 |     def test_convert_positive_int(self) -> None:
55 |         """Convert a positive integer to a string of letters"""
56 |         for arabic, roman in ARABIC_2_ROMAN.items():
57 |             assert lower_roman(arabic) == roman
58 | 
59 |     def test_zero(self) -> None:
60 |         """Raise a value error for < 1"""
61 |         with pytest.raises(ValueError) as msg:
62 |             _ = lower_roman(0)
63 |         assert "Roman" in str(msg.value)
64 | 
65 |     def test_neg(self) -> None:
66 |         """Raise a value error for < 1"""
67 |         with pytest.raises(ValueError) as msg:
68 |             _ = lower_roman(-1)
69 |         assert "Roman" in str(msg.value)
70 | 
71 | 
72 | def test_upper_roman() -> None:
73 |     """Same as lower_roman, but upper"""
74 |     for _ in range(100):
75 |         n = randint(1, 10000)
76 |         assert upper_roman(n) == lower_roman(n).upper()
77 | 
78 | 
79 | def test_decimal() -> None:
80 |     """Return string representation of input"""
81 |     for i in range(10):
82 |         assert decimal(i) == str(i)
83 | 
84 | 
85 | def test_bullet() -> None:
86 |     """Return same string for every input."""
87 |     for i in range(10):
88 |         assert bullet(i) == bullet(i * 10)
89 | 


--------------------------------------------------------------------------------
/tests/test_par_styles.py:
--------------------------------------------------------------------------------
  1 | """Par styles converted to flags
  2 | 
  3 | :author: Shay Hill
  4 | :created: 3/18/2021
  5 | 
  6 | """
  7 | 
  8 | from docx2python.iterators import iter_at_depth
  9 | from docx2python.main import docx2python
 10 | from tests.conftest import RESOURCES
 11 | 
 12 | 
 13 | class TestParStyles:
 14 |     def test_par_styles(self) -> None:
 15 |         """
 16 |         If do_html, paragraphs style is the first element of every paragraph
 17 | 
 18 |         If no paragraph style, empty string is first element of evert paragraph
 19 | 
 20 |         :return:
 21 |         """
 22 |         with docx2python(RESOURCES / "example.docx") as extraction:
 23 |             document_pars = extraction.document_pars
 24 |         styled = [(p.style, p.run_strings) for p in iter_at_depth(document_pars, 4)]
 25 |         styled = [x for x in styled if x[1]]
 26 |         expect = [
 27 |             (
 28 |                 "Header",
 29 |                 [
 30 |                     "Header text",
 31 |                     "----Image alt text---->A close up of a logo\n\n"
 32 |                     + "Description automatically generated<",
 33 |                     "----media/image1.png----",
 34 |                 ],
 35 |             ),
 36 |             ("ListParagraph", ["I)\t", "expect I"]),
 37 |             ("ListParagraph", ["\tA)\t", "expect A"]),
 38 |             ("ListParagraph", ["\tB)\t", "expect B"]),
 39 |             ("ListParagraph", ["\t\t1)\t", "expect 1"]),
 40 |             ("ListParagraph", ["\t\t\ta)\t", "expect a"]),
 41 |             ("ListParagraph", ["\t\t\tb)\t", "expect b"]),
 42 |             ("ListParagraph", ["\t\t\t\t1)\t", "expect 1"]),
 43 |             ("ListParagraph", ["\t\t\t\t\ta)\t", "expect a"]),
 44 |             ("ListParagraph", ["\t\t\t\t\t\ti)\t", "expect i"]),
 45 |             ("ListParagraph", ["\t\t\t\t\t\tii)\t", "expect ii"]),
 46 |             ("ListParagraph", ["II)\t", "This should be II"]),
 47 |             ("ListParagraph", ["\tA)\t", "This should be A), not C)"]),
 48 |             ("ListParagraph", ["--\t", "bullet no indent"]),
 49 |             ("ListParagraph", ["\t--\t", "bullet indent 1"]),
 50 |             ("ListParagraph", ["\t\t--\t", "bullet indent 2"]),
 51 |             ("", ["Bold"]),
 52 |             ("", ["Italics"]),
 53 |             ("", ["Underlined"]),
 54 |             ("", ["Large Font"]),
 55 |             ("", ["Colored"]),
 56 |             ("", ["Large Colored"]),
 57 |             ("", ["Large Bold"]),
 58 |             ("", ["Large Bold Italics Underlined"]),
 59 |             ("", ["Nested"]),
 60 |             ("", ["Table"]),
 61 |             ("", ["A"]),
 62 |             ("", ["B"]),
 63 |             ("", ["Tab", "\t", "delimited", "\t", "text"]),
 64 |             ("", ["10 < 20 and 20 > 10"]),
 65 |             ("", ["Text outside table"]),
 66 |             ("", ["Reference footnote 1", "----footnote1----"]),
 67 |             ("", ["Reference footnote 2", "----footnote2----"]),
 68 |             ("", ["Reference endnote 1", "----endnote1----"]),
 69 |             ("", ["Reference endnote 2", "----endnote2----"]),
 70 |             ("Heading1", ["Heading 1"]),
 71 |             ("Heading2", ["Heading 2"]),
 72 |             (
 73 |                 "",
 74 |                 [
 75 |                     "----Image alt text---->A jellyfish in water\n\n"
 76 |                     + "Description automatically generated<",
 77 |                     "----media/image2.jpg----",
 78 |                 ],
 79 |             ),
 80 |             (
 81 |                 "Footer",
 82 |                 [
 83 |                     "Footer text",
 84 |                     "----Image alt text---->A close up of a logo\n\n"
 85 |                     + "Description automatically generated<",
 86 |                     "----media/image1.png----",
 87 |                 ],
 88 |             ),
 89 |             ("FootnoteText", ["footnote1)\t", " First footnote"]),
 90 |             (
 91 |                 "FootnoteText",
 92 |                 [
 93 |                     "footnote2)\t",
 94 |                     " Second footnote",
 95 |                     "----Image alt text---->A close up of a logo\n\n"
 96 |                     + "Description automatically generated<",
 97 |                     "----media/image1.png----",
 98 |                 ],
 99 |             ),
100 |             ("EndnoteText", ["endnote1)\t", " First endnote"]),
101 |             (
102 |                 "EndnoteText",
103 |                 [
104 |                     "endnote2)\t",
105 |                     " Second endnote",
106 |                     "----Image alt text---->A close up of a logo\n\n"
107 |                     + "Description automatically generated<",
108 |                     "----media/image1.png----",
109 |                 ],
110 |             ),
111 |         ]
112 |         assert styled == expect
113 | 


--------------------------------------------------------------------------------
/tests/test_pict.py:
--------------------------------------------------------------------------------
 1 | """Test functionality with pict elements.
 2 | 
 3 | :author: Shay Hill
 4 | :created: 1/29/2020
 5 | 
 6 | Such file was sent to me by stefan-hock20 on github. Images are referenced in
 7 | document.html as
 8 | 
 9 | ```
10 | <w:pict>
11 |     <v:shape id=Figure 1" o:spid="_x0000_i1085" type="#_x0000_t75"
12 |         style="width:441.6pt;height:264.6pt;visibility:visible">
13 |     <v:imagedata r:id="rId50" o:title=""/></v:shape>
14 | </w:pict>
15 | ```
16 | 
17 | docx2text 1.19 would get the image, but not mark the image location in the output text.
18 | """
19 | 
20 | from docx2python.main import docx2python
21 | from tests.conftest import RESOURCES
22 | 
23 | 
24 | class TestPictElement:
25 |     def test_extraction(self) -> None:
26 |         """Image placeholder inserted into extracted text."""
27 |         extraction = docx2python(RESOURCES / "has_pict.docx")
28 |         assert "image1.png" in extraction.images
29 |         assert "----media/image1.png----" in extraction.text
30 |         extraction.close()
31 | 
32 | 
33 | class TestPictWithAltText:
34 |     def test_extraction(self) -> None:
35 |         """Image placeholder inserted into extracted text."""
36 |         extraction = docx2python(RESOURCES / "pic_alt_text.docx")
37 |         text = extraction.text
38 |         assert "Alt description" in text
39 |         extraction.close()
40 | 


--------------------------------------------------------------------------------
/tests/test_run_styles.py:
--------------------------------------------------------------------------------
 1 | """Run styles converted to html
 2 | 
 3 | :author: Shay Hill
 4 | :created: 3/18/2021
 5 | 
 6 | <w:r><w:rPr><w:sz w:val="32"/><w:szCs w:val="32"/></w:rPr><w:t>16 point</w:t></w:r>
 7 | <w:r><w:rPr><w:color w:val="FF0000"/></w:rPr><w:t>Red</w:t></w:r>
 8 | <w:r><w:rPr><w:rFonts w:ascii="Courier New" w:hAnsi="Courier New" w:cs="Courier New"/>
 9 |     </w:rPr><w:t>Courier new</w:t></w:r>
10 | <w:r><w:rPr><w:i/><w:iCs/></w:rPr><w:t>Italic</w:t></w:r>
11 | <w:r><w:rPr><w:b/><w:bCs/></w:rPr><w:t>Bold</w:t></w:r>
12 | <w:r><w:rPr><w:u w:val="single"/></w:rPr><w:t>Underline</w:t></w:r>
13 | <w:r><w:rPr><w:strike/></w:rPr><w:t>Strikethrough</w:t></w:r>
14 | <w:r><w:rPr><w:dstrike/></w:rPr><w:t>Double Strikethrough</w:t></w:r>
15 | <w:r><w:rPr><w:vertAlign w:val="superscript"/></w:rPr><w:t>Superscript</w:t></w:r>
16 | <w:r><w:rPr><w:vertAlign w:val="subscript"/></w:rPr><w:t>Subscript</w:t></w:r>
17 | <w:r><w:rPr><w:smallCaps/></w:rPr><w:t>Small Caps</w:t></w:r>
18 | <w:r><w:rPr><w:caps/></w:rPr><w:t>All Caps</w:t></w:r>
19 | <w:r><w:rPr><w:highlight w:val="yellow"/></w:rPr><w:t>Highlighted yellow</w:t></w:r>
20 | <w:r><w:rPr><w:highlight w:val="green"/></w:rPr><w:t>Highlighted green</w:t></w:r>
21 | 
22 | <i> italic
23 | <b> bold
24 | <u> underline
25 | <s> strike
26 | <del> double strike
27 | <sup> superscript
28 | <sub> subscript
29 | <font style="font-variant: small-caps">small caps
30 | <font style="text-transform:uppercase">all caps
31 | <span style="background-color: yellow">highlighted yellow
32 | <span style="background-color: green">highlighted green
33 | """
34 | 
35 | from docx2python.main import docx2python
36 | from tests.conftest import RESOURCES
37 | 
38 | 
39 | class TestParStyles:
40 |     def test_par_styles(self) -> None:
41 |         """
42 |         If do_html, paragraphs style is the first element of every run
43 | 
44 |         :return:
45 |         """
46 |         content = docx2python(RESOURCES / "run_styles.docx", html=True)
47 |         assert content.document_runs == [
48 |             [
49 |                 [
50 |                     [
51 |                         ["Normal"],
52 |                         ['<span style="font-size:32pt">16 point</span>'],
53 |                         ['<span style="color:FF0000">Red</span>'],
54 |                         ["Courier new"],
55 |                         ["<i>Italic</i>"],
56 |                         ["<b>Bold</b>"],
57 |                         ["<u>Underline</u>"],
58 |                         ["<s>Strikethrough</s>"],
59 |                         ["Double Strikethough"],
60 |                         ["<sup>Superscript</sup>"],
61 |                         ["<sub>Subscript</sub>"],
62 |                         ['<span style="font-variant:small-caps">Small Caps</span>'],
63 |                         ['<span style="text-transform:uppercase">All Caps</span>'],
64 |                         [
65 |                             '<span style="background-color:yellow">'
66 |                             + "Highlighted yellow</span>"
67 |                         ],
68 |                         [
69 |                             '<span style="background-color:green">'
70 |                             + "Highlighted green</span>"
71 |                         ],
72 |                         ["<b>Consecutive</b>"],
73 |                         ["<b>Bold</b>"],
74 |                         ["<b>Paragraphs</b>"],
75 |                         [
76 |                             '<span style="font-size:24pt"><sub>Subscript </sub></span>',
77 |                             '<span style="font-size:24pt;font-variant:small-caps">'
78 |                             + "Small Caps </span>",
79 |                             '<span style="font-size:24pt;text-transform:uppercase">'
80 |                             + "All Caps </span>",
81 |                             '<span style="background-color:yellow;font-size:24pt">'
82 |                             + "Highlighted yellow </span>",
83 |                             '<span style="background-color:green;font-size:24pt">'
84 |                             + "Highlighted green</span>",
85 |                         ],
86 |                         [],
87 |                         [],
88 |                     ]
89 |                 ]
90 |             ]
91 |         ]
92 |         content.close()
93 | 


--------------------------------------------------------------------------------
/tests/test_slanted_quotes.py:
--------------------------------------------------------------------------------
 1 | """Test that Word's tilted quotes and double quotes extract Docx2Python."""
 2 | 
 3 | from docx2python.main import docx2python
 4 | from tests.conftest import RESOURCES
 5 | 
 6 | 
 7 | class TestTiltedQuotes:
 8 |     """Confirming this works with v1.25"""
 9 | 
10 |     def test_exact_representation(self) -> None:
11 |         """Most characters are represented exactly"""
12 |         with docx2python(RESOURCES / "slanted_quotes.docx") as pars:
13 |             assert pars.text == "“double quote”\n\n‘single quote’\n\nApostrophe’s"
14 | 


--------------------------------------------------------------------------------
/tests/test_soft_line_breaks.py:
--------------------------------------------------------------------------------
 1 | """Start a new paragraph at a soft line break ``<w:br>``
 2 | 
 3 | :author: Shay Hill
 4 | :created: 7/7/2021
 5 | 
 6 | Docx2Python previously ignored <w:br/> elements:
 7 | 
 8 |     ```
 9 |     pars = docx2python('soft_line_breaks.docx')
10 |     [[[[['Line1Line2Line3'], ['Line4'], []]], [[[]]]], [[[[]]]]]
11 |     ```
12 | """
13 | 
14 | from docx2python import docx2python
15 | from docx2python.iterators import iter_paragraphs
16 | from tests.conftest import RESOURCES
17 | 
18 | 
19 | class TestSoftLineBreaks:
20 |     def test_separate_pars(self):
21 |         """
22 |         Start a new paragraph when a <w:br/> element is found.
23 |         """
24 |         with docx2python(RESOURCES / "soft_line_breaks.docx") as content:
25 |             body = content.body
26 |         pars = [x for x in iter_paragraphs(body) if x]
27 |         assert pars == ["Line1\nLine2\nLine3", "Line4"]
28 | 


--------------------------------------------------------------------------------
/tests/test_strict.py:
--------------------------------------------------------------------------------
 1 | """A simple test for docx files saved with the strict menu option.
 2 | 
 3 | :author: Shay Hill
 4 | :created: 2024-07-02
 5 | """
 6 | 
 7 | from docx2python.main import docx2python
 8 | from tests.conftest import RESOURCES
 9 | 
10 | 
11 | class TestParagraphsOnly:
12 |     """Confirming this works with v1.25"""
13 | 
14 |     def test_paragraphs_only(self) -> None:
15 |         """Run without issue"""
16 |         pars = docx2python(RESOURCES / "strict.docx")
17 |         assert pars.document == [
18 |             [[["--\tBullet1", "--\tBullet2", "1)\tNumber1", "2)\tNumber2"]]],
19 |             [[["Cellaa"], ["Cellab"]], [["Cellba"], ["Cellbb"]]],
20 |             [[[""]]],
21 |         ]
22 | 


--------------------------------------------------------------------------------
/tests/test_symbols.py:
--------------------------------------------------------------------------------
 1 | """Test symbol extraction.
 2 | 
 3 | :author: Shay Hill
 4 | :created: 11/2/2021
 5 | 
 6 | Symbols are captured in the docx content files as ``<sym>`` elements.
 7 | 
 8 | ```
 9 |     <w:document>
10 |         <w:body>
11 |             <w:p>
12 |                 <w:r>
13 |                     <w:sym w:font="Symbol" w:char="F0F0"/>
14 |                 </w:r>
15 |             </w:p>
16 |         </w:body>
17 |     </w:document>
18 | ```
19 | """
20 | 
21 | from docx2python.main import docx2python
22 | from tests.conftest import RESOURCES
23 | 
24 | 
25 | def test_symbols() -> None:
26 |     """Export symbols as span elements."""
27 |     with docx2python(RESOURCES / "symbols.docx") as pars:
28 |         assert pars.text == (
29 |             "<span style=font-family:Webdings>&#x0068;</span>"
30 |             "≠"
31 |             "<span style=font-family:Symbol>&#x00F0;</span>"
32 |             "∞×÷≥≤±™®©¥£€µαβπΩ∑"
33 |             "<span style=font-family:Webdings>&#x004A;</span>"
34 |             "<span style=font-family:Webdings>&#x004B;</span>"
35 |             "<span style=font-family:Webdings>&#x0084;</span>"
36 |             "<span style=font-family:Webdings>&#x00E6;</span>"
37 |             "<span style=font-family:Webdings>&#x00DD;</span>"
38 |         )
39 | 
40 | 
41 | def test_symbols_with_html_true() -> None:
42 |     """Export symbols as span elements."""
43 |     with docx2python(RESOURCES / "symbols.docx", html=True) as pars:
44 |         assert pars.text == (
45 |             "<span style=font-family:Webdings>&#x0068;</span>"
46 |             "≠"
47 |             "<span style=font-family:Symbol>&#x00F0;</span>"
48 |             "∞×÷≥≤±™®©¥£€µαβπΩ∑"
49 |             "<span style=font-family:Webdings>&#x004A;</span>"
50 |             "<span style=font-family:Webdings>&#x004B;</span>"
51 |             "<span style=font-family:Webdings>&#x0084;</span>"
52 |             "<span style=font-family:Webdings>&#x00E6;</span>"
53 |             "<span style=font-family:Webdings>&#x00DD;</span>"
54 |         )
55 | 


--------------------------------------------------------------------------------
/tests/test_tables_to_markdown.py:
--------------------------------------------------------------------------------
 1 | """Test converting tables to markdown.
 2 | 
 3 | This is more of an example that an actual test, because I've had multiple requests
 4 | for tables as markdown. The new features in docx2python v3 make this straightforward.
 5 | 
 6 | :author: Shay Hill
 7 | :created: 2024-07-14
 8 | """
 9 | 
10 | from __future__ import annotations
11 | 
12 | from conftest import RESOURCES
13 | 
14 | from docx2python import docx2python
15 | from docx2python.depth_collector import Par
16 | from docx2python.iterators import is_tbl, iter_at_depth, iter_tables
17 | 
18 | 
19 | def _print_tc(cell: list[Par]) -> str:
20 |     """Print a table cell as a string on one line."""
21 |     ps = ["".join(p.run_strings).replace("\n", " ") for p in cell]
22 |     return "\n\n".join(ps)
23 | 
24 | 
25 | def _join_and_enclose_with_pipes(strings: list[str]) -> str:
26 |     """Join strings with pipes and enclose with pipes."""
27 |     return "|" + "|".join(strings) + "|"
28 | 
29 | 
30 | def _print_text(tbl: list[list[list[Par]]]) -> str:
31 |     """Text in this list [[[Par]]] is not a table. It's just text."""
32 |     all_cells = iter_at_depth(tbl, 2)
33 |     return "\n\n".join(_print_tc(tc) for tc in all_cells)
34 | 
35 | 
36 | def _print_tbl(tbl: list[list[list[Par]]]) -> str:
37 |     """Text in this list [[[Par]]] is a table."""
38 |     rows_as_string_lists = [[_print_tc(tc) for tc in tr] for tr in tbl]
39 |     rows_as_string_lists.insert(1, ["---"] * len(rows_as_string_lists[0]))
40 |     rows_as_strings = [
41 |         _join_and_enclose_with_pipes(row) for row in rows_as_string_lists
42 |     ]
43 |     return "\n".join(rows_as_strings)
44 | 
45 | 
46 | EXPECT = """This document has paragraphs.
47 | 
48 | |This|Document|
49 | |---|---|
50 | |Also|Has|
51 | |Tables||
52 | 
53 | There are paragraphs between tables. These are used to check the .lineage attribute of Par instances.
54 | 
55 | Here is another paragraph between the first and second tables.
56 | 
57 | |One  More  Table|
58 | |---|
59 | |One|
60 | |More|
61 | |Table|
62 | 
63 | """
64 | 
65 | 
66 | def test_tables_to_markdown() -> None:
67 |     with docx2python(RESOURCES / "paragraphs_and_tables.docx") as extraction:
68 |         tables = extraction.document_pars
69 | 
70 |     as_text: list[str] = []
71 | 
72 |     for possible_table in iter_tables(tables):
73 |         if is_tbl(possible_table):
74 |             as_text.append(_print_tbl(possible_table))
75 |         else:
76 |             as_text.append(_print_text(possible_table))
77 | 
78 |     assert "\n\n".join(as_text) == EXPECT
79 | 


--------------------------------------------------------------------------------
/tests/test_text_runs.py:
--------------------------------------------------------------------------------
 1 | """Test functions in docx2python.text_runs.py
 2 | 
 3 | :author: Shay Hill
 4 | :created: 7/4/2019
 5 | """
 6 | 
 7 | from lxml import etree
 8 | 
 9 | from docx2python.attribute_register import XML2HTML_FORMATTER
10 | from docx2python.text_runs import gather_Pr, get_run_formatting, html_close, html_open
11 | from tests.helpers.utils import valid_xml
12 | 
13 | ONE_TEXT_RUN = valid_xml(
14 |     '<w:r w:rsidRPr="000E1B98">'
15 |     + "<w:rPr>"
16 |     + '<w:rFonts w:ascii="Arial"/>'
17 |     + "<w:b/>"
18 |     + "<w:u/>"
19 |     + "<w:i/>"
20 |     + '<w:sz w:val="32"/>'
21 |     + '<w:color w:val="red"/>'
22 |     + '<w:szCs w:val="32"/>'
23 |     + '<w:u w:val="single"/>'
24 |     + "</w:rPr>"
25 |     + "<w:t>text styled  with rPr"
26 |     + "</w:t>"
27 |     + "</w:r>"
28 | )
29 | 
30 | NO_STYLE_RUN = valid_xml(
31 |     '<w:r w:rsidRPr="000E1B98">' + "<w:t>no styles applies" + "</w:t>" + "</w:r>"
32 | )
33 | 
34 | 
35 | class TestGatherRpr:
36 |     """Test text_runs.gather_rPr"""
37 | 
38 |     def test_get_styles(self):
39 |         """Map styles to values."""
40 |         document = etree.fromstring(ONE_TEXT_RUN)
41 |         assert gather_Pr(document[0][0][0]) == {
42 |             "rFonts": None,
43 |             "b": None,
44 |             "u": "single",
45 |             "i": None,
46 |             "sz": "32",
47 |             "color": "red",
48 |             "szCs": "32",
49 |         }
50 | 
51 |     def test_no_styles(self):
52 |         """Return empty dict when no rPr for text run."""
53 |         document = etree.fromstring(NO_STYLE_RUN)
54 |         assert gather_Pr(document[0][0][0]) == {}
55 | 
56 | 
57 | class TestGetRunStyle:
58 |     """Test text_runs.get_run_style"""
59 | 
60 |     def test_font_and_others(self) -> None:
61 |         """Return font first, then other styles."""
62 |         document = etree.fromstring(ONE_TEXT_RUN)
63 |         assert get_run_formatting(document[0][0][0], XML2HTML_FORMATTER) == [
64 |             'span style="color:red;font-size:32pt"',
65 |             "b",
66 |             "i",
67 |             "u",
68 |         ]
69 | 
70 | 
71 | class TestStyleStrings:
72 |     """Test text_runs.style_open and text_runs.style_close"""
73 | 
74 |     def test_style_open(self) -> None:
75 |         """Produce valid html for all defined styles."""
76 |         style = ['span style="color:red"', "b", "i", "u"]
77 |         assert html_open(style) == '<span style="color:red"><b><i><u>'
78 | 
79 |     def test_style_close(self) -> None:
80 |         """Produce valid html for all defined styles."""
81 |         style = ['span style="color:red"', "b", "i", "u"]
82 |         assert html_close(style) == "</u></i></b></span>"
83 | 


--------------------------------------------------------------------------------
/tests/test_toc_support.py:
--------------------------------------------------------------------------------
 1 | """Testing Table of Contents support as requested by user leboni
 2 | 
 3 | :author: Shay Hill
 4 | :created: 8/19/2020
 5 | 
 6 | User leboni forwarded a docx file, `zen_of_python.docx` with Table of Contents.
 7 | Addressing issue
 8 | 
 9 | `KeyError: '{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id'`
10 | 
11 | When attempting to extract content from such documents.
12 | 
13 | Two types of links in docx files. Internal links look like actual hyperlinks without
14 | an href.
15 | 
16 |     <w:hyperlink w:anchor="_Toc48296956" w:history="1">
17 |         <w:r w:rsidRPr="00810578">
18 |             <w:rPr>
19 |                 <w:rStyle w:val="Hyperlink"/>
20 |                 <w:noProof/>
21 |             </w:rPr>
22 |             <w:t>Beautiful is better than ugly.</w:t>
23 |         </w:r>
24 |     </w:hyperlink>
25 | """
26 | 
27 | from paragraphs import par
28 | 
29 | from docx2python.main import docx2python
30 | from tests.conftest import RESOURCES
31 | 
32 | 
33 | class TestTocText:
34 |     def test_get_toc_text(self) -> None:
35 |         """Extract header text from table-of-contents header."""
36 |         extraction = docx2python(RESOURCES / "zen_of_python.docx")
37 |         assert extraction.document_runs == [
38 |             [
39 |                 [[["Contents"], ["\t", "Beautiful is better than ugly.\t1"], []]],
40 |                 [
41 |                     [
42 |                         [],
43 |                         [],
44 |                         ["Beautiful is better than ugly."],
45 |                         ["Explicit is better than implicit."],
46 |                         ["Simple is better than complex."],
47 |                         ["Complex is better than complicated."],
48 |                         ["Flat is better than nested."],
49 |                         ["Sparse is better than dense."],
50 |                         ["Readability counts."],
51 |                         ["Special cases aren't special enough to break the rules."],
52 |                         ["Although practicality beats purity."],
53 |                         ["Errors should never pass silently."],
54 |                         ["Unless explicitly silenced."],
55 |                         ["In the face of ambiguity, refuse the temptation to guess."],
56 |                         [
57 |                             par(
58 |                                 """There should be one-- and preferably only one
59 |                                 --obvious way to do it."""
60 |                             )
61 |                         ],
62 |                         [
63 |                             par(
64 |                                 """Although that way may not be obvious at first
65 |                                 unless you're Dutch."""
66 |                             )
67 |                         ],
68 |                         ["Now is better than never."],
69 |                         ["Although never is often better than *right* now."],
70 |                         ["If the implementation is hard to explain, it's a bad idea."],
71 |                         [
72 |                             par(
73 |                                 """If the implementation is easy to explain, it may
74 |                                 be a good idea."""
75 |                             )
76 |                         ],
77 |                         [
78 |                             par(
79 |                                 """Namespaces are one honking great idea -- let's do
80 |                                 more of those!"""
81 |                             )
82 |                         ],
83 |                     ]
84 |                 ],
85 |             ]
86 |         ]
87 |         extraction.close()
88 | 


--------------------------------------------------------------------------------
/tests/test_utilities.py:
--------------------------------------------------------------------------------
  1 | """DocxReader object is able to open a docx file, search and replace text, then save.
  2 | 
  3 | :author: Shay Hill
  4 | :created: 2021-12-20
  5 | """
  6 | 
  7 | import os
  8 | import tempfile
  9 | 
 10 | from docx2python.main import docx2python
 11 | from docx2python.utilities import get_headings, get_links, replace_docx_text
 12 | from tests.conftest import RESOURCES
 13 | 
 14 | 
 15 | class TestSearchReplace:
 16 |     def test_search_and_replace(self) -> None:
 17 |         """Apples -> Pears, Pears -> Apples
 18 | 
 19 |         Ignore html differences when html is False"""
 20 | 
 21 |         # assert test file is in default state
 22 |         html = False
 23 |         input_filename = RESOURCES / "apples_and_pears.docx"
 24 |         expect = (
 25 |             "Apples and Pears\n\nPears and Apples\n\n"
 26 |             "Apples and Pears\n\nPears and Apples"
 27 |         )
 28 |         with docx2python(input_filename, html=html) as input_doc:
 29 |             result = input_doc.text
 30 |         assert result == expect
 31 | 
 32 |         # attempt a search and replace
 33 |         with tempfile.TemporaryDirectory() as temp_dir:
 34 |             output_filename = os.path.join(temp_dir, "pears_and_apples.docx")
 35 |             replace_docx_text(
 36 |                 input_filename,
 37 |                 output_filename,
 38 |                 ("Apples", "Bananas"),
 39 |                 ("Pears", "Apples"),
 40 |                 ("Bananas", "Pears"),
 41 |                 html=html,
 42 |             )
 43 |             expect = (
 44 |                 "Pears and Apples\n\nApples and Pears\n\n"
 45 |                 "Pears and Apples\n\nApples and Pears"
 46 |             )
 47 |             with docx2python(output_filename, html=html) as output_doc:
 48 |                 result = output_doc.text
 49 | 
 50 |             assert result == expect
 51 | 
 52 |     def test_ampersand(self) -> None:
 53 |         """Apples -> Pears, Pears -> Apples
 54 | 
 55 |         Replace text with an ampersand"""
 56 |         html = False
 57 |         input_filename = RESOURCES / "apples_and_pears.docx"
 58 | 
 59 |         with tempfile.TemporaryDirectory() as temp_dir:
 60 |             output_filename = os.path.join(temp_dir, "pears_and_apples.docx")
 61 |             replace_docx_text(
 62 |                 input_filename,
 63 |                 output_filename,
 64 |                 ("Apples", "Apples & Pears <>"),
 65 |                 html=html,
 66 |             )
 67 |             with docx2python(output_filename, html=html) as output_doc:
 68 |                 assert output_doc.text == (
 69 |                     "Apples & Pears <> and Pears\n\nPears and Apples & Pears <>\n\n"
 70 |                     "Apples & Pears <> and Pears\n\nPears and Apples & Pears <>"
 71 |                 )
 72 | 
 73 |     def test_search_and_replace_html(self) -> None:
 74 |         """Apples -> Pears, Pears -> Apples
 75 | 
 76 |         Exchange strings when formatting is consistent across the string. Leave
 77 |         alone otherwise.
 78 |         """
 79 |         html = True
 80 |         input_filename = RESOURCES / "apples_and_pears.docx"
 81 | 
 82 |         with tempfile.TemporaryDirectory() as temp_dir:
 83 |             output_filename = os.path.join(temp_dir, "pears_and_apples.docx")
 84 |             replace_docx_text(
 85 |                 input_filename,
 86 |                 output_filename,
 87 |                 ("Apples", "Bananas"),
 88 |                 ("Pears", "Apples"),
 89 |                 ("Bananas", "Pears"),
 90 |                 html=html,
 91 |             )
 92 |             with docx2python(output_filename, html=html) as output_doc:
 93 |                 assert output_doc.text == (
 94 |                     "Pears and Apples\n\n"
 95 |                     "Apples and Pears\n\n"
 96 |                     'Pears and <span style="background-color:green">Apples</span>\n\n'
 97 |                     "Pe<b>a</b>rs and Pears"
 98 |                 )
 99 | 
100 |     def test_search_and_replace_with_linebreaks(self) -> None:
101 |         """Apples -> Pears, Pears -> Apples
102 | 
103 |         Exchange strings when replacement has linebreaks.
104 |         """
105 |         html = True
106 |         input_filename = RESOURCES / "apples_and_pears.docx"
107 |         with tempfile.TemporaryDirectory() as temp_dir:
108 |             output_filename = os.path.join(temp_dir, "pears_and_apples.docx")
109 |             replace_docx_text(
110 |                 input_filename,
111 |                 output_filename,
112 |                 ("Apples", "Bananas"),
113 |                 ("Pears", "Apples\nPears\nGrapes"),
114 |                 ("Bananas", "Pears"),
115 |                 html=html,
116 |             )
117 |             with docx2python(output_filename, html=html) as output_doc:
118 |                 assert output_doc.text == (
119 |                     "Pears and Apples\nPears\nGrapes\n\n"
120 |                     "Apples\nPears\nGrapes and Pears\n\n"
121 |                     'Pears and <span style="background-color:green">'
122 |                     "Apples\nPears\nGrapes</span>\n\n"
123 |                     "Pe<b>a</b>rs and Pears"
124 |                 )
125 | 
126 | 
127 | def test_get_links() -> None:
128 |     """Return links as tuples"""
129 |     assert [x for x in get_links(RESOURCES / "merged_links.docx")] == [
130 |         ("https://www.shayallenhill.com", "hy"),
131 |         ("https://www.shayallenhill.com", "per"),
132 |         ("https://www.shayallenhill.com", "link"),
133 |         ("https://www.shayallenhill.com", "hyperlink"),
134 |     ]
135 | 
136 | 
137 | def test_get_headings() -> None:
138 |     """Return all headings (paragraphs with heading style) in document"""
139 |     assert [x for x in get_headings(RESOURCES / "example.docx")] == [
140 |         ["<h1>", "Heading 1", "</h1>"],
141 |         ["<h2>", "Heading 2", "</h2>"],
142 |     ]
143 | 


--------------------------------------------------------------------------------