├── .github ├── ISSUE_TEMPLATE.md ├── pull_request_template.md └── workflows │ └── tests.yml ├── .gitignore ├── LICENSE ├── NEWS ├── README.md ├── makefile ├── mammoth ├── __init__.py ├── cli.py ├── conversion.py ├── document_matchers.py ├── documents.py ├── docx │ ├── __init__.py │ ├── body_xml.py │ ├── comments_xml.py │ ├── complex_fields.py │ ├── content_types_xml.py │ ├── dingbats.py │ ├── document_xml.py │ ├── files.py │ ├── notes_xml.py │ ├── numbering_xml.py │ ├── office_xml.py │ ├── relationships_xml.py │ ├── style_map.py │ ├── styles_xml.py │ ├── uris.py │ └── xmlparser.py ├── html │ ├── __init__.py │ └── nodes.py ├── html_paths.py ├── images.py ├── lists.py ├── options.py ├── raw_text.py ├── results.py ├── styles │ ├── __init__.py │ └── parser │ │ ├── __init__.py │ │ ├── document_matcher_parser.py │ │ ├── errors.py │ │ ├── html_path_parser.py │ │ ├── style_mapping_parser.py │ │ ├── token_iterator.py │ │ ├── token_parser.py │ │ └── tokeniser.py ├── transforms.py ├── underline.py ├── writers │ ├── __init__.py │ ├── abc.py │ ├── html.py │ └── markdown.py └── zips.py ├── pyproject.toml ├── recipes └── wmf_images.py ├── setup.cfg ├── setup.py ├── test-requirements.txt ├── tests ├── __init__.py ├── cli_tests.py ├── conftest.py ├── conversion_tests.py ├── docx │ ├── __init__.py │ ├── body_xml_tests.py │ ├── comments_xml_tests.py │ ├── content_types_xml_tests.py │ ├── document_matchers.py │ ├── document_xml_tests.py │ ├── docx_tests.py │ ├── files_tests.py │ ├── notes_xml_tests.py │ ├── numbering_xml_tests.py │ ├── office_xml_tests.py │ ├── relationships_xml_tests.py │ ├── style_map_tests.py │ ├── styles_xml_tests.py │ ├── uris_tests.py │ └── xmlparser_tests.py ├── html │ ├── __init__.py │ ├── collapse_tests.py │ └── strip_empty_tests.py ├── images_tests.py ├── lists_tests.py ├── mammoth_tests.py ├── options_tests.py ├── raw_text_tests.py ├── styles │ ├── __init__.py │ ├── document_matcher_tests.py │ └── parser │ │ ├── __init__.py │ │ ├── document_matcher_parser_tests.py │ │ ├── html_path_parser_tests.py │ │ ├── style_mapping_parser_tests.py │ │ ├── token_parser_tests.py │ │ └── tokeniser_tests.py ├── test-data │ ├── comments.docx │ ├── embedded-style-map.docx │ ├── empty.docx │ ├── endnotes.docx │ ├── external-picture.docx │ ├── footnote-hyperlink.docx │ ├── footnotes.docx │ ├── hyperlinks │ │ └── word │ │ │ ├── _rels │ │ │ └── document.xml.rels │ │ │ └── document.xml │ ├── simple-list.docx │ ├── simple │ │ └── word │ │ │ └── document.xml │ ├── single-paragraph.docx │ ├── strict-format.docx │ ├── strikethrough.docx │ ├── tables.docx │ ├── text-box.docx │ ├── tiny-picture-target-base-relative.docx │ ├── tiny-picture.docx │ ├── tiny-picture.png │ ├── underline.docx │ └── utf8-bom.docx ├── testing.py ├── transforms_tests.py ├── writers │ ├── __init__.py │ └── markdown_tests.py └── zips_tests.py └── tox.ini /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | If you're reporting a bug or requesting a feature, please include: 2 | * a minimal example document 3 | * the HTML output that you'd expect 4 | 5 | If you're reporting a bug, it's also useful to know what platform you're 6 | running on, including: 7 | 8 | * the version of Python 9 | * the operating system and version 10 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | In general, pull requests are not currently accepted. 2 | 3 | Please instead submit an issue if you find a bug or would like to request a feature. 4 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-22.04 8 | 9 | strategy: 10 | matrix: 11 | python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "pypy3.9"] 12 | 13 | steps: 14 | 15 | - uses: actions/checkout@v4 16 | 17 | - name: Use Python ${{ matrix.python-version }} 18 | uses: actions/setup-python@v5 19 | with: 20 | python-version: ${{ matrix.python-version }} 21 | 22 | - run: pip install tox 23 | 24 | - run: make README 25 | 26 | - run: tox -e py 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | /README 3 | /_virtualenv 4 | /*.egg-info 5 | /.tox 6 | /MANIFEST 7 | /build 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013, Michael Williamson 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | -------------------------------------------------------------------------------- /NEWS: -------------------------------------------------------------------------------- 1 | # 1.10.0 2 | 3 | * Add "Heading" and "Body" styles, as found in documents created by Apple Pages, 4 | to the default style map. 5 | 6 | * Handle structured document tags representing checkboxes wrapped in other 7 | elements, such as table cells. Previously, the wrapping elements would have 8 | been ignored. 9 | 10 | * Ignore deleted table rows. 11 | 12 | # 1.9.1 13 | 14 | * Ignore AlternateContent elements when there is no Fallback element. 15 | 16 | # 1.9.0 17 | 18 | * Detect checkboxes, both as complex fields and structured document tags, and 19 | convert them to checkbox inputs. 20 | 21 | * Ignore AlternateContent elements when there is no Fallback element. 22 | 23 | # 1.8.0 24 | 25 | * Add style mapping for highlights. 26 | 27 | # 1.7.1 28 | 29 | * Switch the precedence of numbering properties in paragraph properties and the 30 | numbering in paragraph styles so that the numbering properties in paragraph 31 | properties takes precedence. 32 | 33 | # 1.7.0 34 | 35 | * Support attributes in HTML paths in style mappings. 36 | 37 | * Improve error message when failing to find the body element in a document. 38 | 39 | * Drop support for Python 2.7, Python 3.5 and Python 3.6. 40 | 41 | * Add support for the strict document format. 42 | 43 | # 1.6.0 44 | 45 | * Support merged paragraphs when revisions are tracked. 46 | 47 | # 1.5.1 48 | 49 | * Add a pyproject.toml to add an explicit build dependency on setuptools. 50 | 51 | # 1.5.0 52 | 53 | * Only use the alt text of image elements as a fallback. If an alt attribute is 54 | returned from the function passed to mammoth.images.img_element, that value 55 | will now be preferred to the alt text of the image element. 56 | 57 | # 1.4.19 58 | 59 | * Ignore w:u elements when w:val is missing. 60 | 61 | # 1.4.18 62 | 63 | * Emit warning instead of throwing exception when image file cannot be found for 64 | a:blip elements. 65 | 66 | # 1.4.17 67 | 68 | * When extracting raw text, convert tab elements to tab characters. 69 | 70 | * Handle internal hyperlinks created with complex fields. 71 | 72 | # 1.4.16 73 | 74 | * Handle w:num with invalid w:abstractNumId. 75 | 76 | # 1.4.15 77 | 78 | * Convert symbols in supported fonts to corresponding Unicode characters. 79 | 80 | # 1.4.14 81 | 82 | * Support numbering defined by paragraph style. 83 | 84 | # 1.4.13 85 | 86 | * Add style mapping for all caps. 87 | 88 | # 1.4.12 89 | 90 | * Handle underline elements where w:val is "none". 91 | 92 | # 1.4.11 93 | 94 | * Read font size for runs. 95 | * Support soft hyphens. 96 | 97 | # 1.4.10 98 | 99 | * Update supported Python versions to 2.7 and 3.4 to 3.8. 100 | 101 | # 1.4.9 102 | 103 | * Improve list support by following w:numStyleLink in w:abstractNum. 104 | 105 | # 1.4.8 106 | 107 | * Preserve empty table rows. 108 | 109 | # 1.4.7 110 | 111 | * Always write files as UTF-8 in the CLI. 112 | 113 | # 1.4.6 114 | 115 | * Fix: default style mappings caused footnotes, endnotes and comments 116 | containing multiple paragraphs to be converted into a single paragraph. 117 | 118 | # 1.4.5 119 | 120 | * Read the children of v:rect elements. 121 | 122 | # 1.4.4 123 | 124 | * Parse paragraph indents. 125 | 126 | * Read part paths using relationships. This improves support for documents 127 | created by Word Online. 128 | 129 | # 1.4.3 130 | 131 | * Add style mapping for small caps. 132 | 133 | * Add style mapping for tables. 134 | 135 | # 1.4.2 136 | 137 | * Read children of v:group elements. 138 | 139 | # 1.4.1 140 | 141 | * Read w:noBreakHyphen elements as non-breaking hyphen characters. 142 | 143 | # 1.4.0 144 | 145 | * Extract the default data URI image converter to the images module. 146 | 147 | * Add anchor on hyperlinks as fragment if present. 148 | 149 | * Convert target frames on hyperlinks to targets on anchors. 150 | 151 | * Detect header rows in tables and convert to thead > tr > th. 152 | 153 | # 1.3.5 154 | 155 | * Handle complex fields that do not have a "separate" fldChar. 156 | 157 | # 1.3.4 158 | 159 | * Add transforms.run. 160 | 161 | # 1.3.3 162 | 163 | * Read children of w:object elements. 164 | 165 | * Add support for document transforms. 166 | 167 | # 1.3.2 168 | 169 | * Handle hyperlinks created with complex fields. 170 | 171 | # 1.3.1 172 | 173 | * Handle absolute paths within zip files. This should fix an issue where some 174 | images within a document couldn't be found. 175 | 176 | # 1.3.0 177 | 178 | * Allow style names to be mapped by prefix. For instance: 179 | 180 | r[style-name^='Code '] => code 181 | 182 | * Add default style mappings for Heading 5 and Heading 6. 183 | 184 | * Allow escape sequences in style IDs, style names and CSS class names. 185 | 186 | * Allow a separator to be specified when HTML elements are collapsed. 187 | 188 | * Add include_embedded_style_map argument to allow embedded style maps to be 189 | disabled. 190 | 191 | * Include embedded styles when explicit style map is passed. 192 | 193 | # 1.2.2 194 | 195 | * Ignore bold, italic, underline and strikethrough elements that have a value of 196 | false or 0. 197 | 198 | # 1.2.1 199 | 200 | * Ignore v:imagedata elements without relationship ID with warning. 201 | 202 | # 1.2.0 203 | 204 | * Use alt text title as alt text for images when the alt text description is 205 | blank or missing. 206 | 207 | # 1.1.1 208 | 209 | * Handle comments without author initials. 210 | 211 | * Change numbering of comments to be global rather than per-user to match the 212 | behaviour of Word. 213 | 214 | # 1.1.0 215 | 216 | * Add support for comments. 217 | 218 | # 1.0.4 219 | 220 | * Add support for w:sdt elements. This allows the bodies of content controls, 221 | such as bibliographies, to be converted. 222 | 223 | # 1.0.3 224 | 225 | * Add support for table cells spanning multiple rows. 226 | 227 | # 1.0.2 228 | 229 | * Add support for table cells spanning multiple columns. 230 | 231 | # 1.0.1 232 | 233 | * Improve script installation on Windows by using entry_points instead of 234 | scripts in setup.py. 235 | 236 | # 1.0.0 237 | 238 | * Remove deprecated convert_underline argument. 239 | 240 | * Officially support ID prefixes. 241 | 242 | * Generated IDs no longer insert a hyphen after the ID prefix. 243 | 244 | * The default ID prefix is now the empty string rather than a random number 245 | followed by a hyphen. 246 | 247 | * Rename mammoth.images.inline to mammoth.images.img_element to better reflect 248 | its behaviour. 249 | 250 | # 0.3.31 251 | 252 | * Improve collapsing of similar non-fresh HTML elements. 253 | 254 | # 0.3.30 255 | 256 | * Allow bold and italic style mappings to be configured. 257 | 258 | # 0.3.29 259 | 260 | * Handle references to missing styles when reading documents. 261 | 262 | # 0.3.28 263 | 264 | * Improve support for lists made in LibreOffice. Specifically, this changes the 265 | default style mapping for paragraphs with a style of "Normal" to have the 266 | lowest precedence. 267 | 268 | # 0.3.27 269 | 270 | * Handle XML where the child nodes of an element contains text nodes. 271 | 272 | # 0.3.26 273 | 274 | * Always use mc:Fallback when reading mc:AlternateContent elements. 275 | 276 | # 0.3.25 277 | 278 | * Remove duplicate messages from results. 279 | 280 | * Read v:imagedata with r:id attribute. 281 | 282 | * Read children of v:roundrect. 283 | 284 | * Ignore office-word:wrap, v:shadow and v:shapetype. 285 | 286 | # 0.3.24 287 | 288 | * Continue with warning if external images cannot be found. 289 | 290 | * Add support for embedded style maps. 291 | 292 | # 0.3.23 293 | 294 | * Fix Python 3 support. 295 | 296 | # 0.3.22 297 | 298 | * Generate warnings for not-understood style mappings and continue, rather than 299 | stopping with an error. 300 | 301 | * Support file objects without a name attribute again (broken since 0.3.20). 302 | 303 | # 0.3.21 304 | 305 | * Ignore w:numPr elements without w:numId or w:ilvl children. 306 | 307 | # 0.3.20 308 | 309 | * Add support for linked images. 310 | 311 | # 0.3.19 312 | 313 | * Fix: cannot extract raw text from elements without children 314 | 315 | # 0.3.18 316 | 317 | * Support links and images in footnotes and endnotes. 318 | 319 | # 0.3.17 320 | 321 | * Add support for underlines in style map. 322 | 323 | * Add support for strikethrough. 324 | 325 | # 0.3.16 326 | 327 | * Add basic support for text boxes. The contents of the text box are treated as 328 | a separate paragraph that appears after the paragraph containing the text box. 329 | 330 | # 0.3.15 331 | 332 | * Support styles defined without a name 333 | 334 | # 0.3.14 335 | 336 | * Add ignore_empty_paragraphs option, which defaults to True. 337 | 338 | # 0.3.13 339 | 340 | * Always use forward slashes in ZIP paths. This should fix image handling on 341 | Windows. 342 | 343 | # 0.3.12 344 | 345 | * Make style names case-insensitive in style mappings. This should make style 346 | mappings easier to write, especially since Microsoft Word sometimes represents 347 | style names in the UI differently from in the style definition. For instance, 348 | the style displayed in Word as "Heading 1" has a style name of "heading 1". 349 | This hopefully shouldn't cause an issue for anyone, but if you were relying 350 | on case-sensitivity, please do get in touch. 351 | 352 | # 0.3.11 353 | 354 | * Add support for hyperlinks to bookmarks in the same document. 355 | 356 | # 0.3.10 357 | 358 | * Add basic support for Markdown. Not all features are currently supported. 359 | 360 | # 0.3.9 361 | 362 | * Add default style mappings for builtin footnote and endnote styles in 363 | Microsoft Word and LibreOffice. 364 | 365 | * Allow style mappings with a zero-element HTML path. 366 | 367 | * Emit warnings when image types are unlikely to be supported by web browsers. 368 | 369 | # 0.3.8 370 | 371 | * Add support for endnotes. 372 | 373 | # 0.3.7 374 | 375 | * Add support for superscript and subscript text. 376 | 377 | # 0.3.6 378 | 379 | * Add support for footnotes. 380 | 381 | # 0.3.5 382 | 383 | * Add support for line breaks. 384 | 385 | # 0.3.4 386 | 387 | * Add optional underline conversion. 388 | 389 | # 0.3.3 390 | 391 | * Add `mammoth.images.inline`, and document custom image conversion. 392 | 393 | # 0.3.2 394 | 395 | * Add the function `mammoth.extract_raw_text`. 396 | 397 | # 0.3.1 398 | 399 | * Add support for tables 400 | 401 | # 0.3.0 402 | 403 | * Rename --styles CLI argument to --style-map. 404 | 405 | * Rename styles argument in convert_to_html to style_map. 406 | 407 | * Allow paragraphs and runs to be matched by style name. For instance, to match 408 | a paragraph with the style name `Heading 1`: 409 | 410 | p[style-name='Heading 1'] 411 | -------------------------------------------------------------------------------- /makefile: -------------------------------------------------------------------------------- 1 | .PHONY: test 2 | 3 | test: 4 | _virtualenv/bin/pyflakes mammoth tests 5 | sh -c '. _virtualenv/bin/activate; py.test tests' 6 | 7 | .PHONY: test-all 8 | 9 | test-all: 10 | tox 11 | 12 | .PHONY: upload 13 | 14 | upload: setup assert-converted-readme build-dist 15 | _virtualenv/bin/twine upload dist/* 16 | make clean 17 | 18 | .PHONY: build-dist 19 | 20 | build-dist: 21 | rm -rf dist 22 | _virtualenv/bin/pyproject-build 23 | 24 | README: README.md 25 | pandoc --from=markdown --to=rst README.md > README || cp README.md README 26 | 27 | .PHONY: assert-converted-readme 28 | 29 | assert-converted-readme: 30 | test "`cat README`" != "`cat README.md`" 31 | 32 | .PHONY: clean 33 | 34 | clean: 35 | rm -f README 36 | rm -f MANIFEST 37 | rm -rf dist 38 | 39 | .PHONY: bootstrap 40 | 41 | bootstrap: _virtualenv setup 42 | _virtualenv/bin/pip install -e . 43 | ifneq ($(wildcard test-requirements.txt),) 44 | _virtualenv/bin/pip install -r test-requirements.txt 45 | endif 46 | make clean 47 | 48 | .PHONY: setup 49 | 50 | setup: README 51 | 52 | _virtualenv: 53 | python3 -m venv _virtualenv 54 | _virtualenv/bin/pip install --upgrade pip 55 | _virtualenv/bin/pip install --upgrade setuptools 56 | _virtualenv/bin/pip install --upgrade wheel 57 | _virtualenv/bin/pip install --upgrade build twine 58 | -------------------------------------------------------------------------------- /mammoth/__init__.py: -------------------------------------------------------------------------------- 1 | from . import docx, conversion, options, images, transforms, underline 2 | from .raw_text import extract_raw_text_from_element 3 | from .docx.style_map import write_style_map, read_style_map 4 | 5 | __all__ = ["convert_to_html", "extract_raw_text", "images", "transforms", "underline"] 6 | 7 | 8 | _undefined = object() 9 | 10 | 11 | def convert_to_html(*args, **kwargs): 12 | return convert(*args, output_format="html", **kwargs) 13 | 14 | 15 | def convert_to_markdown(*args, **kwargs): 16 | return convert(*args, output_format="markdown", **kwargs) 17 | 18 | 19 | def convert(fileobj, transform_document=None, id_prefix=None, include_embedded_style_map=_undefined, **kwargs): 20 | if include_embedded_style_map is _undefined: 21 | include_embedded_style_map = True 22 | if transform_document is None: 23 | transform_document = lambda x: x 24 | if include_embedded_style_map: 25 | kwargs["embedded_style_map"] = read_style_map(fileobj) 26 | return options.read_options(kwargs).bind(lambda convert_options: 27 | docx.read(fileobj).map(transform_document).bind(lambda document: 28 | conversion.convert_document_element_to_html( 29 | document, 30 | id_prefix=id_prefix, 31 | **convert_options 32 | ) 33 | ) 34 | ) 35 | 36 | 37 | def extract_raw_text(fileobj): 38 | return docx.read(fileobj).map(extract_raw_text_from_element) 39 | 40 | 41 | def embed_style_map(fileobj, style_map): 42 | write_style_map(fileobj, style_map) 43 | 44 | def read_embedded_style_map(fileobj): 45 | return read_style_map(fileobj) 46 | -------------------------------------------------------------------------------- /mammoth/cli.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import io 3 | import os 4 | import shutil 5 | import sys 6 | 7 | import mammoth 8 | from . import writers 9 | 10 | 11 | def main(): 12 | args = _parse_args() 13 | 14 | if args.style_map is None: 15 | style_map = None 16 | else: 17 | with open(args.style_map) as style_map_fileobj: 18 | style_map = style_map_fileobj.read() 19 | 20 | with open(args.path, "rb") as docx_fileobj: 21 | if args.output_dir is None: 22 | convert_image = None 23 | output_path = args.output 24 | else: 25 | convert_image = mammoth.images.img_element(ImageWriter(args.output_dir)) 26 | output_filename = "{0}.html".format(os.path.basename(args.path).rpartition(".")[0]) 27 | output_path = os.path.join(args.output_dir, output_filename) 28 | 29 | result = mammoth.convert( 30 | docx_fileobj, 31 | style_map=style_map, 32 | convert_image=convert_image, 33 | output_format=args.output_format, 34 | ) 35 | for message in result.messages: 36 | sys.stderr.write(message.message) 37 | sys.stderr.write("\n") 38 | 39 | _write_output(output_path, result.value) 40 | 41 | 42 | class ImageWriter(object): 43 | def __init__(self, output_dir): 44 | self._output_dir = output_dir 45 | self._image_number = 1 46 | 47 | def __call__(self, element): 48 | extension = element.content_type.partition("/")[2] 49 | image_filename = "{0}.{1}".format(self._image_number, extension) 50 | with open(os.path.join(self._output_dir, image_filename), "wb") as image_dest: 51 | with element.open() as image_source: 52 | shutil.copyfileobj(image_source, image_dest) 53 | 54 | self._image_number += 1 55 | 56 | return {"src": image_filename} 57 | 58 | 59 | def _write_output(path, contents): 60 | if path is None: 61 | if sys.version_info[0] <= 2: 62 | stdout = sys.stdout 63 | else: 64 | stdout = sys.stdout.buffer 65 | 66 | stdout.write(contents.encode("utf-8")) 67 | stdout.flush() 68 | else: 69 | with io.open(path, "w", encoding="utf-8") as fileobj: 70 | fileobj.write(contents) 71 | 72 | 73 | def _parse_args(): 74 | parser = argparse.ArgumentParser() 75 | parser.add_argument( 76 | "path", 77 | metavar="docx-path", 78 | help="Path to the .docx file to convert.") 79 | 80 | output_group = parser.add_mutually_exclusive_group() 81 | output_group.add_argument( 82 | "output", 83 | nargs="?", 84 | metavar="output-path", 85 | help="Output path for the generated document. Images will be stored inline in the output document. Output is written to stdout if not set.") 86 | output_group.add_argument( 87 | "--output-dir", 88 | help="Output directory for generated HTML and images. Images will be stored in separate files. Mutually exclusive with output-path.") 89 | 90 | parser.add_argument( 91 | "--output-format", 92 | required=False, 93 | choices=writers.formats(), 94 | help="Output format.") 95 | parser.add_argument( 96 | "--style-map", 97 | required=False, 98 | help="File containg a style map.") 99 | return parser.parse_args() 100 | 101 | 102 | if __name__ == "__main__": 103 | main() 104 | 105 | -------------------------------------------------------------------------------- /mammoth/document_matchers.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | import cobble 4 | 5 | 6 | def paragraph(style_id=None, style_name=None, numbering=None): 7 | return ParagraphMatcher(style_id, style_name, numbering) 8 | 9 | 10 | ParagraphMatcher = collections.namedtuple("ParagraphMatcher", ["style_id", "style_name", "numbering"]) 11 | ParagraphMatcher.element_type = "paragraph" 12 | 13 | 14 | def run(style_id=None, style_name=None): 15 | return RunMatcher(style_id, style_name) 16 | 17 | 18 | RunMatcher = collections.namedtuple("RunMatcher", ["style_id", "style_name"]) 19 | RunMatcher.element_type = "run" 20 | 21 | 22 | def table(style_id=None, style_name=None): 23 | return TableMatcher(style_id, style_name) 24 | 25 | 26 | TableMatcher = collections.namedtuple("TableMatcher", ["style_id", "style_name"]) 27 | TableMatcher.element_type = "table" 28 | 29 | 30 | class bold(object): 31 | element_type = "bold" 32 | 33 | 34 | class italic(object): 35 | element_type = "italic" 36 | 37 | 38 | class underline(object): 39 | element_type = "underline" 40 | 41 | 42 | class strikethrough(object): 43 | element_type = "strikethrough" 44 | 45 | 46 | class all_caps(object): 47 | element_type = "all_caps" 48 | 49 | 50 | class small_caps(object): 51 | element_type = "small_caps" 52 | 53 | 54 | def highlight(color=None): 55 | return HighlightMatcher(color=color) 56 | 57 | 58 | HighlightMatcher = collections.namedtuple("HighlightMatcher", ["color"]) 59 | HighlightMatcher.element_type = "highlight" 60 | 61 | class comment_reference(object): 62 | element_type = "comment_reference" 63 | 64 | 65 | BreakMatcher = collections.namedtuple("BreakMatcher", ["break_type"]) 66 | BreakMatcher.element_type = "break" 67 | 68 | 69 | line_break = BreakMatcher("line") 70 | page_break = BreakMatcher("page") 71 | column_break = BreakMatcher("column") 72 | 73 | 74 | def equal_to(value): 75 | return StringMatcher(_operator_equal_to, value) 76 | 77 | 78 | def _operator_equal_to(first, second): 79 | return first.upper() == second.upper() 80 | 81 | 82 | def starts_with(value): 83 | return StringMatcher(_operator_starts_with, value) 84 | 85 | def _operator_starts_with(first, second): 86 | return second.upper().startswith(first.upper()) 87 | 88 | 89 | @cobble.data 90 | class StringMatcher(object): 91 | operator = cobble.field() 92 | value = cobble.field() 93 | 94 | def matches(self, other): 95 | return self.operator(self.value, other) 96 | -------------------------------------------------------------------------------- /mammoth/documents.py: -------------------------------------------------------------------------------- 1 | import cobble 2 | 3 | 4 | class Element(object): 5 | def copy(self, **kwargs): 6 | return cobble.copy(self, **kwargs) 7 | 8 | 9 | class HasChildren(Element): 10 | children = cobble.field() 11 | 12 | 13 | @cobble.data 14 | class Document(HasChildren): 15 | notes = cobble.field() 16 | comments = cobble.field() 17 | 18 | @cobble.data 19 | class Paragraph(HasChildren): 20 | style_id = cobble.field() 21 | style_name = cobble.field() 22 | numbering = cobble.field() 23 | alignment = cobble.field() 24 | indent = cobble.field() 25 | 26 | 27 | @cobble.data 28 | class ParagraphIndent(object): 29 | start = cobble.field() 30 | end = cobble.field() 31 | first_line = cobble.field() 32 | hanging = cobble.field() 33 | 34 | 35 | @cobble.data 36 | class Indent(object): 37 | left = cobble.field() 38 | right = cobble.field() 39 | first_line = cobble.field() 40 | hanging = cobble.field() 41 | 42 | 43 | @cobble.data 44 | class Run(HasChildren): 45 | style_id = cobble.field() 46 | style_name = cobble.field() 47 | is_bold = cobble.field() 48 | is_italic = cobble.field() 49 | is_underline = cobble.field() 50 | is_strikethrough = cobble.field() 51 | is_all_caps = cobble.field() 52 | is_small_caps = cobble.field() 53 | vertical_alignment = cobble.field() 54 | font = cobble.field() 55 | font_size = cobble.field() 56 | highlight = cobble.field() 57 | 58 | @cobble.data 59 | class Text(Element): 60 | value = cobble.field() 61 | 62 | @cobble.data 63 | class Hyperlink(HasChildren): 64 | href = cobble.field() 65 | anchor = cobble.field() 66 | target_frame = cobble.field() 67 | 68 | @cobble.data 69 | class Checkbox(Element): 70 | checked = cobble.field() 71 | 72 | checkbox = Checkbox 73 | 74 | @cobble.data 75 | class Table(HasChildren): 76 | style_id = cobble.field() 77 | style_name = cobble.field() 78 | 79 | @cobble.data 80 | class TableRow(HasChildren): 81 | is_header = cobble.field() 82 | 83 | @cobble.data 84 | class TableCell(HasChildren): 85 | colspan = cobble.field() 86 | rowspan = cobble.field() 87 | 88 | @cobble.data 89 | class TableCellUnmerged: 90 | children = cobble.field() 91 | colspan = cobble.field() 92 | rowspan = cobble.field() 93 | vmerge = cobble.field() 94 | 95 | def copy(self, **kwargs): 96 | return cobble.copy(self, **kwargs) 97 | 98 | @cobble.data 99 | class Break(Element): 100 | break_type = cobble.field() 101 | 102 | line_break = Break("line") 103 | page_break = Break("page") 104 | column_break = Break("column") 105 | 106 | 107 | @cobble.data 108 | class Tab(Element): 109 | pass 110 | 111 | 112 | @cobble.data 113 | class Image(Element): 114 | alt_text = cobble.field() 115 | content_type = cobble.field() 116 | open = cobble.field() 117 | 118 | 119 | def document(children, notes=None, comments=None): 120 | if notes is None: 121 | notes = Notes({}) 122 | if comments is None: 123 | comments = [] 124 | return Document(children, notes, comments=comments) 125 | 126 | def paragraph(children, style_id=None, style_name=None, numbering=None, alignment=None, indent=None): 127 | if indent is None: 128 | indent = paragraph_indent() 129 | 130 | return Paragraph(children, style_id, style_name, numbering, alignment=alignment, indent=indent) 131 | 132 | def paragraph_indent(start=None, end=None, first_line=None, hanging=None): 133 | return ParagraphIndent(start=start, end=end, first_line=first_line, hanging=hanging) 134 | 135 | def run( 136 | children, 137 | style_id=None, 138 | style_name=None, 139 | is_bold=None, 140 | is_italic=None, 141 | is_underline=None, 142 | is_strikethrough=None, 143 | is_all_caps=None, 144 | is_small_caps=None, 145 | vertical_alignment=None, 146 | font=None, 147 | font_size=None, 148 | highlight=None, 149 | ): 150 | if vertical_alignment is None: 151 | vertical_alignment = VerticalAlignment.baseline 152 | return Run( 153 | children=children, 154 | style_id=style_id, 155 | style_name=style_name, 156 | is_bold=bool(is_bold), 157 | is_italic=bool(is_italic), 158 | is_underline=bool(is_underline), 159 | is_strikethrough=bool(is_strikethrough), 160 | is_all_caps=bool(is_all_caps), 161 | is_small_caps=bool(is_small_caps), 162 | vertical_alignment=vertical_alignment, 163 | font=font, 164 | font_size=font_size, 165 | highlight=highlight, 166 | ) 167 | 168 | class VerticalAlignment(object): 169 | baseline = "baseline" 170 | superscript = "superscript" 171 | subscript = "subscript" 172 | 173 | text = Text 174 | 175 | _tab = Tab() 176 | 177 | def tab(): 178 | return _tab 179 | 180 | 181 | image = Image 182 | 183 | def hyperlink(children, href=None, anchor=None, target_frame=None): 184 | return Hyperlink(href=href, anchor=anchor, target_frame=target_frame, children=children) 185 | 186 | 187 | @cobble.data 188 | class Bookmark(Element): 189 | name = cobble.field() 190 | 191 | bookmark = Bookmark 192 | 193 | 194 | def table(children, style_id=None, style_name=None): 195 | return Table(children=children, style_id=style_id, style_name=style_name) 196 | 197 | def table_row(children, is_header=None): 198 | return TableRow(children=children, is_header=bool(is_header)) 199 | 200 | def table_cell(children, colspan=None, rowspan=None): 201 | if colspan is None: 202 | colspan = 1 203 | if rowspan is None: 204 | rowspan = 1 205 | return TableCell(children=children, colspan=colspan, rowspan=rowspan) 206 | 207 | def table_cell_unmerged(children, colspan, rowspan, vmerge): 208 | return TableCellUnmerged(children=children, colspan=colspan, rowspan=rowspan, vmerge=vmerge) 209 | 210 | def numbering_level(level_index, is_ordered): 211 | return _NumberingLevel(str(level_index), bool(is_ordered)) 212 | 213 | @cobble.data 214 | class _NumberingLevel(object): 215 | level_index = cobble.field() 216 | is_ordered = cobble.field() 217 | 218 | @cobble.data 219 | class Note(Element): 220 | note_type = cobble.field() 221 | note_id = cobble.field() 222 | body = cobble.field() 223 | 224 | 225 | note = Note 226 | 227 | 228 | class Notes(object): 229 | def __init__(self, notes): 230 | self._notes = notes 231 | 232 | def find_note(self, note_type, note_id): 233 | return self._notes[(note_type, note_id)] 234 | 235 | def resolve(self, reference): 236 | return self.find_note(reference.note_type, reference.note_id) 237 | 238 | def __eq__(self, other): 239 | return isinstance(other, Notes) and self._notes == other._notes 240 | 241 | def __ne__(self, other): 242 | return not (self == other) 243 | 244 | def notes(notes_list): 245 | return Notes(dict( 246 | (_note_key(note), note) 247 | for note in notes_list 248 | )) 249 | 250 | def _note_key(note): 251 | return (note.note_type, note.note_id) 252 | 253 | @cobble.data 254 | class NoteReference(Element): 255 | note_type = cobble.field() 256 | note_id = cobble.field() 257 | 258 | note_reference = NoteReference 259 | 260 | 261 | @cobble.data 262 | class Comment(object): 263 | comment_id = cobble.field() 264 | body = cobble.field() 265 | author_name = cobble.field() 266 | author_initials = cobble.field() 267 | 268 | def comment(comment_id, body, author_name=None, author_initials=None): 269 | return Comment( 270 | comment_id=comment_id, 271 | body=body, 272 | author_name=author_name, 273 | author_initials=author_initials, 274 | ) 275 | 276 | @cobble.data 277 | class CommentReference(Element): 278 | comment_id = cobble.field() 279 | 280 | comment_reference = CommentReference 281 | 282 | def element_visitor(args): 283 | return cobble.visitor(Element, args=args) 284 | -------------------------------------------------------------------------------- /mammoth/docx/__init__.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | import os 3 | 4 | import cobble 5 | 6 | from .. import results, lists, zips 7 | from .document_xml import read_document_xml_element 8 | from .content_types_xml import empty_content_types, read_content_types_xml_element 9 | from .relationships_xml import read_relationships_xml_element, Relationships 10 | from .numbering_xml import read_numbering_xml_element, Numbering 11 | from .styles_xml import read_styles_xml_element, Styles 12 | from .notes_xml import read_endnotes_xml_element, read_footnotes_xml_element 13 | from .comments_xml import read_comments_xml_element 14 | from .files import Files 15 | from . import body_xml, office_xml 16 | from ..zips import open_zip 17 | 18 | 19 | _empty_result = results.success([]) 20 | 21 | 22 | def read(fileobj): 23 | zip_file = open_zip(fileobj, "r") 24 | part_paths = _find_part_paths(zip_file) 25 | read_part_with_body = _part_with_body_reader( 26 | getattr(fileobj, "name", None), 27 | zip_file, 28 | part_paths=part_paths, 29 | ) 30 | 31 | return results.combine([ 32 | _read_notes(read_part_with_body, part_paths), 33 | _read_comments(read_part_with_body, part_paths), 34 | ]).bind(lambda referents: 35 | _read_document(zip_file, read_part_with_body, notes=referents[0], comments=referents[1], part_paths=part_paths) 36 | ) 37 | 38 | 39 | @cobble.data 40 | class _PartPaths(object): 41 | main_document = cobble.field() 42 | comments = cobble.field() 43 | endnotes = cobble.field() 44 | footnotes = cobble.field() 45 | numbering = cobble.field() 46 | styles = cobble.field() 47 | 48 | 49 | def _find_part_paths(zip_file): 50 | package_relationships = _read_relationships(zip_file, "_rels/.rels") 51 | document_filename = _find_document_filename(zip_file, package_relationships) 52 | 53 | document_relationships = _read_relationships( 54 | zip_file, 55 | _find_relationships_path_for(document_filename), 56 | ) 57 | 58 | def find(name): 59 | return _find_part_path( 60 | zip_file=zip_file, 61 | relationships=document_relationships, 62 | relationship_type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/" + name, 63 | fallback_path="word/{0}.xml".format(name), 64 | base_path=zips.split_path(document_filename)[0], 65 | ) 66 | 67 | return _PartPaths( 68 | main_document=document_filename, 69 | comments=find("comments"), 70 | endnotes=find("endnotes"), 71 | footnotes=find("footnotes"), 72 | numbering=find("numbering"), 73 | styles=find("styles"), 74 | ) 75 | 76 | 77 | def _find_document_filename(zip_file, relationships): 78 | path = _find_part_path( 79 | zip_file, 80 | relationships, 81 | relationship_type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument", 82 | base_path="", 83 | fallback_path="word/document.xml", 84 | ) 85 | if zip_file.exists(path): 86 | return path 87 | else: 88 | raise IOError("Could not find main document part. Are you sure this is a valid .docx file?") 89 | 90 | 91 | def _find_part_path(zip_file, relationships, relationship_type, base_path, fallback_path): 92 | targets = [ 93 | zips.join_path(base_path, target).lstrip("/") 94 | for target in relationships.find_targets_by_type(relationship_type) 95 | ] 96 | valid_targets = list(filter(lambda target: zip_file.exists(target), targets)) 97 | if len(valid_targets) == 0: 98 | return fallback_path 99 | else: 100 | return valid_targets[0] 101 | 102 | 103 | def _read_notes(read_part_with_body, part_paths): 104 | footnotes = read_part_with_body( 105 | part_paths.footnotes, 106 | lambda root, body_reader: read_footnotes_xml_element(root, body_reader=body_reader), 107 | default=_empty_result, 108 | ) 109 | endnotes = read_part_with_body( 110 | part_paths.endnotes, 111 | lambda root, body_reader: read_endnotes_xml_element(root, body_reader=body_reader), 112 | default=_empty_result, 113 | ) 114 | 115 | return results.combine([footnotes, endnotes]).map(lists.flatten) 116 | 117 | 118 | def _read_comments(read_part_with_body, part_paths): 119 | return read_part_with_body( 120 | part_paths.comments, 121 | lambda root, body_reader: read_comments_xml_element(root, body_reader=body_reader), 122 | default=_empty_result, 123 | ) 124 | 125 | 126 | def _read_document(zip_file, read_part_with_body, notes, comments, part_paths): 127 | return read_part_with_body( 128 | part_paths.main_document, 129 | partial( 130 | read_document_xml_element, 131 | notes=notes, 132 | comments=comments, 133 | ), 134 | ) 135 | 136 | 137 | def _part_with_body_reader(document_path, zip_file, part_paths): 138 | content_types = _try_read_entry_or_default( 139 | zip_file, 140 | "[Content_Types].xml", 141 | read_content_types_xml_element, 142 | empty_content_types, 143 | ) 144 | 145 | styles = _try_read_entry_or_default( 146 | zip_file, 147 | part_paths.styles, 148 | read_styles_xml_element, 149 | Styles.EMPTY, 150 | ) 151 | 152 | numbering = _try_read_entry_or_default( 153 | zip_file, 154 | part_paths.numbering, 155 | lambda element: read_numbering_xml_element(element, styles=styles), 156 | default=Numbering.EMPTY, 157 | ) 158 | 159 | def read_part(name, reader, default=_undefined): 160 | relationships = _read_relationships(zip_file, _find_relationships_path_for(name)) 161 | 162 | body_reader = body_xml.reader( 163 | numbering=numbering, 164 | content_types=content_types, 165 | relationships=relationships, 166 | styles=styles, 167 | docx_file=zip_file, 168 | files=Files(None if document_path is None else os.path.dirname(document_path)), 169 | ) 170 | 171 | if default is _undefined: 172 | return _read_entry(zip_file, name, partial(reader, body_reader=body_reader)) 173 | else: 174 | return _try_read_entry_or_default(zip_file, name, partial(reader, body_reader=body_reader), default=default) 175 | 176 | return read_part 177 | 178 | 179 | 180 | def _find_relationships_path_for(name): 181 | dirname, basename = zips.split_path(name) 182 | return zips.join_path(dirname, "_rels", basename + ".rels") 183 | 184 | 185 | def _read_relationships(zip_file, name): 186 | return _try_read_entry_or_default( 187 | zip_file, 188 | name, 189 | read_relationships_xml_element, 190 | default=Relationships.EMPTY, 191 | ) 192 | 193 | def _try_read_entry_or_default(zip_file, name, reader, default): 194 | if zip_file.exists(name): 195 | return _read_entry(zip_file, name, reader) 196 | else: 197 | return default 198 | 199 | 200 | def _read_entry(zip_file, name, reader): 201 | with zip_file.open(name) as fileobj: 202 | return reader(office_xml.read(fileobj)) 203 | 204 | 205 | _undefined = object() 206 | -------------------------------------------------------------------------------- /mammoth/docx/comments_xml.py: -------------------------------------------------------------------------------- 1 | from .. import lists 2 | from .. import documents 3 | from .. import results 4 | 5 | 6 | def read_comments_xml_element(element, body_reader): 7 | def read_comments_xml_element(element): 8 | comment_elements = element.find_children("w:comment") 9 | return results.combine(lists.map(_read_comment_element, comment_elements)) 10 | 11 | 12 | def _read_comment_element(element): 13 | def read_optional_attribute(name): 14 | return element.attributes.get(name, "").strip() or None 15 | 16 | return body_reader.read_all(element.children).map(lambda body: 17 | documents.comment( 18 | comment_id=element.attributes["w:id"], 19 | body=body, 20 | author_name=read_optional_attribute("w:author"), 21 | author_initials=read_optional_attribute("w:initials"), 22 | )) 23 | 24 | return read_comments_xml_element(element) 25 | -------------------------------------------------------------------------------- /mammoth/docx/complex_fields.py: -------------------------------------------------------------------------------- 1 | class unknown(object): 2 | pass 3 | 4 | 5 | class Begin: 6 | def __init__(self, *, fld_char): 7 | self.fld_char = fld_char 8 | 9 | 10 | def begin(*, fld_char): 11 | return Begin(fld_char=fld_char) 12 | 13 | 14 | class Hyperlink(object): 15 | def __init__(self, kwargs): 16 | self.kwargs = kwargs 17 | 18 | 19 | def hyperlink(kwargs): 20 | return Hyperlink(kwargs=kwargs) 21 | 22 | 23 | class Checkbox: 24 | def __init__(self, *, checked): 25 | self.checked = checked 26 | 27 | 28 | def checkbox(*, checked): 29 | return Checkbox(checked=checked) 30 | -------------------------------------------------------------------------------- /mammoth/docx/content_types_xml.py: -------------------------------------------------------------------------------- 1 | def read_content_types_xml_element(element): 2 | extension_defaults = dict(map( 3 | _read_default, 4 | element.find_children("content-types:Default") 5 | )) 6 | overrides = dict(map( 7 | _read_override, 8 | element.find_children("content-types:Override") 9 | )) 10 | return _ContentTypes(extension_defaults, overrides) 11 | 12 | 13 | def _read_default(element): 14 | extension = element.attributes["Extension"] 15 | content_type = element.attributes["ContentType"] 16 | return extension, content_type 17 | 18 | 19 | def _read_override(element): 20 | part_name = element.attributes["PartName"] 21 | content_type = element.attributes["ContentType"] 22 | return part_name.lstrip("/"), content_type 23 | 24 | 25 | class _ContentTypes(object): 26 | _image_content_types = { 27 | "png": "png", 28 | "gif": "gif", 29 | "jpeg": "jpeg", 30 | "jpg": "jpeg", 31 | "tif": "tiff", 32 | "tiff": "tiff", 33 | "bmp": "bmp", 34 | } 35 | 36 | def __init__(self, extension_defaults, overrides): 37 | self._extension_defaults = extension_defaults 38 | self._overrides = overrides 39 | 40 | def find_content_type(self, path): 41 | if path in self._overrides: 42 | return self._overrides[path] 43 | 44 | extension = _get_extension(path) 45 | default_type = self._extension_defaults.get(extension) 46 | if default_type is not None: 47 | return default_type 48 | 49 | image_type = self._image_content_types.get(extension.lower()) 50 | if image_type is not None: 51 | return "image/" + image_type 52 | 53 | return None 54 | 55 | empty_content_types = _ContentTypes({}, {}) 56 | 57 | def _get_extension(path): 58 | return path.rpartition(".")[2] 59 | -------------------------------------------------------------------------------- /mammoth/docx/document_xml.py: -------------------------------------------------------------------------------- 1 | from .. import documents 2 | 3 | 4 | def read_document_xml_element( 5 | element, 6 | body_reader, 7 | notes=None, 8 | comments=None): 9 | 10 | if notes is None: 11 | notes = [] 12 | if comments is None: 13 | comments = [] 14 | 15 | body_element = element.find_child("w:body") 16 | 17 | if body_element is None: 18 | raise ValueError("Could not find the body element: are you sure this is a docx file?") 19 | 20 | return body_reader.read_all(body_element.children) \ 21 | .map(lambda children: documents.document( 22 | children, 23 | notes=documents.notes(notes), 24 | comments=comments 25 | )) 26 | -------------------------------------------------------------------------------- /mammoth/docx/files.py: -------------------------------------------------------------------------------- 1 | import os 2 | import contextlib 3 | try: 4 | from urllib2 import urlopen 5 | except ImportError: 6 | from urllib.request import urlopen 7 | try: 8 | from urllib.parse import urlparse 9 | except ImportError: 10 | from urlparse import urlparse 11 | 12 | 13 | class Files(object): 14 | def __init__(self, base): 15 | self._base = base 16 | 17 | def open(self, uri): 18 | try: 19 | if _is_absolute(uri): 20 | return contextlib.closing(urlopen(uri)) 21 | elif self._base is not None: 22 | return open(os.path.join(self._base, uri), "rb") 23 | else: 24 | raise InvalidFileReferenceError("could not find external image '{0}', fileobj has no name".format(uri)) 25 | except IOError as error: 26 | message = "could not open external image: '{0}' (document directory: '{1}')\n{2}".format( 27 | uri, self._base, str(error)) 28 | raise InvalidFileReferenceError(message) 29 | 30 | 31 | def _is_absolute(url): 32 | return urlparse(url).scheme != "" 33 | 34 | 35 | class InvalidFileReferenceError(ValueError): 36 | pass 37 | -------------------------------------------------------------------------------- /mammoth/docx/notes_xml.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | from .. import lists 4 | from .. import documents 5 | from .. import results 6 | 7 | 8 | def _read_notes(note_type, element, body_reader): 9 | def read_notes_xml_element(element): 10 | note_elements = lists.filter( 11 | _is_note_element, 12 | element.find_children("w:" + note_type), 13 | ) 14 | return results.combine(lists.map(_read_note_element, note_elements)) 15 | 16 | 17 | def _is_note_element(element): 18 | return element.attributes.get("w:type") not in ["continuationSeparator", "separator"] 19 | 20 | 21 | def _read_note_element(element): 22 | return body_reader.read_all(element.children).map(lambda body: 23 | documents.note( 24 | note_type=note_type, 25 | note_id=element.attributes["w:id"], 26 | body=body 27 | )) 28 | 29 | return read_notes_xml_element(element) 30 | 31 | read_footnotes_xml_element = functools.partial(_read_notes, "footnote") 32 | read_endnotes_xml_element = functools.partial(_read_notes, "endnote") 33 | -------------------------------------------------------------------------------- /mammoth/docx/numbering_xml.py: -------------------------------------------------------------------------------- 1 | import cobble 2 | 3 | from ..documents import numbering_level 4 | from .styles_xml import Styles 5 | 6 | 7 | def read_numbering_xml_element(element, styles): 8 | abstract_nums = _read_abstract_nums(element) 9 | nums = _read_nums(element) 10 | return Numbering(abstract_nums=abstract_nums, nums=nums, styles=styles) 11 | 12 | 13 | def _read_abstract_nums(element): 14 | abstract_num_elements = element.find_children("w:abstractNum") 15 | return dict(map(_read_abstract_num, abstract_num_elements)) 16 | 17 | 18 | def _read_abstract_num(element): 19 | abstract_num_id = element.attributes.get("w:abstractNumId") 20 | levels = _read_abstract_num_levels(element) 21 | num_style_link = element.find_child_or_null("w:numStyleLink").attributes.get("w:val") 22 | return abstract_num_id, _AbstractNum(levels=levels, num_style_link=num_style_link) 23 | 24 | 25 | @cobble.data 26 | class _AbstractNum(object): 27 | levels = cobble.field() 28 | num_style_link = cobble.field() 29 | 30 | 31 | @cobble.data 32 | class _AbstractNumLevel(object): 33 | level_index = cobble.field() 34 | is_ordered = cobble.field() 35 | paragraph_style_id = cobble.field() 36 | 37 | 38 | def _read_abstract_num_levels(element): 39 | levels = map(_read_abstract_num_level, element.find_children("w:lvl")) 40 | return dict( 41 | (level.level_index, level) 42 | for level in levels 43 | ) 44 | 45 | 46 | def _read_abstract_num_level(element): 47 | level_index = element.attributes["w:ilvl"] 48 | num_fmt = element.find_child_or_null("w:numFmt").attributes.get("w:val") 49 | is_ordered = num_fmt != "bullet" 50 | paragraph_style_id = element.find_child_or_null("w:pStyle").attributes.get("w:val") 51 | return _AbstractNumLevel( 52 | level_index=level_index, 53 | is_ordered=is_ordered, 54 | paragraph_style_id=paragraph_style_id, 55 | ) 56 | 57 | 58 | def _read_nums(element): 59 | num_elements = element.find_children("w:num") 60 | return dict( 61 | _read_num(num_element) 62 | for num_element in num_elements 63 | ) 64 | 65 | 66 | def _read_num(element): 67 | num_id = element.attributes.get("w:numId") 68 | abstract_num_id = element.find_child_or_null("w:abstractNumId").attributes["w:val"] 69 | return num_id, _Num(abstract_num_id=abstract_num_id) 70 | 71 | 72 | @cobble.data 73 | class _Num(object): 74 | abstract_num_id = cobble.field() 75 | 76 | 77 | class Numbering(object): 78 | def __init__(self, abstract_nums, nums, styles): 79 | self._abstract_nums = abstract_nums 80 | self._levels_by_paragraph_style_id = dict( 81 | (level.paragraph_style_id, self._to_numbering_level(level)) 82 | for abstract_num in abstract_nums.values() 83 | for level in abstract_num.levels.values() 84 | if level.paragraph_style_id is not None 85 | ) 86 | self._nums = nums 87 | self._styles = styles 88 | 89 | def find_level(self, num_id, level): 90 | num = self._nums.get(num_id) 91 | if num is None: 92 | return None 93 | else: 94 | abstract_num = self._abstract_nums.get(num.abstract_num_id) 95 | if abstract_num is None: 96 | return None 97 | elif abstract_num.num_style_link is None: 98 | return self._to_numbering_level(abstract_num.levels.get(level)) 99 | else: 100 | style = self._styles.find_numbering_style_by_id(abstract_num.num_style_link) 101 | return self.find_level(style.num_id, level) 102 | 103 | def find_level_by_paragraph_style_id(self, style_id): 104 | return self._levels_by_paragraph_style_id.get(style_id) 105 | 106 | def _to_numbering_level(self, abstract_num_level): 107 | if abstract_num_level is None: 108 | return None 109 | else: 110 | return numbering_level( 111 | level_index=abstract_num_level.level_index, 112 | is_ordered=abstract_num_level.is_ordered, 113 | ) 114 | 115 | 116 | Numbering.EMPTY = Numbering(abstract_nums={}, nums={}, styles=Styles.EMPTY) 117 | -------------------------------------------------------------------------------- /mammoth/docx/office_xml.py: -------------------------------------------------------------------------------- 1 | from ..lists import flat_map 2 | from .xmlparser import parse_xml, XmlElement 3 | 4 | 5 | _namespaces = [ 6 | # Transitional format 7 | ("w", "http://schemas.openxmlformats.org/wordprocessingml/2006/main"), 8 | ("r", "http://schemas.openxmlformats.org/officeDocument/2006/relationships"), 9 | ("wp", "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"), 10 | ("a", "http://schemas.openxmlformats.org/drawingml/2006/main"), 11 | ("pic", "http://schemas.openxmlformats.org/drawingml/2006/picture"), 12 | 13 | # Strict format 14 | ("w", "http://purl.oclc.org/ooxml/wordprocessingml/main"), 15 | ("r", "http://purl.oclc.org/ooxml/officeDocument/relationships"), 16 | ("wp", "http://purl.oclc.org/ooxml/drawingml/wordprocessingDrawing"), 17 | ("a", "http://purl.oclc.org/ooxml/drawingml/main"), 18 | ("pic", "http://purl.oclc.org/ooxml/drawingml/picture"), 19 | 20 | # Common 21 | ("content-types", "http://schemas.openxmlformats.org/package/2006/content-types"), 22 | ("relationships", "http://schemas.openxmlformats.org/package/2006/relationships"), 23 | ("mc", "http://schemas.openxmlformats.org/markup-compatibility/2006"), 24 | ("v", "urn:schemas-microsoft-com:vml"), 25 | ("office-word", "urn:schemas-microsoft-com:office:word"), 26 | 27 | # [MS-DOCX]: Word Extensions to the Office Open XML (.docx) File Format 28 | # https://learn.microsoft.com/en-us/openspecs/office_standards/ms-docx/b839fe1f-e1ca-4fa6-8c26-5954d0abbccd 29 | ("wordml", "http://schemas.microsoft.com/office/word/2010/wordml"), 30 | ] 31 | 32 | 33 | def read(fileobj): 34 | return _collapse_alternate_content(parse_xml(fileobj, _namespaces))[0] 35 | 36 | 37 | def _collapse_alternate_content(node): 38 | if isinstance(node, XmlElement): 39 | if node.name == "mc:AlternateContent": 40 | return node.find_child_or_null("mc:Fallback").children 41 | else: 42 | node.children = flat_map(_collapse_alternate_content, node.children) 43 | return [node] 44 | else: 45 | return [node] 46 | -------------------------------------------------------------------------------- /mammoth/docx/relationships_xml.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | 4 | class Relationships(object): 5 | def __init__(self, relationships): 6 | self._targets_by_id = dict( 7 | (relationship.relationship_id, relationship.target) 8 | for relationship in relationships 9 | ) 10 | self._targets_by_type = collections.defaultdict(list) 11 | for relationship in relationships: 12 | self._targets_by_type[relationship.type].append(relationship.target) 13 | 14 | def find_target_by_relationship_id(self, key): 15 | return self._targets_by_id[key] 16 | 17 | def find_targets_by_type(self, relationship_type): 18 | return self._targets_by_type[relationship_type] 19 | 20 | 21 | Relationships.EMPTY = Relationships([]) 22 | 23 | 24 | Relationship = collections.namedtuple("Relationship", ["relationship_id", "target", "type"]) 25 | 26 | 27 | def read_relationships_xml_element(element): 28 | children = element.find_children("relationships:Relationship") 29 | return Relationships(list(map(_read_relationship, children))) 30 | 31 | 32 | def _read_relationship(element): 33 | relationship = Relationship( 34 | relationship_id=element.attributes["Id"], 35 | target=element.attributes["Target"], 36 | type=element.attributes["Type"], 37 | ) 38 | return relationship 39 | -------------------------------------------------------------------------------- /mammoth/docx/style_map.py: -------------------------------------------------------------------------------- 1 | from xml.etree import ElementTree 2 | 3 | from ..zips import open_zip, update_zip 4 | 5 | 6 | _style_map_path = "mammoth/style-map" 7 | _style_map_absolute_path = "/" + _style_map_path 8 | _relationships_path = "word/_rels/document.xml.rels" 9 | _content_types_path = "[Content_Types].xml" 10 | 11 | 12 | def write_style_map(fileobj, style_map): 13 | with open_zip(fileobj, "r") as zip_file: 14 | relationships_xml = _generate_relationships_xml(zip_file.read_str(_relationships_path)) 15 | content_types_xml = _generate_content_types_xml(zip_file.read_str(_content_types_path)) 16 | 17 | update_zip(fileobj, { 18 | _style_map_path: style_map.encode("utf8"), 19 | _relationships_path: relationships_xml, 20 | _content_types_path: content_types_xml, 21 | }) 22 | 23 | def _generate_relationships_xml(relationships_xml): 24 | schema = "http://schemas.zwobble.org/mammoth/style-map" 25 | relationships_uri = "http://schemas.openxmlformats.org/package/2006/relationships" 26 | relationship_element_name = "{" + relationships_uri + "}Relationship" 27 | 28 | relationships = ElementTree.fromstring(relationships_xml) 29 | _add_or_update_element(relationships, relationship_element_name, "Id", { 30 | "Id": "rMammothStyleMap", 31 | "Type": schema, 32 | "Target": _style_map_absolute_path, 33 | }) 34 | 35 | return ElementTree.tostring(relationships, "UTF-8") 36 | 37 | 38 | def _generate_content_types_xml(content_types_xml): 39 | content_types_uri = "http://schemas.openxmlformats.org/package/2006/content-types" 40 | override_name = "{" + content_types_uri + "}Override" 41 | 42 | types = ElementTree.fromstring(content_types_xml) 43 | _add_or_update_element(types, override_name, "PartName", { 44 | "PartName": _style_map_absolute_path, 45 | "ContentType": "text/prs.mammoth.style-map", 46 | }) 47 | 48 | return ElementTree.tostring(types, "UTF-8") 49 | 50 | 51 | def _add_or_update_element(parent, name, identifying_attribute, attributes): 52 | existing_child = _find_child(parent, name, identifying_attribute, attributes) 53 | if existing_child is None: 54 | ElementTree.SubElement(parent, name, attributes) 55 | else: 56 | existing_child.attrib = attributes 57 | 58 | 59 | def _find_child(parent, name, identifying_attribute, attributes): 60 | for element in parent.iter(): 61 | if element.tag == name and element.get(identifying_attribute) == attributes.get(identifying_attribute): 62 | return element 63 | 64 | 65 | def read_style_map(fileobj): 66 | with open_zip(fileobj, "r") as zip_file: 67 | if zip_file.exists(_style_map_path): 68 | return zip_file.read_str(_style_map_path) 69 | 70 | 71 | -------------------------------------------------------------------------------- /mammoth/docx/styles_xml.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | 4 | class Styles(object): 5 | @staticmethod 6 | def create(paragraph_styles=None, character_styles=None, table_styles=None, numbering_styles=None): 7 | if paragraph_styles is None: 8 | paragraph_styles = {} 9 | if character_styles is None: 10 | character_styles = {} 11 | if table_styles is None: 12 | table_styles = {} 13 | if numbering_styles is None: 14 | numbering_styles = {} 15 | 16 | return Styles( 17 | paragraph_styles=paragraph_styles, 18 | character_styles=character_styles, 19 | table_styles=table_styles, 20 | numbering_styles=numbering_styles, 21 | ) 22 | 23 | def __init__(self, paragraph_styles, character_styles, table_styles, numbering_styles): 24 | self._paragraph_styles = paragraph_styles 25 | self._character_styles = character_styles 26 | self._table_styles = table_styles 27 | self._numbering_styles = numbering_styles 28 | 29 | def find_paragraph_style_by_id(self, style_id): 30 | return self._paragraph_styles.get(style_id) 31 | 32 | def find_character_style_by_id(self, style_id): 33 | return self._character_styles.get(style_id) 34 | 35 | def find_table_style_by_id(self, style_id): 36 | return self._table_styles.get(style_id) 37 | 38 | def find_numbering_style_by_id(self, style_id): 39 | return self._numbering_styles.get(style_id) 40 | 41 | 42 | Styles.EMPTY = Styles( 43 | paragraph_styles={}, 44 | character_styles={}, 45 | table_styles={}, 46 | numbering_styles={}, 47 | ) 48 | 49 | 50 | def read_styles_xml_element(element): 51 | paragraph_styles = {} 52 | character_styles = {} 53 | table_styles = {} 54 | numbering_styles = {} 55 | styles = { 56 | "paragraph": paragraph_styles, 57 | "character": character_styles, 58 | "table": table_styles, 59 | } 60 | 61 | for style_element in element.find_children("w:style"): 62 | style = _read_style_element(style_element) 63 | element_type = style_element.attributes["w:type"] 64 | if element_type == "numbering": 65 | numbering_styles[style.style_id] = _read_numbering_style_element(style_element) 66 | else: 67 | style_set = styles.get(element_type) 68 | if style_set is not None: 69 | style_set[style.style_id] = style 70 | 71 | return Styles( 72 | paragraph_styles=paragraph_styles, 73 | character_styles=character_styles, 74 | table_styles=table_styles, 75 | numbering_styles=numbering_styles, 76 | ) 77 | 78 | 79 | Style = collections.namedtuple("Style", ["style_id", "name"]) 80 | 81 | 82 | def _read_style_element(element): 83 | style_id = element.attributes["w:styleId"] 84 | name = element.find_child_or_null("w:name").attributes.get("w:val") 85 | return Style(style_id=style_id, name=name) 86 | 87 | 88 | NumberingStyle = collections.namedtuple("NumberingStyle", ["num_id"]) 89 | 90 | 91 | def _read_numbering_style_element(element): 92 | num_id = element \ 93 | .find_child_or_null("w:pPr") \ 94 | .find_child_or_null("w:numPr") \ 95 | .find_child_or_null("w:numId") \ 96 | .attributes.get("w:val") 97 | 98 | return NumberingStyle(num_id=num_id) 99 | -------------------------------------------------------------------------------- /mammoth/docx/uris.py: -------------------------------------------------------------------------------- 1 | def uri_to_zip_entry_name(base, uri): 2 | if uri.startswith("/"): 3 | return uri[1:] 4 | else: 5 | return base + "/" + uri 6 | 7 | 8 | def replace_fragment(uri, fragment): 9 | hash_index = uri.find("#") 10 | if hash_index != -1: 11 | uri = uri[:hash_index] 12 | return uri + "#" + fragment 13 | -------------------------------------------------------------------------------- /mammoth/docx/xmlparser.py: -------------------------------------------------------------------------------- 1 | import xml.dom.minidom 2 | 3 | import cobble 4 | 5 | 6 | @cobble.data 7 | class XmlElement(object): 8 | name = cobble.field() 9 | attributes = cobble.field() 10 | children = cobble.field() 11 | 12 | def find_child_or_null(self, name): 13 | return self.find_child(name) or null_xml_element 14 | 15 | def find_child(self, name): 16 | for child in self.children: 17 | if isinstance(child, XmlElement) and child.name == name: 18 | return child 19 | 20 | 21 | def find_children(self, name): 22 | return XmlElementList(filter( 23 | lambda child: child.node_type == node_types.element and child.name == name, 24 | self.children 25 | )) 26 | 27 | 28 | class XmlElementList(object): 29 | def __init__(self, elements): 30 | self._elements = elements 31 | 32 | def __iter__(self): 33 | return iter(self._elements) 34 | 35 | def find_children(self, name): 36 | children = [] 37 | for element in self._elements: 38 | for child in element.find_children(name): 39 | children.append(child) 40 | return XmlElementList(children) 41 | 42 | 43 | class NullXmlElement(object): 44 | attributes = {} 45 | children = [] 46 | 47 | def find_child_or_null(self, name): 48 | return self 49 | 50 | def find_child(self, name): 51 | return None 52 | 53 | 54 | null_xml_element = NullXmlElement() 55 | 56 | 57 | @cobble.data 58 | class XmlText(object): 59 | value = cobble.field() 60 | 61 | 62 | def element(name, attributes=None, children=None): 63 | return XmlElement(name, attributes or {}, children or []) 64 | 65 | text = XmlText 66 | 67 | 68 | class node_types(object): 69 | element = 1 70 | text = 3 71 | 72 | 73 | XmlElement.node_type = node_types.element 74 | XmlText.node_type = node_types.text 75 | 76 | 77 | 78 | def parse_xml(fileobj, namespace_mapping=None): 79 | if namespace_mapping is None: 80 | namespace_prefixes = {} 81 | else: 82 | namespace_prefixes = dict((uri, prefix) for prefix, uri in namespace_mapping) 83 | 84 | document = xml.dom.minidom.parse(fileobj) 85 | 86 | def convert_node(node): 87 | if node.nodeType == xml.dom.Node.ELEMENT_NODE: 88 | return convert_element(node) 89 | elif node.nodeType == xml.dom.Node.TEXT_NODE: 90 | return XmlText(node.nodeValue) 91 | else: 92 | return None 93 | 94 | def convert_element(element): 95 | converted_name = convert_name(element) 96 | 97 | converted_attributes = dict( 98 | (convert_name(attribute), attribute.value) 99 | for attribute in element.attributes.values() 100 | if attribute.namespaceURI != "http://www.w3.org/2000/xmlns/" 101 | ) 102 | 103 | converted_children = [] 104 | for child_node in element.childNodes: 105 | converted_child_node = convert_node(child_node) 106 | if converted_child_node is not None: 107 | converted_children.append(converted_child_node) 108 | 109 | return XmlElement(converted_name, converted_attributes, converted_children) 110 | 111 | def convert_name(node): 112 | if node.namespaceURI is None: 113 | return node.localName 114 | else: 115 | prefix = namespace_prefixes.get(node.namespaceURI) 116 | if prefix is None: 117 | return "{%s}%s" % (node.namespaceURI, node.localName) 118 | else: 119 | return "%s:%s" % (prefix, node.localName) 120 | 121 | return convert_node(document.documentElement) 122 | -------------------------------------------------------------------------------- /mammoth/html/__init__.py: -------------------------------------------------------------------------------- 1 | from ..lists import flat_map 2 | from .nodes import TextNode, Tag, Element, ForceWrite, NodeVisitor 3 | 4 | 5 | def text(value): 6 | return TextNode(value) 7 | 8 | 9 | def tag(tag_names, attributes=None, collapsible=None, separator=None): 10 | if not isinstance(tag_names, list): 11 | tag_names = [tag_names] 12 | if attributes is None: 13 | attributes = {} 14 | return Tag(tag_names=tag_names, attributes=attributes, collapsible=bool(collapsible), separator=separator) 15 | 16 | 17 | def element(tag_names, attributes=None, children=None, collapsible=None, separator=None): 18 | if children is None: 19 | children = [] 20 | 21 | element_tag = tag(tag_names=tag_names, attributes=attributes, collapsible=collapsible, separator=separator) 22 | return Element(element_tag, children) 23 | 24 | 25 | def collapsible_element(tag_names, attributes=None, children=None): 26 | return element(tag_names, attributes, children, collapsible=True) 27 | 28 | 29 | force_write = ForceWrite() 30 | 31 | 32 | def strip_empty(nodes): 33 | return flat_map(_strip_empty_node, nodes) 34 | 35 | 36 | def _strip_empty_node(node): 37 | return StripEmpty().visit(node) 38 | 39 | 40 | class StripEmpty(NodeVisitor): 41 | def visit_text_node(self, node): 42 | if node.value: 43 | return [node] 44 | else: 45 | return [] 46 | 47 | def visit_element(self, element): 48 | children = strip_empty(element.children) 49 | if len(children) == 0 and not element.is_void(): 50 | return [] 51 | else: 52 | return [Element(element.tag, children)] 53 | 54 | def visit_force_write(self, node): 55 | return [node] 56 | 57 | 58 | def collapse(nodes): 59 | collapsed = [] 60 | 61 | for node in nodes: 62 | _collapsing_add(collapsed, node) 63 | 64 | return collapsed 65 | 66 | class _CollapseNode(NodeVisitor): 67 | def visit_text_node(self, node): 68 | return node 69 | 70 | def visit_element(self, element): 71 | return Element(element.tag, collapse(element.children)) 72 | 73 | def visit_force_write(self, node): 74 | return node 75 | 76 | _collapse_node = _CollapseNode().visit 77 | 78 | 79 | def _collapsing_add(collapsed, node): 80 | collapsed_node = _collapse_node(node) 81 | if not _try_collapse(collapsed, collapsed_node): 82 | collapsed.append(collapsed_node) 83 | 84 | def _try_collapse(collapsed, node): 85 | if not collapsed: 86 | return False 87 | 88 | last = collapsed[-1] 89 | if not isinstance(last, Element) or not isinstance(node, Element): 90 | return False 91 | 92 | if not node.collapsible: 93 | return False 94 | 95 | if not _is_match(last, node): 96 | return False 97 | 98 | if node.separator: 99 | last.children.append(text(node.separator)) 100 | 101 | for child in node.children: 102 | _collapsing_add(last.children, child) 103 | 104 | return True 105 | 106 | def _is_match(first, second): 107 | return first.tag_name in second.tag_names and first.attributes == second.attributes 108 | 109 | 110 | def write(writer, nodes): 111 | visitor = _NodeWriter(writer) 112 | visitor.visit_all(nodes) 113 | 114 | 115 | class _NodeWriter(NodeVisitor): 116 | def __init__(self, writer): 117 | self._writer = writer 118 | 119 | def visit_text_node(self, node): 120 | self._writer.text(node.value) 121 | 122 | def visit_element(self, element): 123 | if element.is_void(): 124 | self._writer.self_closing(element.tag_name, element.attributes) 125 | else: 126 | self._writer.start(element.tag_name, element.attributes) 127 | self.visit_all(element.children) 128 | self._writer.end(element.tag_name) 129 | 130 | def visit_force_write(self, element): 131 | pass 132 | 133 | def visit_all(self, nodes): 134 | for node in nodes: 135 | self.visit(node) 136 | -------------------------------------------------------------------------------- /mammoth/html/nodes.py: -------------------------------------------------------------------------------- 1 | import cobble 2 | 3 | 4 | class Node(object): 5 | pass 6 | 7 | 8 | @cobble.data 9 | class TextNode(Node): 10 | value = cobble.field() 11 | 12 | 13 | @cobble.data 14 | class Tag(object): 15 | tag_names = cobble.field() 16 | attributes = cobble.field() 17 | collapsible = cobble.field() 18 | separator = cobble.field() 19 | 20 | @property 21 | def tag_name(self): 22 | return self.tag_names[0] 23 | 24 | 25 | @cobble.data 26 | class Element(Node): 27 | tag = cobble.field() 28 | children = cobble.field() 29 | 30 | @property 31 | def tag_name(self): 32 | return self.tag.tag_name 33 | 34 | @property 35 | def tag_names(self): 36 | return self.tag.tag_names 37 | 38 | @property 39 | def attributes(self): 40 | return self.tag.attributes 41 | 42 | @property 43 | def collapsible(self): 44 | return self.tag.collapsible 45 | 46 | @property 47 | def separator(self): 48 | return self.tag.separator 49 | 50 | _VOID_TAG_NAMES = set(["br", "hr", "img", "input"]) 51 | 52 | def is_void(self): 53 | return not self.children and self.tag_name in self._VOID_TAG_NAMES 54 | 55 | 56 | @cobble.visitable 57 | class ForceWrite(Node): 58 | pass 59 | 60 | 61 | NodeVisitor = cobble.visitor(Node) 62 | -------------------------------------------------------------------------------- /mammoth/html_paths.py: -------------------------------------------------------------------------------- 1 | import cobble 2 | 3 | from . import html 4 | 5 | 6 | def path(elements): 7 | return HtmlPath(elements) 8 | 9 | 10 | def element(names, attributes=None, class_names=None, fresh=None, separator=None): 11 | if attributes is None: 12 | attributes = {} 13 | if class_names is None: 14 | class_names = [] 15 | if fresh is None: 16 | fresh = False 17 | if class_names: 18 | attributes["class"] = " ".join(class_names) 19 | 20 | return HtmlPathElement(html.tag( 21 | tag_names=names, 22 | attributes=attributes, 23 | collapsible=not fresh, 24 | separator=separator, 25 | )) 26 | 27 | 28 | @cobble.data 29 | class HtmlPath(object): 30 | elements = cobble.field() 31 | 32 | def wrap(self, generate_nodes): 33 | nodes = generate_nodes() 34 | 35 | for element in reversed(self.elements): 36 | nodes = element.wrap_nodes(nodes) 37 | 38 | return nodes 39 | 40 | 41 | @cobble.data 42 | class HtmlPathElement(object): 43 | tag = cobble.field() 44 | 45 | def wrap(self, generate_nodes): 46 | return self.wrap_nodes(generate_nodes()) 47 | 48 | def wrap_nodes(self, nodes): 49 | element = html.Element(self.tag, nodes) 50 | return [element] 51 | 52 | empty = path([]) 53 | 54 | 55 | class ignore(object): 56 | @staticmethod 57 | def wrap(generate_nodes): 58 | return [] 59 | -------------------------------------------------------------------------------- /mammoth/images.py: -------------------------------------------------------------------------------- 1 | import base64 2 | 3 | from . import html 4 | 5 | 6 | def img_element(func): 7 | def convert_image(image): 8 | attributes = {} 9 | if image.alt_text: 10 | attributes["alt"] = image.alt_text 11 | attributes.update(func(image)) 12 | 13 | return [html.element("img", attributes)] 14 | 15 | return convert_image 16 | 17 | # Undocumented, but retained for backwards-compatibility with 0.3.x 18 | inline = img_element 19 | 20 | 21 | @img_element 22 | def data_uri(image): 23 | with image.open() as image_bytes: 24 | encoded_src = base64.b64encode(image_bytes.read()).decode("ascii") 25 | 26 | return { 27 | "src": "data:{0};base64,{1}".format(image.content_type, encoded_src) 28 | } 29 | -------------------------------------------------------------------------------- /mammoth/lists.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def flatten(values): 5 | return flat_map(lambda x: x, values) 6 | 7 | 8 | def unique(values): 9 | output = [] 10 | seen = set() 11 | for value in values: 12 | if value not in seen: 13 | seen.add(value) 14 | output.append(value) 15 | return output 16 | 17 | 18 | def flat_map(func, values): 19 | return [ 20 | element 21 | for value in values 22 | for element in func(value) 23 | ] 24 | 25 | 26 | def find_index(predicate, values): 27 | for index, value in enumerate(values): 28 | if predicate(value): 29 | return index 30 | 31 | 32 | if sys.version_info[0] == 2: 33 | map = map 34 | filter = filter 35 | else: 36 | import builtins 37 | def map(*args, **kwargs): 38 | return list(builtins.map(*args, **kwargs)) 39 | def filter(*args, **kwargs): 40 | return list(builtins.filter(*args, **kwargs)) 41 | -------------------------------------------------------------------------------- /mammoth/options.py: -------------------------------------------------------------------------------- 1 | from .styles.parser import read_style_mapping 2 | from . import lists, results 3 | 4 | 5 | def read_options(options): 6 | custom_style_map_text = options.pop("style_map", "") or "" 7 | embedded_style_map_text = options.pop("embedded_style_map", "") or "" 8 | include_default_style_map = options.pop("include_default_style_map", True) 9 | 10 | read_style_map_result = results.combine([ 11 | _read_style_map(custom_style_map_text), 12 | _read_style_map(embedded_style_map_text), 13 | ]) 14 | 15 | custom_style_map, embedded_style_map = read_style_map_result.value 16 | style_map = custom_style_map + embedded_style_map 17 | 18 | if include_default_style_map: 19 | style_map += _default_style_map 20 | 21 | options["ignore_empty_paragraphs"] = options.get("ignore_empty_paragraphs", True) 22 | options["style_map"] = style_map 23 | return read_style_map_result.map(lambda _: options) 24 | 25 | 26 | def _read_style_map(style_text): 27 | lines = filter(None, map(_get_line, style_text.split("\n"))) 28 | return results.combine(lists.map(read_style_mapping, lines)) \ 29 | .map(lambda style_mappings: lists.filter(None, style_mappings)) 30 | 31 | 32 | def _get_line(line): 33 | line = line.strip() 34 | if line.startswith("#"): 35 | return None 36 | else: 37 | return line 38 | 39 | 40 | _default_style_map_result = _read_style_map(""" 41 | p.Heading1 => h1:fresh 42 | p.Heading2 => h2:fresh 43 | p.Heading3 => h3:fresh 44 | p.Heading4 => h4:fresh 45 | p.Heading5 => h5:fresh 46 | p.Heading6 => h6:fresh 47 | p[style-name='Heading 1'] => h1:fresh 48 | p[style-name='Heading 2'] => h2:fresh 49 | p[style-name='Heading 3'] => h3:fresh 50 | p[style-name='Heading 4'] => h4:fresh 51 | p[style-name='Heading 5'] => h5:fresh 52 | p[style-name='Heading 6'] => h6:fresh 53 | p[style-name='heading 1'] => h1:fresh 54 | p[style-name='heading 2'] => h2:fresh 55 | p[style-name='heading 3'] => h3:fresh 56 | p[style-name='heading 4'] => h4:fresh 57 | p[style-name='heading 5'] => h5:fresh 58 | p[style-name='heading 6'] => h6:fresh 59 | 60 | # Apple Pages 61 | p.Heading => h1:fresh 62 | p[style-name='Heading'] => h1:fresh 63 | 64 | r[style-name='Strong'] => strong 65 | 66 | p[style-name='footnote text'] => p:fresh 67 | r[style-name='footnote reference'] => 68 | p[style-name='endnote text'] => p:fresh 69 | r[style-name='endnote reference'] => 70 | p[style-name='annotation text'] => p:fresh 71 | r[style-name='annotation reference'] => 72 | 73 | # LibreOffice 74 | p[style-name='Footnote'] => p:fresh 75 | r[style-name='Footnote anchor'] => 76 | p[style-name='Endnote'] => p:fresh 77 | r[style-name='Endnote anchor'] => 78 | 79 | p:unordered-list(1) => ul > li:fresh 80 | p:unordered-list(2) => ul|ol > li > ul > li:fresh 81 | p:unordered-list(3) => ul|ol > li > ul|ol > li > ul > li:fresh 82 | p:unordered-list(4) => ul|ol > li > ul|ol > li > ul|ol > li > ul > li:fresh 83 | p:unordered-list(5) => ul|ol > li > ul|ol > li > ul|ol > li > ul|ol > li > ul > li:fresh 84 | p:ordered-list(1) => ol > li:fresh 85 | p:ordered-list(2) => ul|ol > li > ol > li:fresh 86 | p:ordered-list(3) => ul|ol > li > ul|ol > li > ol > li:fresh 87 | p:ordered-list(4) => ul|ol > li > ul|ol > li > ul|ol > li > ol > li:fresh 88 | p:ordered-list(5) => ul|ol > li > ul|ol > li > ul|ol > li > ul|ol > li > ol > li:fresh 89 | 90 | r[style-name='Hyperlink'] => 91 | 92 | p[style-name='Normal'] => p:fresh 93 | 94 | # Apple Pages 95 | p.Body => p:fresh 96 | p[style-name='Body'] => p:fresh 97 | """) 98 | 99 | 100 | assert not _default_style_map_result.messages 101 | _default_style_map = _default_style_map_result.value 102 | -------------------------------------------------------------------------------- /mammoth/raw_text.py: -------------------------------------------------------------------------------- 1 | from . import documents 2 | 3 | 4 | def extract_raw_text_from_element(element): 5 | if isinstance(element, documents.Text): 6 | return element.value 7 | elif isinstance(element, documents.Tab): 8 | return "\t" 9 | else: 10 | text = "".join(map(extract_raw_text_from_element, getattr(element, "children", []))) 11 | if isinstance(element, documents.Paragraph): 12 | return text + "\n\n" 13 | else: 14 | return text 15 | -------------------------------------------------------------------------------- /mammoth/results.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | from .lists import unique 4 | 5 | 6 | class Result(object): 7 | def __init__(self, value, messages): 8 | self.value = value 9 | self.messages = unique(messages) 10 | 11 | def map(self, func): 12 | return Result(func(self.value), self.messages) 13 | 14 | def bind(self, func): 15 | result = func(self.value) 16 | return Result(result.value, self.messages + result.messages) 17 | 18 | 19 | Message = collections.namedtuple("Message", ["type", "message"]) 20 | 21 | 22 | def warning(message): 23 | return Message("warning", message) 24 | 25 | 26 | def success(value): 27 | return Result(value, []) 28 | 29 | 30 | def combine(results): 31 | values = [] 32 | messages = [] 33 | for result in results: 34 | values.append(result.value) 35 | for message in result.messages: 36 | messages.append(message) 37 | 38 | return Result(values, messages) 39 | 40 | 41 | def map(func, *args): 42 | return combine(args).map(lambda values: func(*values)) 43 | -------------------------------------------------------------------------------- /mammoth/styles/__init__.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | 4 | def style(document_matcher, html_path): 5 | return Style(document_matcher, html_path) 6 | 7 | 8 | Style = collections.namedtuple("Style", ["document_matcher", "html_path"]) 9 | -------------------------------------------------------------------------------- /mammoth/styles/parser/__init__.py: -------------------------------------------------------------------------------- 1 | from .errors import LineParseError 2 | from .style_mapping_parser import parse_style_mapping 3 | from .tokeniser import tokenise 4 | from .token_iterator import TokenIterator 5 | from ... import results 6 | 7 | 8 | def read_style_mapping(string): 9 | try: 10 | tokens = tokenise(string) 11 | return results.success(parse_style_mapping(TokenIterator(tokens))) 12 | except LineParseError: 13 | warning = "Did not understand this style mapping, so ignored it: " + string 14 | return results.Result(None, [results.warning(warning)]) 15 | -------------------------------------------------------------------------------- /mammoth/styles/parser/document_matcher_parser.py: -------------------------------------------------------------------------------- 1 | from ... import documents, document_matchers 2 | from .errors import LineParseError 3 | from .tokeniser import TokenType 4 | from .token_parser import try_parse_class_name, parse_string 5 | 6 | 7 | def parse_document_matcher(tokens): 8 | if tokens.try_skip(TokenType.IDENTIFIER, "p"): 9 | style_id = try_parse_class_name(tokens) 10 | style_name = _parse_style_name(tokens) 11 | numbering = _parse_numbering(tokens) 12 | 13 | return document_matchers.paragraph( 14 | style_id=style_id, 15 | style_name=style_name, 16 | numbering=numbering, 17 | ) 18 | 19 | elif tokens.try_skip(TokenType.IDENTIFIER, "r"): 20 | style_id = try_parse_class_name(tokens) 21 | style_name = _parse_style_name(tokens) 22 | 23 | return document_matchers.run( 24 | style_id=style_id, 25 | style_name=style_name, 26 | ) 27 | 28 | elif tokens.try_skip(TokenType.IDENTIFIER, "table"): 29 | style_id = try_parse_class_name(tokens) 30 | style_name = _parse_style_name(tokens) 31 | 32 | return document_matchers.table( 33 | style_id=style_id, 34 | style_name=style_name, 35 | ) 36 | 37 | elif tokens.try_skip(TokenType.IDENTIFIER, "b"): 38 | return document_matchers.bold 39 | 40 | elif tokens.try_skip(TokenType.IDENTIFIER, "i"): 41 | return document_matchers.italic 42 | 43 | elif tokens.try_skip(TokenType.IDENTIFIER, "u"): 44 | return document_matchers.underline 45 | 46 | elif tokens.try_skip(TokenType.IDENTIFIER, "strike"): 47 | return document_matchers.strikethrough 48 | 49 | elif tokens.try_skip(TokenType.IDENTIFIER, "all-caps"): 50 | return document_matchers.all_caps 51 | 52 | elif tokens.try_skip(TokenType.IDENTIFIER, "small-caps"): 53 | return document_matchers.small_caps 54 | 55 | elif tokens.try_skip(TokenType.IDENTIFIER, "highlight"): 56 | return _parse_highlight(tokens) 57 | 58 | elif tokens.try_skip(TokenType.IDENTIFIER, "comment-reference"): 59 | return document_matchers.comment_reference 60 | 61 | elif tokens.try_skip(TokenType.IDENTIFIER, "br"): 62 | return _parse_break(tokens) 63 | 64 | else: 65 | raise LineParseError("Unrecognised document element: {0}".format(tokens.next_value(TokenType.IDENTIFIER))) 66 | 67 | def _parse_style_name(tokens): 68 | if tokens.try_skip(TokenType.SYMBOL, "["): 69 | tokens.skip(TokenType.IDENTIFIER, "style-name") 70 | string_matcher = _parse_string_matcher(tokens) 71 | tokens.skip(TokenType.SYMBOL, "]") 72 | return string_matcher 73 | else: 74 | return None 75 | 76 | 77 | def _parse_string_matcher(tokens): 78 | if tokens.try_skip(TokenType.SYMBOL, "="): 79 | return document_matchers.equal_to(parse_string(tokens)) 80 | elif tokens.try_skip(TokenType.SYMBOL, "^="): 81 | return document_matchers.starts_with(parse_string(tokens)) 82 | else: 83 | raise LineParseError("Unrecognised string matcher: {0}".format(tokens.next_value())) 84 | 85 | def _parse_numbering(tokens): 86 | if tokens.try_skip(TokenType.SYMBOL, ":"): 87 | is_ordered = _parse_list_type(tokens) 88 | tokens.skip(TokenType.SYMBOL, "(") 89 | level = int(tokens.next_value(TokenType.INTEGER)) - 1 90 | tokens.skip(TokenType.SYMBOL, ")") 91 | return documents.numbering_level(level, is_ordered=is_ordered) 92 | 93 | 94 | def _parse_list_type(tokens): 95 | list_type = tokens.next_value(TokenType.IDENTIFIER) 96 | if list_type == "ordered-list": 97 | return True 98 | elif list_type == "unordered-list": 99 | return False 100 | else: 101 | raise LineParseError("Unrecognised list type: {0}".format(list_type)) 102 | 103 | 104 | def _parse_highlight(tokens): 105 | if tokens.try_skip(TokenType.SYMBOL, "["): 106 | tokens.skip(TokenType.IDENTIFIER, "color") 107 | tokens.skip(TokenType.SYMBOL, "=") 108 | color = parse_string(tokens) 109 | tokens.skip(TokenType.SYMBOL, "]"); 110 | else: 111 | color = None 112 | 113 | return document_matchers.highlight(color=color) 114 | 115 | 116 | def _parse_break(tokens): 117 | tokens.skip(TokenType.SYMBOL, "[") 118 | tokens.skip(TokenType.IDENTIFIER, "type") 119 | tokens.skip(TokenType.SYMBOL, "=") 120 | type_name = parse_string(tokens) 121 | tokens.skip(TokenType.SYMBOL, "]"); 122 | 123 | if type_name == "line": 124 | return document_matchers.line_break 125 | elif type_name == "page": 126 | return document_matchers.page_break 127 | elif type_name == "column": 128 | return document_matchers.column_break 129 | else: 130 | raise LineParseError("Unrecognised break type: {0}".format(type_name)) 131 | -------------------------------------------------------------------------------- /mammoth/styles/parser/errors.py: -------------------------------------------------------------------------------- 1 | class LineParseError(Exception): 2 | pass 3 | -------------------------------------------------------------------------------- /mammoth/styles/parser/html_path_parser.py: -------------------------------------------------------------------------------- 1 | import cobble 2 | 3 | from ... import html_paths 4 | from .tokeniser import TokenType 5 | from .token_parser import parse_identifier, parse_string 6 | 7 | 8 | @cobble.data 9 | class _AttributeOrClassName(object): 10 | name = cobble.field() 11 | value = cobble.field() 12 | append = cobble.field() 13 | 14 | 15 | def parse_html_path(tokens): 16 | if tokens.try_skip(TokenType.SYMBOL, "!"): 17 | return html_paths.ignore 18 | else: 19 | return html_paths.path(_parse_html_path_elements(tokens)) 20 | 21 | 22 | def _parse_html_path_elements(tokens): 23 | elements = [] 24 | 25 | if tokens.peek_token_type() == TokenType.IDENTIFIER: 26 | elements.append(_parse_element(tokens)) 27 | 28 | while tokens.try_skip_many(((TokenType.WHITESPACE, None), (TokenType.SYMBOL, ">"))): 29 | tokens.skip(TokenType.WHITESPACE) 30 | elements.append(_parse_element(tokens)) 31 | 32 | return elements 33 | 34 | 35 | def _parse_element(tokens): 36 | tag_names = _parse_tag_names(tokens) 37 | attributes_list = _parse_attribute_or_class_names(tokens) 38 | is_fresh = _parse_is_fresh(tokens) 39 | separator = _parse_separator(tokens) 40 | 41 | attributes = {} 42 | for attribute in attributes_list: 43 | if attribute.append and attributes.get(attribute.name): 44 | attributes[attribute.name] += " " + attribute.value 45 | else: 46 | attributes[attribute.name] = attribute.value 47 | 48 | return html_paths.element( 49 | tag_names, 50 | attributes=attributes, 51 | fresh=is_fresh, 52 | separator=separator, 53 | ) 54 | 55 | 56 | def _parse_tag_names(tokens): 57 | tag_names = [parse_identifier(tokens)] 58 | 59 | while tokens.try_skip(TokenType.SYMBOL, "|"): 60 | tag_names.append(parse_identifier(tokens)) 61 | 62 | return tag_names 63 | 64 | 65 | def _parse_attribute_or_class_names(tokens): 66 | attribute_or_class_names = [] 67 | 68 | while True: 69 | attribute_or_class_name = _try_parse_attribute_or_class_name(tokens) 70 | if attribute_or_class_name is None: 71 | break 72 | else: 73 | attribute_or_class_names.append(attribute_or_class_name) 74 | 75 | return attribute_or_class_names 76 | 77 | 78 | def _try_parse_attribute_or_class_name(tokens): 79 | if tokens.is_next(TokenType.SYMBOL, "["): 80 | return _parse_attribute(tokens) 81 | if tokens.is_next(TokenType.SYMBOL, "."): 82 | return _parse_class_name(tokens) 83 | else: 84 | return None 85 | 86 | 87 | def _parse_attribute(tokens): 88 | tokens.skip(TokenType.SYMBOL, "[") 89 | name = parse_identifier(tokens) 90 | tokens.skip(TokenType.SYMBOL, "=") 91 | value = parse_string(tokens) 92 | tokens.skip(TokenType.SYMBOL, "]") 93 | return _AttributeOrClassName(name=name, value=value, append=False) 94 | 95 | 96 | def _parse_class_name(tokens): 97 | tokens.skip(TokenType.SYMBOL, ".") 98 | class_name = parse_identifier(tokens) 99 | return _AttributeOrClassName(name="class", value=class_name, append=True) 100 | 101 | 102 | def _parse_is_fresh(tokens): 103 | return tokens.try_skip_many(( 104 | (TokenType.SYMBOL, ":"), 105 | (TokenType.IDENTIFIER, "fresh"), 106 | )) 107 | 108 | 109 | def _parse_separator(tokens): 110 | is_separator = tokens.try_skip_many(( 111 | (TokenType.SYMBOL, ":"), 112 | (TokenType.IDENTIFIER, "separator"), 113 | )) 114 | if is_separator: 115 | tokens.skip(TokenType.SYMBOL, "(") 116 | value = parse_string(tokens) 117 | tokens.skip(TokenType.SYMBOL, ")") 118 | return value 119 | else: 120 | return None 121 | -------------------------------------------------------------------------------- /mammoth/styles/parser/style_mapping_parser.py: -------------------------------------------------------------------------------- 1 | from .tokeniser import TokenType 2 | from .document_matcher_parser import parse_document_matcher 3 | from .html_path_parser import parse_html_path 4 | from ...styles import Style 5 | 6 | 7 | def parse_style_mapping(tokens): 8 | document_matcher = parse_document_matcher(tokens) 9 | tokens.skip(TokenType.WHITESPACE) 10 | tokens.skip(TokenType.SYMBOL, "=>") 11 | tokens.try_skip(TokenType.WHITESPACE) 12 | html_path = parse_html_path(tokens) 13 | tokens.skip(TokenType.END) 14 | 15 | return Style(document_matcher, html_path) 16 | -------------------------------------------------------------------------------- /mammoth/styles/parser/token_iterator.py: -------------------------------------------------------------------------------- 1 | # TODO: check indices 2 | # TODO: proper tests for unexpected tokens 3 | 4 | from .errors import LineParseError 5 | 6 | 7 | class TokenIterator(object): 8 | def __init__(self, tokens): 9 | self._tokens = tokens 10 | self._index = 0 11 | 12 | def peek_token_type(self): 13 | return self._tokens[self._index].type 14 | 15 | def next_value(self, token_type=None): 16 | return self._next(token_type).value 17 | 18 | def _next(self, token_type=None): 19 | token = self._tokens[self._index] 20 | if token_type is None or token.type == token_type: 21 | self._index += 1 22 | return token 23 | else: 24 | raise self._unexpected_token_type(token_type, token) 25 | 26 | def skip(self, token_type, token_value=None): 27 | token = self._tokens[self._index] 28 | if token.type == token_type and (token_value is None or token.value == token_value): 29 | self._index += 1 30 | return True 31 | else: 32 | raise self._unexpected_token_type(token_type, token) 33 | 34 | def try_skip(self, token_type, token_value=None): 35 | if self.is_next(token_type, token_value): 36 | self._index += 1 37 | return True 38 | else: 39 | return False 40 | 41 | def try_skip_many(self, tokens): 42 | start = self._index 43 | for token_type, token_value in tokens: 44 | token = self._tokens[self._index] 45 | if not (token.type == token_type and (token_value is None or token.value == token_value)): 46 | self._index = start 47 | return False 48 | else: 49 | self._index += 1 50 | 51 | return True 52 | 53 | def is_next(self, token_type, token_value=None): 54 | token = self._tokens[self._index] 55 | return token.type == token_type and (token_value is None or token.value == token_value) 56 | 57 | def _unexpected_token_type(self, token_type, token): 58 | raise LineParseError() 59 | 60 | -------------------------------------------------------------------------------- /mammoth/styles/parser/token_parser.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from .tokeniser import TokenType 4 | 5 | 6 | def try_parse_class_name(tokens): 7 | if tokens.try_skip(TokenType.SYMBOL, "."): 8 | return parse_identifier(tokens) 9 | else: 10 | return None 11 | 12 | 13 | def parse_identifier(tokens): 14 | return decode_escape_sequences(tokens.next_value(TokenType.IDENTIFIER)) 15 | 16 | 17 | def parse_string(tokens): 18 | return decode_escape_sequences(tokens.next_value(TokenType.STRING)[1:-1]) 19 | 20 | 21 | _ESCAPE_SEQUENCE_REGEX = re.compile(r"\\(.)") 22 | 23 | 24 | def decode_escape_sequences(value): 25 | return _ESCAPE_SEQUENCE_REGEX.sub(_decode_escape_sequence, value) 26 | 27 | 28 | def _decode_escape_sequence(match): 29 | code = match.group(1) 30 | if code == "n": 31 | return "\n" 32 | elif code == "r": 33 | return "\r" 34 | elif code == "t": 35 | return "\t" 36 | else: 37 | return code 38 | -------------------------------------------------------------------------------- /mammoth/styles/parser/tokeniser.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import re 3 | 4 | 5 | Token = collections.namedtuple("Token", ["character_index", "type", "value"]) 6 | 7 | 8 | class TokenType(object): 9 | IDENTIFIER = "identifier" 10 | SYMBOL = "symbol" 11 | WHITESPACE = "whitespace" 12 | STRING = "string" 13 | UNTERMINATED_STRING = "unterminated string" 14 | INTEGER = "integer" 15 | END = "end" 16 | 17 | 18 | 19 | def regex_tokeniser(rules): 20 | rules = [(token_type, _to_regex(regex)) for token_type, regex in rules] 21 | rules.append(("unknown", re.compile("."))) 22 | 23 | def tokenise(value): 24 | tokens = [] 25 | index = 0 26 | while index < len(value): 27 | for token_type, regex in rules: 28 | match = regex.match(value, index) 29 | if match is not None: 30 | tokens.append(Token(index, token_type, match.group(0))) 31 | index = match.end() 32 | break 33 | else: 34 | # Should be impossible 35 | raise Exception("Remaining: " + value[index:]) 36 | 37 | tokens.append(Token(index, TokenType.END, "")) 38 | 39 | return tokens 40 | 41 | return tokenise 42 | 43 | 44 | def _to_regex(value): 45 | if hasattr(value, "match"): 46 | return value 47 | else: 48 | return re.compile(value) 49 | 50 | 51 | _string_prefix = r"'(?:\\.|[^'])*" 52 | _identifier_character = r"(?:[a-zA-Z\-_]|\\.)" 53 | 54 | tokenise = regex_tokeniser([ 55 | (TokenType.IDENTIFIER, _identifier_character + "(?:" + _identifier_character + "|[0-9])*"), 56 | (TokenType.SYMBOL, r":|>|=>|\^=|=|\(|\)|\[|\]|\||!|\."), 57 | (TokenType.WHITESPACE, r"\s+"), 58 | (TokenType.STRING, _string_prefix + "'"), 59 | (TokenType.UNTERMINATED_STRING, _string_prefix), 60 | (TokenType.INTEGER, "([0-9]+)"), 61 | ]) 62 | -------------------------------------------------------------------------------- /mammoth/transforms.py: -------------------------------------------------------------------------------- 1 | from . import documents 2 | 3 | 4 | def paragraph(transform_paragraph): 5 | return element_of_type(documents.Paragraph, transform_paragraph) 6 | 7 | 8 | def run(transform_run): 9 | return element_of_type(documents.Run, transform_run) 10 | 11 | 12 | def element_of_type(element_type, transform): 13 | def transform_element(element): 14 | if isinstance(element, element_type): 15 | return transform(element) 16 | else: 17 | return element 18 | 19 | return _each_element(transform_element) 20 | 21 | 22 | def _each_element(transform_element): 23 | def transform_element_and_children(element): 24 | if isinstance(element, (documents.HasChildren, documents.TableCellUnmerged)): 25 | children = list(map(transform_element_and_children, element.children)) 26 | element = element.copy(children=children) 27 | 28 | return transform_element(element) 29 | 30 | return transform_element_and_children 31 | 32 | 33 | def get_descendants_of_type(element, element_type): 34 | return list(filter( 35 | lambda descendant: isinstance(descendant, element_type), 36 | get_descendants(element), 37 | )) 38 | 39 | 40 | def get_descendants(element): 41 | descendants = [] 42 | 43 | def visit(element): 44 | descendants.append(element) 45 | 46 | _visit_descendants(element, visit) 47 | 48 | return descendants 49 | 50 | 51 | def _visit_descendants(element, visit): 52 | if isinstance(element, documents.HasChildren): 53 | for child in element.children: 54 | _visit_descendants(child, visit) 55 | visit(child) 56 | 57 | -------------------------------------------------------------------------------- /mammoth/underline.py: -------------------------------------------------------------------------------- 1 | from . import html 2 | 3 | 4 | def element(name): 5 | def convert_underline(nodes): 6 | return [html.collapsible_element(name, {}, nodes)] 7 | 8 | return convert_underline 9 | -------------------------------------------------------------------------------- /mammoth/writers/__init__.py: -------------------------------------------------------------------------------- 1 | from .html import HtmlWriter 2 | from .markdown import MarkdownWriter 3 | 4 | 5 | def writer(output_format=None): 6 | if output_format is None: 7 | output_format = "html" 8 | 9 | return _writers[output_format]() 10 | 11 | 12 | def formats(): 13 | return _writers.keys() 14 | 15 | 16 | _writers = { 17 | "html": HtmlWriter, 18 | "markdown": MarkdownWriter, 19 | } 20 | -------------------------------------------------------------------------------- /mammoth/writers/abc.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import abc 4 | 5 | 6 | class Writer(object): 7 | __metaclass__ = abc.ABCMeta 8 | 9 | @abc.abstractmethod 10 | def text(self, text): 11 | pass 12 | 13 | @abc.abstractmethod 14 | def start(self, name, attributes=None): 15 | pass 16 | 17 | @abc.abstractmethod 18 | def end(self, name): 19 | pass 20 | 21 | @abc.abstractmethod 22 | def self_closing(self, name, attributes=None): 23 | pass 24 | 25 | @abc.abstractmethod 26 | def append(self, html): 27 | pass 28 | 29 | @abc.abstractmethod 30 | def as_string(self): 31 | pass 32 | -------------------------------------------------------------------------------- /mammoth/writers/html.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from xml.sax.saxutils import escape 3 | 4 | from .abc import Writer 5 | 6 | 7 | class HtmlWriter(Writer): 8 | def __init__(self): 9 | self._fragments = [] 10 | 11 | def text(self, text): 12 | self._fragments.append(_escape_html(text)) 13 | 14 | def start(self, name, attributes=None): 15 | attribute_string = _generate_attribute_string(attributes) 16 | self._fragments.append("<{0}{1}>".format(name, attribute_string)) 17 | 18 | def end(self, name): 19 | self._fragments.append("".format(name)) 20 | 21 | def self_closing(self, name, attributes=None): 22 | attribute_string = _generate_attribute_string(attributes) 23 | self._fragments.append("<{0}{1} />".format(name, attribute_string)) 24 | 25 | def append(self, html): 26 | self._fragments.append(html) 27 | 28 | def as_string(self): 29 | return "".join(self._fragments) 30 | 31 | 32 | def _escape_html(text): 33 | return escape(text, {'"': """}) 34 | 35 | 36 | def _generate_attribute_string(attributes): 37 | if attributes is None: 38 | return "" 39 | else: 40 | return "".join( 41 | ' {0}="{1}"'.format(key, _escape_html(attributes[key])) 42 | for key in sorted(attributes) 43 | ) 44 | -------------------------------------------------------------------------------- /mammoth/writers/markdown.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | 3 | from .abc import Writer 4 | 5 | import re 6 | 7 | 8 | class _WriterOutput(object): 9 | def __init__(self, start, end=None, generate_end=None, anchor_position=None): 10 | if generate_end is None: 11 | generate_end = _constant(end) 12 | 13 | self.start = start 14 | self.generate_end = generate_end 15 | self.anchor_position = anchor_position 16 | 17 | 18 | def _constant(value): 19 | def get(): 20 | return value 21 | 22 | return get 23 | 24 | 25 | class _MarkdownState(object): 26 | def __init__(self): 27 | self._list_state_stack = [] 28 | self.list_state = None 29 | self.list_item_has_closed = False 30 | 31 | def update_list_state(self, list_state): 32 | self._list_state_stack.append(self.list_state) 33 | self.list_state = list_state 34 | 35 | def pop_list_state(self): 36 | self.list_state = self._list_state_stack.pop() 37 | 38 | 39 | class _MarkdownListState(object): 40 | def __init__(self, ordered, indentation): 41 | self.ordered = ordered 42 | self.count = 0 43 | self.indentation = indentation 44 | 45 | 46 | def _symmetric_wrapped(end): 47 | return _Wrapped(end, end) 48 | 49 | 50 | class _Wrapped(object): 51 | def __init__(self, start, end): 52 | self._start = start 53 | self._end = end 54 | 55 | def __call__(self, attributes, markdown_state): 56 | return _WriterOutput(self._start, self._end) 57 | 58 | 59 | def _hyperlink(attributes, markdown_state): 60 | href = attributes.get("href", "") 61 | if href: 62 | return _WriterOutput( 63 | "[", "]({0})".format(href), 64 | anchor_position="before", 65 | ) 66 | else: 67 | return _default_output 68 | 69 | 70 | def _image(attributes, markdown_state): 71 | src = attributes.get("src", "") 72 | alt_text = attributes.get("alt", "") 73 | if src or alt_text: 74 | return _WriterOutput("![{0}]({1})".format(alt_text, src), "") 75 | else: 76 | return _default_output 77 | 78 | 79 | def _list(ordered): 80 | def call(attributes, markdown_state): 81 | if markdown_state.list_state is None: 82 | start = "" 83 | end_text = "\n" 84 | indentation = 0 85 | else: 86 | start = "\n" 87 | end_text = "" 88 | indentation = markdown_state.list_state.indentation + 1 89 | 90 | def generate_end(): 91 | markdown_state.pop_list_state() 92 | return end_text 93 | 94 | markdown_state.update_list_state(_MarkdownListState( 95 | ordered=ordered, 96 | indentation=indentation, 97 | )) 98 | 99 | return _WriterOutput(start, generate_end=generate_end) 100 | 101 | return call 102 | 103 | 104 | def _list_item(attributes, markdown_state): 105 | markdown_state.list_item_has_closed = False 106 | 107 | list_state = markdown_state.list_state or _MarkdownListState(ordered=False, indentation=0) 108 | list_state.count += 1 109 | 110 | if list_state.ordered: 111 | bullet = "{0}.".format(list_state.count) 112 | else: 113 | bullet = "-" 114 | 115 | def generate_end(): 116 | if markdown_state.list_item_has_closed: 117 | return "" 118 | else: 119 | markdown_state.list_item_has_closed = True 120 | return "\n" 121 | 122 | return _WriterOutput( 123 | start=("\t" * list_state.indentation) + bullet + " ", 124 | generate_end=generate_end 125 | ) 126 | 127 | 128 | def _init_writers(): 129 | writers = { 130 | "p": _Wrapped("", "\n\n"), 131 | "br": _Wrapped("", " \n"), 132 | "strong": _symmetric_wrapped("__"), 133 | "em": _symmetric_wrapped("*"), 134 | "a": _hyperlink, 135 | "img": _image, 136 | "ol": _list(ordered=True), 137 | "ul": _list(ordered=False), 138 | "li": _list_item, 139 | } 140 | 141 | for level in range(1, 7): 142 | writers["h{0}".format(level)] = _Wrapped("#" * level + " ", "\n\n") 143 | 144 | return writers 145 | 146 | 147 | _writers = _init_writers() 148 | _default_output = _WriterOutput("", "") 149 | 150 | def _default_writer(attributes, markdown_state): 151 | return _default_output 152 | 153 | 154 | class MarkdownWriter(Writer): 155 | def __init__(self): 156 | self._fragments = [] 157 | self._element_stack = [] 158 | self._markdown_state = _MarkdownState() 159 | 160 | def text(self, text): 161 | self._fragments.append(_escape_markdown(text)) 162 | 163 | def start(self, name, attributes=None): 164 | if attributes is None: 165 | attributes = {} 166 | 167 | output = _writers.get(name, _default_writer)(attributes, self._markdown_state) 168 | self._element_stack.append(output.generate_end) 169 | 170 | anchor_before_start = output.anchor_position == "before" 171 | if anchor_before_start: 172 | self._write_anchor(attributes) 173 | 174 | self._fragments.append(output.start) 175 | 176 | if not anchor_before_start: 177 | self._write_anchor(attributes) 178 | 179 | 180 | 181 | def end(self, name): 182 | end = self._element_stack.pop() 183 | output = end() 184 | self._fragments.append(output) 185 | 186 | def self_closing(self, name, attributes=None): 187 | self.start(name, attributes) 188 | self.end(name) 189 | 190 | def append(self, other): 191 | self._fragments.append(other) 192 | 193 | def as_string(self): 194 | return "".join(self._fragments) 195 | 196 | def _write_anchor(self, attributes): 197 | html_id = attributes.get("id") 198 | if html_id: 199 | self._fragments.append(''.format(html_id)) 200 | 201 | 202 | def _escape_markdown(value): 203 | return re.sub(r"([\`\*_\{\}\[\]\(\)\#\+\-\.\!])", r"\\\1", re.sub("\\\\", "\\\\\\\\", value)) 204 | -------------------------------------------------------------------------------- /mammoth/zips.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import io 3 | import shutil 4 | 5 | from zipfile import ZipFile 6 | 7 | 8 | def open_zip(fileobj, mode): 9 | return _Zip(ZipFile(fileobj, mode)) 10 | 11 | 12 | class _Zip(object): 13 | def __init__(self, zip_file): 14 | self._zip_file = zip_file 15 | 16 | def __enter__(self): 17 | return self 18 | 19 | def __exit__(self, *args): 20 | self._zip_file.close() 21 | 22 | def open(self, name): 23 | return contextlib.closing(self._zip_file.open(name)) 24 | 25 | def exists(self, name): 26 | try: 27 | self._zip_file.getinfo(name) 28 | return True 29 | except KeyError: 30 | return False 31 | 32 | def read_str(self, name): 33 | return self._zip_file.read(name).decode("utf8") 34 | 35 | 36 | def update_zip(fileobj, files): 37 | source = ZipFile(fileobj, "r") 38 | try: 39 | destination_fileobj = io.BytesIO() 40 | destination = ZipFile(destination_fileobj, "w") 41 | try: 42 | names = set(source.namelist()) | set(files.keys()) 43 | for name in names: 44 | if name in files: 45 | contents = files[name] 46 | else: 47 | contents = source.read(name) 48 | destination.writestr(name, contents) 49 | finally: 50 | destination.close() 51 | finally: 52 | source.close() 53 | 54 | fileobj.seek(0) 55 | destination_fileobj.seek(0) 56 | shutil.copyfileobj(destination_fileobj, fileobj) 57 | 58 | 59 | def split_path(path): 60 | parts = path.rsplit("/", 1) 61 | if len(parts) == 1: 62 | return ("", path) 63 | else: 64 | return tuple(parts) 65 | 66 | 67 | def join_path(*args): 68 | non_empty_paths = list(filter(None, args)) 69 | 70 | relevant_paths = [] 71 | for path in non_empty_paths: 72 | if path.startswith("/"): 73 | relevant_paths = [path] 74 | else: 75 | relevant_paths.append(path) 76 | 77 | return "/".join(relevant_paths) 78 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools"] 3 | build-backend = "setuptools.build_meta" 4 | -------------------------------------------------------------------------------- /recipes/wmf_images.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | import shutil 4 | import subprocess 5 | import tempfile 6 | 7 | 8 | # An example of how to use LibreOffice and ImageMagick to convert WMF images to 9 | # PNGs. 10 | # 11 | # libreoffice_wmf_conversion uses LibreOffice to convert the image to a PNG. 12 | # This normally creates an image with a large amount of padding, so 13 | # imagemagick_trim can be used to trim the image. 14 | # 15 | # The image can be then be converted using a normal image handler, such as 16 | # mammoth.images.data_uri. 17 | # 18 | # Example usage: 19 | # 20 | # def convert_image(image): 21 | # image = libreoffice_wmf_conversion(image, post_process=imagemagick_trim) 22 | # return mammoth.images.data_uri(image) 23 | # 24 | # with open("document.docx", "rb") as fileobj: 25 | # result = mammoth.convert_to_html(fileobj, convert_image=convert_image) 26 | 27 | 28 | _wmf_extensions = { 29 | "image/x-wmf": ".wmf", 30 | "image/x-emf": ".emf", 31 | } 32 | 33 | 34 | def libreoffice_wmf_conversion(image, post_process=None): 35 | if post_process is None: 36 | post_process = lambda x: x 37 | 38 | wmf_extension = _wmf_extensions.get(image.content_type) 39 | if wmf_extension is None: 40 | return image 41 | else: 42 | temporary_directory = tempfile.mkdtemp() 43 | try: 44 | input_path = os.path.join(temporary_directory, "image" + wmf_extension) 45 | with io.open(input_path, "wb") as input_fileobj: 46 | with image.open() as image_fileobj: 47 | shutil.copyfileobj(image_fileobj, input_fileobj) 48 | 49 | output_path = os.path.join(temporary_directory, "image.png") 50 | subprocess.check_call([ 51 | "libreoffice", 52 | "--headless", 53 | "--convert-to", 54 | "png", 55 | input_path, 56 | "--outdir", 57 | temporary_directory, 58 | ]) 59 | 60 | with io.open(output_path, "rb") as output_fileobj: 61 | output = output_fileobj.read() 62 | 63 | def open_image(): 64 | return io.BytesIO(output) 65 | 66 | return post_process(image.copy( 67 | content_type="image/png", 68 | open=open_image, 69 | )) 70 | finally: 71 | shutil.rmtree(temporary_directory) 72 | 73 | 74 | def imagemagick_trim(image): 75 | command = ["convert", "-", "-trim", "-"] 76 | process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) 77 | try: 78 | with image.open() as image_fileobj: 79 | shutil.copyfileobj(image_fileobj, process.stdin) 80 | output, err_output = process.communicate() 81 | except: 82 | process.kill() 83 | process.wait() 84 | raise 85 | 86 | return_code = process.poll() 87 | if return_code: 88 | raise subprocess.CalledProcessError(return_code, command) 89 | else: 90 | def open_image(): 91 | return io.BytesIO(output) 92 | 93 | return image.copy(open=open_image) 94 | 95 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal = 1 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | from setuptools import setup 5 | 6 | def read(fname): 7 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 8 | 9 | 10 | setup( 11 | name='mammoth', 12 | version='1.9.1', 13 | description='Convert Word documents from docx to simple and clean HTML and Markdown', 14 | long_description=read("README"), 15 | author='Michael Williamson', 16 | author_email='mike@zwobble.org', 17 | url='https://github.com/mwilliamson/python-mammoth', 18 | packages=['mammoth', 'mammoth.docx', 'mammoth.html', 'mammoth.styles', 'mammoth.styles.parser', 'mammoth.writers'], 19 | entry_points={ 20 | "console_scripts": [ 21 | "mammoth=mammoth.cli:main" 22 | ] 23 | }, 24 | keywords="docx word office clean html markdown md", 25 | install_requires=[ 26 | "cobble>=0.1.3,<0.2", 27 | ], 28 | python_requires='>=3.7', 29 | license="BSD-2-Clause", 30 | classifiers=[ 31 | 'Development Status :: 5 - Production/Stable', 32 | 'Intended Audience :: Developers', 33 | 'License :: OSI Approved :: BSD License', 34 | 'Programming Language :: Python', 35 | 'Programming Language :: Python :: 3', 36 | 'Programming Language :: Python :: 3.7', 37 | 'Programming Language :: Python :: 3.8', 38 | 'Programming Language :: Python :: 3.9', 39 | 'Programming Language :: Python :: 3.10', 40 | 'Programming Language :: Python :: 3.11', 41 | 'Programming Language :: Python :: 3.12', 42 | ], 43 | ) 44 | 45 | -------------------------------------------------------------------------------- /test-requirements.txt: -------------------------------------------------------------------------------- 1 | funk>=0.4,<0.5 2 | pytest 3 | precisely==0.1.3 4 | pyflakes==2.4.0 5 | spur.local>=0.3.7,<0.4 6 | tempman>=0.1.2,<0.2 7 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tests/cli_tests.py: -------------------------------------------------------------------------------- 1 | import os 2 | import base64 3 | 4 | import spur 5 | import tempman 6 | 7 | from .testing import assert_equal, generate_test_path 8 | 9 | 10 | _local = spur.LocalShell() 11 | 12 | 13 | def test_html_is_printed_to_stdout_if_output_file_is_not_set(): 14 | docx_path = generate_test_path("single-paragraph.docx") 15 | result = _local.run(["mammoth", docx_path]) 16 | assert_equal(b"", result.stderr_output) 17 | assert_equal(b"

Walking on imported air

", result.output) 18 | 19 | 20 | def test_html_is_written_to_file_if_output_file_is_set(): 21 | with tempman.create_temp_dir() as temp_dir: 22 | output_path = os.path.join(temp_dir.path, "output.html") 23 | docx_path = generate_test_path("single-paragraph.docx") 24 | result = _local.run(["mammoth", docx_path, output_path]) 25 | assert_equal(b"", result.stderr_output) 26 | assert_equal(b"", result.output) 27 | with open(output_path) as output_file: 28 | assert_equal("

Walking on imported air

", output_file.read()) 29 | 30 | 31 | _image_base_64 = b"iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAIAAAACUFjqAAAAAXNSR0IArs4c6QAAAAlwSFlzAAAOvgAADr4B6kKxwAAAABNJREFUKFNj/M+ADzDhlWUYqdIAQSwBE8U+X40AAAAASUVORK5CYII=" 32 | 33 | 34 | def test_inline_images_are_included_in_output_if_writing_to_single_file(): 35 | docx_path = generate_test_path("tiny-picture.docx") 36 | result = _local.run(["mammoth", docx_path]) 37 | assert_equal(b"""

""", result.output) 38 | 39 | 40 | def test_images_are_written_to_separate_files_if_output_dir_is_set(): 41 | with tempman.create_temp_dir() as temp_dir: 42 | output_path = os.path.join(temp_dir.path, "tiny-picture.html") 43 | image_path = os.path.join(temp_dir.path, "1.png") 44 | 45 | docx_path = generate_test_path("tiny-picture.docx") 46 | result = _local.run(["mammoth", docx_path, "--output-dir", temp_dir.path]) 47 | assert_equal(b"", result.stderr_output) 48 | assert_equal(b"", result.output) 49 | with open(output_path) as output_file: 50 | assert_equal("""

""", output_file.read()) 51 | 52 | with open(image_path, "rb") as image_file: 53 | assert_equal(_image_base_64, base64.b64encode(image_file.read())) 54 | 55 | 56 | def test_style_map_is_used_if_set(): 57 | with tempman.create_temp_dir() as temp_dir: 58 | docx_path = generate_test_path("single-paragraph.docx") 59 | style_map_path = os.path.join(temp_dir.path, "style-map") 60 | with open(style_map_path, "w") as style_map_file: 61 | style_map_file.write("p => span:fresh") 62 | result = _local.run(["mammoth", docx_path, "--style-map", style_map_path]) 63 | assert_equal(b"", result.stderr_output) 64 | assert_equal(b"Walking on imported air", result.output) 65 | 66 | 67 | def test_output_format_markdown_option_generates_markdown_output(): 68 | docx_path = generate_test_path("single-paragraph.docx") 69 | result = _local.run(["mammoth", docx_path, "--output-format=markdown"]) 70 | assert_equal(b"", result.stderr_output) 71 | assert_equal(b"Walking on imported air\n\n", result.output) 72 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import funk 2 | import pytest 3 | 4 | 5 | @pytest.fixture(name="mocks") 6 | def _fixture_mocks(): 7 | mocks = funk.Mocks() 8 | yield mocks 9 | mocks.verify() 10 | -------------------------------------------------------------------------------- /tests/docx/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/docx/__init__.py -------------------------------------------------------------------------------- /tests/docx/comments_xml_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth import documents 2 | from mammoth.docx.xmlparser import element as xml_element 3 | from mammoth.docx.comments_xml import read_comments_xml_element 4 | from mammoth.docx import body_xml 5 | from ..testing import assert_equal 6 | 7 | 8 | def test_id_and_body_of_comment_is_read(): 9 | body = [xml_element("w:p")] 10 | comments = read_comments_xml_element(xml_element("w:comments", {}, [ 11 | xml_element("w:comment", {"w:id": "1"}, body), 12 | ]), body_reader=body_xml.reader()) 13 | assert_equal(1, len(comments.value)) 14 | assert_equal(comments.value[0].body, [documents.paragraph(children=[])]) 15 | assert_equal("1", comments.value[0].comment_id) 16 | 17 | 18 | def test_when_optional_attributes_of_comment_are_missing_then_they_are_read_as_none(): 19 | comments = read_comments_xml_element(xml_element("w:comments", {}, [ 20 | xml_element("w:comment", {"w:id": "1"}, []), 21 | ]), body_reader=body_xml.reader()) 22 | comment, = comments.value 23 | assert_equal(None, comment.author_name) 24 | assert_equal(None, comment.author_initials) 25 | 26 | 27 | def test_when_optional_attributes_of_comment_are_blank_then_they_are_read_as_none(): 28 | comments = read_comments_xml_element(xml_element("w:comments", {}, [ 29 | xml_element("w:comment", {"w:id": "1", "w:author": " ", "w:initials": " "}, []), 30 | ]), body_reader=body_xml.reader()) 31 | comment, = comments.value 32 | assert_equal(None, comment.author_name) 33 | assert_equal(None, comment.author_initials) 34 | 35 | 36 | def test_when_optional_attributes_of_comment_are_not_blank_then_they_are_read(): 37 | comments = read_comments_xml_element(xml_element("w:comments", {}, [ 38 | xml_element("w:comment", {"w:id": "1", "w:author": "The Piemaker", "w:initials": "TP"}, []), 39 | ]), body_reader=body_xml.reader()) 40 | comment, = comments.value 41 | assert_equal("The Piemaker", comment.author_name) 42 | assert_equal("TP", comment.author_initials) 43 | -------------------------------------------------------------------------------- /tests/docx/content_types_xml_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth.docx.xmlparser import element as xml_element 2 | from mammoth.docx.content_types_xml import read_content_types_xml_element 3 | from ..testing import assert_equal 4 | 5 | 6 | def test_content_type_is_based_on_default_for_extension_if_there_is_no_override(): 7 | element = xml_element("content-types:Types", {}, [ 8 | xml_element("content-types:Default", { 9 | "Extension": "png", 10 | "ContentType": "image/png", 11 | }) 12 | ]) 13 | content_types = read_content_types_xml_element(element) 14 | assert_equal( 15 | "image/png", 16 | content_types.find_content_type("word/media/hat.png"), 17 | ) 18 | 19 | 20 | def test_content_type_is_based_on_override_if_present(): 21 | element = xml_element("content-types:Types", {}, [ 22 | xml_element("content-types:Default", { 23 | "Extension": "png", 24 | "ContentType": "image/png", 25 | }), 26 | xml_element("content-types:Override", { 27 | "PartName": "/word/media/hat.png", 28 | "ContentType": "image/hat" 29 | }), 30 | ]) 31 | content_types = read_content_types_xml_element(element) 32 | assert_equal( 33 | "image/hat", 34 | content_types.find_content_type("word/media/hat.png"), 35 | ) 36 | 37 | 38 | def test_fallback_content_types_have_common_image_types(): 39 | element = xml_element("content-types:Types", {}, []) 40 | content_types = read_content_types_xml_element(element) 41 | assert_equal( 42 | "image/png", 43 | content_types.find_content_type("word/media/hat.png"), 44 | ) 45 | assert_equal( 46 | "image/gif", 47 | content_types.find_content_type("word/media/hat.gif"), 48 | ) 49 | assert_equal( 50 | "image/jpeg", 51 | content_types.find_content_type("word/media/hat.jpg"), 52 | ) 53 | assert_equal( 54 | "image/jpeg", 55 | content_types.find_content_type("word/media/hat.jpeg"), 56 | ) 57 | assert_equal( 58 | "image/bmp", 59 | content_types.find_content_type("word/media/hat.bmp"), 60 | ) 61 | assert_equal( 62 | "image/tiff", 63 | content_types.find_content_type("word/media/hat.tif"), 64 | ) 65 | assert_equal( 66 | "image/tiff", 67 | content_types.find_content_type("word/media/hat.tiff"), 68 | ) 69 | 70 | 71 | def test_fallback_content_types_are_case_insensitive(): 72 | element = xml_element("content-types:Types", {}, []) 73 | content_types = read_content_types_xml_element(element) 74 | assert_equal( 75 | "image/png", 76 | content_types.find_content_type("word/media/hat.PnG"), 77 | ) 78 | -------------------------------------------------------------------------------- /tests/docx/document_matchers.py: -------------------------------------------------------------------------------- 1 | from precisely import all_of, has_attrs, instance_of 2 | 3 | from mammoth import documents 4 | 5 | 6 | def create_element_matcher(element_type): 7 | def matcher(**kwargs): 8 | return all_of( 9 | instance_of(element_type), 10 | has_attrs(**kwargs), 11 | ) 12 | 13 | return matcher 14 | 15 | 16 | is_paragraph = create_element_matcher(documents.Paragraph) 17 | is_run = create_element_matcher(documents.Run) 18 | is_hyperlink = create_element_matcher(documents.Hyperlink) 19 | is_checkbox = create_element_matcher(documents.Checkbox) 20 | is_table = create_element_matcher(documents.Table) 21 | is_row = create_element_matcher(documents.TableRow) 22 | 23 | 24 | is_empty_run = is_run(children=[]) 25 | 26 | 27 | def is_text(value): 28 | return all_of( 29 | instance_of(documents.Text), 30 | has_attrs(value=value), 31 | ) 32 | -------------------------------------------------------------------------------- /tests/docx/document_xml_tests.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from mammoth import documents 4 | from mammoth.docx.xmlparser import element as xml_element, text as xml_text 5 | from mammoth.docx.document_xml import read_document_xml_element 6 | from mammoth.docx import body_xml 7 | from ..testing import assert_equal 8 | 9 | 10 | def test_when_body_element_is_present_then_body_is_read(): 11 | text_xml = xml_element("w:t", {}, [xml_text("Hello!")]) 12 | run_xml = xml_element("w:r", {}, [text_xml]) 13 | paragraph_xml = xml_element("w:p", {}, [run_xml]) 14 | body_xml = xml_element("w:body", {}, [paragraph_xml]) 15 | document_xml = xml_element("w:document", {}, [body_xml]) 16 | 17 | document = _read_and_get_document_xml_element(document_xml) 18 | 19 | assert_equal( 20 | documents.document([documents.paragraph([documents.run([documents.text("Hello!")])])]), 21 | document 22 | ) 23 | 24 | 25 | def test_when_body_element_is_not_present_then_error_is_raised(): 26 | paragraph_xml = xml_element("w:p", {}, []) 27 | body_xml = xml_element("w:body2", {}, [paragraph_xml]) 28 | document_xml = xml_element("w:document", {}, [body_xml]) 29 | 30 | error = pytest.raises(ValueError, lambda: _read_and_get_document_xml_element(document_xml)) 31 | 32 | assert_equal(str(error.value), "Could not find the body element: are you sure this is a docx file?") 33 | 34 | 35 | def test_footnotes_of_document_are_read(): 36 | notes = [documents.note("footnote", "4", [documents.paragraph([])])] 37 | 38 | body_xml = xml_element("w:body") 39 | document_xml = xml_element("w:document", {}, [body_xml]) 40 | 41 | document = _read_and_get_document_xml_element(document_xml, notes=notes) 42 | footnote = document.notes.find_note("footnote", "4") 43 | assert_equal("4", footnote.note_id) 44 | assert isinstance(footnote.body[0], documents.Paragraph) 45 | 46 | 47 | def _read_and_get_document_xml_element(*args, **kwargs): 48 | body_reader = body_xml.reader() 49 | result = read_document_xml_element(*args, body_reader=body_reader, **kwargs) 50 | assert_equal([], result.messages) 51 | return result.value 52 | -------------------------------------------------------------------------------- /tests/docx/docx_tests.py: -------------------------------------------------------------------------------- 1 | import io 2 | import textwrap 3 | import zipfile 4 | 5 | from mammoth import docx, documents, zips 6 | from ..testing import assert_equal, assert_raises, generate_test_path 7 | 8 | 9 | class ReadTests(object): 10 | def test_can_read_document_with_single_paragraph_with_single_run_of_text(self): 11 | with open(generate_test_path("single-paragraph.docx"), "rb") as fileobj: 12 | result = docx.read(fileobj=fileobj) 13 | expected_document = documents.document([ 14 | documents.paragraph([ 15 | documents.run([ 16 | documents.text("Walking on imported air") 17 | ]) 18 | ]) 19 | ]) 20 | assert_equal(expected_document, result.value) 21 | 22 | 23 | _relationship_namespaces = { 24 | "r": "http://schemas.openxmlformats.org/package/2006/relationships", 25 | } 26 | 27 | 28 | def test_main_document_is_found_using_package_relationships(): 29 | fileobj = _create_zip({ 30 | "word/document2.xml": textwrap.dedent("""\ 31 | 32 | 33 | 34 | 35 | 36 | Hello. 37 | 38 | 39 | 40 | 41 | """), 42 | "_rels/.rels": textwrap.dedent("""\ 43 | 44 | 45 | 46 | 47 | """), 48 | }) 49 | result = docx.read(fileobj=fileobj) 50 | expected_document = documents.document([ 51 | documents.paragraph([ 52 | documents.run([ 53 | documents.text("Hello.") 54 | ]) 55 | ]) 56 | ]) 57 | assert_equal(expected_document, result.value) 58 | 59 | 60 | def test_error_is_raised_when_main_document_part_does_not_exist(): 61 | fileobj = _create_zip({ 62 | "_rels/.rels": textwrap.dedent("""\ 63 | 64 | 65 | 66 | 67 | """), 68 | }) 69 | error = assert_raises(IOError, lambda: docx.read(fileobj=fileobj)) 70 | assert_equal( 71 | "Could not find main document part. Are you sure this is a valid .docx file?", 72 | str(error), 73 | ) 74 | 75 | class PartPathsTests(object): 76 | def test_main_document_part_is_found_using_package_relationships(self): 77 | fileobj = _create_zip({ 78 | "word/document2.xml": " ", 79 | "_rels/.rels": textwrap.dedent("""\ 80 | 81 | 82 | 83 | 84 | """), 85 | }) 86 | part_paths = self._find_part_paths(fileobj) 87 | assert_equal("word/document2.xml", part_paths.main_document) 88 | 89 | def test_when_relationship_for_main_document_cannot_be_found_then_fallback_is_used(self): 90 | fileobj = _create_zip({ 91 | "word/document.xml": " ", 92 | }) 93 | part_paths = self._find_part_paths(fileobj) 94 | assert_equal("word/document.xml", part_paths.main_document) 95 | 96 | def test_comments_part_is_found_using_main_document_relationships(self): 97 | self._assert_path_is_found_using_main_document_relationships("comments") 98 | 99 | def test_when_relationship_for_comments_cannot_be_found_then_fallback_is_used(self): 100 | self._assert_when_relationship_for_part_cannot_be_found_then_fallback_is_used("comments") 101 | 102 | def test_endnotes_part_is_found_using_main_document_relationships(self): 103 | self._assert_path_is_found_using_main_document_relationships("endnotes") 104 | 105 | def test_when_relationship_for_endnotes_cannot_be_found_then_fallback_is_used(self): 106 | self._assert_when_relationship_for_part_cannot_be_found_then_fallback_is_used("endnotes") 107 | 108 | def test_footnotes_part_is_found_using_main_document_relationships(self): 109 | self._assert_path_is_found_using_main_document_relationships("footnotes") 110 | 111 | def test_when_relationship_for_footnotes_cannot_be_found_then_fallback_is_used(self): 112 | self._assert_when_relationship_for_part_cannot_be_found_then_fallback_is_used("footnotes") 113 | 114 | def test_numbering_part_is_found_using_main_document_relationships(self): 115 | self._assert_path_is_found_using_main_document_relationships("numbering") 116 | 117 | def test_when_relationship_for_numbering_cannot_be_found_then_fallback_is_used(self): 118 | self._assert_when_relationship_for_part_cannot_be_found_then_fallback_is_used("numbering") 119 | 120 | def test_styles_part_is_found_using_main_document_relationships(self): 121 | self._assert_path_is_found_using_main_document_relationships("styles") 122 | 123 | def test_when_relationship_for_styles_cannot_be_found_then_fallback_is_used(self): 124 | self._assert_when_relationship_for_part_cannot_be_found_then_fallback_is_used("styles") 125 | 126 | def _assert_path_is_found_using_main_document_relationships(self, name): 127 | fileobj = _create_zip({ 128 | "_rels/.rels": textwrap.dedent("""\ 129 | 130 | 131 | 132 | 133 | """), 134 | "word/document.xml": " ", 135 | "word/_rels/document.xml.rels": textwrap.dedent("""\ 136 | 137 | 138 | 139 | 140 | """.format(name=name)), 141 | "word/target-path.xml": " " 142 | }) 143 | part_paths = self._find_part_paths(fileobj) 144 | assert_equal("word/target-path.xml", getattr(part_paths, name)) 145 | 146 | def _assert_when_relationship_for_part_cannot_be_found_then_fallback_is_used(self, name): 147 | fileobj = _create_zip({ 148 | "_rels/.rels": textwrap.dedent("""\ 149 | 150 | 151 | 152 | 153 | """), 154 | "word/document.xml": " ", 155 | }) 156 | part_paths = self._find_part_paths(fileobj) 157 | assert_equal("word/{0}.xml".format(name), getattr(part_paths, name)) 158 | 159 | 160 | def _find_part_paths(self, fileobj): 161 | return docx._find_part_paths(zips.open_zip(fileobj, "r")) 162 | 163 | 164 | def _create_zip(files): 165 | fileobj = io.BytesIO() 166 | 167 | zip_file = zipfile.ZipFile(fileobj, "w") 168 | try: 169 | for name, contents in files.items(): 170 | zip_file.writestr(name, contents) 171 | finally: 172 | zip_file.close() 173 | 174 | fileobj.seek(0) 175 | return fileobj 176 | -------------------------------------------------------------------------------- /tests/docx/files_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth.docx.files import Files, InvalidFileReferenceError 2 | from ..testing import generate_test_path, assert_equal, assert_raises 3 | 4 | 5 | def test_can_open_files_with_file_uri(): 6 | path = generate_test_path("tiny-picture.png") 7 | files = Files(None) 8 | with files.open("file:///" + path) as image_file: 9 | contents = image_file.read() 10 | assert_equal(bytes, type(contents)) 11 | with open(path, "rb") as source_file: 12 | assert_equal(source_file.read(), contents) 13 | 14 | 15 | def test_can_open_files_with_relative_uri(): 16 | files = Files(generate_test_path("")) 17 | with files.open("tiny-picture.png") as image_file: 18 | contents = image_file.read() 19 | assert_equal(bytes, type(contents)) 20 | with open(generate_test_path("tiny-picture.png"), "rb") as source_file: 21 | assert_equal(source_file.read(), contents) 22 | 23 | 24 | def test_given_base_is_not_set_when_opening_relative_uri_then_error_is_raised(): 25 | files = Files(None) 26 | error = assert_raises(InvalidFileReferenceError, lambda: files.open("not-a-real-file.png")) 27 | expected_message = ( 28 | "could not find external image 'not-a-real-file.png', fileobj has no name" 29 | ) 30 | assert_equal(expected_message, str(error)) 31 | 32 | 33 | def test_error_is_raised_if_relative_uri_cannot_be_opened(): 34 | files = Files("/tmp") 35 | error = assert_raises(InvalidFileReferenceError, lambda: files.open("not-a-real-file.png")) 36 | expected_message = ( 37 | "could not open external image: 'not-a-real-file.png' (document directory: '/tmp')\n" + 38 | "[Errno 2] No such file or directory: '/tmp/not-a-real-file.png'" 39 | ) 40 | assert_equal(expected_message, str(error)) 41 | 42 | 43 | def test_error_is_raised_if_file_uri_cannot_be_opened(): 44 | files = Files("/tmp") 45 | error = assert_raises(InvalidFileReferenceError, lambda: files.open("file:///not-a-real-file.png")) 46 | expected_message = "could not open external image: 'file:///not-a-real-file.png' (document directory: '/tmp')\n" 47 | assert str(error).startswith(expected_message) 48 | -------------------------------------------------------------------------------- /tests/docx/notes_xml_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth import documents 2 | from mammoth.docx.xmlparser import element as xml_element 3 | from mammoth.docx.notes_xml import read_footnotes_xml_element 4 | from mammoth.docx import body_xml 5 | from ..testing import assert_equal 6 | 7 | 8 | def test_id_and_body_of_footnote_are_read(): 9 | footnote_body = [xml_element("w:p")] 10 | footnotes = read_footnotes_xml_element(xml_element("w:footnotes", {}, [ 11 | xml_element("w:footnote", {"w:id": "1"}, footnote_body), 12 | ]), body_reader=body_xml.reader()) 13 | assert_equal(1, len(footnotes.value)) 14 | assert isinstance(footnotes.value[0].body[0], documents.Paragraph) 15 | assert_equal("1", footnotes.value[0].note_id) 16 | 17 | 18 | def test_continuation_separator_is_ignored(): 19 | _assert_footnote_type_is_ignored("continuationSeparator") 20 | 21 | 22 | def test_separator_is_ignored(): 23 | _assert_footnote_type_is_ignored("separator") 24 | 25 | 26 | def _assert_footnote_type_is_ignored(footnote_type): 27 | footnote_body = [xml_element("w:p")] 28 | footnotes = read_footnotes_xml_element(xml_element("w:footnotes", {}, [ 29 | xml_element("w:footnote", {"w:id": "1", "w:type": footnote_type}, footnote_body), 30 | ]), body_reader=None) 31 | assert_equal(0, len(footnotes.value)) 32 | 33 | -------------------------------------------------------------------------------- /tests/docx/numbering_xml_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth.docx.xmlparser import element as xml_element 2 | from mammoth.docx.numbering_xml import read_numbering_xml_element 3 | from mammoth.docx.styles_xml import NumberingStyle, Styles 4 | from ..testing import assert_equal 5 | 6 | 7 | def test_find_level_returns_none_if_num_with_id_cannot_be_found(): 8 | numbering = _read_numbering_xml_element(xml_element("w:numbering")) 9 | assert_equal(None, numbering.find_level("47", "0")) 10 | 11 | 12 | _sample_numbering_xml = xml_element("w:numbering", {}, [ 13 | xml_element("w:abstractNum", {"w:abstractNumId": "42"}, [ 14 | xml_element("w:lvl", {"w:ilvl": "0"}, [ 15 | xml_element("w:numFmt", {"w:val": "bullet"}) 16 | ]), 17 | xml_element("w:lvl", {"w:ilvl": "1"}, [ 18 | xml_element("w:numFmt", {"w:val": "decimal"}) 19 | ]) 20 | ]), 21 | xml_element("w:num", {"w:numId": "47"}, [ 22 | xml_element("w:abstractNumId", {"w:val": "42"}) 23 | ]) 24 | ]) 25 | 26 | 27 | def test_level_includes_level_index(): 28 | numbering = _read_numbering_xml_element(_sample_numbering_xml) 29 | assert_equal("0", numbering.find_level("47", "0").level_index) 30 | assert_equal("1", numbering.find_level("47", "1").level_index) 31 | 32 | 33 | def test_list_is_not_ordered_if_formatted_as_bullet(): 34 | numbering = _read_numbering_xml_element(_sample_numbering_xml) 35 | assert_equal(False, numbering.find_level("47", "0").is_ordered) 36 | 37 | 38 | def test_list_is_ordered_if_formatted_as_decimal(): 39 | numbering = _read_numbering_xml_element(_sample_numbering_xml) 40 | assert_equal(True, numbering.find_level("47", "1").is_ordered) 41 | 42 | 43 | def test_list_is_ordered_if_there_is_no_explicit_format(): 44 | element = xml_element("w:numbering", {}, [ 45 | xml_element("w:abstractNum", {"w:abstractNumId": "42"}, [ 46 | xml_element("w:lvl", {"w:ilvl": "0"}), 47 | ]), 48 | xml_element("w:num", {"w:numId": "47"}, [ 49 | xml_element("w:abstractNumId", {"w:val": "42"}) 50 | ]) 51 | ]) 52 | 53 | numbering = _read_numbering_xml_element(element) 54 | 55 | assert_equal(True, numbering.find_level("47", "0").is_ordered) 56 | 57 | 58 | def test_find_level_returns_none_if_level_cannot_be_found(): 59 | numbering = _read_numbering_xml_element(_sample_numbering_xml) 60 | assert_equal(None, numbering.find_level("47", "2")) 61 | 62 | 63 | def test_num_referencing_non_existent_abstract_num_is_ignored(): 64 | element = xml_element("w:numbering", {}, [ 65 | xml_element("w:num", {"w:numId": "47"}, [ 66 | xml_element("w:abstractNumId", {"w:val": "42"}) 67 | ]) 68 | ]) 69 | 70 | numbering = _read_numbering_xml_element(element) 71 | 72 | assert_equal(None, numbering.find_level("47", "0")) 73 | 74 | 75 | def test_when_abstract_num_has_num_style_link_then_style_is_used_to_find_num(): 76 | numbering = _read_numbering_xml_element( 77 | xml_element("w:numbering", {}, [ 78 | xml_element("w:abstractNum", {"w:abstractNumId": "100"}, [ 79 | xml_element("w:lvl", {"w:ilvl": "0"}, [ 80 | xml_element("w:numFmt", {"w:val": "decimal"}), 81 | ]), 82 | ]), 83 | xml_element("w:abstractNum", {"w:abstractNumId": "101"}, [ 84 | xml_element("w:numStyleLink", {"w:val": "List1"}), 85 | ]), 86 | xml_element("w:num", {"w:numId": "200"}, [ 87 | xml_element("w:abstractNumId", {"w:val": "100"}), 88 | ]), 89 | xml_element("w:num", {"w:numId": "201"}, [ 90 | xml_element("w:abstractNumId", {"w:val": "101"}), 91 | ]) 92 | ]), 93 | styles=Styles.create(numbering_styles={"List1": NumberingStyle(num_id="200")}), 94 | ) 95 | assert_equal(True, numbering.find_level("201", "0").is_ordered) 96 | 97 | 98 | # See: 17.9.23 pStyle (Paragraph Style's Associated Numbering Level) in ECMA-376, 4th Edition 99 | def test_numbering_level_can_be_found_by_paragraph_style_id(): 100 | numbering = _read_numbering_xml_element( 101 | xml_element("w:numbering", {}, [ 102 | xml_element("w:abstractNum", {"w:abstractNumId": "42"}, [ 103 | xml_element("w:lvl", {"w:ilvl": "0"}, [ 104 | xml_element("w:numFmt", {"w:val": "bullet"}), 105 | ]), 106 | ]), 107 | xml_element("w:abstractNum", {"w:abstractNumId": "43"}, [ 108 | xml_element("w:lvl", {"w:ilvl": "0"}, [ 109 | xml_element("w:pStyle", {"w:val": "List"}), 110 | xml_element("w:numFmt", {"w:val": "decimal"}), 111 | ]), 112 | ]), 113 | ]), 114 | ) 115 | 116 | assert_equal(True, numbering.find_level_by_paragraph_style_id("List").is_ordered) 117 | assert_equal(None, numbering.find_level_by_paragraph_style_id("Paragraph")) 118 | 119 | 120 | def _read_numbering_xml_element(element, styles=None): 121 | if styles is None: 122 | styles = Styles.EMPTY 123 | 124 | return read_numbering_xml_element(element, styles=styles) 125 | -------------------------------------------------------------------------------- /tests/docx/office_xml_tests.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | 3 | import io 4 | 5 | from mammoth.docx import xmlparser as xml, office_xml 6 | from ..testing import assert_equal 7 | 8 | 9 | class AlternateContentTests(object): 10 | def test_when_fallback_is_present_then_fallback_is_read(self): 11 | xml_string = ( 12 | '' + 13 | '' + 14 | '' + 15 | '' + 16 | '' + 17 | '' + 18 | '' + 19 | '' + 20 | '' + 21 | '' + 22 | '') 23 | 24 | result = office_xml.read(io.StringIO(xml_string)) 25 | assert_equal([xml.element("fallback")], result.children) 26 | 27 | 28 | def test_when_fallback_is_not_present_then_element_is_ignored(self): 29 | xml_string = ( 30 | '' + 31 | '' + 32 | '' + 33 | '' + 34 | '' + 35 | '' + 36 | '' + 37 | '') 38 | 39 | result = office_xml.read(io.StringIO(xml_string)) 40 | assert_equal([], result.children) 41 | -------------------------------------------------------------------------------- /tests/docx/relationships_xml_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth.docx.xmlparser import element as xml_element 2 | from mammoth.docx.relationships_xml import read_relationships_xml_element 3 | from ..testing import assert_equal 4 | 5 | 6 | def test_relationship_targets_can_be_found_by_id(): 7 | element = xml_element("relationships:Relationships", {}, [ 8 | xml_element("relationships:Relationship", { 9 | "Id": "rId8", 10 | "Type": "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink", 11 | "Target": "http://example.com", 12 | }), 13 | xml_element("relationships:Relationship", { 14 | "Id": "rId2", 15 | "Type": "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink", 16 | "Target": "http://example.net", 17 | }), 18 | ]) 19 | relationships = read_relationships_xml_element(element) 20 | assert_equal( 21 | "http://example.com", 22 | relationships.find_target_by_relationship_id("rId8"), 23 | ) 24 | 25 | 26 | def test_relationship_targets_can_be_found_by_type(): 27 | element = xml_element("relationships:Relationships", {}, [ 28 | xml_element("relationships:Relationship", { 29 | "Id": "rId2", 30 | "Target": "docProps/core.xml", 31 | "Type": "http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties", 32 | }), 33 | xml_element("relationships:Relationship", { 34 | "Id": "rId1", 35 | "Target": "word/document.xml", 36 | "Type": "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument", 37 | }), 38 | xml_element("relationships:Relationship", { 39 | "Id": "rId3", 40 | "Target": "word/document2.xml", 41 | "Type": "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument", 42 | }), 43 | ]) 44 | relationships = read_relationships_xml_element(element) 45 | assert_equal( 46 | ["word/document.xml", "word/document2.xml"], 47 | relationships.find_targets_by_type("http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument"), 48 | ) 49 | 50 | 51 | def test_when_there_are_no_relationships_of_requested_type_then_empty_list_is_returned(): 52 | element = xml_element("relationships:Relationships", {}, []) 53 | relationships = read_relationships_xml_element(element) 54 | assert_equal( 55 | [], 56 | relationships.find_targets_by_type("http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument"), 57 | ) 58 | -------------------------------------------------------------------------------- /tests/docx/style_map_tests.py: -------------------------------------------------------------------------------- 1 | import io 2 | from zipfile import ZipFile 3 | 4 | from mammoth.docx.style_map import write_style_map, read_style_map 5 | from mammoth.zips import open_zip 6 | from mammoth.docx import xmlparser as xml 7 | from ..testing import assert_equal 8 | 9 | 10 | def test_reading_embedded_style_map_on_document_without_embedded_style_map_returns_none(): 11 | fileobj = _normal_docx() 12 | assert_equal(None, read_style_map(fileobj)) 13 | 14 | 15 | def test_writing_style_map_preserves_unrelated_files(): 16 | fileobj = _normal_docx() 17 | write_style_map(fileobj, "p => h1") 18 | with open_zip(fileobj, "r") as zip_file: 19 | assert_equal("placeholder", zip_file.read_str("placeholder")) 20 | 21 | def test_embedded_style_map_can_be_read_after_being_written(): 22 | fileobj = _normal_docx() 23 | write_style_map(fileobj, "p => h1") 24 | assert_equal("p => h1", read_style_map(fileobj)) 25 | 26 | 27 | def test_embedded_style_map_is_written_to_separate_file(): 28 | fileobj = _normal_docx() 29 | write_style_map(fileobj, "p => h1") 30 | with open_zip(fileobj, "r") as zip_file: 31 | assert_equal("p => h1", zip_file.read_str("mammoth/style-map")) 32 | 33 | 34 | def test_embedded_style_map_is_referenced_in_relationships(): 35 | fileobj = _normal_docx() 36 | write_style_map(fileobj, "p => h1") 37 | assert_equal(expected_relationships_xml, _read_relationships_xml(fileobj)) 38 | 39 | def test_embedded_style_map_has_override_content_type_in_content_types_xml(): 40 | fileobj = _normal_docx() 41 | write_style_map(fileobj, "p => h1") 42 | assert_equal(expected_content_types_xml, _read_content_types_xml(fileobj)) 43 | 44 | 45 | def test_can_overwrite_existing_style_map(): 46 | fileobj = _normal_docx() 47 | write_style_map(fileobj, "p => h1") 48 | write_style_map(fileobj, "p => h2") 49 | with open_zip(fileobj, "r") as zip_file: 50 | assert_equal("p => h2", read_style_map(fileobj)) 51 | _assert_no_duplicates(zip_file._zip_file.namelist()) 52 | assert_equal(expected_relationships_xml, _read_relationships_xml(fileobj)) 53 | assert_equal(expected_content_types_xml, _read_content_types_xml(fileobj)) 54 | 55 | 56 | def _read_relationships_xml(fileobj): 57 | with open_zip(fileobj, "r") as zip_file: 58 | return xml.parse_xml( 59 | io.StringIO(zip_file.read_str("word/_rels/document.xml.rels")), 60 | [("r", "http://schemas.openxmlformats.org/package/2006/relationships")], 61 | ) 62 | 63 | 64 | def _read_content_types_xml(fileobj): 65 | with open_zip(fileobj, "r") as zip_file: 66 | return xml.parse_xml( 67 | io.StringIO(zip_file.read_str("[Content_Types].xml")), 68 | [("ct", "http://schemas.openxmlformats.org/package/2006/content-types")], 69 | ) 70 | 71 | 72 | original_relationships_xml = ('' + 73 | '' + 74 | '' + 75 | '') 76 | 77 | expected_relationships_xml = xml.element("r:Relationships", {}, [ 78 | xml.element("r:Relationship", {"Id": "rId3", "Type": "http://schemas.openxmlformats.org/officeDocument/2006/relationships/settings", "Target": "settings.xml"}), 79 | xml.element("r:Relationship", {"Id": "rMammothStyleMap", "Type": "http://schemas.zwobble.org/mammoth/style-map", "Target": "/mammoth/style-map"}), 80 | ]) 81 | 82 | original_content_types_xml = ('' + 83 | '' + 84 | '' + 85 | '' 86 | ) 87 | 88 | expected_content_types_xml = xml.element("ct:Types", {}, [ 89 | xml.element("ct:Default", {"Extension": "png", "ContentType": "image/png"}), 90 | xml.element("ct:Override", {"PartName": "/mammoth/style-map", "ContentType": "text/prs.mammoth.style-map"}), 91 | ]) 92 | 93 | 94 | def _normal_docx(): 95 | fileobj = io.BytesIO() 96 | zip_file = ZipFile(fileobj, "w") 97 | try: 98 | zip_file.writestr("placeholder", "placeholder") 99 | zip_file.writestr("word/_rels/document.xml.rels", original_relationships_xml) 100 | zip_file.writestr("[Content_Types].xml", original_content_types_xml) 101 | expected_relationships_xml 102 | finally: 103 | zip_file.close() 104 | return fileobj 105 | 106 | 107 | def _assert_no_duplicates(values): 108 | counts = {} 109 | for value in values: 110 | counts[value] = counts.get(value, 0) + 1 111 | for value, count in counts.items(): 112 | if count != 1: 113 | assert False, "{0} has count of {1}".format(value, count) 114 | -------------------------------------------------------------------------------- /tests/docx/styles_xml_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth.docx.xmlparser import element as xml_element 2 | from mammoth.docx.styles_xml import read_styles_xml_element 3 | from ..testing import assert_equal 4 | 5 | 6 | def test_paragraph_style_is_null_if_no_style_with_that_id_exists(): 7 | element = xml_element("w:styles") 8 | styles = read_styles_xml_element(element) 9 | assert_equal(None, styles.find_paragraph_style_by_id("Heading1")) 10 | 11 | 12 | def test_paragraph_style_can_be_found_by_id(): 13 | element = xml_element("w:styles", {}, [ 14 | _paragraph_style_element("Heading1", "Heading 1"), 15 | ]) 16 | styles = read_styles_xml_element(element) 17 | assert_equal( 18 | "Heading1", 19 | styles.find_paragraph_style_by_id("Heading1").style_id 20 | ) 21 | 22 | 23 | def test_character_style_can_be_found_by_id(): 24 | element = xml_element("w:styles", {}, [ 25 | _character_style_element("Heading1Char", "Heading 1 Char"), 26 | ]) 27 | styles = read_styles_xml_element(element) 28 | assert_equal( 29 | "Heading1Char", 30 | styles.find_character_style_by_id("Heading1Char").style_id 31 | ) 32 | 33 | 34 | def test_table_style_can_be_found_by_id(): 35 | element = xml_element("w:styles", {}, [ 36 | _table_style_element("TableNormal", "Normal Table"), 37 | ]) 38 | styles = read_styles_xml_element(element) 39 | assert_equal( 40 | "TableNormal", 41 | styles.find_table_style_by_id("TableNormal").style_id 42 | ) 43 | 44 | 45 | def test_paragraph_and_character_styles_are_distinct(): 46 | element = xml_element("w:styles", {}, [ 47 | _paragraph_style_element("Heading1", "Heading 1"), 48 | _character_style_element("Heading1Char", "Heading 1 Char"), 49 | ]) 50 | styles = read_styles_xml_element(element) 51 | assert_equal(None, styles.find_character_style_by_id("Heading1")) 52 | assert_equal(None, styles.find_paragraph_style_by_id("Heading1Char")) 53 | 54 | 55 | def test_styles_include_names(): 56 | element = xml_element("w:styles", {}, [ 57 | _paragraph_style_element("Heading1", "Heading 1"), 58 | ]) 59 | styles = read_styles_xml_element(element) 60 | assert_equal( 61 | "Heading 1", 62 | styles.find_paragraph_style_by_id("Heading1").name 63 | ) 64 | 65 | 66 | def test_style_name_is_none_if_name_element_does_not_exist(): 67 | element = xml_element("w:styles", {}, [ 68 | _style_without_name_element("paragraph", "Heading1"), 69 | _style_without_name_element("character", "Heading1Char") 70 | ]) 71 | styles = read_styles_xml_element(element) 72 | assert_equal(None, styles.find_paragraph_style_by_id("Heading1").name) 73 | assert_equal(None, styles.find_character_style_by_id("Heading1Char").name) 74 | 75 | 76 | def test_numbering_style_is_none_if_no_style_with_that_id_exists(): 77 | element = xml_element("w:styles", {}, []) 78 | styles = read_styles_xml_element(element) 79 | assert_equal(None, styles.find_numbering_style_by_id("List1")) 80 | 81 | 82 | def test_numbering_style_has_none_num_id_if_style_has_no_paragraph_properties(): 83 | element = xml_element("w:styles", {}, [ 84 | xml_element("w:style", {"w:type": "numbering", "w:styleId": "List1"}), 85 | ]) 86 | styles = read_styles_xml_element(element) 87 | assert_equal(None, styles.find_numbering_style_by_id("List1").num_id) 88 | 89 | 90 | def test_numbering_style_has_num_id_read_from_paragraph_properties(): 91 | element = xml_element("w:styles", {}, [ 92 | xml_element("w:style", {"w:type": "numbering", "w:styleId": "List1"}, [ 93 | xml_element("w:pPr", {}, [ 94 | xml_element("w:numPr", {}, [ 95 | xml_element("w:numId", {"w:val": "42"}) 96 | ]), 97 | ]), 98 | ]), 99 | ]) 100 | styles = read_styles_xml_element(element) 101 | assert_equal("42", styles.find_numbering_style_by_id("List1").num_id) 102 | 103 | 104 | def _paragraph_style_element(style_id, name): 105 | return _style_element("paragraph", style_id, name) 106 | 107 | def _character_style_element(style_id, name): 108 | return _style_element("character", style_id, name) 109 | 110 | def _table_style_element(style_id, name): 111 | return _style_element("table", style_id, name) 112 | 113 | def _style_element(element_type, style_id, name): 114 | children = [xml_element("w:name", {"w:val": name}, [])] 115 | return _style_element_with_children(element_type, style_id, children) 116 | 117 | def _style_without_name_element(element_type, style_id): 118 | return _style_element_with_children(element_type, style_id, []) 119 | 120 | def _style_element_with_children(element_type, style_id, children): 121 | attributes = {"w:type": element_type, "w:styleId": style_id} 122 | return xml_element("w:style", attributes, children) 123 | -------------------------------------------------------------------------------- /tests/docx/uris_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth.docx.uris import uri_to_zip_entry_name 2 | from ..testing import assert_equal 3 | 4 | 5 | def test_when_path_does_not_have_leading_slash_then_path_is_resolved_relative_to_base(): 6 | assert_equal( 7 | "one/two/three/four", 8 | uri_to_zip_entry_name("one/two", "three/four"), 9 | ) 10 | 11 | 12 | def test_when_path_has_leading_slash_then_base_is_ignored(): 13 | assert_equal( 14 | "three/four", 15 | uri_to_zip_entry_name("one/two", "/three/four"), 16 | ) 17 | -------------------------------------------------------------------------------- /tests/docx/xmlparser_tests.py: -------------------------------------------------------------------------------- 1 | import io 2 | 3 | from mammoth.docx.xmlparser import parse_xml, element as xml_element, text as xml_text 4 | from ..testing import assert_equal 5 | 6 | 7 | def test_can_parse_self_closing_element(): 8 | xml = _parse_xml_string(b"") 9 | assert_equal(xml_element("body", {}, []), xml) 10 | 11 | 12 | def test_can_parse_empty_element_with_separate_closing_tag(): 13 | xml = _parse_xml_string(b"") 14 | assert_equal(xml_element("body", {}, []), xml) 15 | 16 | 17 | def test_can_parse_attributes_of_tag(): 18 | xml = _parse_xml_string(b"") 19 | assert_equal(xml_element("body", {"name": "bob"}, []), xml) 20 | 21 | 22 | def test_can_parse_text_element(): 23 | xml = _parse_xml_string(b"Hello!") 24 | assert_equal(xml_element("body", {}, [xml_text("Hello!")]), xml) 25 | 26 | 27 | def test_can_parse_text_element_before_new_tag(): 28 | xml = _parse_xml_string(b"Hello!
") 29 | assert_equal(xml_element("body", {}, [xml_text("Hello!"), xml_element("br", {}, [])]), xml) 30 | 31 | 32 | def test_can_parse_element_with_children(): 33 | xml = _parse_xml_string(b"") 34 | assert_equal([xml_element("a", {}, []), xml_element("b", {}, [])], xml.children) 35 | 36 | 37 | def test_unmapped_namespaces_uris_are_included_in_braces_as_prefix(): 38 | xml = _parse_xml_string(b'') 39 | assert_equal("{word}body", xml.name) 40 | 41 | 42 | def test_mapped_namespaces_uris_are_translated_using_namespace_map(): 43 | xml = _parse_xml_string(b'', [("x", "word")]) 44 | assert_equal("x:body", xml.name) 45 | 46 | 47 | def test_namespace_of_attributes_is_mapped_to_prefix(): 48 | xml = _parse_xml_string(b'', [("x", "word")]) 49 | assert_equal("Hello!", xml.attributes["x:val"]) 50 | 51 | 52 | def test_whitespace_between_xml_declaration_and_root_tag_is_ignored(): 53 | xml = _parse_xml_string(b'\n') 54 | assert_equal("body", xml.name) 55 | 56 | 57 | class FindChildTests(object): 58 | def test_returns_none_if_no_children(self): 59 | xml = xml_element("a") 60 | assert_equal(None, xml.find_child("b")) 61 | 62 | def test_returns_none_if_no_matching_children(self): 63 | xml = xml_element("a", {}, [xml_element("c")]) 64 | assert_equal(None, xml.find_child("b")) 65 | 66 | def test_returns_first_matching_child(self): 67 | xml = xml_element("a", {}, [xml_element("b", {"id": 1}), xml_element("b", {"id": 2})]) 68 | assert_equal(1, xml.find_child("b").attributes["id"]) 69 | 70 | def test_ignores_text_nodes(self): 71 | xml = xml_element("a", {}, [xml_text("Hello!")]) 72 | assert_equal(None, xml.find_child("b")) 73 | 74 | 75 | def _parse_xml_string(string, namespace_mapping=None): 76 | return parse_xml(io.BytesIO(string), namespace_mapping) 77 | -------------------------------------------------------------------------------- /tests/html/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tests/html/collapse_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth import html 2 | from ..testing import assert_equal 3 | 4 | 5 | def test_collapsing_does_nothing_to_single_text_node(): 6 | assert_equal( 7 | html.collapse([html.text("Bluebells")]), 8 | [html.text("Bluebells")]) 9 | 10 | 11 | def test_consecutive_fresh_elements_are_not_collapsed(): 12 | assert_equal( 13 | html.collapse([html.element("p"), html.element("p")]), 14 | [html.element("p"), html.element("p")]) 15 | 16 | 17 | def test_consecutive_collapsible_elements_are_collapsed_if_they_have_the_same_tag_and_attributes(): 18 | assert_equal( 19 | [html.collapsible_element("p", {}, [html.text("One"), html.text("Two")])], 20 | html.collapse([ 21 | html.collapsible_element("p", {}, [html.text("One")]), 22 | html.collapsible_element("p", {}, [html.text("Two")]) 23 | ])) 24 | 25 | 26 | def test_elements_with_different_tag_names_are_not_collapsed(): 27 | assert_equal( 28 | [ 29 | html.collapsible_element("p", {}, [html.text("One")]), 30 | html.collapsible_element("div", {}, [html.text("Two")]) 31 | ], 32 | 33 | html.collapse([ 34 | html.collapsible_element("p", {}, [html.text("One")]), 35 | html.collapsible_element("div", {}, [html.text("Two")]) 36 | ])) 37 | 38 | 39 | def test_elements_with_different_attributes_are_not_collapsed(): 40 | assert_equal( 41 | [ 42 | html.collapsible_element("p", {"id": "a"}, [html.text("One")]), 43 | html.collapsible_element("p", {}, [html.text("Two")]) 44 | ], 45 | 46 | html.collapse([ 47 | html.collapsible_element("p", {"id": "a"}, [html.text("One")]), 48 | html.collapsible_element("p", {}, [html.text("Two")]) 49 | ])) 50 | 51 | 52 | def test_children_of_collapsed_element_can_collapse_with_children_of_previous_element(): 53 | assert_equal( 54 | [ 55 | html.collapsible_element("blockquote", {}, [ 56 | html.collapsible_element("p", {}, [ 57 | html.text("One"), 58 | html.text("Two") 59 | ]) 60 | ]), 61 | ], 62 | 63 | html.collapse([ 64 | html.collapsible_element("blockquote", {}, [ 65 | html.collapsible_element("p", {}, [html.text("One")]) 66 | ]), 67 | html.collapsible_element("blockquote", {}, [ 68 | html.collapsible_element("p", {}, [html.text("Two")]) 69 | ]), 70 | ])) 71 | 72 | 73 | def test_collapsible_element_can_collapse_into_previous_fresh_element(): 74 | assert_equal( 75 | [html.element("p", {}, [html.text("One"), html.text("Two")])], 76 | html.collapse([ 77 | html.element("p", {}, [html.text("One")]), 78 | html.collapsible_element("p", {}, [html.text("Two")]) 79 | ])) 80 | 81 | 82 | def test_element_with_choice_of_tag_names_can_collapse_into_previous_element_if_it_has_one_of_those_tag_names_as_its_main_tag_name(): 83 | assert_equal( 84 | [html.collapsible_element(["ol"])], 85 | html.collapse([ 86 | html.collapsible_element("ol"), 87 | html.collapsible_element(["ul", "ol"]) 88 | ])) 89 | 90 | assert_equal( 91 | [ 92 | html.collapsible_element(["ul", "ol"]), 93 | html.collapsible_element("ol") 94 | ], 95 | html.collapse([ 96 | html.collapsible_element(["ul", "ol"]), 97 | html.collapsible_element("ol") 98 | ])) 99 | 100 | 101 | def test_when_separator_is_present_then_separator_is_prepended_to_collapsed_element(): 102 | assert_equal( 103 | [ 104 | html.element("pre", collapsible=False, children=[ 105 | html.text("Hello"), 106 | html.text("\n"), 107 | html.text(" the"), 108 | html.text("re") 109 | ]) 110 | ], 111 | html.collapse([ 112 | html.element("pre", collapsible=False, children=[html.text("Hello")]), 113 | html.element("pre", collapsible=True, separator="\n", children=[html.text(" the"), html.text("re")]), 114 | ]), 115 | ) 116 | -------------------------------------------------------------------------------- /tests/html/strip_empty_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth import html 2 | from ..testing import assert_equal 3 | 4 | 5 | def test_text_nodes_with_text_are_not_stripped(): 6 | assert_equal( 7 | [html.text("H")], 8 | html.strip_empty([html.text("H")])) 9 | 10 | 11 | def test_empty_text_nodes_are_stripped(): 12 | assert_equal( 13 | [], 14 | html.strip_empty([html.text("")])) 15 | 16 | 17 | def test_elements_with_non_empty_children_are_not_stripped(): 18 | assert_equal( 19 | [html.element("p", {}, [html.text("H")])], 20 | html.strip_empty([html.element("p", {}, [html.text("H")])])) 21 | 22 | 23 | def test_elements_with_no_children_are_stripped(): 24 | assert_equal( 25 | [], 26 | html.strip_empty([html.element("p")])) 27 | 28 | 29 | def test_elements_with_only_empty_children_are_stripped(): 30 | assert_equal( 31 | [], 32 | html.strip_empty([html.element("p", {}, [html.text("")])])) 33 | 34 | 35 | def test_empty_children_are_removed(): 36 | assert_equal( 37 | html.strip_empty([html.element("ul", {}, [ 38 | html.element("li", {}, [html.text("")]), 39 | html.element("li", {}, [html.text("H")]), 40 | ])]), 41 | 42 | [html.element("ul", {}, [ 43 | html.element("li", {}, [html.text("H")]) 44 | ])]) 45 | 46 | 47 | def test_self_closing_elements_are_never_empty(): 48 | assert_equal( 49 | [html.element("br")], 50 | html.strip_empty([html.element("br")])) 51 | 52 | 53 | def test_force_writes_are_never_empty(): 54 | assert_equal( 55 | [html.force_write], 56 | html.strip_empty([html.force_write])) 57 | -------------------------------------------------------------------------------- /tests/images_tests.py: -------------------------------------------------------------------------------- 1 | import io 2 | 3 | from precisely import assert_that, has_attrs, is_sequence 4 | 5 | import mammoth 6 | 7 | 8 | def test_inline_is_available_as_alias_of_img_element(): 9 | assert mammoth.images.inline is mammoth.images.img_element 10 | 11 | 12 | def test_data_uri_encodes_images_in_base64(): 13 | image_bytes = b"abc" 14 | image = mammoth.documents.Image( 15 | alt_text=None, 16 | content_type="image/jpeg", 17 | open=lambda: io.BytesIO(image_bytes), 18 | ) 19 | 20 | result = mammoth.images.data_uri(image) 21 | 22 | assert_that(result, is_sequence( 23 | has_attrs(attributes={"src": ""}), 24 | )) 25 | 26 | 27 | class ImgElementTests: 28 | def test_when_element_does_not_have_alt_text_then_alt_attribute_is_not_set(self): 29 | image_bytes = b"abc" 30 | image = mammoth.documents.Image( 31 | alt_text=None, 32 | content_type="image/jpeg", 33 | open=lambda: io.BytesIO(image_bytes), 34 | ) 35 | 36 | @mammoth.images.img_element 37 | def convert_image(image): 38 | return {"src": ""} 39 | 40 | result = convert_image(image) 41 | 42 | assert_that(result, is_sequence( 43 | has_attrs(attributes={"src": ""}), 44 | )) 45 | 46 | def test_when_element_se_alt_text_then_alt_attribute_is_set(self): 47 | image_bytes = b"abc" 48 | image = mammoth.documents.Image( 49 | alt_text="", 50 | content_type="image/jpeg", 51 | open=lambda: io.BytesIO(image_bytes), 52 | ) 53 | 54 | @mammoth.images.img_element 55 | def convert_image(image): 56 | return {"src": ""} 57 | 58 | result = convert_image(image) 59 | 60 | assert_that(result, is_sequence( 61 | has_attrs(attributes={"alt": "", "src": ""}), 62 | )) 63 | 64 | def test_image_alt_text_can_be_overridden_by_alt_attribute_returned_from_function(self): 65 | image_bytes = b"abc" 66 | image = mammoth.documents.Image( 67 | alt_text="", 68 | content_type="image/jpeg", 69 | open=lambda: io.BytesIO(image_bytes), 70 | ) 71 | 72 | @mammoth.images.img_element 73 | def convert_image(image): 74 | return {"alt": "", "src": ""} 75 | 76 | result = convert_image(image) 77 | 78 | assert_that(result, is_sequence( 79 | has_attrs(attributes={"alt": "", "src": ""}), 80 | )) 81 | -------------------------------------------------------------------------------- /tests/lists_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth.lists import unique 2 | from .testing import assert_equal 3 | 4 | 5 | def test_unique_of_empty_list_is_empty_list(): 6 | assert_equal([], unique([])) 7 | 8 | 9 | def test_unique_removes_duplicates_while_preserving_order(): 10 | assert_equal(["apple", "banana"], unique(["apple", "banana", "apple"])) 11 | -------------------------------------------------------------------------------- /tests/mammoth_tests.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | from __future__ import unicode_literals 4 | 5 | import base64 6 | import io 7 | import shutil 8 | import os 9 | 10 | import tempman 11 | 12 | from .testing import assert_equal, generate_test_path 13 | 14 | import mammoth 15 | from mammoth import results 16 | 17 | 18 | def test_docx_containing_one_paragraph_is_converted_to_single_p_element(): 19 | with open(generate_test_path("single-paragraph.docx"), "rb") as fileobj: 20 | result = mammoth.convert_to_html(fileobj=fileobj) 21 | assert_equal("

Walking on imported air

", result.value) 22 | assert_equal([], result.messages) 23 | 24 | 25 | def test_can_read_xml_files_with_utf8_bom(): 26 | with open(generate_test_path("utf8-bom.docx"), "rb") as fileobj: 27 | result = mammoth.convert_to_html(fileobj=fileobj) 28 | assert_equal("

This XML has a byte order mark.

", result.value) 29 | assert_equal([], result.messages) 30 | 31 | 32 | def test_empty_paragraphs_are_ignored_by_default(): 33 | with open(generate_test_path("empty.docx"), "rb") as fileobj: 34 | result = mammoth.convert_to_html(fileobj=fileobj) 35 | assert_equal("", result.value) 36 | assert_equal([], result.messages) 37 | 38 | 39 | def test_empty_paragraphs_are_preserved_if_ignore_empty_paragraphs_is_false(): 40 | with open(generate_test_path("empty.docx"), "rb") as fileobj: 41 | result = mammoth.convert_to_html(fileobj=fileobj, ignore_empty_paragraphs=False) 42 | assert_equal("

", result.value) 43 | assert_equal([], result.messages) 44 | 45 | 46 | def test_embedded_style_map_is_used_if_present(): 47 | with open(generate_test_path("embedded-style-map.docx"), "rb") as fileobj: 48 | result = mammoth.convert_to_html(fileobj=fileobj) 49 | assert_equal("

Walking on imported air

", result.value) 50 | assert_equal([], result.messages) 51 | 52 | 53 | def test_explicit_style_map_takes_precedence_over_embedded_style_map(): 54 | with open(generate_test_path("embedded-style-map.docx"), "rb") as fileobj: 55 | result = mammoth.convert_to_html(fileobj=fileobj, style_map="p => p") 56 | assert_equal("

Walking on imported air

", result.value) 57 | assert_equal([], result.messages) 58 | 59 | 60 | def test_explicit_style_map_is_combined_with_embedded_style_map(): 61 | with open(generate_test_path("embedded-style-map.docx"), "rb") as fileobj: 62 | result = mammoth.convert_to_html(fileobj=fileobj, style_map="r => strong") 63 | assert_equal("

Walking on imported air

", result.value) 64 | assert_equal([], result.messages) 65 | 66 | 67 | def test_embedded_style_maps_can_be_disabled(): 68 | with open(generate_test_path("embedded-style-map.docx"), "rb") as fileobj: 69 | result = mammoth.convert_to_html(fileobj=fileobj, include_embedded_style_map=False) 70 | assert_equal("

Walking on imported air

", result.value) 71 | assert_equal([], result.messages) 72 | 73 | 74 | def test_embedded_style_map_can_be_written_and_then_read(): 75 | with _copy_of_test_data("single-paragraph.docx") as fileobj: 76 | mammoth.embed_style_map(fileobj, "p => h1") 77 | result = mammoth.convert_to_html(fileobj=fileobj, ignore_empty_paragraphs=False) 78 | assert_equal("

Walking on imported air

", result.value) 79 | assert_equal([], result.messages) 80 | 81 | 82 | def test_embedded_style_map_can_be_retrieved(): 83 | with _copy_of_test_data("single-paragraph.docx") as fileobj: 84 | mammoth.embed_style_map(fileobj, "p => h1") 85 | assert_equal("p => h1", mammoth.read_embedded_style_map(fileobj)) 86 | 87 | 88 | def test_warning_if_style_mapping_is_not_understood(): 89 | style_map = """ 90 | !!!! 91 | p => h1""" 92 | with open(generate_test_path("single-paragraph.docx"), "rb") as fileobj: 93 | result = mammoth.convert_to_html(fileobj=fileobj, style_map=style_map) 94 | assert_equal("

Walking on imported air

", result.value) 95 | warning = "Did not understand this style mapping, so ignored it: !!!!" 96 | assert_equal([results.warning(warning)], result.messages) 97 | 98 | 99 | def test_inline_images_referenced_by_path_relative_to_part_are_included_in_output(): 100 | with open(generate_test_path("tiny-picture.docx"), "rb") as fileobj: 101 | result = mammoth.convert_to_html(fileobj=fileobj) 102 | assert_equal("""

""", result.value) 103 | assert_equal([], result.messages) 104 | 105 | 106 | def test_inline_images_referenced_by_path_relative_to_base_are_included_in_output(): 107 | with open(generate_test_path("tiny-picture-target-base-relative.docx"), "rb") as fileobj: 108 | result = mammoth.convert_to_html(fileobj=fileobj) 109 | assert_equal("""

""", result.value) 110 | assert_equal([], result.messages) 111 | 112 | 113 | def test_images_stored_outside_of_document_are_included_in_output(): 114 | with open(generate_test_path("external-picture.docx"), "rb") as fileobj: 115 | result = mammoth.convert_to_html(fileobj=fileobj) 116 | assert_equal("""

""", result.value) 117 | assert_equal([], result.messages) 118 | 119 | 120 | def test_warn_if_images_stored_outside_of_document_are_specified_when_passing_fileobj_without_name(): 121 | fileobj = io.BytesIO() 122 | with open(generate_test_path("external-picture.docx"), "rb") as source_fileobj: 123 | shutil.copyfileobj(source_fileobj, fileobj) 124 | 125 | result = mammoth.convert_to_html(fileobj=fileobj) 126 | assert_equal("", result.value) 127 | assert_equal([results.warning("could not find external image 'tiny-picture.png', fileobj has no name")], result.messages) 128 | 129 | 130 | def test_warn_if_images_stored_outside_of_document_are_not_found(): 131 | with tempman.create_temp_dir() as temp_dir: 132 | document_path = os.path.join(temp_dir.path, "document.docx") 133 | with open(document_path, "wb") as fileobj: 134 | with open(generate_test_path("external-picture.docx"), "rb") as source_fileobj: 135 | shutil.copyfileobj(source_fileobj, fileobj) 136 | 137 | with open(document_path, "rb") as fileobj: 138 | result = mammoth.convert_to_html(fileobj=fileobj) 139 | assert_equal("", result.value) 140 | expected_warning = "could not open external image: 'tiny-picture.png'" 141 | assert_equal("warning", result.messages[0].type) 142 | assert result.messages[0].message.startswith(expected_warning), "message was: " + result.messages[0].message 143 | assert_equal(1, len(result.messages)) 144 | 145 | 146 | def test_image_conversion_can_be_customised(): 147 | @mammoth.images.img_element 148 | def convert_image(image): 149 | with image.open() as image_bytes: 150 | encoded_src = base64.b64encode(image_bytes.read()).decode("ascii") 151 | 152 | return { 153 | "src": encoded_src[:2] + "," + image.content_type 154 | } 155 | 156 | with open(generate_test_path("tiny-picture.docx"), "rb") as fileobj: 157 | result = mammoth.convert_to_html(fileobj=fileobj, convert_image=convert_image) 158 | assert_equal("""

""", result.value) 159 | assert_equal([], result.messages) 160 | 161 | 162 | def test_simple_list_is_converted_to_list_elements(): 163 | with open(generate_test_path("simple-list.docx"), "rb") as fileobj: 164 | result = mammoth.convert_to_html(fileobj=fileobj) 165 | assert_equal([], result.messages) 166 | assert_equal("
  • Apple
  • Banana
", result.value) 167 | 168 | 169 | def test_word_tables_are_converted_to_html_tables(): 170 | expected_html = ("

Above

" + 171 | "" + 172 | "" + 173 | "" + 174 | "

Top left

Top right

Bottom left

Bottom right

" + 175 | "

Below

") 176 | 177 | 178 | with open(generate_test_path("tables.docx"), "rb") as fileobj: 179 | result = mammoth.convert_to_html(fileobj=fileobj) 180 | assert_equal([], result.messages) 181 | assert_equal(expected_html, result.value) 182 | 183 | 184 | def test_footnotes_are_appended_to_text(): 185 | # TODO: don't duplicate footnotes with multiple references 186 | expected_html = ('
' + 189 | '
  1. A tachyon walks into a bar.

  2. ' + 190 | '
  3. Fin.

') 191 | 192 | with open(generate_test_path("footnotes.docx"), "rb") as fileobj: 193 | result = mammoth.convert_to_html(fileobj=fileobj, id_prefix="doc-42-") 194 | assert_equal([], result.messages) 195 | assert_equal(expected_html, result.value) 196 | 197 | 198 | def test_endnotes_are_appended_to_text(): 199 | expected_html = ('

Ouch' + 200 | '[1].' + 201 | '[2]

' + 202 | '
  1. A tachyon walks into a bar.

  2. ' + 203 | '
  3. Fin.

') 204 | 205 | with open(generate_test_path("endnotes.docx"), "rb") as fileobj: 206 | result = mammoth.convert_to_html(fileobj=fileobj, id_prefix="doc-42-") 207 | assert_equal([], result.messages) 208 | assert_equal(expected_html, result.value) 209 | 210 | 211 | def test_relationships_are_handled_properly_in_footnotes(): 212 | expected_html = ( 213 | '

[1]

' + 214 | '
  1. Example

') 215 | 216 | with open(generate_test_path("footnote-hyperlink.docx"), "rb") as fileobj: 217 | result = mammoth.convert_to_html(fileobj=fileobj, id_prefix="doc-42-") 218 | assert_equal([], result.messages) 219 | assert_equal(expected_html, result.value) 220 | 221 | 222 | def test_when_style_mapping_is_defined_for_comment_references_then_comments_are_included(): 223 | expected_html = ( 224 | '

Ouch' + 225 | '[MW1].' + 226 | '[MW2]

' + 227 | '
Comment [MW1]

A tachyon walks into a bar.

' + 228 | '
Comment [MW2]

Fin.

' 229 | ) 230 | 231 | with open(generate_test_path("comments.docx"), "rb") as fileobj: 232 | result = mammoth.convert_to_html(fileobj=fileobj, id_prefix="doc-42-", style_map="comment-reference => sup") 233 | assert_equal([], result.messages) 234 | assert_equal(expected_html, result.value) 235 | 236 | 237 | def test_text_boxes_are_read(): 238 | with open(generate_test_path("text-box.docx"), "rb") as fileobj: 239 | result = mammoth.convert_to_html(fileobj=fileobj) 240 | assert_equal('

Datum plane

', result.value) 241 | 242 | 243 | def test_underline_is_ignored_by_default(): 244 | with open(generate_test_path("underline.docx"), "rb") as fileobj: 245 | result = mammoth.convert_to_html(fileobj=fileobj) 246 | assert_equal('

The Sunset Tree

', result.value) 247 | 248 | 249 | def test_underline_can_be_configured_with_style_mapping(): 250 | with open(generate_test_path("underline.docx"), "rb") as fileobj: 251 | result = mammoth.convert_to_html(fileobj=fileobj, style_map="u => em") 252 | assert_equal('

The Sunset Tree

', result.value) 253 | 254 | 255 | def test_strikethrough_is_converted_to_s_element_by_default(): 256 | with open(generate_test_path("strikethrough.docx"), "rb") as fileobj: 257 | result = mammoth.convert_to_html(fileobj=fileobj) 258 | assert_equal("

Today's Special: Salmon Sold out

", result.value) 259 | 260 | 261 | def test_strikethrough_conversion_can_be_configured_with_style_mapping(): 262 | with open(generate_test_path("strikethrough.docx"), "rb") as fileobj: 263 | result = mammoth.convert_to_html(fileobj=fileobj, style_map="strike => del") 264 | assert_equal("

Today's Special: Salmon Sold out

", result.value) 265 | 266 | 267 | def test_transform_document_is_applied_to_document_before_conversion(): 268 | def transform_document(document): 269 | document.children[0].style_id = "Heading1" 270 | return document 271 | 272 | with open(generate_test_path("single-paragraph.docx"), "rb") as fileobj: 273 | result = mammoth.convert_to_html(fileobj=fileobj, transform_document=transform_document) 274 | assert_equal("

Walking on imported air

", result.value) 275 | assert_equal([], result.messages) 276 | 277 | 278 | def test_paragraph_transform_only_transforms_paragraphs(): 279 | def transform_paragraph(paragraph): 280 | return paragraph.copy(style_id="Heading1") 281 | transform_document = mammoth.transforms.paragraph(transform_paragraph) 282 | with open(generate_test_path("single-paragraph.docx"), "rb") as fileobj: 283 | result = mammoth.convert_to_html(fileobj=fileobj, transform_document=transform_document) 284 | assert_equal("

Walking on imported air

", result.value) 285 | assert_equal([], result.messages) 286 | 287 | 288 | def test_docx_containing_one_paragraph_can_be_converted_to_markdown(): 289 | with open(generate_test_path("single-paragraph.docx"), "rb") as fileobj: 290 | result = mammoth.convert_to_markdown(fileobj=fileobj) 291 | assert_equal("Walking on imported air\n\n", result.value) 292 | assert_equal([], result.messages) 293 | 294 | 295 | def test_can_extract_raw_text(): 296 | with open(generate_test_path("simple-list.docx"), "rb") as fileobj: 297 | result = mammoth.extract_raw_text(fileobj=fileobj) 298 | assert_equal([], result.messages) 299 | assert_equal("Apple\n\nBanana\n\n", result.value) 300 | 301 | 302 | def test_can_read_strict_format(): 303 | with open(generate_test_path("strict-format.docx"), "rb") as fileobj: 304 | result = mammoth.convert_to_html(fileobj=fileobj) 305 | assert_equal([], result.messages) 306 | assert_equal("

Test

", result.value) 307 | 308 | 309 | def _copy_of_test_data(path): 310 | destination = io.BytesIO() 311 | with open(generate_test_path(path), "rb") as source: 312 | shutil.copyfileobj(source, destination) 313 | return destination 314 | -------------------------------------------------------------------------------- /tests/options_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth.options import read_options, _default_style_map 2 | from mammoth.styles.parser import read_style_mapping 3 | from .testing import assert_equal 4 | 5 | 6 | def test_default_style_map_is_used_if_style_map_is_not_set(): 7 | assert_equal(_default_style_map, read_options({}).value["style_map"]) 8 | 9 | 10 | def test_custom_style_mappings_are_prepended_to_default_style_mappings(): 11 | style_map = read_options({ 12 | "style_map": "p.SectionTitle => h2" 13 | }).value["style_map"] 14 | assert_equal(read_style_mapping("p.SectionTitle => h2").value, style_map[0]) 15 | assert_equal(_default_style_map, style_map[1:]) 16 | 17 | 18 | def test_default_style_mappings_are_ignored_if_include_default_style_map_is_false(): 19 | style_map = read_options({ 20 | "style_map": "p.SectionTitle => h2", 21 | "include_default_style_map": False 22 | }).value["style_map"] 23 | assert_equal([read_style_mapping("p.SectionTitle => h2").value], style_map) 24 | 25 | 26 | def test_lines_starting_with_hash_in_custom_style_map_are_ignored(): 27 | style_map = read_options({ 28 | "style_map": "#p.SectionTitle => h3\np.SectionTitle => h2", 29 | "include_default_style_map": False 30 | }).value["style_map"] 31 | assert_equal([read_style_mapping("p.SectionTitle => h2").value], style_map) 32 | -------------------------------------------------------------------------------- /tests/raw_text_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth.raw_text import extract_raw_text_from_element 2 | from mammoth import documents 3 | from .testing import assert_equal 4 | 5 | 6 | def test_text_element_is_converted_to_text_content(): 7 | element = documents.Text("Hello.") 8 | 9 | result = extract_raw_text_from_element(element) 10 | 11 | assert_equal("Hello.", result) 12 | 13 | 14 | def test_tab_element_is_converted_to_tab_character(): 15 | element = documents.tab() 16 | 17 | result = extract_raw_text_from_element(element) 18 | 19 | assert_equal("\t", result) 20 | 21 | 22 | def test_paragraphs_are_terminated_with_newlines(): 23 | element = documents.paragraph( 24 | children=[ 25 | documents.Text("Hello "), 26 | documents.Text("world."), 27 | ], 28 | ) 29 | 30 | result = extract_raw_text_from_element(element) 31 | 32 | assert_equal("Hello world.\n\n", result) 33 | 34 | 35 | def test_children_are_recursively_converted_to_text(): 36 | element = documents.document([ 37 | documents.paragraph( 38 | [ 39 | documents.text("Hello "), 40 | documents.text("world.") 41 | ], 42 | {} 43 | ) 44 | ]) 45 | 46 | result = extract_raw_text_from_element(element) 47 | 48 | assert_equal("Hello world.\n\n", result) 49 | 50 | 51 | def test_non_text_element_without_children_is_converted_to_empty_string(): 52 | element = documents.line_break 53 | assert not hasattr(element, "children") 54 | 55 | result = extract_raw_text_from_element(element) 56 | 57 | assert_equal("", result) 58 | -------------------------------------------------------------------------------- /tests/styles/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/styles/__init__.py -------------------------------------------------------------------------------- /tests/styles/document_matcher_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth import document_matchers 2 | from ..testing import assert_equal 3 | 4 | 5 | def test_equal_to_matcher_is_case_insensitive(): 6 | matcher = document_matchers.equal_to("Heading 1") 7 | assert_equal(True, matcher.matches("heaDING 1")) 8 | assert_equal(False, matcher.matches("heaDING 2")) 9 | 10 | 11 | def test_starts_with_matcher_matches_string_with_prefix(): 12 | matcher = document_matchers.starts_with("Heading") 13 | assert_equal(True, matcher.matches("Heading")) 14 | assert_equal(True, matcher.matches("Heading 1")) 15 | assert_equal(False, matcher.matches("Custom Heading")) 16 | assert_equal(False, matcher.matches("Head")) 17 | assert_equal(False, matcher.matches("Header 2")) 18 | 19 | 20 | def test_starts_with_matcher_is_case_insensitive(): 21 | matcher = document_matchers.starts_with("Heading") 22 | assert_equal(True, matcher.matches("heaDING")) 23 | -------------------------------------------------------------------------------- /tests/styles/parser/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/styles/parser/__init__.py -------------------------------------------------------------------------------- /tests/styles/parser/document_matcher_parser_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth import documents, document_matchers 2 | from mammoth.styles.parser.document_matcher_parser import parse_document_matcher 3 | from mammoth.styles.parser.errors import LineParseError 4 | from mammoth.styles.parser.tokeniser import tokenise 5 | from mammoth.styles.parser.token_iterator import TokenIterator 6 | from ...testing import assert_equal, assert_raises 7 | 8 | 9 | def test_unrecognised_document_element_raises_error(): 10 | error = assert_raises(LineParseError, lambda: read_document_matcher("x")) 11 | assert_equal("Unrecognised document element: x", str(error)) 12 | 13 | 14 | def test_reads_plain_paragraph(): 15 | assert_equal( 16 | document_matchers.paragraph(), 17 | read_document_matcher("p") 18 | ) 19 | 20 | 21 | def test_reads_paragraph_with_style_id(): 22 | assert_equal( 23 | document_matchers.paragraph(style_id="Heading1"), 24 | read_document_matcher("p.Heading1") 25 | ) 26 | 27 | 28 | def test_reads_paragraph_with_exact_style_name(): 29 | assert_equal( 30 | document_matchers.paragraph(style_name=document_matchers.equal_to("Heading 1")), 31 | read_document_matcher("p[style-name='Heading 1']") 32 | ) 33 | 34 | 35 | def test_reads_paragraph_with_style_name_prefix(): 36 | assert_equal( 37 | document_matchers.paragraph(style_name=document_matchers.starts_with("Heading")), 38 | read_document_matcher("p[style-name^='Heading']") 39 | ) 40 | 41 | 42 | def test_unrecognised_string_matcher_raises_error(): 43 | error = assert_raises(LineParseError, lambda: read_document_matcher("p[style-name*='Heading']")) 44 | assert_equal("Unrecognised string matcher: *", str(error)) 45 | 46 | 47 | def test_reads_paragraph_ordered_list(): 48 | assert_equal( 49 | document_matchers.paragraph(numbering=documents.numbering_level(1, is_ordered=True)), 50 | read_document_matcher("p:ordered-list(2)") 51 | ) 52 | 53 | 54 | def test_reads_paragraph_unordered_list(): 55 | assert_equal( 56 | document_matchers.paragraph(numbering=documents.numbering_level(1, is_ordered=False)), 57 | read_document_matcher("p:unordered-list(2)") 58 | ) 59 | 60 | 61 | def test_unrecognised_list_type_raises_error(): 62 | error = assert_raises(LineParseError, lambda: read_document_matcher("p:blah")) 63 | assert_equal("Unrecognised list type: blah", str(error)) 64 | 65 | 66 | def test_reads_plain_run(): 67 | assert_equal( 68 | document_matchers.run(), 69 | read_document_matcher("r") 70 | ) 71 | 72 | 73 | def test_reads_run_with_style_id(): 74 | assert_equal( 75 | document_matchers.run(style_id="Emphasis"), 76 | read_document_matcher("r.Emphasis") 77 | ) 78 | 79 | 80 | def test_reads_run_with_style_name(): 81 | assert_equal( 82 | document_matchers.run(style_name=document_matchers.equal_to("Emphasis")), 83 | read_document_matcher("r[style-name='Emphasis']") 84 | ) 85 | 86 | 87 | def test_reads_plain_table(): 88 | assert_equal( 89 | document_matchers.table(), 90 | read_document_matcher("table") 91 | ) 92 | 93 | 94 | def test_reads_table_with_style_id(): 95 | assert_equal( 96 | document_matchers.table(style_id="TableNormal"), 97 | read_document_matcher("table.TableNormal") 98 | ) 99 | 100 | 101 | def test_reads_table_with_style_name(): 102 | assert_equal( 103 | document_matchers.table(style_name=document_matchers.equal_to("Normal Table")), 104 | read_document_matcher("table[style-name='Normal Table']") 105 | ) 106 | 107 | 108 | def test_reads_bold(): 109 | assert_equal( 110 | document_matchers.bold, 111 | read_document_matcher("b") 112 | ) 113 | 114 | def test_reads_italic(): 115 | assert_equal( 116 | document_matchers.italic, 117 | read_document_matcher("i") 118 | ) 119 | 120 | def test_reads_underline(): 121 | assert_equal( 122 | document_matchers.underline, 123 | read_document_matcher("u") 124 | ) 125 | 126 | def test_reads_strikethrough(): 127 | assert_equal( 128 | document_matchers.strikethrough, 129 | read_document_matcher("strike") 130 | ) 131 | 132 | def test_reads_all_caps(): 133 | assert_equal( 134 | document_matchers.all_caps, 135 | read_document_matcher("all-caps") 136 | ) 137 | 138 | def test_reads_small_caps(): 139 | assert_equal( 140 | document_matchers.small_caps, 141 | read_document_matcher("small-caps") 142 | ) 143 | 144 | def test_reads_highlight_without_color(): 145 | assert_equal( 146 | document_matchers.highlight(), 147 | read_document_matcher("highlight") 148 | ) 149 | 150 | def test_reads_highlight_with_color(): 151 | assert_equal( 152 | document_matchers.highlight(color="yellow"), 153 | read_document_matcher("highlight[color='yellow']") 154 | ) 155 | 156 | def test_reads_comment_reference(): 157 | assert_equal( 158 | document_matchers.comment_reference, 159 | read_document_matcher("comment-reference") 160 | ) 161 | 162 | def test_reads_line_breaks(): 163 | assert_equal( 164 | document_matchers.line_break, 165 | read_document_matcher("br[type='line']"), 166 | ) 167 | 168 | def test_reads_page_breaks(): 169 | assert_equal( 170 | document_matchers.page_break, 171 | read_document_matcher("br[type='page']"), 172 | ) 173 | 174 | def test_reads_column_breaks(): 175 | assert_equal( 176 | document_matchers.column_break, 177 | read_document_matcher("br[type='column']"), 178 | ) 179 | 180 | 181 | def test_unrecognised_break_type_raises_error(): 182 | error = assert_raises(LineParseError, lambda: read_document_matcher("br[type='unknownBreakType']")) 183 | assert_equal("Unrecognised break type: unknownBreakType", str(error)) 184 | 185 | 186 | def read_document_matcher(string): 187 | return parse_document_matcher(TokenIterator(tokenise(string))) 188 | -------------------------------------------------------------------------------- /tests/styles/parser/html_path_parser_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth import html_paths 2 | from mammoth.styles.parser.html_path_parser import parse_html_path 3 | from mammoth.styles.parser.tokeniser import tokenise 4 | from mammoth.styles.parser.token_iterator import TokenIterator 5 | from ...testing import assert_equal 6 | 7 | 8 | def test_can_read_empty_path(): 9 | assert_equal( 10 | html_paths.empty, 11 | read_html_path("") 12 | ) 13 | 14 | def test_can_read_single_element(): 15 | assert_equal( 16 | html_paths.path([html_paths.element(["p"])]), 17 | read_html_path("p") 18 | ) 19 | 20 | 21 | def test_can_read_choice_of_two_elements(): 22 | assert_equal( 23 | html_paths.path([html_paths.element(["ul", "ol"])]), 24 | read_html_path("ul|ol") 25 | ) 26 | 27 | 28 | def test_can_read_choice_of_three_elements(): 29 | assert_equal( 30 | html_paths.path([html_paths.element(["ul", "ol", "p"])]), 31 | read_html_path("ul|ol|p") 32 | ) 33 | 34 | 35 | def test_can_read_nested_elements(): 36 | assert_equal( 37 | html_paths.path([html_paths.element(["ul"]), html_paths.element(["li"])]), 38 | read_html_path("ul > li") 39 | ) 40 | 41 | 42 | def test_can_read_class_on_element(): 43 | assert_equal( 44 | html_paths.path([html_paths.element(["p"], class_names=["tip"])]), 45 | read_html_path("p.tip") 46 | ) 47 | 48 | 49 | def test_can_read_multiple_classes_on_element(): 50 | assert_equal( 51 | html_paths.path([html_paths.element(["p"], class_names=["tip", "help"])]), 52 | read_html_path("p.tip.help") 53 | ) 54 | 55 | 56 | def test_can_read_attribute_on_element(): 57 | assert_equal( 58 | html_paths.path([html_paths.element(["p"], attributes={"lang": "fr"})]), 59 | read_html_path("p[lang='fr']") 60 | ) 61 | 62 | 63 | def test_can_read_multiple_attributes_on_element(): 64 | assert_equal( 65 | html_paths.path([html_paths.element(["p"], attributes={"lang": "fr", "data-x": "y"})]), 66 | read_html_path("p[lang='fr'][data-x='y']") 67 | ) 68 | 69 | 70 | def test_can_read_when_element_must_be_fresh(): 71 | assert_equal( 72 | html_paths.path([html_paths.element(["p"], fresh=True)]), 73 | read_html_path("p:fresh") 74 | ) 75 | 76 | 77 | def test_can_read_separator_for_elements(): 78 | assert_equal( 79 | html_paths.path([html_paths.element(["p"], separator="x")]), 80 | read_html_path("p:separator('x')") 81 | ) 82 | 83 | 84 | def test_can_read_ignore_element(): 85 | assert_equal( 86 | html_paths.ignore, 87 | read_html_path("!") 88 | ) 89 | 90 | def read_html_path(string): 91 | return parse_html_path(TokenIterator(tokenise(string))) 92 | -------------------------------------------------------------------------------- /tests/styles/parser/style_mapping_parser_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth import html_paths, document_matchers, styles 2 | from mammoth.styles.parser.style_mapping_parser import parse_style_mapping 3 | from mammoth.styles.parser.tokeniser import tokenise 4 | from mammoth.styles.parser.token_iterator import TokenIterator 5 | from ...testing import assert_equal 6 | 7 | 8 | def test_document_matcher_is_mapped_to_html_path_using_fat_arrow(): 9 | assert_equal( 10 | styles.style(document_matchers.paragraph(), html_paths.path([html_paths.element(["h1"])])), 11 | read_style_mapping("p => h1") 12 | ) 13 | 14 | 15 | def read_style_mapping(string): 16 | return parse_style_mapping(TokenIterator(tokenise(string))) 17 | -------------------------------------------------------------------------------- /tests/styles/parser/token_parser_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth.styles.parser.tokeniser import Token, TokenType 2 | from mammoth.styles.parser.token_parser import decode_escape_sequences, parse_identifier, parse_string 3 | from mammoth.styles.parser.token_iterator import TokenIterator 4 | from ...testing import assert_equal 5 | 6 | 7 | def test_escape_sequences_in_identifiers_are_decoded(): 8 | assert_equal( 9 | ":", 10 | parse_identifier(TokenIterator([ 11 | Token(0, TokenType.IDENTIFIER, r"\:"), 12 | ])), 13 | ) 14 | 15 | 16 | def test_escape_sequences_in_strings_are_decoded(): 17 | assert_equal( 18 | "\n", 19 | parse_string(TokenIterator([ 20 | Token(0, TokenType.STRING, r"'\n'"), 21 | ])), 22 | ) 23 | 24 | 25 | def test_line_feeds_are_decoded(): 26 | assert_equal("\n", decode_escape_sequences(r"\n")) 27 | 28 | 29 | def test_carriage_returns_are_decoded(): 30 | assert_equal("\r", decode_escape_sequences(r"\r")) 31 | 32 | 33 | def test_tabs_are_decoded(): 34 | assert_equal("\t", decode_escape_sequences(r"\t")) 35 | 36 | 37 | def test_backslashes_are_decoded(): 38 | assert_equal("\\", decode_escape_sequences(r"\\")) 39 | 40 | 41 | def test_colons_are_decoded(): 42 | assert_equal(":", decode_escape_sequences(r"\:")) 43 | -------------------------------------------------------------------------------- /tests/styles/parser/tokeniser_tests.py: -------------------------------------------------------------------------------- 1 | from precisely import assert_that, has_attrs, is_sequence 2 | 3 | from mammoth.styles.parser.tokeniser import tokenise 4 | 5 | 6 | def test_unknown_tokens_are_tokenised(): 7 | assert_tokens("~", is_token("unknown", "~")) 8 | 9 | 10 | def test_empty_string_is_tokenised_to_end_of_file_token(): 11 | assert_tokens("") 12 | 13 | 14 | def test_whitespace_is_tokenised(): 15 | assert_tokens(" \t\t ", is_token("whitespace", " \t\t ")) 16 | 17 | 18 | def test_identifiers_are_tokenised(): 19 | assert_tokens("Overture", is_token("identifier", "Overture")) 20 | 21 | 22 | def test_escape_sequences_in_identifiers_are_tokenised(): 23 | assert_tokens(r"\:", is_token("identifier", r"\:")) 24 | 25 | 26 | def test_integers_are_tokenised(): 27 | assert_tokens("123", is_token("integer", "123")) 28 | 29 | 30 | def test_strings_are_tokenised(): 31 | assert_tokens("'Tristan'", is_token("string", "'Tristan'")) 32 | 33 | 34 | def test_escape_sequences_in_strings_are_tokenised(): 35 | assert_tokens(r"'Tristan\''", is_token("string", r"'Tristan\''")) 36 | 37 | 38 | def test_unterminated_strings_are_tokenised(): 39 | assert_tokens("'Tristan", is_token("unterminated string", "'Tristan")) 40 | 41 | 42 | def test_arrows_are_tokenised(): 43 | assert_tokens("=>=>", is_token("symbol", "=>"), is_token("symbol", "=>")) 44 | 45 | 46 | def test_dots_are_tokenised(): 47 | assert_tokens(".", is_token("symbol", ".")) 48 | 49 | 50 | def test_colons_are_tokenised(): 51 | assert_tokens("::", is_token("symbol", ":"), is_token("symbol", ":")) 52 | 53 | 54 | def test_greater_thans_are_tokenised(): 55 | assert_tokens(">>", is_token("symbol", ">"), is_token("symbol", ">")) 56 | 57 | 58 | def test_equals_are_tokenised(): 59 | assert_tokens("==", is_token("symbol", "="), is_token("symbol", "=")) 60 | 61 | 62 | def test_open_parens_are_tokenised(): 63 | assert_tokens("((", is_token("symbol", "("), is_token("symbol", "(")) 64 | 65 | 66 | def test_close_parens_are_tokenised(): 67 | assert_tokens("))", is_token("symbol", ")"), is_token("symbol", ")")) 68 | 69 | 70 | def test_open_square_brackets_are_tokenised(): 71 | assert_tokens("[[", is_token("symbol", "["), is_token("symbol", "[")) 72 | 73 | 74 | def test_close_square_brackets_are_tokenised(): 75 | assert_tokens("]]", is_token("symbol", "]"), is_token("symbol", "]")) 76 | 77 | 78 | def test_choices_are_tokenised(): 79 | assert_tokens("||", is_token("symbol", "|"), is_token("symbol", "|")) 80 | 81 | 82 | def test_bangs_are_tokenised(): 83 | assert_tokens("!!", is_token("symbol", "!"), is_token("symbol", "!")) 84 | 85 | 86 | def test_can_tokenise_multiple_tokens(): 87 | assert_tokens("The Magic Position", 88 | is_token("identifier", "The"), 89 | is_token("whitespace", " "), 90 | is_token("identifier", "Magic"), 91 | is_token("whitespace", " "), 92 | is_token("identifier", "Position"), 93 | ) 94 | 95 | 96 | def assert_tokens(string, *expected): 97 | expected = list(expected) 98 | expected.append(is_token("end", "")) 99 | assert_that( 100 | tokenise(string), 101 | is_sequence(*expected), 102 | ) 103 | 104 | 105 | def is_token(token_type, value): 106 | return has_attrs( 107 | type=token_type, 108 | value=value, 109 | ) 110 | -------------------------------------------------------------------------------- /tests/test-data/comments.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/test-data/comments.docx -------------------------------------------------------------------------------- /tests/test-data/embedded-style-map.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/test-data/embedded-style-map.docx -------------------------------------------------------------------------------- /tests/test-data/empty.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/test-data/empty.docx -------------------------------------------------------------------------------- /tests/test-data/endnotes.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/test-data/endnotes.docx -------------------------------------------------------------------------------- /tests/test-data/external-picture.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/test-data/external-picture.docx -------------------------------------------------------------------------------- /tests/test-data/footnote-hyperlink.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/test-data/footnote-hyperlink.docx -------------------------------------------------------------------------------- /tests/test-data/footnotes.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/test-data/footnotes.docx -------------------------------------------------------------------------------- /tests/test-data/hyperlinks/word/_rels/document.xml.rels: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /tests/test-data/hyperlinks/word/document.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | coconuts 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /tests/test-data/simple-list.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/test-data/simple-list.docx -------------------------------------------------------------------------------- /tests/test-data/simple/word/document.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Hello. 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /tests/test-data/single-paragraph.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/test-data/single-paragraph.docx -------------------------------------------------------------------------------- /tests/test-data/strict-format.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/test-data/strict-format.docx -------------------------------------------------------------------------------- /tests/test-data/strikethrough.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/test-data/strikethrough.docx -------------------------------------------------------------------------------- /tests/test-data/tables.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/test-data/tables.docx -------------------------------------------------------------------------------- /tests/test-data/text-box.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/test-data/text-box.docx -------------------------------------------------------------------------------- /tests/test-data/tiny-picture-target-base-relative.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/test-data/tiny-picture-target-base-relative.docx -------------------------------------------------------------------------------- /tests/test-data/tiny-picture.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/test-data/tiny-picture.docx -------------------------------------------------------------------------------- /tests/test-data/tiny-picture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/test-data/tiny-picture.png -------------------------------------------------------------------------------- /tests/test-data/underline.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/test-data/underline.docx -------------------------------------------------------------------------------- /tests/test-data/utf8-bom.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/test-data/utf8-bom.docx -------------------------------------------------------------------------------- /tests/testing.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from precisely import assert_that, equal_to 4 | 5 | 6 | def generate_test_path(path): 7 | this_dir = os.path.dirname(__file__) 8 | return os.path.join(this_dir, "test-data", path) 9 | 10 | 11 | def assert_equal(expected, actual): 12 | assert_that(actual, equal_to(expected)) 13 | 14 | 15 | def assert_raises(exception, func): 16 | try: 17 | func() 18 | assert False, "Expected " + exception.__name__ 19 | except exception as error: 20 | return error 21 | 22 | -------------------------------------------------------------------------------- /tests/transforms_tests.py: -------------------------------------------------------------------------------- 1 | import cobble 2 | 3 | from mammoth import documents, transforms 4 | from mammoth.transforms import get_descendants, get_descendants_of_type, _each_element 5 | from .testing import assert_equal 6 | 7 | 8 | class ParagraphTests(object): 9 | def test_paragraph_is_transformed(self): 10 | paragraph = documents.paragraph(children=[]) 11 | result = transforms.paragraph(lambda _: documents.tab())(paragraph) 12 | assert_equal(documents.tab(), result) 13 | 14 | def test_non_paragraph_elements_are_not_transformed(self): 15 | run = documents.run(children=[]) 16 | result = transforms.paragraph(lambda _: documents.tab())(run) 17 | assert_equal(documents.run(children=[]), result) 18 | 19 | 20 | class RunTests(object): 21 | def test_run_is_transformed(self): 22 | run = documents.run(children=[]) 23 | result = transforms.run(lambda _: documents.tab())(run) 24 | assert_equal(documents.tab(), result) 25 | 26 | def test_non_paragraph_elements_are_not_transformed(self): 27 | paragraph = documents.paragraph(children=[]) 28 | result = transforms.run(lambda _: documents.tab())(paragraph) 29 | assert_equal(documents.paragraph(children=[]), result) 30 | 31 | 32 | class EachElementTests(object): 33 | def test_all_descendants_are_transformed(self): 34 | @cobble.data 35 | class Count(documents.HasChildren): 36 | count = cobble.field() 37 | 38 | root = Count(count=None, children=[ 39 | Count(count=None, children=[ 40 | Count(count=None, children=[]), 41 | ]), 42 | ]) 43 | 44 | current_count = [0] 45 | def set_count(node): 46 | current_count[0] += 1 47 | return node.copy(count=current_count[0]) 48 | 49 | result = _each_element(set_count)(root) 50 | 51 | assert_equal(Count(count=3, children=[ 52 | Count(count=2, children=[ 53 | Count(count=1, children=[]), 54 | ]), 55 | ]), result) 56 | 57 | 58 | class GetDescendantsTests(object): 59 | def test_returns_nothing_if_element_type_has_no_children(self): 60 | assert_equal([], get_descendants(documents.tab())) 61 | 62 | def test_returns_nothing_if_element_has_empty_children(self): 63 | assert_equal([], get_descendants(documents.paragraph(children=[]))) 64 | 65 | def test_includes_children(self): 66 | children = [documents.text("child 1"), documents.text("child 2")] 67 | element = documents.paragraph(children=children) 68 | assert_equal(children, get_descendants(element)) 69 | 70 | def test_includes_indirect_descendants(self): 71 | grandchild = documents.text("grandchild") 72 | child = documents.run(children=[grandchild]) 73 | element = documents.paragraph(children=[child]) 74 | assert_equal([grandchild, child], get_descendants(element)) 75 | 76 | 77 | class GetDescendantsOfTypeTests(object): 78 | def test_filters_descendants_to_type(self): 79 | tab = documents.tab() 80 | run = documents.run(children=[]) 81 | element = documents.paragraph(children=[tab, run]) 82 | assert_equal([run], get_descendants_of_type(element, documents.Run)) 83 | -------------------------------------------------------------------------------- /tests/writers/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tests/writers/markdown_tests.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | 3 | from mammoth.writers.markdown import MarkdownWriter 4 | from ..testing import assert_equal 5 | 6 | 7 | def test_special_markdown_characters_are_escaped(): 8 | writer = _create_writer() 9 | writer.text(r"\*") 10 | assert_equal(r"\\\*", writer.as_string()) 11 | 12 | 13 | def test_unrecognised_elements_are_treated_as_normal_text(): 14 | writer = _create_writer() 15 | writer.start("blah"); 16 | writer.text("Hello"); 17 | writer.end("blah"); 18 | assert_equal("Hello", writer.as_string()) 19 | 20 | 21 | def test_paragraphs_are_terminated_with_double_new_line(): 22 | writer = _create_writer() 23 | writer.start("p"); 24 | writer.text("Hello"); 25 | writer.end("p"); 26 | assert_equal("Hello\n\n", writer.as_string()) 27 | 28 | 29 | def test_h1_elements_are_converted_to_heading_with_leading_hash(): 30 | writer = _create_writer() 31 | writer.start("h1"); 32 | writer.text("Hello"); 33 | writer.end("h1"); 34 | assert_equal("# Hello\n\n", writer.as_string()) 35 | 36 | 37 | def test_h6_elements_are_converted_to_heading_with_six_leading_hashes(): 38 | writer = _create_writer() 39 | writer.start("h6"); 40 | writer.text("Hello"); 41 | writer.end("h6"); 42 | assert_equal("###### Hello\n\n", writer.as_string()) 43 | 44 | 45 | def test_br_is_written_as_two_spaces_followed_by_newline(): 46 | writer = _create_writer() 47 | writer.text("Hello"); 48 | writer.self_closing("br"); 49 | assert_equal("Hello \n", writer.as_string()) 50 | 51 | 52 | def test_strong_text_is_surrounded_by_two_underscores(): 53 | writer = _create_writer() 54 | writer.text("Hello "); 55 | writer.start("strong"); 56 | writer.text("World") 57 | writer.end("strong") 58 | assert_equal("Hello __World__", writer.as_string()) 59 | 60 | 61 | def test_emphasised_text_is_surrounded_by_one_asterix(): 62 | writer = _create_writer() 63 | writer.text("Hello "); 64 | writer.start("em"); 65 | writer.text("World") 66 | writer.end("em") 67 | assert_equal("Hello *World*", writer.as_string()) 68 | 69 | 70 | def test_anchor_tags_are_written_as_hyperlinks(): 71 | writer = _create_writer() 72 | writer.start("a", {"href": "http://example.com"}); 73 | writer.text("Hello"); 74 | writer.end("a"); 75 | assert_equal("[Hello](http://example.com)", writer.as_string()) 76 | 77 | 78 | def test_anchor_tags_without_href_attribute_are_treated_as_ordinary_text(): 79 | writer = _create_writer() 80 | writer.start("a"); 81 | writer.text("Hello"); 82 | writer.end("a"); 83 | assert_equal("Hello", writer.as_string()) 84 | 85 | 86 | def test_elements_with_ids_have_anchor_tags_with_ids_appended_to_start_of_markdown_element(): 87 | writer = _create_writer() 88 | writer.start("h1", {"id": "start"}) 89 | writer.text("Hello") 90 | writer.end("h1") 91 | assert_equal('# Hello\n\n', writer.as_string()) 92 | 93 | 94 | def test_links_have_anchors_before_opening_square_bracket(): 95 | writer = _create_writer() 96 | writer.start("a", {"href": "http://example.com", "id": "start"}) 97 | writer.text("Hello") 98 | writer.end("a") 99 | assert_equal('[Hello](http://example.com)', writer.as_string()) 100 | 101 | 102 | def test_image_elements_are_written_as_markdown_images(): 103 | writer = _create_writer() 104 | writer.self_closing("img", {"src": "http://example.com/image.jpg", "alt": "Alt Text"}) 105 | assert_equal("![Alt Text](http://example.com/image.jpg)", writer.as_string()) 106 | 107 | 108 | def test_images_are_written_even_if_they_dont_have_alt_text(): 109 | writer = _create_writer() 110 | writer.self_closing("img", {"src": "http://example.com/image.jpg"}) 111 | assert_equal("![](http://example.com/image.jpg)", writer.as_string()) 112 | 113 | 114 | def test_images_are_written_even_if_they_dont_have_a_src_attribute(): 115 | writer = _create_writer() 116 | writer.self_closing("img", {"alt": "Alt Text"}) 117 | assert_equal("![Alt Text]()", writer.as_string()) 118 | 119 | 120 | def test_image_elements_are_ignored_if_they_have_no_src_and_no_alt_text(): 121 | writer = _create_writer() 122 | writer.self_closing("img") 123 | assert_equal("", writer.as_string()) 124 | 125 | 126 | def test_list_item_outside_of_list_is_treated_as_unordered_list(): 127 | writer = _create_writer() 128 | writer.start("li") 129 | writer.text("Fruit") 130 | writer.end("li") 131 | assert_equal("- Fruit\n", writer.as_string()) 132 | 133 | 134 | def test_ol_element_is_written_as_ordered_list_with_sequential_numbering(): 135 | writer = _create_writer() 136 | writer.start("ol") 137 | writer.start("li") 138 | writer.text("Fruit") 139 | writer.end("li") 140 | writer.start("li") 141 | writer.text("Condiments") 142 | writer.end("li") 143 | writer.end("ol") 144 | assert_equal("1. Fruit\n2. Condiments\n\n", writer.as_string()) 145 | 146 | 147 | def test_ul_element_is_written_as_unordered_list_using_hyphens_as_bullets(): 148 | writer = _create_writer() 149 | writer.start("ul") 150 | writer.start("li") 151 | writer.text("Fruit") 152 | writer.end("li") 153 | writer.start("li") 154 | writer.text("Condiments") 155 | writer.end("li") 156 | writer.end("ul") 157 | assert_equal("- Fruit\n- Condiments\n\n", writer.as_string()) 158 | 159 | 160 | def test_numbering_is_separate_for_nested_list_and_parent_list(): 161 | writer = _create_writer() 162 | writer.start("ol") 163 | 164 | writer.start("li") 165 | writer.text("Fruit") 166 | writer.start("ol") 167 | writer.start("li") 168 | writer.text("Apple") 169 | writer.end("li") 170 | writer.start("li") 171 | writer.text("Banana") 172 | writer.end("li") 173 | writer.end("ol") 174 | writer.end("li") 175 | 176 | writer.start("li") 177 | writer.text("Condiments") 178 | writer.end("li") 179 | writer.end("ol") 180 | assert_equal("1. Fruit\n\t1. Apple\n\t2. Banana\n2. Condiments\n\n", writer.as_string()) 181 | 182 | 183 | 184 | def _create_writer(): 185 | return MarkdownWriter() 186 | -------------------------------------------------------------------------------- /tests/zips_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth import zips 2 | from .testing import assert_equal 3 | 4 | 5 | def test_split_path_splits_zip_paths_on_last_forward_slash(): 6 | assert_equal(("a", "b"), zips.split_path("a/b")) 7 | assert_equal(("a/b", "c"), zips.split_path("a/b/c")) 8 | assert_equal(("/a/b", "c"), zips.split_path("/a/b/c")) 9 | 10 | 11 | def test_when_path_has_no_forward_slashes_then_split_path_returns_empty_dirname(): 12 | assert_equal(("", "name"), zips.split_path("name")) 13 | 14 | 15 | def test_join_path_joins_arguments_with_forward_slashes(): 16 | assert_equal("a/b", zips.join_path("a", "b")) 17 | assert_equal("a/b/c", zips.join_path("a/b", "c")) 18 | assert_equal("/a/b/c", zips.join_path("/a/b", "c")) 19 | 20 | 21 | def test_empty_parts_are_ignored_when_joining_paths(): 22 | assert_equal("a", zips.join_path("a", "")) 23 | assert_equal("b", zips.join_path("", "b")) 24 | assert_equal("a/b", zips.join_path("a", "", "b")) 25 | 26 | 27 | def test_when_joining_paths_then_absolute_paths_ignore_earlier_paths(): 28 | assert_equal("/b", zips.join_path("a", "/b")) 29 | assert_equal("/b/c", zips.join_path("a", "/b", "c")) 30 | assert_equal("/b", zips.join_path("/a", "/b")) 31 | assert_equal("/a", zips.join_path("/a")) 32 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py37,py38,py39,py310,py311,py312,pypy3 3 | [testenv] 4 | changedir = {envtmpdir} 5 | deps=-r{toxinidir}/test-requirements.txt 6 | commands= 7 | py.test {toxinidir}/tests 8 | [pytest] 9 | python_classes = *Tests 10 | python_files = *_tests.py 11 | --------------------------------------------------------------------------------

Ouch' + 187 | '[1].' + 188 | '[2]