├── .github ├── ISSUE_TEMPLATE.md ├── pull_request_template.md └── workflows │ └── tests.yml ├── .gitignore ├── LICENSE ├── NEWS ├── README.md ├── makefile ├── mammoth ├── __init__.py ├── cli.py ├── conversion.py ├── document_matchers.py ├── documents.py ├── docx │ ├── __init__.py │ ├── body_xml.py │ ├── comments_xml.py │ ├── complex_fields.py │ ├── content_types_xml.py │ ├── dingbats.py │ ├── document_xml.py │ ├── files.py │ ├── notes_xml.py │ ├── numbering_xml.py │ ├── office_xml.py │ ├── relationships_xml.py │ ├── style_map.py │ ├── styles_xml.py │ ├── uris.py │ └── xmlparser.py ├── html │ ├── __init__.py │ └── nodes.py ├── html_paths.py ├── images.py ├── lists.py ├── options.py ├── raw_text.py ├── results.py ├── styles │ ├── __init__.py │ └── parser │ │ ├── __init__.py │ │ ├── document_matcher_parser.py │ │ ├── errors.py │ │ ├── html_path_parser.py │ │ ├── style_mapping_parser.py │ │ ├── token_iterator.py │ │ ├── token_parser.py │ │ └── tokeniser.py ├── transforms.py ├── underline.py ├── writers │ ├── __init__.py │ ├── abc.py │ ├── html.py │ └── markdown.py └── zips.py ├── pyproject.toml ├── recipes └── wmf_images.py ├── setup.cfg ├── setup.py ├── test-requirements.txt ├── tests ├── __init__.py ├── cli_tests.py ├── conftest.py ├── conversion_tests.py ├── docx │ ├── __init__.py │ ├── body_xml_tests.py │ ├── comments_xml_tests.py │ ├── content_types_xml_tests.py │ ├── document_matchers.py │ ├── document_xml_tests.py │ ├── docx_tests.py │ ├── files_tests.py │ ├── notes_xml_tests.py │ ├── numbering_xml_tests.py │ ├── office_xml_tests.py │ ├── relationships_xml_tests.py │ ├── style_map_tests.py │ ├── styles_xml_tests.py │ ├── uris_tests.py │ └── xmlparser_tests.py ├── html │ ├── __init__.py │ ├── collapse_tests.py │ └── strip_empty_tests.py ├── images_tests.py ├── lists_tests.py ├── mammoth_tests.py ├── options_tests.py ├── raw_text_tests.py ├── styles │ ├── __init__.py │ ├── document_matcher_tests.py │ └── parser │ │ ├── __init__.py │ │ ├── document_matcher_parser_tests.py │ │ ├── html_path_parser_tests.py │ │ ├── style_mapping_parser_tests.py │ │ ├── token_parser_tests.py │ │ └── tokeniser_tests.py ├── test-data │ ├── comments.docx │ ├── embedded-style-map.docx │ ├── empty.docx │ ├── endnotes.docx │ ├── external-picture.docx │ ├── footnote-hyperlink.docx │ ├── footnotes.docx │ ├── hyperlinks │ │ └── word │ │ │ ├── _rels │ │ │ └── document.xml.rels │ │ │ └── document.xml │ ├── simple-list.docx │ ├── simple │ │ └── word │ │ │ └── document.xml │ ├── single-paragraph.docx │ ├── strict-format.docx │ ├── strikethrough.docx │ ├── tables.docx │ ├── text-box.docx │ ├── tiny-picture-target-base-relative.docx │ ├── tiny-picture.docx │ ├── tiny-picture.png │ ├── underline.docx │ └── utf8-bom.docx ├── testing.py ├── transforms_tests.py ├── writers │ ├── __init__.py │ └── markdown_tests.py └── zips_tests.py └── tox.ini /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | If you're reporting a bug or requesting a feature, please include: 2 | * a minimal example document 3 | * the HTML output that you'd expect 4 | 5 | If you're reporting a bug, it's also useful to know what platform you're 6 | running on, including: 7 | 8 | * the version of Python 9 | * the operating system and version 10 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | In general, pull requests are not currently accepted. 2 | 3 | Please instead submit an issue if you find a bug or would like to request a feature. 4 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-22.04 8 | 9 | strategy: 10 | matrix: 11 | python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "pypy3.9"] 12 | 13 | steps: 14 | 15 | - uses: actions/checkout@v4 16 | 17 | - name: Use Python ${{ matrix.python-version }} 18 | uses: actions/setup-python@v5 19 | with: 20 | python-version: ${{ matrix.python-version }} 21 | 22 | - run: pip install tox 23 | 24 | - run: make README 25 | 26 | - run: tox -e py 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | /README 3 | /_virtualenv 4 | /*.egg-info 5 | /.tox 6 | /MANIFEST 7 | /build 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013, Michael Williamson 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | -------------------------------------------------------------------------------- /NEWS: -------------------------------------------------------------------------------- 1 | # 1.10.0 2 | 3 | * Add "Heading" and "Body" styles, as found in documents created by Apple Pages, 4 | to the default style map. 5 | 6 | * Handle structured document tags representing checkboxes wrapped in other 7 | elements, such as table cells. Previously, the wrapping elements would have 8 | been ignored. 9 | 10 | * Ignore deleted table rows. 11 | 12 | # 1.9.1 13 | 14 | * Ignore AlternateContent elements when there is no Fallback element. 15 | 16 | # 1.9.0 17 | 18 | * Detect checkboxes, both as complex fields and structured document tags, and 19 | convert them to checkbox inputs. 20 | 21 | * Ignore AlternateContent elements when there is no Fallback element. 22 | 23 | # 1.8.0 24 | 25 | * Add style mapping for highlights. 26 | 27 | # 1.7.1 28 | 29 | * Switch the precedence of numbering properties in paragraph properties and the 30 | numbering in paragraph styles so that the numbering properties in paragraph 31 | properties takes precedence. 32 | 33 | # 1.7.0 34 | 35 | * Support attributes in HTML paths in style mappings. 36 | 37 | * Improve error message when failing to find the body element in a document. 38 | 39 | * Drop support for Python 2.7, Python 3.5 and Python 3.6. 40 | 41 | * Add support for the strict document format. 42 | 43 | # 1.6.0 44 | 45 | * Support merged paragraphs when revisions are tracked. 46 | 47 | # 1.5.1 48 | 49 | * Add a pyproject.toml to add an explicit build dependency on setuptools. 50 | 51 | # 1.5.0 52 | 53 | * Only use the alt text of image elements as a fallback. If an alt attribute is 54 | returned from the function passed to mammoth.images.img_element, that value 55 | will now be preferred to the alt text of the image element. 56 | 57 | # 1.4.19 58 | 59 | * Ignore w:u elements when w:val is missing. 60 | 61 | # 1.4.18 62 | 63 | * Emit warning instead of throwing exception when image file cannot be found for 64 | a:blip elements. 65 | 66 | # 1.4.17 67 | 68 | * When extracting raw text, convert tab elements to tab characters. 69 | 70 | * Handle internal hyperlinks created with complex fields. 71 | 72 | # 1.4.16 73 | 74 | * Handle w:num with invalid w:abstractNumId. 75 | 76 | # 1.4.15 77 | 78 | * Convert symbols in supported fonts to corresponding Unicode characters. 79 | 80 | # 1.4.14 81 | 82 | * Support numbering defined by paragraph style. 83 | 84 | # 1.4.13 85 | 86 | * Add style mapping for all caps. 87 | 88 | # 1.4.12 89 | 90 | * Handle underline elements where w:val is "none". 91 | 92 | # 1.4.11 93 | 94 | * Read font size for runs. 95 | * Support soft hyphens. 96 | 97 | # 1.4.10 98 | 99 | * Update supported Python versions to 2.7 and 3.4 to 3.8. 100 | 101 | # 1.4.9 102 | 103 | * Improve list support by following w:numStyleLink in w:abstractNum. 104 | 105 | # 1.4.8 106 | 107 | * Preserve empty table rows. 108 | 109 | # 1.4.7 110 | 111 | * Always write files as UTF-8 in the CLI. 112 | 113 | # 1.4.6 114 | 115 | * Fix: default style mappings caused footnotes, endnotes and comments 116 | containing multiple paragraphs to be converted into a single paragraph. 117 | 118 | # 1.4.5 119 | 120 | * Read the children of v:rect elements. 121 | 122 | # 1.4.4 123 | 124 | * Parse paragraph indents. 125 | 126 | * Read part paths using relationships. This improves support for documents 127 | created by Word Online. 128 | 129 | # 1.4.3 130 | 131 | * Add style mapping for small caps. 132 | 133 | * Add style mapping for tables. 134 | 135 | # 1.4.2 136 | 137 | * Read children of v:group elements. 138 | 139 | # 1.4.1 140 | 141 | * Read w:noBreakHyphen elements as non-breaking hyphen characters. 142 | 143 | # 1.4.0 144 | 145 | * Extract the default data URI image converter to the images module. 146 | 147 | * Add anchor on hyperlinks as fragment if present. 148 | 149 | * Convert target frames on hyperlinks to targets on anchors. 150 | 151 | * Detect header rows in tables and convert to thead > tr > th. 152 | 153 | # 1.3.5 154 | 155 | * Handle complex fields that do not have a "separate" fldChar. 156 | 157 | # 1.3.4 158 | 159 | * Add transforms.run. 160 | 161 | # 1.3.3 162 | 163 | * Read children of w:object elements. 164 | 165 | * Add support for document transforms. 166 | 167 | # 1.3.2 168 | 169 | * Handle hyperlinks created with complex fields. 170 | 171 | # 1.3.1 172 | 173 | * Handle absolute paths within zip files. This should fix an issue where some 174 | images within a document couldn't be found. 175 | 176 | # 1.3.0 177 | 178 | * Allow style names to be mapped by prefix. For instance: 179 | 180 | r[style-name^='Code '] => code 181 | 182 | * Add default style mappings for Heading 5 and Heading 6. 183 | 184 | * Allow escape sequences in style IDs, style names and CSS class names. 185 | 186 | * Allow a separator to be specified when HTML elements are collapsed. 187 | 188 | * Add include_embedded_style_map argument to allow embedded style maps to be 189 | disabled. 190 | 191 | * Include embedded styles when explicit style map is passed. 192 | 193 | # 1.2.2 194 | 195 | * Ignore bold, italic, underline and strikethrough elements that have a value of 196 | false or 0. 197 | 198 | # 1.2.1 199 | 200 | * Ignore v:imagedata elements without relationship ID with warning. 201 | 202 | # 1.2.0 203 | 204 | * Use alt text title as alt text for images when the alt text description is 205 | blank or missing. 206 | 207 | # 1.1.1 208 | 209 | * Handle comments without author initials. 210 | 211 | * Change numbering of comments to be global rather than per-user to match the 212 | behaviour of Word. 213 | 214 | # 1.1.0 215 | 216 | * Add support for comments. 217 | 218 | # 1.0.4 219 | 220 | * Add support for w:sdt elements. This allows the bodies of content controls, 221 | such as bibliographies, to be converted. 222 | 223 | # 1.0.3 224 | 225 | * Add support for table cells spanning multiple rows. 226 | 227 | # 1.0.2 228 | 229 | * Add support for table cells spanning multiple columns. 230 | 231 | # 1.0.1 232 | 233 | * Improve script installation on Windows by using entry_points instead of 234 | scripts in setup.py. 235 | 236 | # 1.0.0 237 | 238 | * Remove deprecated convert_underline argument. 239 | 240 | * Officially support ID prefixes. 241 | 242 | * Generated IDs no longer insert a hyphen after the ID prefix. 243 | 244 | * The default ID prefix is now the empty string rather than a random number 245 | followed by a hyphen. 246 | 247 | * Rename mammoth.images.inline to mammoth.images.img_element to better reflect 248 | its behaviour. 249 | 250 | # 0.3.31 251 | 252 | * Improve collapsing of similar non-fresh HTML elements. 253 | 254 | # 0.3.30 255 | 256 | * Allow bold and italic style mappings to be configured. 257 | 258 | # 0.3.29 259 | 260 | * Handle references to missing styles when reading documents. 261 | 262 | # 0.3.28 263 | 264 | * Improve support for lists made in LibreOffice. Specifically, this changes the 265 | default style mapping for paragraphs with a style of "Normal" to have the 266 | lowest precedence. 267 | 268 | # 0.3.27 269 | 270 | * Handle XML where the child nodes of an element contains text nodes. 271 | 272 | # 0.3.26 273 | 274 | * Always use mc:Fallback when reading mc:AlternateContent elements. 275 | 276 | # 0.3.25 277 | 278 | * Remove duplicate messages from results. 279 | 280 | * Read v:imagedata with r:id attribute. 281 | 282 | * Read children of v:roundrect. 283 | 284 | * Ignore office-word:wrap, v:shadow and v:shapetype. 285 | 286 | # 0.3.24 287 | 288 | * Continue with warning if external images cannot be found. 289 | 290 | * Add support for embedded style maps. 291 | 292 | # 0.3.23 293 | 294 | * Fix Python 3 support. 295 | 296 | # 0.3.22 297 | 298 | * Generate warnings for not-understood style mappings and continue, rather than 299 | stopping with an error. 300 | 301 | * Support file objects without a name attribute again (broken since 0.3.20). 302 | 303 | # 0.3.21 304 | 305 | * Ignore w:numPr elements without w:numId or w:ilvl children. 306 | 307 | # 0.3.20 308 | 309 | * Add support for linked images. 310 | 311 | # 0.3.19 312 | 313 | * Fix: cannot extract raw text from elements without children 314 | 315 | # 0.3.18 316 | 317 | * Support links and images in footnotes and endnotes. 318 | 319 | # 0.3.17 320 | 321 | * Add support for underlines in style map. 322 | 323 | * Add support for strikethrough. 324 | 325 | # 0.3.16 326 | 327 | * Add basic support for text boxes. The contents of the text box are treated as 328 | a separate paragraph that appears after the paragraph containing the text box. 329 | 330 | # 0.3.15 331 | 332 | * Support styles defined without a name 333 | 334 | # 0.3.14 335 | 336 | * Add ignore_empty_paragraphs option, which defaults to True. 337 | 338 | # 0.3.13 339 | 340 | * Always use forward slashes in ZIP paths. This should fix image handling on 341 | Windows. 342 | 343 | # 0.3.12 344 | 345 | * Make style names case-insensitive in style mappings. This should make style 346 | mappings easier to write, especially since Microsoft Word sometimes represents 347 | style names in the UI differently from in the style definition. For instance, 348 | the style displayed in Word as "Heading 1" has a style name of "heading 1". 349 | This hopefully shouldn't cause an issue for anyone, but if you were relying 350 | on case-sensitivity, please do get in touch. 351 | 352 | # 0.3.11 353 | 354 | * Add support for hyperlinks to bookmarks in the same document. 355 | 356 | # 0.3.10 357 | 358 | * Add basic support for Markdown. Not all features are currently supported. 359 | 360 | # 0.3.9 361 | 362 | * Add default style mappings for builtin footnote and endnote styles in 363 | Microsoft Word and LibreOffice. 364 | 365 | * Allow style mappings with a zero-element HTML path. 366 | 367 | * Emit warnings when image types are unlikely to be supported by web browsers. 368 | 369 | # 0.3.8 370 | 371 | * Add support for endnotes. 372 | 373 | # 0.3.7 374 | 375 | * Add support for superscript and subscript text. 376 | 377 | # 0.3.6 378 | 379 | * Add support for footnotes. 380 | 381 | # 0.3.5 382 | 383 | * Add support for line breaks. 384 | 385 | # 0.3.4 386 | 387 | * Add optional underline conversion. 388 | 389 | # 0.3.3 390 | 391 | * Add `mammoth.images.inline`, and document custom image conversion. 392 | 393 | # 0.3.2 394 | 395 | * Add the function `mammoth.extract_raw_text`. 396 | 397 | # 0.3.1 398 | 399 | * Add support for tables 400 | 401 | # 0.3.0 402 | 403 | * Rename --styles CLI argument to --style-map. 404 | 405 | * Rename styles argument in convert_to_html to style_map. 406 | 407 | * Allow paragraphs and runs to be matched by style name. For instance, to match 408 | a paragraph with the style name `Heading 1`: 409 | 410 | p[style-name='Heading 1'] 411 | -------------------------------------------------------------------------------- /makefile: -------------------------------------------------------------------------------- 1 | .PHONY: test 2 | 3 | test: 4 | _virtualenv/bin/pyflakes mammoth tests 5 | sh -c '. _virtualenv/bin/activate; py.test tests' 6 | 7 | .PHONY: test-all 8 | 9 | test-all: 10 | tox 11 | 12 | .PHONY: upload 13 | 14 | upload: setup assert-converted-readme build-dist 15 | _virtualenv/bin/twine upload dist/* 16 | make clean 17 | 18 | .PHONY: build-dist 19 | 20 | build-dist: 21 | rm -rf dist 22 | _virtualenv/bin/pyproject-build 23 | 24 | README: README.md 25 | pandoc --from=markdown --to=rst README.md > README || cp README.md README 26 | 27 | .PHONY: assert-converted-readme 28 | 29 | assert-converted-readme: 30 | test "`cat README`" != "`cat README.md`" 31 | 32 | .PHONY: clean 33 | 34 | clean: 35 | rm -f README 36 | rm -f MANIFEST 37 | rm -rf dist 38 | 39 | .PHONY: bootstrap 40 | 41 | bootstrap: _virtualenv setup 42 | _virtualenv/bin/pip install -e . 43 | ifneq ($(wildcard test-requirements.txt),) 44 | _virtualenv/bin/pip install -r test-requirements.txt 45 | endif 46 | make clean 47 | 48 | .PHONY: setup 49 | 50 | setup: README 51 | 52 | _virtualenv: 53 | python3 -m venv _virtualenv 54 | _virtualenv/bin/pip install --upgrade pip 55 | _virtualenv/bin/pip install --upgrade setuptools 56 | _virtualenv/bin/pip install --upgrade wheel 57 | _virtualenv/bin/pip install --upgrade build twine 58 | -------------------------------------------------------------------------------- /mammoth/__init__.py: -------------------------------------------------------------------------------- 1 | from . import docx, conversion, options, images, transforms, underline 2 | from .raw_text import extract_raw_text_from_element 3 | from .docx.style_map import write_style_map, read_style_map 4 | 5 | __all__ = ["convert_to_html", "extract_raw_text", "images", "transforms", "underline"] 6 | 7 | 8 | _undefined = object() 9 | 10 | 11 | def convert_to_html(*args, **kwargs): 12 | return convert(*args, output_format="html", **kwargs) 13 | 14 | 15 | def convert_to_markdown(*args, **kwargs): 16 | return convert(*args, output_format="markdown", **kwargs) 17 | 18 | 19 | def convert(fileobj, transform_document=None, id_prefix=None, include_embedded_style_map=_undefined, **kwargs): 20 | if include_embedded_style_map is _undefined: 21 | include_embedded_style_map = True 22 | if transform_document is None: 23 | transform_document = lambda x: x 24 | if include_embedded_style_map: 25 | kwargs["embedded_style_map"] = read_style_map(fileobj) 26 | return options.read_options(kwargs).bind(lambda convert_options: 27 | docx.read(fileobj).map(transform_document).bind(lambda document: 28 | conversion.convert_document_element_to_html( 29 | document, 30 | id_prefix=id_prefix, 31 | **convert_options 32 | ) 33 | ) 34 | ) 35 | 36 | 37 | def extract_raw_text(fileobj): 38 | return docx.read(fileobj).map(extract_raw_text_from_element) 39 | 40 | 41 | def embed_style_map(fileobj, style_map): 42 | write_style_map(fileobj, style_map) 43 | 44 | def read_embedded_style_map(fileobj): 45 | return read_style_map(fileobj) 46 | -------------------------------------------------------------------------------- /mammoth/cli.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import io 3 | import os 4 | import shutil 5 | import sys 6 | 7 | import mammoth 8 | from . import writers 9 | 10 | 11 | def main(): 12 | args = _parse_args() 13 | 14 | if args.style_map is None: 15 | style_map = None 16 | else: 17 | with open(args.style_map) as style_map_fileobj: 18 | style_map = style_map_fileobj.read() 19 | 20 | with open(args.path, "rb") as docx_fileobj: 21 | if args.output_dir is None: 22 | convert_image = None 23 | output_path = args.output 24 | else: 25 | convert_image = mammoth.images.img_element(ImageWriter(args.output_dir)) 26 | output_filename = "{0}.html".format(os.path.basename(args.path).rpartition(".")[0]) 27 | output_path = os.path.join(args.output_dir, output_filename) 28 | 29 | result = mammoth.convert( 30 | docx_fileobj, 31 | style_map=style_map, 32 | convert_image=convert_image, 33 | output_format=args.output_format, 34 | ) 35 | for message in result.messages: 36 | sys.stderr.write(message.message) 37 | sys.stderr.write("\n") 38 | 39 | _write_output(output_path, result.value) 40 | 41 | 42 | class ImageWriter(object): 43 | def __init__(self, output_dir): 44 | self._output_dir = output_dir 45 | self._image_number = 1 46 | 47 | def __call__(self, element): 48 | extension = element.content_type.partition("/")[2] 49 | image_filename = "{0}.{1}".format(self._image_number, extension) 50 | with open(os.path.join(self._output_dir, image_filename), "wb") as image_dest: 51 | with element.open() as image_source: 52 | shutil.copyfileobj(image_source, image_dest) 53 | 54 | self._image_number += 1 55 | 56 | return {"src": image_filename} 57 | 58 | 59 | def _write_output(path, contents): 60 | if path is None: 61 | if sys.version_info[0] <= 2: 62 | stdout = sys.stdout 63 | else: 64 | stdout = sys.stdout.buffer 65 | 66 | stdout.write(contents.encode("utf-8")) 67 | stdout.flush() 68 | else: 69 | with io.open(path, "w", encoding="utf-8") as fileobj: 70 | fileobj.write(contents) 71 | 72 | 73 | def _parse_args(): 74 | parser = argparse.ArgumentParser() 75 | parser.add_argument( 76 | "path", 77 | metavar="docx-path", 78 | help="Path to the .docx file to convert.") 79 | 80 | output_group = parser.add_mutually_exclusive_group() 81 | output_group.add_argument( 82 | "output", 83 | nargs="?", 84 | metavar="output-path", 85 | help="Output path for the generated document. Images will be stored inline in the output document. Output is written to stdout if not set.") 86 | output_group.add_argument( 87 | "--output-dir", 88 | help="Output directory for generated HTML and images. Images will be stored in separate files. Mutually exclusive with output-path.") 89 | 90 | parser.add_argument( 91 | "--output-format", 92 | required=False, 93 | choices=writers.formats(), 94 | help="Output format.") 95 | parser.add_argument( 96 | "--style-map", 97 | required=False, 98 | help="File containg a style map.") 99 | return parser.parse_args() 100 | 101 | 102 | if __name__ == "__main__": 103 | main() 104 | 105 | -------------------------------------------------------------------------------- /mammoth/document_matchers.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | import cobble 4 | 5 | 6 | def paragraph(style_id=None, style_name=None, numbering=None): 7 | return ParagraphMatcher(style_id, style_name, numbering) 8 | 9 | 10 | ParagraphMatcher = collections.namedtuple("ParagraphMatcher", ["style_id", "style_name", "numbering"]) 11 | ParagraphMatcher.element_type = "paragraph" 12 | 13 | 14 | def run(style_id=None, style_name=None): 15 | return RunMatcher(style_id, style_name) 16 | 17 | 18 | RunMatcher = collections.namedtuple("RunMatcher", ["style_id", "style_name"]) 19 | RunMatcher.element_type = "run" 20 | 21 | 22 | def table(style_id=None, style_name=None): 23 | return TableMatcher(style_id, style_name) 24 | 25 | 26 | TableMatcher = collections.namedtuple("TableMatcher", ["style_id", "style_name"]) 27 | TableMatcher.element_type = "table" 28 | 29 | 30 | class bold(object): 31 | element_type = "bold" 32 | 33 | 34 | class italic(object): 35 | element_type = "italic" 36 | 37 | 38 | class underline(object): 39 | element_type = "underline" 40 | 41 | 42 | class strikethrough(object): 43 | element_type = "strikethrough" 44 | 45 | 46 | class all_caps(object): 47 | element_type = "all_caps" 48 | 49 | 50 | class small_caps(object): 51 | element_type = "small_caps" 52 | 53 | 54 | def highlight(color=None): 55 | return HighlightMatcher(color=color) 56 | 57 | 58 | HighlightMatcher = collections.namedtuple("HighlightMatcher", ["color"]) 59 | HighlightMatcher.element_type = "highlight" 60 | 61 | class comment_reference(object): 62 | element_type = "comment_reference" 63 | 64 | 65 | BreakMatcher = collections.namedtuple("BreakMatcher", ["break_type"]) 66 | BreakMatcher.element_type = "break" 67 | 68 | 69 | line_break = BreakMatcher("line") 70 | page_break = BreakMatcher("page") 71 | column_break = BreakMatcher("column") 72 | 73 | 74 | def equal_to(value): 75 | return StringMatcher(_operator_equal_to, value) 76 | 77 | 78 | def _operator_equal_to(first, second): 79 | return first.upper() == second.upper() 80 | 81 | 82 | def starts_with(value): 83 | return StringMatcher(_operator_starts_with, value) 84 | 85 | def _operator_starts_with(first, second): 86 | return second.upper().startswith(first.upper()) 87 | 88 | 89 | @cobble.data 90 | class StringMatcher(object): 91 | operator = cobble.field() 92 | value = cobble.field() 93 | 94 | def matches(self, other): 95 | return self.operator(self.value, other) 96 | -------------------------------------------------------------------------------- /mammoth/documents.py: -------------------------------------------------------------------------------- 1 | import cobble 2 | 3 | 4 | class Element(object): 5 | def copy(self, **kwargs): 6 | return cobble.copy(self, **kwargs) 7 | 8 | 9 | class HasChildren(Element): 10 | children = cobble.field() 11 | 12 | 13 | @cobble.data 14 | class Document(HasChildren): 15 | notes = cobble.field() 16 | comments = cobble.field() 17 | 18 | @cobble.data 19 | class Paragraph(HasChildren): 20 | style_id = cobble.field() 21 | style_name = cobble.field() 22 | numbering = cobble.field() 23 | alignment = cobble.field() 24 | indent = cobble.field() 25 | 26 | 27 | @cobble.data 28 | class ParagraphIndent(object): 29 | start = cobble.field() 30 | end = cobble.field() 31 | first_line = cobble.field() 32 | hanging = cobble.field() 33 | 34 | 35 | @cobble.data 36 | class Indent(object): 37 | left = cobble.field() 38 | right = cobble.field() 39 | first_line = cobble.field() 40 | hanging = cobble.field() 41 | 42 | 43 | @cobble.data 44 | class Run(HasChildren): 45 | style_id = cobble.field() 46 | style_name = cobble.field() 47 | is_bold = cobble.field() 48 | is_italic = cobble.field() 49 | is_underline = cobble.field() 50 | is_strikethrough = cobble.field() 51 | is_all_caps = cobble.field() 52 | is_small_caps = cobble.field() 53 | vertical_alignment = cobble.field() 54 | font = cobble.field() 55 | font_size = cobble.field() 56 | highlight = cobble.field() 57 | 58 | @cobble.data 59 | class Text(Element): 60 | value = cobble.field() 61 | 62 | @cobble.data 63 | class Hyperlink(HasChildren): 64 | href = cobble.field() 65 | anchor = cobble.field() 66 | target_frame = cobble.field() 67 | 68 | @cobble.data 69 | class Checkbox(Element): 70 | checked = cobble.field() 71 | 72 | checkbox = Checkbox 73 | 74 | @cobble.data 75 | class Table(HasChildren): 76 | style_id = cobble.field() 77 | style_name = cobble.field() 78 | 79 | @cobble.data 80 | class TableRow(HasChildren): 81 | is_header = cobble.field() 82 | 83 | @cobble.data 84 | class TableCell(HasChildren): 85 | colspan = cobble.field() 86 | rowspan = cobble.field() 87 | 88 | @cobble.data 89 | class TableCellUnmerged: 90 | children = cobble.field() 91 | colspan = cobble.field() 92 | rowspan = cobble.field() 93 | vmerge = cobble.field() 94 | 95 | def copy(self, **kwargs): 96 | return cobble.copy(self, **kwargs) 97 | 98 | @cobble.data 99 | class Break(Element): 100 | break_type = cobble.field() 101 | 102 | line_break = Break("line") 103 | page_break = Break("page") 104 | column_break = Break("column") 105 | 106 | 107 | @cobble.data 108 | class Tab(Element): 109 | pass 110 | 111 | 112 | @cobble.data 113 | class Image(Element): 114 | alt_text = cobble.field() 115 | content_type = cobble.field() 116 | open = cobble.field() 117 | 118 | 119 | def document(children, notes=None, comments=None): 120 | if notes is None: 121 | notes = Notes({}) 122 | if comments is None: 123 | comments = [] 124 | return Document(children, notes, comments=comments) 125 | 126 | def paragraph(children, style_id=None, style_name=None, numbering=None, alignment=None, indent=None): 127 | if indent is None: 128 | indent = paragraph_indent() 129 | 130 | return Paragraph(children, style_id, style_name, numbering, alignment=alignment, indent=indent) 131 | 132 | def paragraph_indent(start=None, end=None, first_line=None, hanging=None): 133 | return ParagraphIndent(start=start, end=end, first_line=first_line, hanging=hanging) 134 | 135 | def run( 136 | children, 137 | style_id=None, 138 | style_name=None, 139 | is_bold=None, 140 | is_italic=None, 141 | is_underline=None, 142 | is_strikethrough=None, 143 | is_all_caps=None, 144 | is_small_caps=None, 145 | vertical_alignment=None, 146 | font=None, 147 | font_size=None, 148 | highlight=None, 149 | ): 150 | if vertical_alignment is None: 151 | vertical_alignment = VerticalAlignment.baseline 152 | return Run( 153 | children=children, 154 | style_id=style_id, 155 | style_name=style_name, 156 | is_bold=bool(is_bold), 157 | is_italic=bool(is_italic), 158 | is_underline=bool(is_underline), 159 | is_strikethrough=bool(is_strikethrough), 160 | is_all_caps=bool(is_all_caps), 161 | is_small_caps=bool(is_small_caps), 162 | vertical_alignment=vertical_alignment, 163 | font=font, 164 | font_size=font_size, 165 | highlight=highlight, 166 | ) 167 | 168 | class VerticalAlignment(object): 169 | baseline = "baseline" 170 | superscript = "superscript" 171 | subscript = "subscript" 172 | 173 | text = Text 174 | 175 | _tab = Tab() 176 | 177 | def tab(): 178 | return _tab 179 | 180 | 181 | image = Image 182 | 183 | def hyperlink(children, href=None, anchor=None, target_frame=None): 184 | return Hyperlink(href=href, anchor=anchor, target_frame=target_frame, children=children) 185 | 186 | 187 | @cobble.data 188 | class Bookmark(Element): 189 | name = cobble.field() 190 | 191 | bookmark = Bookmark 192 | 193 | 194 | def table(children, style_id=None, style_name=None): 195 | return Table(children=children, style_id=style_id, style_name=style_name) 196 | 197 | def table_row(children, is_header=None): 198 | return TableRow(children=children, is_header=bool(is_header)) 199 | 200 | def table_cell(children, colspan=None, rowspan=None): 201 | if colspan is None: 202 | colspan = 1 203 | if rowspan is None: 204 | rowspan = 1 205 | return TableCell(children=children, colspan=colspan, rowspan=rowspan) 206 | 207 | def table_cell_unmerged(children, colspan, rowspan, vmerge): 208 | return TableCellUnmerged(children=children, colspan=colspan, rowspan=rowspan, vmerge=vmerge) 209 | 210 | def numbering_level(level_index, is_ordered): 211 | return _NumberingLevel(str(level_index), bool(is_ordered)) 212 | 213 | @cobble.data 214 | class _NumberingLevel(object): 215 | level_index = cobble.field() 216 | is_ordered = cobble.field() 217 | 218 | @cobble.data 219 | class Note(Element): 220 | note_type = cobble.field() 221 | note_id = cobble.field() 222 | body = cobble.field() 223 | 224 | 225 | note = Note 226 | 227 | 228 | class Notes(object): 229 | def __init__(self, notes): 230 | self._notes = notes 231 | 232 | def find_note(self, note_type, note_id): 233 | return self._notes[(note_type, note_id)] 234 | 235 | def resolve(self, reference): 236 | return self.find_note(reference.note_type, reference.note_id) 237 | 238 | def __eq__(self, other): 239 | return isinstance(other, Notes) and self._notes == other._notes 240 | 241 | def __ne__(self, other): 242 | return not (self == other) 243 | 244 | def notes(notes_list): 245 | return Notes(dict( 246 | (_note_key(note), note) 247 | for note in notes_list 248 | )) 249 | 250 | def _note_key(note): 251 | return (note.note_type, note.note_id) 252 | 253 | @cobble.data 254 | class NoteReference(Element): 255 | note_type = cobble.field() 256 | note_id = cobble.field() 257 | 258 | note_reference = NoteReference 259 | 260 | 261 | @cobble.data 262 | class Comment(object): 263 | comment_id = cobble.field() 264 | body = cobble.field() 265 | author_name = cobble.field() 266 | author_initials = cobble.field() 267 | 268 | def comment(comment_id, body, author_name=None, author_initials=None): 269 | return Comment( 270 | comment_id=comment_id, 271 | body=body, 272 | author_name=author_name, 273 | author_initials=author_initials, 274 | ) 275 | 276 | @cobble.data 277 | class CommentReference(Element): 278 | comment_id = cobble.field() 279 | 280 | comment_reference = CommentReference 281 | 282 | def element_visitor(args): 283 | return cobble.visitor(Element, args=args) 284 | -------------------------------------------------------------------------------- /mammoth/docx/__init__.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | import os 3 | 4 | import cobble 5 | 6 | from .. import results, lists, zips 7 | from .document_xml import read_document_xml_element 8 | from .content_types_xml import empty_content_types, read_content_types_xml_element 9 | from .relationships_xml import read_relationships_xml_element, Relationships 10 | from .numbering_xml import read_numbering_xml_element, Numbering 11 | from .styles_xml import read_styles_xml_element, Styles 12 | from .notes_xml import read_endnotes_xml_element, read_footnotes_xml_element 13 | from .comments_xml import read_comments_xml_element 14 | from .files import Files 15 | from . import body_xml, office_xml 16 | from ..zips import open_zip 17 | 18 | 19 | _empty_result = results.success([]) 20 | 21 | 22 | def read(fileobj): 23 | zip_file = open_zip(fileobj, "r") 24 | part_paths = _find_part_paths(zip_file) 25 | read_part_with_body = _part_with_body_reader( 26 | getattr(fileobj, "name", None), 27 | zip_file, 28 | part_paths=part_paths, 29 | ) 30 | 31 | return results.combine([ 32 | _read_notes(read_part_with_body, part_paths), 33 | _read_comments(read_part_with_body, part_paths), 34 | ]).bind(lambda referents: 35 | _read_document(zip_file, read_part_with_body, notes=referents[0], comments=referents[1], part_paths=part_paths) 36 | ) 37 | 38 | 39 | @cobble.data 40 | class _PartPaths(object): 41 | main_document = cobble.field() 42 | comments = cobble.field() 43 | endnotes = cobble.field() 44 | footnotes = cobble.field() 45 | numbering = cobble.field() 46 | styles = cobble.field() 47 | 48 | 49 | def _find_part_paths(zip_file): 50 | package_relationships = _read_relationships(zip_file, "_rels/.rels") 51 | document_filename = _find_document_filename(zip_file, package_relationships) 52 | 53 | document_relationships = _read_relationships( 54 | zip_file, 55 | _find_relationships_path_for(document_filename), 56 | ) 57 | 58 | def find(name): 59 | return _find_part_path( 60 | zip_file=zip_file, 61 | relationships=document_relationships, 62 | relationship_type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/" + name, 63 | fallback_path="word/{0}.xml".format(name), 64 | base_path=zips.split_path(document_filename)[0], 65 | ) 66 | 67 | return _PartPaths( 68 | main_document=document_filename, 69 | comments=find("comments"), 70 | endnotes=find("endnotes"), 71 | footnotes=find("footnotes"), 72 | numbering=find("numbering"), 73 | styles=find("styles"), 74 | ) 75 | 76 | 77 | def _find_document_filename(zip_file, relationships): 78 | path = _find_part_path( 79 | zip_file, 80 | relationships, 81 | relationship_type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument", 82 | base_path="", 83 | fallback_path="word/document.xml", 84 | ) 85 | if zip_file.exists(path): 86 | return path 87 | else: 88 | raise IOError("Could not find main document part. Are you sure this is a valid .docx file?") 89 | 90 | 91 | def _find_part_path(zip_file, relationships, relationship_type, base_path, fallback_path): 92 | targets = [ 93 | zips.join_path(base_path, target).lstrip("/") 94 | for target in relationships.find_targets_by_type(relationship_type) 95 | ] 96 | valid_targets = list(filter(lambda target: zip_file.exists(target), targets)) 97 | if len(valid_targets) == 0: 98 | return fallback_path 99 | else: 100 | return valid_targets[0] 101 | 102 | 103 | def _read_notes(read_part_with_body, part_paths): 104 | footnotes = read_part_with_body( 105 | part_paths.footnotes, 106 | lambda root, body_reader: read_footnotes_xml_element(root, body_reader=body_reader), 107 | default=_empty_result, 108 | ) 109 | endnotes = read_part_with_body( 110 | part_paths.endnotes, 111 | lambda root, body_reader: read_endnotes_xml_element(root, body_reader=body_reader), 112 | default=_empty_result, 113 | ) 114 | 115 | return results.combine([footnotes, endnotes]).map(lists.flatten) 116 | 117 | 118 | def _read_comments(read_part_with_body, part_paths): 119 | return read_part_with_body( 120 | part_paths.comments, 121 | lambda root, body_reader: read_comments_xml_element(root, body_reader=body_reader), 122 | default=_empty_result, 123 | ) 124 | 125 | 126 | def _read_document(zip_file, read_part_with_body, notes, comments, part_paths): 127 | return read_part_with_body( 128 | part_paths.main_document, 129 | partial( 130 | read_document_xml_element, 131 | notes=notes, 132 | comments=comments, 133 | ), 134 | ) 135 | 136 | 137 | def _part_with_body_reader(document_path, zip_file, part_paths): 138 | content_types = _try_read_entry_or_default( 139 | zip_file, 140 | "[Content_Types].xml", 141 | read_content_types_xml_element, 142 | empty_content_types, 143 | ) 144 | 145 | styles = _try_read_entry_or_default( 146 | zip_file, 147 | part_paths.styles, 148 | read_styles_xml_element, 149 | Styles.EMPTY, 150 | ) 151 | 152 | numbering = _try_read_entry_or_default( 153 | zip_file, 154 | part_paths.numbering, 155 | lambda element: read_numbering_xml_element(element, styles=styles), 156 | default=Numbering.EMPTY, 157 | ) 158 | 159 | def read_part(name, reader, default=_undefined): 160 | relationships = _read_relationships(zip_file, _find_relationships_path_for(name)) 161 | 162 | body_reader = body_xml.reader( 163 | numbering=numbering, 164 | content_types=content_types, 165 | relationships=relationships, 166 | styles=styles, 167 | docx_file=zip_file, 168 | files=Files(None if document_path is None else os.path.dirname(document_path)), 169 | ) 170 | 171 | if default is _undefined: 172 | return _read_entry(zip_file, name, partial(reader, body_reader=body_reader)) 173 | else: 174 | return _try_read_entry_or_default(zip_file, name, partial(reader, body_reader=body_reader), default=default) 175 | 176 | return read_part 177 | 178 | 179 | 180 | def _find_relationships_path_for(name): 181 | dirname, basename = zips.split_path(name) 182 | return zips.join_path(dirname, "_rels", basename + ".rels") 183 | 184 | 185 | def _read_relationships(zip_file, name): 186 | return _try_read_entry_or_default( 187 | zip_file, 188 | name, 189 | read_relationships_xml_element, 190 | default=Relationships.EMPTY, 191 | ) 192 | 193 | def _try_read_entry_or_default(zip_file, name, reader, default): 194 | if zip_file.exists(name): 195 | return _read_entry(zip_file, name, reader) 196 | else: 197 | return default 198 | 199 | 200 | def _read_entry(zip_file, name, reader): 201 | with zip_file.open(name) as fileobj: 202 | return reader(office_xml.read(fileobj)) 203 | 204 | 205 | _undefined = object() 206 | -------------------------------------------------------------------------------- /mammoth/docx/comments_xml.py: -------------------------------------------------------------------------------- 1 | from .. import lists 2 | from .. import documents 3 | from .. import results 4 | 5 | 6 | def read_comments_xml_element(element, body_reader): 7 | def read_comments_xml_element(element): 8 | comment_elements = element.find_children("w:comment") 9 | return results.combine(lists.map(_read_comment_element, comment_elements)) 10 | 11 | 12 | def _read_comment_element(element): 13 | def read_optional_attribute(name): 14 | return element.attributes.get(name, "").strip() or None 15 | 16 | return body_reader.read_all(element.children).map(lambda body: 17 | documents.comment( 18 | comment_id=element.attributes["w:id"], 19 | body=body, 20 | author_name=read_optional_attribute("w:author"), 21 | author_initials=read_optional_attribute("w:initials"), 22 | )) 23 | 24 | return read_comments_xml_element(element) 25 | -------------------------------------------------------------------------------- /mammoth/docx/complex_fields.py: -------------------------------------------------------------------------------- 1 | class unknown(object): 2 | pass 3 | 4 | 5 | class Begin: 6 | def __init__(self, *, fld_char): 7 | self.fld_char = fld_char 8 | 9 | 10 | def begin(*, fld_char): 11 | return Begin(fld_char=fld_char) 12 | 13 | 14 | class Hyperlink(object): 15 | def __init__(self, kwargs): 16 | self.kwargs = kwargs 17 | 18 | 19 | def hyperlink(kwargs): 20 | return Hyperlink(kwargs=kwargs) 21 | 22 | 23 | class Checkbox: 24 | def __init__(self, *, checked): 25 | self.checked = checked 26 | 27 | 28 | def checkbox(*, checked): 29 | return Checkbox(checked=checked) 30 | -------------------------------------------------------------------------------- /mammoth/docx/content_types_xml.py: -------------------------------------------------------------------------------- 1 | def read_content_types_xml_element(element): 2 | extension_defaults = dict(map( 3 | _read_default, 4 | element.find_children("content-types:Default") 5 | )) 6 | overrides = dict(map( 7 | _read_override, 8 | element.find_children("content-types:Override") 9 | )) 10 | return _ContentTypes(extension_defaults, overrides) 11 | 12 | 13 | def _read_default(element): 14 | extension = element.attributes["Extension"] 15 | content_type = element.attributes["ContentType"] 16 | return extension, content_type 17 | 18 | 19 | def _read_override(element): 20 | part_name = element.attributes["PartName"] 21 | content_type = element.attributes["ContentType"] 22 | return part_name.lstrip("/"), content_type 23 | 24 | 25 | class _ContentTypes(object): 26 | _image_content_types = { 27 | "png": "png", 28 | "gif": "gif", 29 | "jpeg": "jpeg", 30 | "jpg": "jpeg", 31 | "tif": "tiff", 32 | "tiff": "tiff", 33 | "bmp": "bmp", 34 | } 35 | 36 | def __init__(self, extension_defaults, overrides): 37 | self._extension_defaults = extension_defaults 38 | self._overrides = overrides 39 | 40 | def find_content_type(self, path): 41 | if path in self._overrides: 42 | return self._overrides[path] 43 | 44 | extension = _get_extension(path) 45 | default_type = self._extension_defaults.get(extension) 46 | if default_type is not None: 47 | return default_type 48 | 49 | image_type = self._image_content_types.get(extension.lower()) 50 | if image_type is not None: 51 | return "image/" + image_type 52 | 53 | return None 54 | 55 | empty_content_types = _ContentTypes({}, {}) 56 | 57 | def _get_extension(path): 58 | return path.rpartition(".")[2] 59 | -------------------------------------------------------------------------------- /mammoth/docx/document_xml.py: -------------------------------------------------------------------------------- 1 | from .. import documents 2 | 3 | 4 | def read_document_xml_element( 5 | element, 6 | body_reader, 7 | notes=None, 8 | comments=None): 9 | 10 | if notes is None: 11 | notes = [] 12 | if comments is None: 13 | comments = [] 14 | 15 | body_element = element.find_child("w:body") 16 | 17 | if body_element is None: 18 | raise ValueError("Could not find the body element: are you sure this is a docx file?") 19 | 20 | return body_reader.read_all(body_element.children) \ 21 | .map(lambda children: documents.document( 22 | children, 23 | notes=documents.notes(notes), 24 | comments=comments 25 | )) 26 | -------------------------------------------------------------------------------- /mammoth/docx/files.py: -------------------------------------------------------------------------------- 1 | import os 2 | import contextlib 3 | try: 4 | from urllib2 import urlopen 5 | except ImportError: 6 | from urllib.request import urlopen 7 | try: 8 | from urllib.parse import urlparse 9 | except ImportError: 10 | from urlparse import urlparse 11 | 12 | 13 | class Files(object): 14 | def __init__(self, base): 15 | self._base = base 16 | 17 | def open(self, uri): 18 | try: 19 | if _is_absolute(uri): 20 | return contextlib.closing(urlopen(uri)) 21 | elif self._base is not None: 22 | return open(os.path.join(self._base, uri), "rb") 23 | else: 24 | raise InvalidFileReferenceError("could not find external image '{0}', fileobj has no name".format(uri)) 25 | except IOError as error: 26 | message = "could not open external image: '{0}' (document directory: '{1}')\n{2}".format( 27 | uri, self._base, str(error)) 28 | raise InvalidFileReferenceError(message) 29 | 30 | 31 | def _is_absolute(url): 32 | return urlparse(url).scheme != "" 33 | 34 | 35 | class InvalidFileReferenceError(ValueError): 36 | pass 37 | -------------------------------------------------------------------------------- /mammoth/docx/notes_xml.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | from .. import lists 4 | from .. import documents 5 | from .. import results 6 | 7 | 8 | def _read_notes(note_type, element, body_reader): 9 | def read_notes_xml_element(element): 10 | note_elements = lists.filter( 11 | _is_note_element, 12 | element.find_children("w:" + note_type), 13 | ) 14 | return results.combine(lists.map(_read_note_element, note_elements)) 15 | 16 | 17 | def _is_note_element(element): 18 | return element.attributes.get("w:type") not in ["continuationSeparator", "separator"] 19 | 20 | 21 | def _read_note_element(element): 22 | return body_reader.read_all(element.children).map(lambda body: 23 | documents.note( 24 | note_type=note_type, 25 | note_id=element.attributes["w:id"], 26 | body=body 27 | )) 28 | 29 | return read_notes_xml_element(element) 30 | 31 | read_footnotes_xml_element = functools.partial(_read_notes, "footnote") 32 | read_endnotes_xml_element = functools.partial(_read_notes, "endnote") 33 | -------------------------------------------------------------------------------- /mammoth/docx/numbering_xml.py: -------------------------------------------------------------------------------- 1 | import cobble 2 | 3 | from ..documents import numbering_level 4 | from .styles_xml import Styles 5 | 6 | 7 | def read_numbering_xml_element(element, styles): 8 | abstract_nums = _read_abstract_nums(element) 9 | nums = _read_nums(element) 10 | return Numbering(abstract_nums=abstract_nums, nums=nums, styles=styles) 11 | 12 | 13 | def _read_abstract_nums(element): 14 | abstract_num_elements = element.find_children("w:abstractNum") 15 | return dict(map(_read_abstract_num, abstract_num_elements)) 16 | 17 | 18 | def _read_abstract_num(element): 19 | abstract_num_id = element.attributes.get("w:abstractNumId") 20 | levels = _read_abstract_num_levels(element) 21 | num_style_link = element.find_child_or_null("w:numStyleLink").attributes.get("w:val") 22 | return abstract_num_id, _AbstractNum(levels=levels, num_style_link=num_style_link) 23 | 24 | 25 | @cobble.data 26 | class _AbstractNum(object): 27 | levels = cobble.field() 28 | num_style_link = cobble.field() 29 | 30 | 31 | @cobble.data 32 | class _AbstractNumLevel(object): 33 | level_index = cobble.field() 34 | is_ordered = cobble.field() 35 | paragraph_style_id = cobble.field() 36 | 37 | 38 | def _read_abstract_num_levels(element): 39 | levels = map(_read_abstract_num_level, element.find_children("w:lvl")) 40 | return dict( 41 | (level.level_index, level) 42 | for level in levels 43 | ) 44 | 45 | 46 | def _read_abstract_num_level(element): 47 | level_index = element.attributes["w:ilvl"] 48 | num_fmt = element.find_child_or_null("w:numFmt").attributes.get("w:val") 49 | is_ordered = num_fmt != "bullet" 50 | paragraph_style_id = element.find_child_or_null("w:pStyle").attributes.get("w:val") 51 | return _AbstractNumLevel( 52 | level_index=level_index, 53 | is_ordered=is_ordered, 54 | paragraph_style_id=paragraph_style_id, 55 | ) 56 | 57 | 58 | def _read_nums(element): 59 | num_elements = element.find_children("w:num") 60 | return dict( 61 | _read_num(num_element) 62 | for num_element in num_elements 63 | ) 64 | 65 | 66 | def _read_num(element): 67 | num_id = element.attributes.get("w:numId") 68 | abstract_num_id = element.find_child_or_null("w:abstractNumId").attributes["w:val"] 69 | return num_id, _Num(abstract_num_id=abstract_num_id) 70 | 71 | 72 | @cobble.data 73 | class _Num(object): 74 | abstract_num_id = cobble.field() 75 | 76 | 77 | class Numbering(object): 78 | def __init__(self, abstract_nums, nums, styles): 79 | self._abstract_nums = abstract_nums 80 | self._levels_by_paragraph_style_id = dict( 81 | (level.paragraph_style_id, self._to_numbering_level(level)) 82 | for abstract_num in abstract_nums.values() 83 | for level in abstract_num.levels.values() 84 | if level.paragraph_style_id is not None 85 | ) 86 | self._nums = nums 87 | self._styles = styles 88 | 89 | def find_level(self, num_id, level): 90 | num = self._nums.get(num_id) 91 | if num is None: 92 | return None 93 | else: 94 | abstract_num = self._abstract_nums.get(num.abstract_num_id) 95 | if abstract_num is None: 96 | return None 97 | elif abstract_num.num_style_link is None: 98 | return self._to_numbering_level(abstract_num.levels.get(level)) 99 | else: 100 | style = self._styles.find_numbering_style_by_id(abstract_num.num_style_link) 101 | return self.find_level(style.num_id, level) 102 | 103 | def find_level_by_paragraph_style_id(self, style_id): 104 | return self._levels_by_paragraph_style_id.get(style_id) 105 | 106 | def _to_numbering_level(self, abstract_num_level): 107 | if abstract_num_level is None: 108 | return None 109 | else: 110 | return numbering_level( 111 | level_index=abstract_num_level.level_index, 112 | is_ordered=abstract_num_level.is_ordered, 113 | ) 114 | 115 | 116 | Numbering.EMPTY = Numbering(abstract_nums={}, nums={}, styles=Styles.EMPTY) 117 | -------------------------------------------------------------------------------- /mammoth/docx/office_xml.py: -------------------------------------------------------------------------------- 1 | from ..lists import flat_map 2 | from .xmlparser import parse_xml, XmlElement 3 | 4 | 5 | _namespaces = [ 6 | # Transitional format 7 | ("w", "http://schemas.openxmlformats.org/wordprocessingml/2006/main"), 8 | ("r", "http://schemas.openxmlformats.org/officeDocument/2006/relationships"), 9 | ("wp", "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"), 10 | ("a", "http://schemas.openxmlformats.org/drawingml/2006/main"), 11 | ("pic", "http://schemas.openxmlformats.org/drawingml/2006/picture"), 12 | 13 | # Strict format 14 | ("w", "http://purl.oclc.org/ooxml/wordprocessingml/main"), 15 | ("r", "http://purl.oclc.org/ooxml/officeDocument/relationships"), 16 | ("wp", "http://purl.oclc.org/ooxml/drawingml/wordprocessingDrawing"), 17 | ("a", "http://purl.oclc.org/ooxml/drawingml/main"), 18 | ("pic", "http://purl.oclc.org/ooxml/drawingml/picture"), 19 | 20 | # Common 21 | ("content-types", "http://schemas.openxmlformats.org/package/2006/content-types"), 22 | ("relationships", "http://schemas.openxmlformats.org/package/2006/relationships"), 23 | ("mc", "http://schemas.openxmlformats.org/markup-compatibility/2006"), 24 | ("v", "urn:schemas-microsoft-com:vml"), 25 | ("office-word", "urn:schemas-microsoft-com:office:word"), 26 | 27 | # [MS-DOCX]: Word Extensions to the Office Open XML (.docx) File Format 28 | # https://learn.microsoft.com/en-us/openspecs/office_standards/ms-docx/b839fe1f-e1ca-4fa6-8c26-5954d0abbccd 29 | ("wordml", "http://schemas.microsoft.com/office/word/2010/wordml"), 30 | ] 31 | 32 | 33 | def read(fileobj): 34 | return _collapse_alternate_content(parse_xml(fileobj, _namespaces))[0] 35 | 36 | 37 | def _collapse_alternate_content(node): 38 | if isinstance(node, XmlElement): 39 | if node.name == "mc:AlternateContent": 40 | return node.find_child_or_null("mc:Fallback").children 41 | else: 42 | node.children = flat_map(_collapse_alternate_content, node.children) 43 | return [node] 44 | else: 45 | return [node] 46 | -------------------------------------------------------------------------------- /mammoth/docx/relationships_xml.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | 4 | class Relationships(object): 5 | def __init__(self, relationships): 6 | self._targets_by_id = dict( 7 | (relationship.relationship_id, relationship.target) 8 | for relationship in relationships 9 | ) 10 | self._targets_by_type = collections.defaultdict(list) 11 | for relationship in relationships: 12 | self._targets_by_type[relationship.type].append(relationship.target) 13 | 14 | def find_target_by_relationship_id(self, key): 15 | return self._targets_by_id[key] 16 | 17 | def find_targets_by_type(self, relationship_type): 18 | return self._targets_by_type[relationship_type] 19 | 20 | 21 | Relationships.EMPTY = Relationships([]) 22 | 23 | 24 | Relationship = collections.namedtuple("Relationship", ["relationship_id", "target", "type"]) 25 | 26 | 27 | def read_relationships_xml_element(element): 28 | children = element.find_children("relationships:Relationship") 29 | return Relationships(list(map(_read_relationship, children))) 30 | 31 | 32 | def _read_relationship(element): 33 | relationship = Relationship( 34 | relationship_id=element.attributes["Id"], 35 | target=element.attributes["Target"], 36 | type=element.attributes["Type"], 37 | ) 38 | return relationship 39 | -------------------------------------------------------------------------------- /mammoth/docx/style_map.py: -------------------------------------------------------------------------------- 1 | from xml.etree import ElementTree 2 | 3 | from ..zips import open_zip, update_zip 4 | 5 | 6 | _style_map_path = "mammoth/style-map" 7 | _style_map_absolute_path = "/" + _style_map_path 8 | _relationships_path = "word/_rels/document.xml.rels" 9 | _content_types_path = "[Content_Types].xml" 10 | 11 | 12 | def write_style_map(fileobj, style_map): 13 | with open_zip(fileobj, "r") as zip_file: 14 | relationships_xml = _generate_relationships_xml(zip_file.read_str(_relationships_path)) 15 | content_types_xml = _generate_content_types_xml(zip_file.read_str(_content_types_path)) 16 | 17 | update_zip(fileobj, { 18 | _style_map_path: style_map.encode("utf8"), 19 | _relationships_path: relationships_xml, 20 | _content_types_path: content_types_xml, 21 | }) 22 | 23 | def _generate_relationships_xml(relationships_xml): 24 | schema = "http://schemas.zwobble.org/mammoth/style-map" 25 | relationships_uri = "http://schemas.openxmlformats.org/package/2006/relationships" 26 | relationship_element_name = "{" + relationships_uri + "}Relationship" 27 | 28 | relationships = ElementTree.fromstring(relationships_xml) 29 | _add_or_update_element(relationships, relationship_element_name, "Id", { 30 | "Id": "rMammothStyleMap", 31 | "Type": schema, 32 | "Target": _style_map_absolute_path, 33 | }) 34 | 35 | return ElementTree.tostring(relationships, "UTF-8") 36 | 37 | 38 | def _generate_content_types_xml(content_types_xml): 39 | content_types_uri = "http://schemas.openxmlformats.org/package/2006/content-types" 40 | override_name = "{" + content_types_uri + "}Override" 41 | 42 | types = ElementTree.fromstring(content_types_xml) 43 | _add_or_update_element(types, override_name, "PartName", { 44 | "PartName": _style_map_absolute_path, 45 | "ContentType": "text/prs.mammoth.style-map", 46 | }) 47 | 48 | return ElementTree.tostring(types, "UTF-8") 49 | 50 | 51 | def _add_or_update_element(parent, name, identifying_attribute, attributes): 52 | existing_child = _find_child(parent, name, identifying_attribute, attributes) 53 | if existing_child is None: 54 | ElementTree.SubElement(parent, name, attributes) 55 | else: 56 | existing_child.attrib = attributes 57 | 58 | 59 | def _find_child(parent, name, identifying_attribute, attributes): 60 | for element in parent.iter(): 61 | if element.tag == name and element.get(identifying_attribute) == attributes.get(identifying_attribute): 62 | return element 63 | 64 | 65 | def read_style_map(fileobj): 66 | with open_zip(fileobj, "r") as zip_file: 67 | if zip_file.exists(_style_map_path): 68 | return zip_file.read_str(_style_map_path) 69 | 70 | 71 | -------------------------------------------------------------------------------- /mammoth/docx/styles_xml.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | 4 | class Styles(object): 5 | @staticmethod 6 | def create(paragraph_styles=None, character_styles=None, table_styles=None, numbering_styles=None): 7 | if paragraph_styles is None: 8 | paragraph_styles = {} 9 | if character_styles is None: 10 | character_styles = {} 11 | if table_styles is None: 12 | table_styles = {} 13 | if numbering_styles is None: 14 | numbering_styles = {} 15 | 16 | return Styles( 17 | paragraph_styles=paragraph_styles, 18 | character_styles=character_styles, 19 | table_styles=table_styles, 20 | numbering_styles=numbering_styles, 21 | ) 22 | 23 | def __init__(self, paragraph_styles, character_styles, table_styles, numbering_styles): 24 | self._paragraph_styles = paragraph_styles 25 | self._character_styles = character_styles 26 | self._table_styles = table_styles 27 | self._numbering_styles = numbering_styles 28 | 29 | def find_paragraph_style_by_id(self, style_id): 30 | return self._paragraph_styles.get(style_id) 31 | 32 | def find_character_style_by_id(self, style_id): 33 | return self._character_styles.get(style_id) 34 | 35 | def find_table_style_by_id(self, style_id): 36 | return self._table_styles.get(style_id) 37 | 38 | def find_numbering_style_by_id(self, style_id): 39 | return self._numbering_styles.get(style_id) 40 | 41 | 42 | Styles.EMPTY = Styles( 43 | paragraph_styles={}, 44 | character_styles={}, 45 | table_styles={}, 46 | numbering_styles={}, 47 | ) 48 | 49 | 50 | def read_styles_xml_element(element): 51 | paragraph_styles = {} 52 | character_styles = {} 53 | table_styles = {} 54 | numbering_styles = {} 55 | styles = { 56 | "paragraph": paragraph_styles, 57 | "character": character_styles, 58 | "table": table_styles, 59 | } 60 | 61 | for style_element in element.find_children("w:style"): 62 | style = _read_style_element(style_element) 63 | element_type = style_element.attributes["w:type"] 64 | if element_type == "numbering": 65 | numbering_styles[style.style_id] = _read_numbering_style_element(style_element) 66 | else: 67 | style_set = styles.get(element_type) 68 | if style_set is not None: 69 | style_set[style.style_id] = style 70 | 71 | return Styles( 72 | paragraph_styles=paragraph_styles, 73 | character_styles=character_styles, 74 | table_styles=table_styles, 75 | numbering_styles=numbering_styles, 76 | ) 77 | 78 | 79 | Style = collections.namedtuple("Style", ["style_id", "name"]) 80 | 81 | 82 | def _read_style_element(element): 83 | style_id = element.attributes["w:styleId"] 84 | name = element.find_child_or_null("w:name").attributes.get("w:val") 85 | return Style(style_id=style_id, name=name) 86 | 87 | 88 | NumberingStyle = collections.namedtuple("NumberingStyle", ["num_id"]) 89 | 90 | 91 | def _read_numbering_style_element(element): 92 | num_id = element \ 93 | .find_child_or_null("w:pPr") \ 94 | .find_child_or_null("w:numPr") \ 95 | .find_child_or_null("w:numId") \ 96 | .attributes.get("w:val") 97 | 98 | return NumberingStyle(num_id=num_id) 99 | -------------------------------------------------------------------------------- /mammoth/docx/uris.py: -------------------------------------------------------------------------------- 1 | def uri_to_zip_entry_name(base, uri): 2 | if uri.startswith("/"): 3 | return uri[1:] 4 | else: 5 | return base + "/" + uri 6 | 7 | 8 | def replace_fragment(uri, fragment): 9 | hash_index = uri.find("#") 10 | if hash_index != -1: 11 | uri = uri[:hash_index] 12 | return uri + "#" + fragment 13 | -------------------------------------------------------------------------------- /mammoth/docx/xmlparser.py: -------------------------------------------------------------------------------- 1 | import xml.dom.minidom 2 | 3 | import cobble 4 | 5 | 6 | @cobble.data 7 | class XmlElement(object): 8 | name = cobble.field() 9 | attributes = cobble.field() 10 | children = cobble.field() 11 | 12 | def find_child_or_null(self, name): 13 | return self.find_child(name) or null_xml_element 14 | 15 | def find_child(self, name): 16 | for child in self.children: 17 | if isinstance(child, XmlElement) and child.name == name: 18 | return child 19 | 20 | 21 | def find_children(self, name): 22 | return XmlElementList(filter( 23 | lambda child: child.node_type == node_types.element and child.name == name, 24 | self.children 25 | )) 26 | 27 | 28 | class XmlElementList(object): 29 | def __init__(self, elements): 30 | self._elements = elements 31 | 32 | def __iter__(self): 33 | return iter(self._elements) 34 | 35 | def find_children(self, name): 36 | children = [] 37 | for element in self._elements: 38 | for child in element.find_children(name): 39 | children.append(child) 40 | return XmlElementList(children) 41 | 42 | 43 | class NullXmlElement(object): 44 | attributes = {} 45 | children = [] 46 | 47 | def find_child_or_null(self, name): 48 | return self 49 | 50 | def find_child(self, name): 51 | return None 52 | 53 | 54 | null_xml_element = NullXmlElement() 55 | 56 | 57 | @cobble.data 58 | class XmlText(object): 59 | value = cobble.field() 60 | 61 | 62 | def element(name, attributes=None, children=None): 63 | return XmlElement(name, attributes or {}, children or []) 64 | 65 | text = XmlText 66 | 67 | 68 | class node_types(object): 69 | element = 1 70 | text = 3 71 | 72 | 73 | XmlElement.node_type = node_types.element 74 | XmlText.node_type = node_types.text 75 | 76 | 77 | 78 | def parse_xml(fileobj, namespace_mapping=None): 79 | if namespace_mapping is None: 80 | namespace_prefixes = {} 81 | else: 82 | namespace_prefixes = dict((uri, prefix) for prefix, uri in namespace_mapping) 83 | 84 | document = xml.dom.minidom.parse(fileobj) 85 | 86 | def convert_node(node): 87 | if node.nodeType == xml.dom.Node.ELEMENT_NODE: 88 | return convert_element(node) 89 | elif node.nodeType == xml.dom.Node.TEXT_NODE: 90 | return XmlText(node.nodeValue) 91 | else: 92 | return None 93 | 94 | def convert_element(element): 95 | converted_name = convert_name(element) 96 | 97 | converted_attributes = dict( 98 | (convert_name(attribute), attribute.value) 99 | for attribute in element.attributes.values() 100 | if attribute.namespaceURI != "http://www.w3.org/2000/xmlns/" 101 | ) 102 | 103 | converted_children = [] 104 | for child_node in element.childNodes: 105 | converted_child_node = convert_node(child_node) 106 | if converted_child_node is not None: 107 | converted_children.append(converted_child_node) 108 | 109 | return XmlElement(converted_name, converted_attributes, converted_children) 110 | 111 | def convert_name(node): 112 | if node.namespaceURI is None: 113 | return node.localName 114 | else: 115 | prefix = namespace_prefixes.get(node.namespaceURI) 116 | if prefix is None: 117 | return "{%s}%s" % (node.namespaceURI, node.localName) 118 | else: 119 | return "%s:%s" % (prefix, node.localName) 120 | 121 | return convert_node(document.documentElement) 122 | -------------------------------------------------------------------------------- /mammoth/html/__init__.py: -------------------------------------------------------------------------------- 1 | from ..lists import flat_map 2 | from .nodes import TextNode, Tag, Element, ForceWrite, NodeVisitor 3 | 4 | 5 | def text(value): 6 | return TextNode(value) 7 | 8 | 9 | def tag(tag_names, attributes=None, collapsible=None, separator=None): 10 | if not isinstance(tag_names, list): 11 | tag_names = [tag_names] 12 | if attributes is None: 13 | attributes = {} 14 | return Tag(tag_names=tag_names, attributes=attributes, collapsible=bool(collapsible), separator=separator) 15 | 16 | 17 | def element(tag_names, attributes=None, children=None, collapsible=None, separator=None): 18 | if children is None: 19 | children = [] 20 | 21 | element_tag = tag(tag_names=tag_names, attributes=attributes, collapsible=collapsible, separator=separator) 22 | return Element(element_tag, children) 23 | 24 | 25 | def collapsible_element(tag_names, attributes=None, children=None): 26 | return element(tag_names, attributes, children, collapsible=True) 27 | 28 | 29 | force_write = ForceWrite() 30 | 31 | 32 | def strip_empty(nodes): 33 | return flat_map(_strip_empty_node, nodes) 34 | 35 | 36 | def _strip_empty_node(node): 37 | return StripEmpty().visit(node) 38 | 39 | 40 | class StripEmpty(NodeVisitor): 41 | def visit_text_node(self, node): 42 | if node.value: 43 | return [node] 44 | else: 45 | return [] 46 | 47 | def visit_element(self, element): 48 | children = strip_empty(element.children) 49 | if len(children) == 0 and not element.is_void(): 50 | return [] 51 | else: 52 | return [Element(element.tag, children)] 53 | 54 | def visit_force_write(self, node): 55 | return [node] 56 | 57 | 58 | def collapse(nodes): 59 | collapsed = [] 60 | 61 | for node in nodes: 62 | _collapsing_add(collapsed, node) 63 | 64 | return collapsed 65 | 66 | class _CollapseNode(NodeVisitor): 67 | def visit_text_node(self, node): 68 | return node 69 | 70 | def visit_element(self, element): 71 | return Element(element.tag, collapse(element.children)) 72 | 73 | def visit_force_write(self, node): 74 | return node 75 | 76 | _collapse_node = _CollapseNode().visit 77 | 78 | 79 | def _collapsing_add(collapsed, node): 80 | collapsed_node = _collapse_node(node) 81 | if not _try_collapse(collapsed, collapsed_node): 82 | collapsed.append(collapsed_node) 83 | 84 | def _try_collapse(collapsed, node): 85 | if not collapsed: 86 | return False 87 | 88 | last = collapsed[-1] 89 | if not isinstance(last, Element) or not isinstance(node, Element): 90 | return False 91 | 92 | if not node.collapsible: 93 | return False 94 | 95 | if not _is_match(last, node): 96 | return False 97 | 98 | if node.separator: 99 | last.children.append(text(node.separator)) 100 | 101 | for child in node.children: 102 | _collapsing_add(last.children, child) 103 | 104 | return True 105 | 106 | def _is_match(first, second): 107 | return first.tag_name in second.tag_names and first.attributes == second.attributes 108 | 109 | 110 | def write(writer, nodes): 111 | visitor = _NodeWriter(writer) 112 | visitor.visit_all(nodes) 113 | 114 | 115 | class _NodeWriter(NodeVisitor): 116 | def __init__(self, writer): 117 | self._writer = writer 118 | 119 | def visit_text_node(self, node): 120 | self._writer.text(node.value) 121 | 122 | def visit_element(self, element): 123 | if element.is_void(): 124 | self._writer.self_closing(element.tag_name, element.attributes) 125 | else: 126 | self._writer.start(element.tag_name, element.attributes) 127 | self.visit_all(element.children) 128 | self._writer.end(element.tag_name) 129 | 130 | def visit_force_write(self, element): 131 | pass 132 | 133 | def visit_all(self, nodes): 134 | for node in nodes: 135 | self.visit(node) 136 | -------------------------------------------------------------------------------- /mammoth/html/nodes.py: -------------------------------------------------------------------------------- 1 | import cobble 2 | 3 | 4 | class Node(object): 5 | pass 6 | 7 | 8 | @cobble.data 9 | class TextNode(Node): 10 | value = cobble.field() 11 | 12 | 13 | @cobble.data 14 | class Tag(object): 15 | tag_names = cobble.field() 16 | attributes = cobble.field() 17 | collapsible = cobble.field() 18 | separator = cobble.field() 19 | 20 | @property 21 | def tag_name(self): 22 | return self.tag_names[0] 23 | 24 | 25 | @cobble.data 26 | class Element(Node): 27 | tag = cobble.field() 28 | children = cobble.field() 29 | 30 | @property 31 | def tag_name(self): 32 | return self.tag.tag_name 33 | 34 | @property 35 | def tag_names(self): 36 | return self.tag.tag_names 37 | 38 | @property 39 | def attributes(self): 40 | return self.tag.attributes 41 | 42 | @property 43 | def collapsible(self): 44 | return self.tag.collapsible 45 | 46 | @property 47 | def separator(self): 48 | return self.tag.separator 49 | 50 | _VOID_TAG_NAMES = set(["br", "hr", "img", "input"]) 51 | 52 | def is_void(self): 53 | return not self.children and self.tag_name in self._VOID_TAG_NAMES 54 | 55 | 56 | @cobble.visitable 57 | class ForceWrite(Node): 58 | pass 59 | 60 | 61 | NodeVisitor = cobble.visitor(Node) 62 | -------------------------------------------------------------------------------- /mammoth/html_paths.py: -------------------------------------------------------------------------------- 1 | import cobble 2 | 3 | from . import html 4 | 5 | 6 | def path(elements): 7 | return HtmlPath(elements) 8 | 9 | 10 | def element(names, attributes=None, class_names=None, fresh=None, separator=None): 11 | if attributes is None: 12 | attributes = {} 13 | if class_names is None: 14 | class_names = [] 15 | if fresh is None: 16 | fresh = False 17 | if class_names: 18 | attributes["class"] = " ".join(class_names) 19 | 20 | return HtmlPathElement(html.tag( 21 | tag_names=names, 22 | attributes=attributes, 23 | collapsible=not fresh, 24 | separator=separator, 25 | )) 26 | 27 | 28 | @cobble.data 29 | class HtmlPath(object): 30 | elements = cobble.field() 31 | 32 | def wrap(self, generate_nodes): 33 | nodes = generate_nodes() 34 | 35 | for element in reversed(self.elements): 36 | nodes = element.wrap_nodes(nodes) 37 | 38 | return nodes 39 | 40 | 41 | @cobble.data 42 | class HtmlPathElement(object): 43 | tag = cobble.field() 44 | 45 | def wrap(self, generate_nodes): 46 | return self.wrap_nodes(generate_nodes()) 47 | 48 | def wrap_nodes(self, nodes): 49 | element = html.Element(self.tag, nodes) 50 | return [element] 51 | 52 | empty = path([]) 53 | 54 | 55 | class ignore(object): 56 | @staticmethod 57 | def wrap(generate_nodes): 58 | return [] 59 | -------------------------------------------------------------------------------- /mammoth/images.py: -------------------------------------------------------------------------------- 1 | import base64 2 | 3 | from . import html 4 | 5 | 6 | def img_element(func): 7 | def convert_image(image): 8 | attributes = {} 9 | if image.alt_text: 10 | attributes["alt"] = image.alt_text 11 | attributes.update(func(image)) 12 | 13 | return [html.element("img", attributes)] 14 | 15 | return convert_image 16 | 17 | # Undocumented, but retained for backwards-compatibility with 0.3.x 18 | inline = img_element 19 | 20 | 21 | @img_element 22 | def data_uri(image): 23 | with image.open() as image_bytes: 24 | encoded_src = base64.b64encode(image_bytes.read()).decode("ascii") 25 | 26 | return { 27 | "src": "data:{0};base64,{1}".format(image.content_type, encoded_src) 28 | } 29 | -------------------------------------------------------------------------------- /mammoth/lists.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def flatten(values): 5 | return flat_map(lambda x: x, values) 6 | 7 | 8 | def unique(values): 9 | output = [] 10 | seen = set() 11 | for value in values: 12 | if value not in seen: 13 | seen.add(value) 14 | output.append(value) 15 | return output 16 | 17 | 18 | def flat_map(func, values): 19 | return [ 20 | element 21 | for value in values 22 | for element in func(value) 23 | ] 24 | 25 | 26 | def find_index(predicate, values): 27 | for index, value in enumerate(values): 28 | if predicate(value): 29 | return index 30 | 31 | 32 | if sys.version_info[0] == 2: 33 | map = map 34 | filter = filter 35 | else: 36 | import builtins 37 | def map(*args, **kwargs): 38 | return list(builtins.map(*args, **kwargs)) 39 | def filter(*args, **kwargs): 40 | return list(builtins.filter(*args, **kwargs)) 41 | -------------------------------------------------------------------------------- /mammoth/options.py: -------------------------------------------------------------------------------- 1 | from .styles.parser import read_style_mapping 2 | from . import lists, results 3 | 4 | 5 | def read_options(options): 6 | custom_style_map_text = options.pop("style_map", "") or "" 7 | embedded_style_map_text = options.pop("embedded_style_map", "") or "" 8 | include_default_style_map = options.pop("include_default_style_map", True) 9 | 10 | read_style_map_result = results.combine([ 11 | _read_style_map(custom_style_map_text), 12 | _read_style_map(embedded_style_map_text), 13 | ]) 14 | 15 | custom_style_map, embedded_style_map = read_style_map_result.value 16 | style_map = custom_style_map + embedded_style_map 17 | 18 | if include_default_style_map: 19 | style_map += _default_style_map 20 | 21 | options["ignore_empty_paragraphs"] = options.get("ignore_empty_paragraphs", True) 22 | options["style_map"] = style_map 23 | return read_style_map_result.map(lambda _: options) 24 | 25 | 26 | def _read_style_map(style_text): 27 | lines = filter(None, map(_get_line, style_text.split("\n"))) 28 | return results.combine(lists.map(read_style_mapping, lines)) \ 29 | .map(lambda style_mappings: lists.filter(None, style_mappings)) 30 | 31 | 32 | def _get_line(line): 33 | line = line.strip() 34 | if line.startswith("#"): 35 | return None 36 | else: 37 | return line 38 | 39 | 40 | _default_style_map_result = _read_style_map(""" 41 | p.Heading1 => h1:fresh 42 | p.Heading2 => h2:fresh 43 | p.Heading3 => h3:fresh 44 | p.Heading4 => h4:fresh 45 | p.Heading5 => h5:fresh 46 | p.Heading6 => h6:fresh 47 | p[style-name='Heading 1'] => h1:fresh 48 | p[style-name='Heading 2'] => h2:fresh 49 | p[style-name='Heading 3'] => h3:fresh 50 | p[style-name='Heading 4'] => h4:fresh 51 | p[style-name='Heading 5'] => h5:fresh 52 | p[style-name='Heading 6'] => h6:fresh 53 | p[style-name='heading 1'] => h1:fresh 54 | p[style-name='heading 2'] => h2:fresh 55 | p[style-name='heading 3'] => h3:fresh 56 | p[style-name='heading 4'] => h4:fresh 57 | p[style-name='heading 5'] => h5:fresh 58 | p[style-name='heading 6'] => h6:fresh 59 | 60 | # Apple Pages 61 | p.Heading => h1:fresh 62 | p[style-name='Heading'] => h1:fresh 63 | 64 | r[style-name='Strong'] => strong 65 | 66 | p[style-name='footnote text'] => p:fresh 67 | r[style-name='footnote reference'] => 68 | p[style-name='endnote text'] => p:fresh 69 | r[style-name='endnote reference'] => 70 | p[style-name='annotation text'] => p:fresh 71 | r[style-name='annotation reference'] => 72 | 73 | # LibreOffice 74 | p[style-name='Footnote'] => p:fresh 75 | r[style-name='Footnote anchor'] => 76 | p[style-name='Endnote'] => p:fresh 77 | r[style-name='Endnote anchor'] => 78 | 79 | p:unordered-list(1) => ul > li:fresh 80 | p:unordered-list(2) => ul|ol > li > ul > li:fresh 81 | p:unordered-list(3) => ul|ol > li > ul|ol > li > ul > li:fresh 82 | p:unordered-list(4) => ul|ol > li > ul|ol > li > ul|ol > li > ul > li:fresh 83 | p:unordered-list(5) => ul|ol > li > ul|ol > li > ul|ol > li > ul|ol > li > ul > li:fresh 84 | p:ordered-list(1) => ol > li:fresh 85 | p:ordered-list(2) => ul|ol > li > ol > li:fresh 86 | p:ordered-list(3) => ul|ol > li > ul|ol > li > ol > li:fresh 87 | p:ordered-list(4) => ul|ol > li > ul|ol > li > ul|ol > li > ol > li:fresh 88 | p:ordered-list(5) => ul|ol > li > ul|ol > li > ul|ol > li > ul|ol > li > ol > li:fresh 89 | 90 | r[style-name='Hyperlink'] => 91 | 92 | p[style-name='Normal'] => p:fresh 93 | 94 | # Apple Pages 95 | p.Body => p:fresh 96 | p[style-name='Body'] => p:fresh 97 | """) 98 | 99 | 100 | assert not _default_style_map_result.messages 101 | _default_style_map = _default_style_map_result.value 102 | -------------------------------------------------------------------------------- /mammoth/raw_text.py: -------------------------------------------------------------------------------- 1 | from . import documents 2 | 3 | 4 | def extract_raw_text_from_element(element): 5 | if isinstance(element, documents.Text): 6 | return element.value 7 | elif isinstance(element, documents.Tab): 8 | return "\t" 9 | else: 10 | text = "".join(map(extract_raw_text_from_element, getattr(element, "children", []))) 11 | if isinstance(element, documents.Paragraph): 12 | return text + "\n\n" 13 | else: 14 | return text 15 | -------------------------------------------------------------------------------- /mammoth/results.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | from .lists import unique 4 | 5 | 6 | class Result(object): 7 | def __init__(self, value, messages): 8 | self.value = value 9 | self.messages = unique(messages) 10 | 11 | def map(self, func): 12 | return Result(func(self.value), self.messages) 13 | 14 | def bind(self, func): 15 | result = func(self.value) 16 | return Result(result.value, self.messages + result.messages) 17 | 18 | 19 | Message = collections.namedtuple("Message", ["type", "message"]) 20 | 21 | 22 | def warning(message): 23 | return Message("warning", message) 24 | 25 | 26 | def success(value): 27 | return Result(value, []) 28 | 29 | 30 | def combine(results): 31 | values = [] 32 | messages = [] 33 | for result in results: 34 | values.append(result.value) 35 | for message in result.messages: 36 | messages.append(message) 37 | 38 | return Result(values, messages) 39 | 40 | 41 | def map(func, *args): 42 | return combine(args).map(lambda values: func(*values)) 43 | -------------------------------------------------------------------------------- /mammoth/styles/__init__.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | 4 | def style(document_matcher, html_path): 5 | return Style(document_matcher, html_path) 6 | 7 | 8 | Style = collections.namedtuple("Style", ["document_matcher", "html_path"]) 9 | -------------------------------------------------------------------------------- /mammoth/styles/parser/__init__.py: -------------------------------------------------------------------------------- 1 | from .errors import LineParseError 2 | from .style_mapping_parser import parse_style_mapping 3 | from .tokeniser import tokenise 4 | from .token_iterator import TokenIterator 5 | from ... import results 6 | 7 | 8 | def read_style_mapping(string): 9 | try: 10 | tokens = tokenise(string) 11 | return results.success(parse_style_mapping(TokenIterator(tokens))) 12 | except LineParseError: 13 | warning = "Did not understand this style mapping, so ignored it: " + string 14 | return results.Result(None, [results.warning(warning)]) 15 | -------------------------------------------------------------------------------- /mammoth/styles/parser/document_matcher_parser.py: -------------------------------------------------------------------------------- 1 | from ... import documents, document_matchers 2 | from .errors import LineParseError 3 | from .tokeniser import TokenType 4 | from .token_parser import try_parse_class_name, parse_string 5 | 6 | 7 | def parse_document_matcher(tokens): 8 | if tokens.try_skip(TokenType.IDENTIFIER, "p"): 9 | style_id = try_parse_class_name(tokens) 10 | style_name = _parse_style_name(tokens) 11 | numbering = _parse_numbering(tokens) 12 | 13 | return document_matchers.paragraph( 14 | style_id=style_id, 15 | style_name=style_name, 16 | numbering=numbering, 17 | ) 18 | 19 | elif tokens.try_skip(TokenType.IDENTIFIER, "r"): 20 | style_id = try_parse_class_name(tokens) 21 | style_name = _parse_style_name(tokens) 22 | 23 | return document_matchers.run( 24 | style_id=style_id, 25 | style_name=style_name, 26 | ) 27 | 28 | elif tokens.try_skip(TokenType.IDENTIFIER, "table"): 29 | style_id = try_parse_class_name(tokens) 30 | style_name = _parse_style_name(tokens) 31 | 32 | return document_matchers.table( 33 | style_id=style_id, 34 | style_name=style_name, 35 | ) 36 | 37 | elif tokens.try_skip(TokenType.IDENTIFIER, "b"): 38 | return document_matchers.bold 39 | 40 | elif tokens.try_skip(TokenType.IDENTIFIER, "i"): 41 | return document_matchers.italic 42 | 43 | elif tokens.try_skip(TokenType.IDENTIFIER, "u"): 44 | return document_matchers.underline 45 | 46 | elif tokens.try_skip(TokenType.IDENTIFIER, "strike"): 47 | return document_matchers.strikethrough 48 | 49 | elif tokens.try_skip(TokenType.IDENTIFIER, "all-caps"): 50 | return document_matchers.all_caps 51 | 52 | elif tokens.try_skip(TokenType.IDENTIFIER, "small-caps"): 53 | return document_matchers.small_caps 54 | 55 | elif tokens.try_skip(TokenType.IDENTIFIER, "highlight"): 56 | return _parse_highlight(tokens) 57 | 58 | elif tokens.try_skip(TokenType.IDENTIFIER, "comment-reference"): 59 | return document_matchers.comment_reference 60 | 61 | elif tokens.try_skip(TokenType.IDENTIFIER, "br"): 62 | return _parse_break(tokens) 63 | 64 | else: 65 | raise LineParseError("Unrecognised document element: {0}".format(tokens.next_value(TokenType.IDENTIFIER))) 66 | 67 | def _parse_style_name(tokens): 68 | if tokens.try_skip(TokenType.SYMBOL, "["): 69 | tokens.skip(TokenType.IDENTIFIER, "style-name") 70 | string_matcher = _parse_string_matcher(tokens) 71 | tokens.skip(TokenType.SYMBOL, "]") 72 | return string_matcher 73 | else: 74 | return None 75 | 76 | 77 | def _parse_string_matcher(tokens): 78 | if tokens.try_skip(TokenType.SYMBOL, "="): 79 | return document_matchers.equal_to(parse_string(tokens)) 80 | elif tokens.try_skip(TokenType.SYMBOL, "^="): 81 | return document_matchers.starts_with(parse_string(tokens)) 82 | else: 83 | raise LineParseError("Unrecognised string matcher: {0}".format(tokens.next_value())) 84 | 85 | def _parse_numbering(tokens): 86 | if tokens.try_skip(TokenType.SYMBOL, ":"): 87 | is_ordered = _parse_list_type(tokens) 88 | tokens.skip(TokenType.SYMBOL, "(") 89 | level = int(tokens.next_value(TokenType.INTEGER)) - 1 90 | tokens.skip(TokenType.SYMBOL, ")") 91 | return documents.numbering_level(level, is_ordered=is_ordered) 92 | 93 | 94 | def _parse_list_type(tokens): 95 | list_type = tokens.next_value(TokenType.IDENTIFIER) 96 | if list_type == "ordered-list": 97 | return True 98 | elif list_type == "unordered-list": 99 | return False 100 | else: 101 | raise LineParseError("Unrecognised list type: {0}".format(list_type)) 102 | 103 | 104 | def _parse_highlight(tokens): 105 | if tokens.try_skip(TokenType.SYMBOL, "["): 106 | tokens.skip(TokenType.IDENTIFIER, "color") 107 | tokens.skip(TokenType.SYMBOL, "=") 108 | color = parse_string(tokens) 109 | tokens.skip(TokenType.SYMBOL, "]"); 110 | else: 111 | color = None 112 | 113 | return document_matchers.highlight(color=color) 114 | 115 | 116 | def _parse_break(tokens): 117 | tokens.skip(TokenType.SYMBOL, "[") 118 | tokens.skip(TokenType.IDENTIFIER, "type") 119 | tokens.skip(TokenType.SYMBOL, "=") 120 | type_name = parse_string(tokens) 121 | tokens.skip(TokenType.SYMBOL, "]"); 122 | 123 | if type_name == "line": 124 | return document_matchers.line_break 125 | elif type_name == "page": 126 | return document_matchers.page_break 127 | elif type_name == "column": 128 | return document_matchers.column_break 129 | else: 130 | raise LineParseError("Unrecognised break type: {0}".format(type_name)) 131 | -------------------------------------------------------------------------------- /mammoth/styles/parser/errors.py: -------------------------------------------------------------------------------- 1 | class LineParseError(Exception): 2 | pass 3 | -------------------------------------------------------------------------------- /mammoth/styles/parser/html_path_parser.py: -------------------------------------------------------------------------------- 1 | import cobble 2 | 3 | from ... import html_paths 4 | from .tokeniser import TokenType 5 | from .token_parser import parse_identifier, parse_string 6 | 7 | 8 | @cobble.data 9 | class _AttributeOrClassName(object): 10 | name = cobble.field() 11 | value = cobble.field() 12 | append = cobble.field() 13 | 14 | 15 | def parse_html_path(tokens): 16 | if tokens.try_skip(TokenType.SYMBOL, "!"): 17 | return html_paths.ignore 18 | else: 19 | return html_paths.path(_parse_html_path_elements(tokens)) 20 | 21 | 22 | def _parse_html_path_elements(tokens): 23 | elements = [] 24 | 25 | if tokens.peek_token_type() == TokenType.IDENTIFIER: 26 | elements.append(_parse_element(tokens)) 27 | 28 | while tokens.try_skip_many(((TokenType.WHITESPACE, None), (TokenType.SYMBOL, ">"))): 29 | tokens.skip(TokenType.WHITESPACE) 30 | elements.append(_parse_element(tokens)) 31 | 32 | return elements 33 | 34 | 35 | def _parse_element(tokens): 36 | tag_names = _parse_tag_names(tokens) 37 | attributes_list = _parse_attribute_or_class_names(tokens) 38 | is_fresh = _parse_is_fresh(tokens) 39 | separator = _parse_separator(tokens) 40 | 41 | attributes = {} 42 | for attribute in attributes_list: 43 | if attribute.append and attributes.get(attribute.name): 44 | attributes[attribute.name] += " " + attribute.value 45 | else: 46 | attributes[attribute.name] = attribute.value 47 | 48 | return html_paths.element( 49 | tag_names, 50 | attributes=attributes, 51 | fresh=is_fresh, 52 | separator=separator, 53 | ) 54 | 55 | 56 | def _parse_tag_names(tokens): 57 | tag_names = [parse_identifier(tokens)] 58 | 59 | while tokens.try_skip(TokenType.SYMBOL, "|"): 60 | tag_names.append(parse_identifier(tokens)) 61 | 62 | return tag_names 63 | 64 | 65 | def _parse_attribute_or_class_names(tokens): 66 | attribute_or_class_names = [] 67 | 68 | while True: 69 | attribute_or_class_name = _try_parse_attribute_or_class_name(tokens) 70 | if attribute_or_class_name is None: 71 | break 72 | else: 73 | attribute_or_class_names.append(attribute_or_class_name) 74 | 75 | return attribute_or_class_names 76 | 77 | 78 | def _try_parse_attribute_or_class_name(tokens): 79 | if tokens.is_next(TokenType.SYMBOL, "["): 80 | return _parse_attribute(tokens) 81 | if tokens.is_next(TokenType.SYMBOL, "."): 82 | return _parse_class_name(tokens) 83 | else: 84 | return None 85 | 86 | 87 | def _parse_attribute(tokens): 88 | tokens.skip(TokenType.SYMBOL, "[") 89 | name = parse_identifier(tokens) 90 | tokens.skip(TokenType.SYMBOL, "=") 91 | value = parse_string(tokens) 92 | tokens.skip(TokenType.SYMBOL, "]") 93 | return _AttributeOrClassName(name=name, value=value, append=False) 94 | 95 | 96 | def _parse_class_name(tokens): 97 | tokens.skip(TokenType.SYMBOL, ".") 98 | class_name = parse_identifier(tokens) 99 | return _AttributeOrClassName(name="class", value=class_name, append=True) 100 | 101 | 102 | def _parse_is_fresh(tokens): 103 | return tokens.try_skip_many(( 104 | (TokenType.SYMBOL, ":"), 105 | (TokenType.IDENTIFIER, "fresh"), 106 | )) 107 | 108 | 109 | def _parse_separator(tokens): 110 | is_separator = tokens.try_skip_many(( 111 | (TokenType.SYMBOL, ":"), 112 | (TokenType.IDENTIFIER, "separator"), 113 | )) 114 | if is_separator: 115 | tokens.skip(TokenType.SYMBOL, "(") 116 | value = parse_string(tokens) 117 | tokens.skip(TokenType.SYMBOL, ")") 118 | return value 119 | else: 120 | return None 121 | -------------------------------------------------------------------------------- /mammoth/styles/parser/style_mapping_parser.py: -------------------------------------------------------------------------------- 1 | from .tokeniser import TokenType 2 | from .document_matcher_parser import parse_document_matcher 3 | from .html_path_parser import parse_html_path 4 | from ...styles import Style 5 | 6 | 7 | def parse_style_mapping(tokens): 8 | document_matcher = parse_document_matcher(tokens) 9 | tokens.skip(TokenType.WHITESPACE) 10 | tokens.skip(TokenType.SYMBOL, "=>") 11 | tokens.try_skip(TokenType.WHITESPACE) 12 | html_path = parse_html_path(tokens) 13 | tokens.skip(TokenType.END) 14 | 15 | return Style(document_matcher, html_path) 16 | -------------------------------------------------------------------------------- /mammoth/styles/parser/token_iterator.py: -------------------------------------------------------------------------------- 1 | # TODO: check indices 2 | # TODO: proper tests for unexpected tokens 3 | 4 | from .errors import LineParseError 5 | 6 | 7 | class TokenIterator(object): 8 | def __init__(self, tokens): 9 | self._tokens = tokens 10 | self._index = 0 11 | 12 | def peek_token_type(self): 13 | return self._tokens[self._index].type 14 | 15 | def next_value(self, token_type=None): 16 | return self._next(token_type).value 17 | 18 | def _next(self, token_type=None): 19 | token = self._tokens[self._index] 20 | if token_type is None or token.type == token_type: 21 | self._index += 1 22 | return token 23 | else: 24 | raise self._unexpected_token_type(token_type, token) 25 | 26 | def skip(self, token_type, token_value=None): 27 | token = self._tokens[self._index] 28 | if token.type == token_type and (token_value is None or token.value == token_value): 29 | self._index += 1 30 | return True 31 | else: 32 | raise self._unexpected_token_type(token_type, token) 33 | 34 | def try_skip(self, token_type, token_value=None): 35 | if self.is_next(token_type, token_value): 36 | self._index += 1 37 | return True 38 | else: 39 | return False 40 | 41 | def try_skip_many(self, tokens): 42 | start = self._index 43 | for token_type, token_value in tokens: 44 | token = self._tokens[self._index] 45 | if not (token.type == token_type and (token_value is None or token.value == token_value)): 46 | self._index = start 47 | return False 48 | else: 49 | self._index += 1 50 | 51 | return True 52 | 53 | def is_next(self, token_type, token_value=None): 54 | token = self._tokens[self._index] 55 | return token.type == token_type and (token_value is None or token.value == token_value) 56 | 57 | def _unexpected_token_type(self, token_type, token): 58 | raise LineParseError() 59 | 60 | -------------------------------------------------------------------------------- /mammoth/styles/parser/token_parser.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from .tokeniser import TokenType 4 | 5 | 6 | def try_parse_class_name(tokens): 7 | if tokens.try_skip(TokenType.SYMBOL, "."): 8 | return parse_identifier(tokens) 9 | else: 10 | return None 11 | 12 | 13 | def parse_identifier(tokens): 14 | return decode_escape_sequences(tokens.next_value(TokenType.IDENTIFIER)) 15 | 16 | 17 | def parse_string(tokens): 18 | return decode_escape_sequences(tokens.next_value(TokenType.STRING)[1:-1]) 19 | 20 | 21 | _ESCAPE_SEQUENCE_REGEX = re.compile(r"\\(.)") 22 | 23 | 24 | def decode_escape_sequences(value): 25 | return _ESCAPE_SEQUENCE_REGEX.sub(_decode_escape_sequence, value) 26 | 27 | 28 | def _decode_escape_sequence(match): 29 | code = match.group(1) 30 | if code == "n": 31 | return "\n" 32 | elif code == "r": 33 | return "\r" 34 | elif code == "t": 35 | return "\t" 36 | else: 37 | return code 38 | -------------------------------------------------------------------------------- /mammoth/styles/parser/tokeniser.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import re 3 | 4 | 5 | Token = collections.namedtuple("Token", ["character_index", "type", "value"]) 6 | 7 | 8 | class TokenType(object): 9 | IDENTIFIER = "identifier" 10 | SYMBOL = "symbol" 11 | WHITESPACE = "whitespace" 12 | STRING = "string" 13 | UNTERMINATED_STRING = "unterminated string" 14 | INTEGER = "integer" 15 | END = "end" 16 | 17 | 18 | 19 | def regex_tokeniser(rules): 20 | rules = [(token_type, _to_regex(regex)) for token_type, regex in rules] 21 | rules.append(("unknown", re.compile("."))) 22 | 23 | def tokenise(value): 24 | tokens = [] 25 | index = 0 26 | while index < len(value): 27 | for token_type, regex in rules: 28 | match = regex.match(value, index) 29 | if match is not None: 30 | tokens.append(Token(index, token_type, match.group(0))) 31 | index = match.end() 32 | break 33 | else: 34 | # Should be impossible 35 | raise Exception("Remaining: " + value[index:]) 36 | 37 | tokens.append(Token(index, TokenType.END, "")) 38 | 39 | return tokens 40 | 41 | return tokenise 42 | 43 | 44 | def _to_regex(value): 45 | if hasattr(value, "match"): 46 | return value 47 | else: 48 | return re.compile(value) 49 | 50 | 51 | _string_prefix = r"'(?:\\.|[^'])*" 52 | _identifier_character = r"(?:[a-zA-Z\-_]|\\.)" 53 | 54 | tokenise = regex_tokeniser([ 55 | (TokenType.IDENTIFIER, _identifier_character + "(?:" + _identifier_character + "|[0-9])*"), 56 | (TokenType.SYMBOL, r":|>|=>|\^=|=|\(|\)|\[|\]|\||!|\."), 57 | (TokenType.WHITESPACE, r"\s+"), 58 | (TokenType.STRING, _string_prefix + "'"), 59 | (TokenType.UNTERMINATED_STRING, _string_prefix), 60 | (TokenType.INTEGER, "([0-9]+)"), 61 | ]) 62 | -------------------------------------------------------------------------------- /mammoth/transforms.py: -------------------------------------------------------------------------------- 1 | from . import documents 2 | 3 | 4 | def paragraph(transform_paragraph): 5 | return element_of_type(documents.Paragraph, transform_paragraph) 6 | 7 | 8 | def run(transform_run): 9 | return element_of_type(documents.Run, transform_run) 10 | 11 | 12 | def element_of_type(element_type, transform): 13 | def transform_element(element): 14 | if isinstance(element, element_type): 15 | return transform(element) 16 | else: 17 | return element 18 | 19 | return _each_element(transform_element) 20 | 21 | 22 | def _each_element(transform_element): 23 | def transform_element_and_children(element): 24 | if isinstance(element, (documents.HasChildren, documents.TableCellUnmerged)): 25 | children = list(map(transform_element_and_children, element.children)) 26 | element = element.copy(children=children) 27 | 28 | return transform_element(element) 29 | 30 | return transform_element_and_children 31 | 32 | 33 | def get_descendants_of_type(element, element_type): 34 | return list(filter( 35 | lambda descendant: isinstance(descendant, element_type), 36 | get_descendants(element), 37 | )) 38 | 39 | 40 | def get_descendants(element): 41 | descendants = [] 42 | 43 | def visit(element): 44 | descendants.append(element) 45 | 46 | _visit_descendants(element, visit) 47 | 48 | return descendants 49 | 50 | 51 | def _visit_descendants(element, visit): 52 | if isinstance(element, documents.HasChildren): 53 | for child in element.children: 54 | _visit_descendants(child, visit) 55 | visit(child) 56 | 57 | -------------------------------------------------------------------------------- /mammoth/underline.py: -------------------------------------------------------------------------------- 1 | from . import html 2 | 3 | 4 | def element(name): 5 | def convert_underline(nodes): 6 | return [html.collapsible_element(name, {}, nodes)] 7 | 8 | return convert_underline 9 | -------------------------------------------------------------------------------- /mammoth/writers/__init__.py: -------------------------------------------------------------------------------- 1 | from .html import HtmlWriter 2 | from .markdown import MarkdownWriter 3 | 4 | 5 | def writer(output_format=None): 6 | if output_format is None: 7 | output_format = "html" 8 | 9 | return _writers[output_format]() 10 | 11 | 12 | def formats(): 13 | return _writers.keys() 14 | 15 | 16 | _writers = { 17 | "html": HtmlWriter, 18 | "markdown": MarkdownWriter, 19 | } 20 | -------------------------------------------------------------------------------- /mammoth/writers/abc.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import abc 4 | 5 | 6 | class Writer(object): 7 | __metaclass__ = abc.ABCMeta 8 | 9 | @abc.abstractmethod 10 | def text(self, text): 11 | pass 12 | 13 | @abc.abstractmethod 14 | def start(self, name, attributes=None): 15 | pass 16 | 17 | @abc.abstractmethod 18 | def end(self, name): 19 | pass 20 | 21 | @abc.abstractmethod 22 | def self_closing(self, name, attributes=None): 23 | pass 24 | 25 | @abc.abstractmethod 26 | def append(self, html): 27 | pass 28 | 29 | @abc.abstractmethod 30 | def as_string(self): 31 | pass 32 | -------------------------------------------------------------------------------- /mammoth/writers/html.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from xml.sax.saxutils import escape 3 | 4 | from .abc import Writer 5 | 6 | 7 | class HtmlWriter(Writer): 8 | def __init__(self): 9 | self._fragments = [] 10 | 11 | def text(self, text): 12 | self._fragments.append(_escape_html(text)) 13 | 14 | def start(self, name, attributes=None): 15 | attribute_string = _generate_attribute_string(attributes) 16 | self._fragments.append("<{0}{1}>".format(name, attribute_string)) 17 | 18 | def end(self, name): 19 | self._fragments.append("{0}>".format(name)) 20 | 21 | def self_closing(self, name, attributes=None): 22 | attribute_string = _generate_attribute_string(attributes) 23 | self._fragments.append("<{0}{1} />".format(name, attribute_string)) 24 | 25 | def append(self, html): 26 | self._fragments.append(html) 27 | 28 | def as_string(self): 29 | return "".join(self._fragments) 30 | 31 | 32 | def _escape_html(text): 33 | return escape(text, {'"': """}) 34 | 35 | 36 | def _generate_attribute_string(attributes): 37 | if attributes is None: 38 | return "" 39 | else: 40 | return "".join( 41 | ' {0}="{1}"'.format(key, _escape_html(attributes[key])) 42 | for key in sorted(attributes) 43 | ) 44 | -------------------------------------------------------------------------------- /mammoth/writers/markdown.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | 3 | from .abc import Writer 4 | 5 | import re 6 | 7 | 8 | class _WriterOutput(object): 9 | def __init__(self, start, end=None, generate_end=None, anchor_position=None): 10 | if generate_end is None: 11 | generate_end = _constant(end) 12 | 13 | self.start = start 14 | self.generate_end = generate_end 15 | self.anchor_position = anchor_position 16 | 17 | 18 | def _constant(value): 19 | def get(): 20 | return value 21 | 22 | return get 23 | 24 | 25 | class _MarkdownState(object): 26 | def __init__(self): 27 | self._list_state_stack = [] 28 | self.list_state = None 29 | self.list_item_has_closed = False 30 | 31 | def update_list_state(self, list_state): 32 | self._list_state_stack.append(self.list_state) 33 | self.list_state = list_state 34 | 35 | def pop_list_state(self): 36 | self.list_state = self._list_state_stack.pop() 37 | 38 | 39 | class _MarkdownListState(object): 40 | def __init__(self, ordered, indentation): 41 | self.ordered = ordered 42 | self.count = 0 43 | self.indentation = indentation 44 | 45 | 46 | def _symmetric_wrapped(end): 47 | return _Wrapped(end, end) 48 | 49 | 50 | class _Wrapped(object): 51 | def __init__(self, start, end): 52 | self._start = start 53 | self._end = end 54 | 55 | def __call__(self, attributes, markdown_state): 56 | return _WriterOutput(self._start, self._end) 57 | 58 | 59 | def _hyperlink(attributes, markdown_state): 60 | href = attributes.get("href", "") 61 | if href: 62 | return _WriterOutput( 63 | "[", "]({0})".format(href), 64 | anchor_position="before", 65 | ) 66 | else: 67 | return _default_output 68 | 69 | 70 | def _image(attributes, markdown_state): 71 | src = attributes.get("src", "") 72 | alt_text = attributes.get("alt", "") 73 | if src or alt_text: 74 | return _WriterOutput("".format(alt_text, src), "") 75 | else: 76 | return _default_output 77 | 78 | 79 | def _list(ordered): 80 | def call(attributes, markdown_state): 81 | if markdown_state.list_state is None: 82 | start = "" 83 | end_text = "\n" 84 | indentation = 0 85 | else: 86 | start = "\n" 87 | end_text = "" 88 | indentation = markdown_state.list_state.indentation + 1 89 | 90 | def generate_end(): 91 | markdown_state.pop_list_state() 92 | return end_text 93 | 94 | markdown_state.update_list_state(_MarkdownListState( 95 | ordered=ordered, 96 | indentation=indentation, 97 | )) 98 | 99 | return _WriterOutput(start, generate_end=generate_end) 100 | 101 | return call 102 | 103 | 104 | def _list_item(attributes, markdown_state): 105 | markdown_state.list_item_has_closed = False 106 | 107 | list_state = markdown_state.list_state or _MarkdownListState(ordered=False, indentation=0) 108 | list_state.count += 1 109 | 110 | if list_state.ordered: 111 | bullet = "{0}.".format(list_state.count) 112 | else: 113 | bullet = "-" 114 | 115 | def generate_end(): 116 | if markdown_state.list_item_has_closed: 117 | return "" 118 | else: 119 | markdown_state.list_item_has_closed = True 120 | return "\n" 121 | 122 | return _WriterOutput( 123 | start=("\t" * list_state.indentation) + bullet + " ", 124 | generate_end=generate_end 125 | ) 126 | 127 | 128 | def _init_writers(): 129 | writers = { 130 | "p": _Wrapped("", "\n\n"), 131 | "br": _Wrapped("", " \n"), 132 | "strong": _symmetric_wrapped("__"), 133 | "em": _symmetric_wrapped("*"), 134 | "a": _hyperlink, 135 | "img": _image, 136 | "ol": _list(ordered=True), 137 | "ul": _list(ordered=False), 138 | "li": _list_item, 139 | } 140 | 141 | for level in range(1, 7): 142 | writers["h{0}".format(level)] = _Wrapped("#" * level + " ", "\n\n") 143 | 144 | return writers 145 | 146 | 147 | _writers = _init_writers() 148 | _default_output = _WriterOutput("", "") 149 | 150 | def _default_writer(attributes, markdown_state): 151 | return _default_output 152 | 153 | 154 | class MarkdownWriter(Writer): 155 | def __init__(self): 156 | self._fragments = [] 157 | self._element_stack = [] 158 | self._markdown_state = _MarkdownState() 159 | 160 | def text(self, text): 161 | self._fragments.append(_escape_markdown(text)) 162 | 163 | def start(self, name, attributes=None): 164 | if attributes is None: 165 | attributes = {} 166 | 167 | output = _writers.get(name, _default_writer)(attributes, self._markdown_state) 168 | self._element_stack.append(output.generate_end) 169 | 170 | anchor_before_start = output.anchor_position == "before" 171 | if anchor_before_start: 172 | self._write_anchor(attributes) 173 | 174 | self._fragments.append(output.start) 175 | 176 | if not anchor_before_start: 177 | self._write_anchor(attributes) 178 | 179 | 180 | 181 | def end(self, name): 182 | end = self._element_stack.pop() 183 | output = end() 184 | self._fragments.append(output) 185 | 186 | def self_closing(self, name, attributes=None): 187 | self.start(name, attributes) 188 | self.end(name) 189 | 190 | def append(self, other): 191 | self._fragments.append(other) 192 | 193 | def as_string(self): 194 | return "".join(self._fragments) 195 | 196 | def _write_anchor(self, attributes): 197 | html_id = attributes.get("id") 198 | if html_id: 199 | self._fragments.append(''.format(html_id)) 200 | 201 | 202 | def _escape_markdown(value): 203 | return re.sub(r"([\`\*_\{\}\[\]\(\)\#\+\-\.\!])", r"\\\1", re.sub("\\\\", "\\\\\\\\", value)) 204 | -------------------------------------------------------------------------------- /mammoth/zips.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import io 3 | import shutil 4 | 5 | from zipfile import ZipFile 6 | 7 | 8 | def open_zip(fileobj, mode): 9 | return _Zip(ZipFile(fileobj, mode)) 10 | 11 | 12 | class _Zip(object): 13 | def __init__(self, zip_file): 14 | self._zip_file = zip_file 15 | 16 | def __enter__(self): 17 | return self 18 | 19 | def __exit__(self, *args): 20 | self._zip_file.close() 21 | 22 | def open(self, name): 23 | return contextlib.closing(self._zip_file.open(name)) 24 | 25 | def exists(self, name): 26 | try: 27 | self._zip_file.getinfo(name) 28 | return True 29 | except KeyError: 30 | return False 31 | 32 | def read_str(self, name): 33 | return self._zip_file.read(name).decode("utf8") 34 | 35 | 36 | def update_zip(fileobj, files): 37 | source = ZipFile(fileobj, "r") 38 | try: 39 | destination_fileobj = io.BytesIO() 40 | destination = ZipFile(destination_fileobj, "w") 41 | try: 42 | names = set(source.namelist()) | set(files.keys()) 43 | for name in names: 44 | if name in files: 45 | contents = files[name] 46 | else: 47 | contents = source.read(name) 48 | destination.writestr(name, contents) 49 | finally: 50 | destination.close() 51 | finally: 52 | source.close() 53 | 54 | fileobj.seek(0) 55 | destination_fileobj.seek(0) 56 | shutil.copyfileobj(destination_fileobj, fileobj) 57 | 58 | 59 | def split_path(path): 60 | parts = path.rsplit("/", 1) 61 | if len(parts) == 1: 62 | return ("", path) 63 | else: 64 | return tuple(parts) 65 | 66 | 67 | def join_path(*args): 68 | non_empty_paths = list(filter(None, args)) 69 | 70 | relevant_paths = [] 71 | for path in non_empty_paths: 72 | if path.startswith("/"): 73 | relevant_paths = [path] 74 | else: 75 | relevant_paths.append(path) 76 | 77 | return "/".join(relevant_paths) 78 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools"] 3 | build-backend = "setuptools.build_meta" 4 | -------------------------------------------------------------------------------- /recipes/wmf_images.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | import shutil 4 | import subprocess 5 | import tempfile 6 | 7 | 8 | # An example of how to use LibreOffice and ImageMagick to convert WMF images to 9 | # PNGs. 10 | # 11 | # libreoffice_wmf_conversion uses LibreOffice to convert the image to a PNG. 12 | # This normally creates an image with a large amount of padding, so 13 | # imagemagick_trim can be used to trim the image. 14 | # 15 | # The image can be then be converted using a normal image handler, such as 16 | # mammoth.images.data_uri. 17 | # 18 | # Example usage: 19 | # 20 | # def convert_image(image): 21 | # image = libreoffice_wmf_conversion(image, post_process=imagemagick_trim) 22 | # return mammoth.images.data_uri(image) 23 | # 24 | # with open("document.docx", "rb") as fileobj: 25 | # result = mammoth.convert_to_html(fileobj, convert_image=convert_image) 26 | 27 | 28 | _wmf_extensions = { 29 | "image/x-wmf": ".wmf", 30 | "image/x-emf": ".emf", 31 | } 32 | 33 | 34 | def libreoffice_wmf_conversion(image, post_process=None): 35 | if post_process is None: 36 | post_process = lambda x: x 37 | 38 | wmf_extension = _wmf_extensions.get(image.content_type) 39 | if wmf_extension is None: 40 | return image 41 | else: 42 | temporary_directory = tempfile.mkdtemp() 43 | try: 44 | input_path = os.path.join(temporary_directory, "image" + wmf_extension) 45 | with io.open(input_path, "wb") as input_fileobj: 46 | with image.open() as image_fileobj: 47 | shutil.copyfileobj(image_fileobj, input_fileobj) 48 | 49 | output_path = os.path.join(temporary_directory, "image.png") 50 | subprocess.check_call([ 51 | "libreoffice", 52 | "--headless", 53 | "--convert-to", 54 | "png", 55 | input_path, 56 | "--outdir", 57 | temporary_directory, 58 | ]) 59 | 60 | with io.open(output_path, "rb") as output_fileobj: 61 | output = output_fileobj.read() 62 | 63 | def open_image(): 64 | return io.BytesIO(output) 65 | 66 | return post_process(image.copy( 67 | content_type="image/png", 68 | open=open_image, 69 | )) 70 | finally: 71 | shutil.rmtree(temporary_directory) 72 | 73 | 74 | def imagemagick_trim(image): 75 | command = ["convert", "-", "-trim", "-"] 76 | process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) 77 | try: 78 | with image.open() as image_fileobj: 79 | shutil.copyfileobj(image_fileobj, process.stdin) 80 | output, err_output = process.communicate() 81 | except: 82 | process.kill() 83 | process.wait() 84 | raise 85 | 86 | return_code = process.poll() 87 | if return_code: 88 | raise subprocess.CalledProcessError(return_code, command) 89 | else: 90 | def open_image(): 91 | return io.BytesIO(output) 92 | 93 | return image.copy(open=open_image) 94 | 95 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal = 1 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | from setuptools import setup 5 | 6 | def read(fname): 7 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 8 | 9 | 10 | setup( 11 | name='mammoth', 12 | version='1.9.1', 13 | description='Convert Word documents from docx to simple and clean HTML and Markdown', 14 | long_description=read("README"), 15 | author='Michael Williamson', 16 | author_email='mike@zwobble.org', 17 | url='https://github.com/mwilliamson/python-mammoth', 18 | packages=['mammoth', 'mammoth.docx', 'mammoth.html', 'mammoth.styles', 'mammoth.styles.parser', 'mammoth.writers'], 19 | entry_points={ 20 | "console_scripts": [ 21 | "mammoth=mammoth.cli:main" 22 | ] 23 | }, 24 | keywords="docx word office clean html markdown md", 25 | install_requires=[ 26 | "cobble>=0.1.3,<0.2", 27 | ], 28 | python_requires='>=3.7', 29 | license="BSD-2-Clause", 30 | classifiers=[ 31 | 'Development Status :: 5 - Production/Stable', 32 | 'Intended Audience :: Developers', 33 | 'License :: OSI Approved :: BSD License', 34 | 'Programming Language :: Python', 35 | 'Programming Language :: Python :: 3', 36 | 'Programming Language :: Python :: 3.7', 37 | 'Programming Language :: Python :: 3.8', 38 | 'Programming Language :: Python :: 3.9', 39 | 'Programming Language :: Python :: 3.10', 40 | 'Programming Language :: Python :: 3.11', 41 | 'Programming Language :: Python :: 3.12', 42 | ], 43 | ) 44 | 45 | -------------------------------------------------------------------------------- /test-requirements.txt: -------------------------------------------------------------------------------- 1 | funk>=0.4,<0.5 2 | pytest 3 | precisely==0.1.3 4 | pyflakes==2.4.0 5 | spur.local>=0.3.7,<0.4 6 | tempman>=0.1.2,<0.2 7 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tests/cli_tests.py: -------------------------------------------------------------------------------- 1 | import os 2 | import base64 3 | 4 | import spur 5 | import tempman 6 | 7 | from .testing import assert_equal, generate_test_path 8 | 9 | 10 | _local = spur.LocalShell() 11 | 12 | 13 | def test_html_is_printed_to_stdout_if_output_file_is_not_set(): 14 | docx_path = generate_test_path("single-paragraph.docx") 15 | result = _local.run(["mammoth", docx_path]) 16 | assert_equal(b"", result.stderr_output) 17 | assert_equal(b"
Walking on imported air
", result.output) 18 | 19 | 20 | def test_html_is_written_to_file_if_output_file_is_set(): 21 | with tempman.create_temp_dir() as temp_dir: 22 | output_path = os.path.join(temp_dir.path, "output.html") 23 | docx_path = generate_test_path("single-paragraph.docx") 24 | result = _local.run(["mammoth", docx_path, output_path]) 25 | assert_equal(b"", result.stderr_output) 26 | assert_equal(b"", result.output) 27 | with open(output_path) as output_file: 28 | assert_equal("Walking on imported air
", output_file.read()) 29 | 30 | 31 | _image_base_64 = b"iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAIAAAACUFjqAAAAAXNSR0IArs4c6QAAAAlwSFlzAAAOvgAADr4B6kKxwAAAABNJREFUKFNj/M+ADzDhlWUYqdIAQSwBE8U+X40AAAAASUVORK5CYII=" 32 | 33 | 34 | def test_inline_images_are_included_in_output_if_writing_to_single_file(): 35 | docx_path = generate_test_path("tiny-picture.docx") 36 | result = _local.run(["mammoth", docx_path]) 37 | assert_equal(b"""