├── .github
    ├── ISSUE_TEMPLATE.md
    ├── pull_request_template.md
    └── workflows
    │   └── tests.yml
├── .gitignore
├── LICENSE
├── NEWS
├── README.md
├── makefile
├── mammoth
    ├── __init__.py
    ├── cli.py
    ├── conversion.py
    ├── document_matchers.py
    ├── documents.py
    ├── docx
    │   ├── __init__.py
    │   ├── body_xml.py
    │   ├── comments_xml.py
    │   ├── complex_fields.py
    │   ├── content_types_xml.py
    │   ├── dingbats.py
    │   ├── document_xml.py
    │   ├── files.py
    │   ├── notes_xml.py
    │   ├── numbering_xml.py
    │   ├── office_xml.py
    │   ├── relationships_xml.py
    │   ├── style_map.py
    │   ├── styles_xml.py
    │   ├── uris.py
    │   └── xmlparser.py
    ├── html
    │   ├── __init__.py
    │   └── nodes.py
    ├── html_paths.py
    ├── images.py
    ├── lists.py
    ├── options.py
    ├── raw_text.py
    ├── results.py
    ├── styles
    │   ├── __init__.py
    │   └── parser
    │   │   ├── __init__.py
    │   │   ├── document_matcher_parser.py
    │   │   ├── errors.py
    │   │   ├── html_path_parser.py
    │   │   ├── style_mapping_parser.py
    │   │   ├── token_iterator.py
    │   │   ├── token_parser.py
    │   │   └── tokeniser.py
    ├── transforms.py
    ├── underline.py
    ├── writers
    │   ├── __init__.py
    │   ├── abc.py
    │   ├── html.py
    │   └── markdown.py
    └── zips.py
├── pyproject.toml
├── recipes
    └── wmf_images.py
├── setup.cfg
├── setup.py
├── test-requirements.txt
├── tests
    ├── __init__.py
    ├── cli_tests.py
    ├── conftest.py
    ├── conversion_tests.py
    ├── docx
    │   ├── __init__.py
    │   ├── body_xml_tests.py
    │   ├── comments_xml_tests.py
    │   ├── content_types_xml_tests.py
    │   ├── document_matchers.py
    │   ├── document_xml_tests.py
    │   ├── docx_tests.py
    │   ├── files_tests.py
    │   ├── notes_xml_tests.py
    │   ├── numbering_xml_tests.py
    │   ├── office_xml_tests.py
    │   ├── relationships_xml_tests.py
    │   ├── style_map_tests.py
    │   ├── styles_xml_tests.py
    │   ├── uris_tests.py
    │   └── xmlparser_tests.py
    ├── html
    │   ├── __init__.py
    │   ├── collapse_tests.py
    │   └── strip_empty_tests.py
    ├── images_tests.py
    ├── lists_tests.py
    ├── mammoth_tests.py
    ├── options_tests.py
    ├── raw_text_tests.py
    ├── styles
    │   ├── __init__.py
    │   ├── document_matcher_tests.py
    │   └── parser
    │   │   ├── __init__.py
    │   │   ├── document_matcher_parser_tests.py
    │   │   ├── html_path_parser_tests.py
    │   │   ├── style_mapping_parser_tests.py
    │   │   ├── token_parser_tests.py
    │   │   └── tokeniser_tests.py
    ├── test-data
    │   ├── comments.docx
    │   ├── embedded-style-map.docx
    │   ├── empty.docx
    │   ├── endnotes.docx
    │   ├── external-picture.docx
    │   ├── footnote-hyperlink.docx
    │   ├── footnotes.docx
    │   ├── hyperlinks
    │   │   └── word
    │   │   │   ├── _rels
    │   │   │       └── document.xml.rels
    │   │   │   └── document.xml
    │   ├── simple-list.docx
    │   ├── simple
    │   │   └── word
    │   │   │   └── document.xml
    │   ├── single-paragraph.docx
    │   ├── strict-format.docx
    │   ├── strikethrough.docx
    │   ├── tables.docx
    │   ├── text-box.docx
    │   ├── tiny-picture-target-base-relative.docx
    │   ├── tiny-picture.docx
    │   ├── tiny-picture.png
    │   ├── underline.docx
    │   └── utf8-bom.docx
    ├── testing.py
    ├── transforms_tests.py
    ├── writers
    │   ├── __init__.py
    │   └── markdown_tests.py
    └── zips_tests.py
└── tox.ini


/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | If you're reporting a bug or requesting a feature, please include:
 2 | * a minimal example document
 3 | * the HTML output that you'd expect
 4 | 
 5 | If you're reporting a bug, it's also useful to know what platform you're
 6 | running on, including:
 7 | 
 8 | * the version of Python
 9 | * the operating system and version
10 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
1 | In general, pull requests are not currently accepted.
2 | 
3 | Please instead submit an issue if you find a bug or would like to request a feature.
4 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |     build:
 7 |         runs-on: ubuntu-22.04
 8 | 
 9 |         strategy:
10 |             matrix:
11 |                 python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "pypy3.9"]
12 | 
13 |         steps:
14 | 
15 |         - uses: actions/checkout@v4
16 | 
17 |         - name: Use Python ${{ matrix.python-version }}
18 |           uses: actions/setup-python@v5
19 |           with:
20 |               python-version: ${{ matrix.python-version }}
21 | 
22 |         - run: pip install tox
23 | 
24 |         - run: make README
25 | 
26 |         - run: tox -e py
27 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | /README
3 | /_virtualenv
4 | /*.egg-info
5 | /.tox
6 | /MANIFEST
7 | /build
8 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2013, Michael Williamson
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met: 
 6 | 
 7 | 1. Redistributions of source code must retain the above copyright notice, this
 8 |    list of conditions and the following disclaimer. 
 9 | 2. Redistributions in binary form must reproduce the above copyright notice,
10 |    this list of conditions and the following disclaimer in the documentation
11 |    and/or other materials provided with the distribution. 
12 | 
13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 | 


--------------------------------------------------------------------------------
/NEWS:
--------------------------------------------------------------------------------
  1 | # 1.10.0
  2 | 
  3 | * Add "Heading" and "Body" styles, as found in documents created by Apple Pages,
  4 |   to the default style map.
  5 | 
  6 | * Handle structured document tags representing checkboxes wrapped in other
  7 |   elements, such as table cells. Previously, the wrapping elements would have
  8 |   been ignored.
  9 | 
 10 | * Ignore deleted table rows.
 11 | 
 12 | # 1.9.1
 13 | 
 14 | * Ignore AlternateContent elements when there is no Fallback element.
 15 | 
 16 | # 1.9.0
 17 | 
 18 | * Detect checkboxes, both as complex fields and structured document tags, and
 19 |   convert them to checkbox inputs.
 20 | 
 21 | * Ignore AlternateContent elements when there is no Fallback element.
 22 | 
 23 | # 1.8.0
 24 | 
 25 | * Add style mapping for highlights.
 26 | 
 27 | # 1.7.1
 28 | 
 29 | * Switch the precedence of numbering properties in paragraph properties and the
 30 |   numbering in paragraph styles so that the numbering properties in paragraph
 31 |   properties takes precedence.
 32 | 
 33 | # 1.7.0
 34 | 
 35 | * Support attributes in HTML paths in style mappings.
 36 | 
 37 | * Improve error message when failing to find the body element in a document.
 38 | 
 39 | * Drop support for Python 2.7, Python 3.5 and Python 3.6.
 40 | 
 41 | * Add support for the strict document format.
 42 | 
 43 | # 1.6.0
 44 | 
 45 | * Support merged paragraphs when revisions are tracked.
 46 | 
 47 | # 1.5.1
 48 | 
 49 | * Add a pyproject.toml to add an explicit build dependency on setuptools.
 50 | 
 51 | # 1.5.0
 52 | 
 53 | * Only use the alt text of image elements as a fallback. If an alt attribute is
 54 |   returned from the function passed to mammoth.images.img_element, that value
 55 |   will now be preferred to the alt text of the image element.
 56 | 
 57 | # 1.4.19
 58 | 
 59 | * Ignore w:u elements when w:val is missing.
 60 | 
 61 | # 1.4.18
 62 | 
 63 | * Emit warning instead of throwing exception when image file cannot be found for
 64 |   a:blip elements.
 65 | 
 66 | # 1.4.17
 67 | 
 68 | * When extracting raw text, convert tab elements to tab characters.
 69 | 
 70 | * Handle internal hyperlinks created with complex fields.
 71 | 
 72 | # 1.4.16
 73 | 
 74 | * Handle w:num with invalid w:abstractNumId.
 75 | 
 76 | # 1.4.15
 77 | 
 78 | * Convert symbols in supported fonts to corresponding Unicode characters.
 79 | 
 80 | # 1.4.14
 81 | 
 82 | * Support numbering defined by paragraph style.
 83 | 
 84 | # 1.4.13
 85 | 
 86 | * Add style mapping for all caps.
 87 | 
 88 | # 1.4.12
 89 | 
 90 | * Handle underline elements where w:val is "none".
 91 | 
 92 | # 1.4.11
 93 | 
 94 | * Read font size for runs.
 95 | * Support soft hyphens.
 96 | 
 97 | # 1.4.10
 98 | 
 99 | * Update supported Python versions to 2.7 and 3.4 to 3.8.
100 | 
101 | # 1.4.9
102 | 
103 | * Improve list support by following w:numStyleLink in w:abstractNum.
104 | 
105 | # 1.4.8
106 | 
107 | * Preserve empty table rows.
108 | 
109 | # 1.4.7
110 | 
111 | * Always write files as UTF-8 in the CLI.
112 | 
113 | # 1.4.6
114 | 
115 | * Fix: default style mappings caused footnotes, endnotes and comments
116 |   containing multiple paragraphs to be converted into a single paragraph.
117 | 
118 | # 1.4.5
119 | 
120 | * Read the children of v:rect elements.
121 | 
122 | # 1.4.4
123 | 
124 | * Parse paragraph indents.
125 | 
126 | * Read part paths using relationships. This improves support for documents
127 |   created by Word Online.
128 | 
129 | # 1.4.3
130 | 
131 | * Add style mapping for small caps.
132 | 
133 | * Add style mapping for tables.
134 | 
135 | # 1.4.2
136 | 
137 | * Read children of v:group elements.
138 | 
139 | # 1.4.1
140 | 
141 | * Read w:noBreakHyphen elements as non-breaking hyphen characters.
142 | 
143 | # 1.4.0
144 | 
145 | * Extract the default data URI image converter to the images module.
146 | 
147 | * Add anchor on hyperlinks as fragment if present.
148 | 
149 | * Convert target frames on hyperlinks to targets on anchors.
150 | 
151 | * Detect header rows in tables and convert to thead > tr > th.
152 | 
153 | # 1.3.5
154 | 
155 | * Handle complex fields that do not have a "separate" fldChar.
156 | 
157 | # 1.3.4
158 | 
159 | * Add transforms.run.
160 | 
161 | # 1.3.3
162 | 
163 | * Read children of w:object elements.
164 | 
165 | * Add support for document transforms.
166 | 
167 | # 1.3.2
168 | 
169 | * Handle hyperlinks created with complex fields.
170 | 
171 | # 1.3.1
172 | 
173 | * Handle absolute paths within zip files. This should fix an issue where some
174 |   images within a document couldn't be found.
175 | 
176 | # 1.3.0
177 | 
178 | * Allow style names to be mapped by prefix. For instance:
179 | 
180 |       r[style-name^='Code '] => code
181 | 
182 | * Add default style mappings for Heading 5 and Heading 6.
183 | 
184 | * Allow escape sequences in style IDs, style names and CSS class names.
185 | 
186 | * Allow a separator to be specified when HTML elements are collapsed.
187 | 
188 | * Add include_embedded_style_map argument to allow embedded style maps to be
189 |   disabled.
190 | 
191 | * Include embedded styles when explicit style map is passed.
192 | 
193 | # 1.2.2
194 | 
195 | * Ignore bold, italic, underline and strikethrough elements that have a value of
196 |   false or 0.
197 | 
198 | # 1.2.1
199 | 
200 | * Ignore v:imagedata elements without relationship ID with warning.
201 | 
202 | # 1.2.0
203 | 
204 | * Use alt text title as alt text for images when the alt text description is
205 |   blank or missing.
206 | 
207 | # 1.1.1
208 | 
209 | * Handle comments without author initials.
210 | 
211 | * Change numbering of comments to be global rather than per-user to match the
212 |   behaviour of Word.
213 | 
214 | # 1.1.0
215 | 
216 | * Add support for comments.
217 | 
218 | # 1.0.4
219 | 
220 | * Add support for w:sdt elements. This allows the bodies of content controls,
221 |   such as bibliographies, to be converted.
222 | 
223 | # 1.0.3
224 | 
225 | * Add support for table cells spanning multiple rows.
226 | 
227 | # 1.0.2
228 | 
229 | * Add support for table cells spanning multiple columns.
230 | 
231 | # 1.0.1
232 | 
233 | * Improve script installation on Windows by using entry_points instead of
234 |   scripts in setup.py.
235 | 
236 | # 1.0.0
237 | 
238 | * Remove deprecated convert_underline argument.
239 | 
240 | * Officially support ID prefixes.
241 | 
242 | * Generated IDs no longer insert a hyphen after the ID prefix.
243 | 
244 | * The default ID prefix is now the empty string rather than a random number
245 |   followed by a hyphen.
246 | 
247 | * Rename mammoth.images.inline to mammoth.images.img_element to better reflect
248 |   its behaviour.
249 | 
250 | # 0.3.31
251 | 
252 | * Improve collapsing of similar non-fresh HTML elements.
253 | 
254 | # 0.3.30
255 | 
256 | * Allow bold and italic style mappings to be configured.
257 | 
258 | # 0.3.29
259 | 
260 | * Handle references to missing styles when reading documents.
261 | 
262 | # 0.3.28
263 | 
264 | * Improve support for lists made in LibreOffice. Specifically, this changes the
265 |   default style mapping for paragraphs with a style of "Normal" to have the
266 |   lowest precedence.
267 | 
268 | # 0.3.27
269 | 
270 | * Handle XML where the child nodes of an element contains text nodes.
271 | 
272 | # 0.3.26
273 | 
274 | * Always use mc:Fallback when reading mc:AlternateContent elements.
275 | 
276 | # 0.3.25
277 | 
278 | * Remove duplicate messages from results.
279 | 
280 | * Read v:imagedata with r:id attribute.
281 | 
282 | * Read children of v:roundrect.
283 | 
284 | * Ignore office-word:wrap, v:shadow and v:shapetype.
285 | 
286 | # 0.3.24
287 | 
288 | * Continue with warning if external images cannot be found.
289 | 
290 | * Add support for embedded style maps.
291 | 
292 | # 0.3.23
293 | 
294 | * Fix Python 3 support.
295 | 
296 | # 0.3.22
297 | 
298 | * Generate warnings for not-understood style mappings and continue, rather than
299 |   stopping with an error.
300 | 
301 | * Support file objects without a name attribute again (broken since 0.3.20).
302 | 
303 | # 0.3.21
304 | 
305 | * Ignore w:numPr elements without w:numId or w:ilvl children.
306 | 
307 | # 0.3.20
308 | 
309 | * Add support for linked images.
310 | 
311 | # 0.3.19
312 | 
313 | * Fix: cannot extract raw text from elements without children
314 | 
315 | # 0.3.18
316 | 
317 | * Support links and images in footnotes and endnotes.
318 | 
319 | # 0.3.17
320 | 
321 | * Add support for underlines in style map.
322 | 
323 | * Add support for strikethrough.
324 | 
325 | # 0.3.16
326 | 
327 | * Add basic support for text boxes. The contents of the text box are treated as
328 |   a separate paragraph that appears after the paragraph containing the text box.
329 | 
330 | # 0.3.15
331 | 
332 | * Support styles defined without a name
333 | 
334 | # 0.3.14
335 | 
336 | * Add ignore_empty_paragraphs option, which defaults to True.
337 | 
338 | # 0.3.13
339 | 
340 | * Always use forward slashes in ZIP paths. This should fix image handling on
341 |   Windows.
342 | 
343 | # 0.3.12
344 | 
345 | * Make style names case-insensitive in style mappings. This should make style
346 |   mappings easier to write, especially since Microsoft Word sometimes represents
347 |   style names in the UI differently from in the style definition. For instance,
348 |   the style displayed in Word as "Heading 1" has  a style name of "heading 1".
349 |   This hopefully shouldn't cause an issue for anyone, but if you were relying
350 |   on case-sensitivity, please do get in touch.
351 | 
352 | # 0.3.11
353 | 
354 | * Add support for hyperlinks to bookmarks in the same document.
355 | 
356 | # 0.3.10
357 | 
358 | * Add basic support for Markdown. Not all features are currently supported.
359 | 
360 | # 0.3.9
361 | 
362 | * Add default style mappings for builtin footnote and endnote styles in
363 |   Microsoft Word and LibreOffice.
364 | 
365 | * Allow style mappings with a zero-element HTML path.
366 | 
367 | * Emit warnings when image types are unlikely to be supported by web browsers.
368 | 
369 | # 0.3.8
370 | 
371 | * Add support for endnotes.
372 | 
373 | # 0.3.7
374 | 
375 | * Add support for superscript and subscript text.
376 | 
377 | # 0.3.6
378 | 
379 | * Add support for footnotes.
380 | 
381 | # 0.3.5
382 | 
383 | * Add support for line breaks.
384 | 
385 | # 0.3.4
386 | 
387 | * Add optional underline conversion.
388 | 
389 | # 0.3.3
390 | 
391 | * Add `mammoth.images.inline`, and document custom image conversion.
392 | 
393 | # 0.3.2
394 | 
395 | * Add the function `mammoth.extract_raw_text`.
396 | 
397 | # 0.3.1
398 | 
399 | * Add support for tables
400 | 
401 | # 0.3.0
402 | 
403 | * Rename --styles CLI argument to --style-map.
404 | 
405 | * Rename styles argument in convert_to_html to style_map.
406 | 
407 | * Allow paragraphs and runs to be matched by style name. For instance, to match
408 |   a paragraph with the style name `Heading 1`:
409 | 
410 |     p[style-name='Heading 1']
411 | 


--------------------------------------------------------------------------------
/makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: test
 2 | 
 3 | test:
 4 | 	_virtualenv/bin/pyflakes mammoth tests
 5 | 	sh -c '. _virtualenv/bin/activate; py.test tests'
 6 | 
 7 | .PHONY: test-all
 8 | 
 9 | test-all:
10 | 	tox
11 | 
12 | .PHONY: upload
13 | 
14 | upload: setup assert-converted-readme build-dist
15 | 	_virtualenv/bin/twine upload dist/*
16 | 	make clean
17 | 
18 | .PHONY: build-dist
19 | 
20 | build-dist:
21 | 	rm -rf dist
22 | 	_virtualenv/bin/pyproject-build
23 | 
24 | README: README.md
25 | 	pandoc --from=markdown --to=rst README.md > README || cp README.md README
26 | 
27 | .PHONY: assert-converted-readme
28 | 
29 | assert-converted-readme:
30 | 	test "`cat README`" != "`cat README.md`"
31 | 
32 | .PHONY: clean
33 | 
34 | clean:
35 | 	rm -f README
36 | 	rm -f MANIFEST
37 | 	rm -rf dist
38 | 
39 | .PHONY: bootstrap
40 | 
41 | bootstrap: _virtualenv setup
42 | 	_virtualenv/bin/pip install -e .
43 | ifneq ($(wildcard test-requirements.txt),)
44 | 	_virtualenv/bin/pip install -r test-requirements.txt
45 | endif
46 | 	make clean
47 | 
48 | .PHONY: setup
49 | 
50 | setup: README
51 | 
52 | _virtualenv:
53 | 	python3 -m venv _virtualenv
54 | 	_virtualenv/bin/pip install --upgrade pip
55 | 	_virtualenv/bin/pip install --upgrade setuptools
56 | 	_virtualenv/bin/pip install --upgrade wheel
57 | 	_virtualenv/bin/pip install --upgrade build twine
58 | 


--------------------------------------------------------------------------------
/mammoth/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import docx, conversion, options, images, transforms, underline
 2 | from .raw_text import extract_raw_text_from_element
 3 | from .docx.style_map import write_style_map, read_style_map
 4 | 
 5 | __all__ = ["convert_to_html", "extract_raw_text", "images", "transforms", "underline"]
 6 | 
 7 | 
 8 | _undefined = object()
 9 | 
10 | 
11 | def convert_to_html(*args, **kwargs):
12 |     return convert(*args, output_format="html", **kwargs)
13 | 
14 | 
15 | def convert_to_markdown(*args, **kwargs):
16 |     return convert(*args, output_format="markdown", **kwargs)
17 | 
18 | 
19 | def convert(fileobj, transform_document=None, id_prefix=None, include_embedded_style_map=_undefined, **kwargs):
20 |     if include_embedded_style_map is _undefined:
21 |         include_embedded_style_map = True
22 |     if transform_document is None:
23 |         transform_document = lambda x: x
24 |     if include_embedded_style_map:
25 |         kwargs["embedded_style_map"] = read_style_map(fileobj)
26 |     return options.read_options(kwargs).bind(lambda convert_options:
27 |         docx.read(fileobj).map(transform_document).bind(lambda document:
28 |             conversion.convert_document_element_to_html(
29 |                 document,
30 |                 id_prefix=id_prefix,
31 |                 **convert_options
32 |             )
33 |         )
34 |     )
35 |     
36 | 
37 | def extract_raw_text(fileobj):
38 |     return docx.read(fileobj).map(extract_raw_text_from_element)
39 | 
40 | 
41 | def embed_style_map(fileobj, style_map):
42 |     write_style_map(fileobj, style_map)
43 | 
44 | def read_embedded_style_map(fileobj):
45 |     return read_style_map(fileobj)
46 | 


--------------------------------------------------------------------------------
/mammoth/cli.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import io
  3 | import os
  4 | import shutil
  5 | import sys
  6 | 
  7 | import mammoth
  8 | from . import writers
  9 | 
 10 | 
 11 | def main():
 12 |     args = _parse_args()
 13 |     
 14 |     if args.style_map is None:
 15 |         style_map = None
 16 |     else:
 17 |         with open(args.style_map) as style_map_fileobj:
 18 |             style_map = style_map_fileobj.read()
 19 |     
 20 |     with open(args.path, "rb") as docx_fileobj:
 21 |         if args.output_dir is None:
 22 |             convert_image = None
 23 |             output_path = args.output
 24 |         else:
 25 |             convert_image = mammoth.images.img_element(ImageWriter(args.output_dir))
 26 |             output_filename = "{0}.html".format(os.path.basename(args.path).rpartition(".")[0])
 27 |             output_path = os.path.join(args.output_dir, output_filename)
 28 |         
 29 |         result = mammoth.convert(
 30 |             docx_fileobj,
 31 |             style_map=style_map,
 32 |             convert_image=convert_image,
 33 |             output_format=args.output_format,
 34 |         )
 35 |         for message in result.messages:
 36 |             sys.stderr.write(message.message)
 37 |             sys.stderr.write("\n")
 38 |         
 39 |         _write_output(output_path, result.value)
 40 | 
 41 | 
 42 | class ImageWriter(object):
 43 |     def __init__(self, output_dir):
 44 |         self._output_dir = output_dir
 45 |         self._image_number = 1
 46 |         
 47 |     def __call__(self, element):
 48 |         extension = element.content_type.partition("/")[2]
 49 |         image_filename = "{0}.{1}".format(self._image_number, extension)
 50 |         with open(os.path.join(self._output_dir, image_filename), "wb") as image_dest:
 51 |             with element.open() as image_source:
 52 |                 shutil.copyfileobj(image_source, image_dest)
 53 |         
 54 |         self._image_number += 1
 55 |         
 56 |         return {"src": image_filename}
 57 | 
 58 | 
 59 | def _write_output(path, contents):
 60 |     if path is None:
 61 |         if sys.version_info[0] <= 2:
 62 |             stdout = sys.stdout
 63 |         else:
 64 |             stdout = sys.stdout.buffer
 65 | 
 66 |         stdout.write(contents.encode("utf-8"))
 67 |         stdout.flush()
 68 |     else:
 69 |         with io.open(path, "w", encoding="utf-8") as fileobj:
 70 |             fileobj.write(contents)
 71 | 
 72 | 
 73 | def _parse_args():
 74 |     parser = argparse.ArgumentParser()
 75 |     parser.add_argument(
 76 |         "path",
 77 |         metavar="docx-path",
 78 |         help="Path to the .docx file to convert.")
 79 |     
 80 |     output_group = parser.add_mutually_exclusive_group()
 81 |     output_group.add_argument(
 82 |         "output",
 83 |         nargs="?",
 84 |         metavar="output-path",
 85 |         help="Output path for the generated document. Images will be stored inline in the output document. Output is written to stdout if not set.")
 86 |     output_group.add_argument(
 87 |         "--output-dir",
 88 |         help="Output directory for generated HTML and images. Images will be stored in separate files. Mutually exclusive with output-path.")
 89 |     
 90 |     parser.add_argument(
 91 |         "--output-format",
 92 |         required=False,
 93 |         choices=writers.formats(),
 94 |         help="Output format.")
 95 |     parser.add_argument(
 96 |         "--style-map",
 97 |         required=False,
 98 |         help="File containg a style map.")
 99 |     return parser.parse_args()
100 | 
101 | 
102 | if __name__ == "__main__":
103 |     main()
104 | 
105 | 


--------------------------------------------------------------------------------
/mammoth/document_matchers.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | 
 3 | import cobble
 4 | 
 5 | 
 6 | def paragraph(style_id=None, style_name=None, numbering=None):
 7 |     return ParagraphMatcher(style_id, style_name, numbering)
 8 | 
 9 | 
10 | ParagraphMatcher = collections.namedtuple("ParagraphMatcher", ["style_id", "style_name", "numbering"])
11 | ParagraphMatcher.element_type = "paragraph"
12 | 
13 | 
14 | def run(style_id=None, style_name=None):
15 |     return RunMatcher(style_id, style_name)
16 | 
17 | 
18 | RunMatcher = collections.namedtuple("RunMatcher", ["style_id", "style_name"])
19 | RunMatcher.element_type = "run"
20 | 
21 | 
22 | def table(style_id=None, style_name=None):
23 |     return TableMatcher(style_id, style_name)
24 | 
25 | 
26 | TableMatcher = collections.namedtuple("TableMatcher", ["style_id", "style_name"])
27 | TableMatcher.element_type = "table"
28 | 
29 | 
30 | class bold(object):
31 |     element_type = "bold"
32 | 
33 | 
34 | class italic(object):
35 |     element_type = "italic"
36 | 
37 | 
38 | class underline(object):
39 |     element_type = "underline"
40 | 
41 | 
42 | class strikethrough(object):
43 |     element_type = "strikethrough"
44 | 
45 | 
46 | class all_caps(object):
47 |     element_type = "all_caps"
48 | 
49 | 
50 | class small_caps(object):
51 |     element_type = "small_caps"
52 | 
53 | 
54 | def highlight(color=None):
55 |     return HighlightMatcher(color=color)
56 | 
57 | 
58 | HighlightMatcher = collections.namedtuple("HighlightMatcher", ["color"])
59 | HighlightMatcher.element_type = "highlight"
60 | 
61 | class comment_reference(object):
62 |     element_type = "comment_reference"
63 | 
64 | 
65 | BreakMatcher = collections.namedtuple("BreakMatcher", ["break_type"])
66 | BreakMatcher.element_type = "break"
67 | 
68 | 
69 | line_break = BreakMatcher("line")
70 | page_break = BreakMatcher("page")
71 | column_break = BreakMatcher("column")
72 | 
73 | 
74 | def equal_to(value):
75 |     return StringMatcher(_operator_equal_to, value)
76 | 
77 | 
78 | def _operator_equal_to(first, second):
79 |     return first.upper() == second.upper()
80 | 
81 | 
82 | def starts_with(value):
83 |     return StringMatcher(_operator_starts_with, value)
84 | 
85 | def _operator_starts_with(first, second):
86 |     return second.upper().startswith(first.upper())
87 | 
88 | 
89 | @cobble.data
90 | class StringMatcher(object):
91 |     operator = cobble.field()
92 |     value = cobble.field()
93 | 
94 |     def matches(self, other):
95 |         return self.operator(self.value, other)
96 | 


--------------------------------------------------------------------------------
/mammoth/documents.py:
--------------------------------------------------------------------------------
  1 | import cobble
  2 | 
  3 | 
  4 | class Element(object):
  5 |     def copy(self, **kwargs):
  6 |         return cobble.copy(self, **kwargs)
  7 | 
  8 | 
  9 | class HasChildren(Element):
 10 |     children = cobble.field()
 11 | 
 12 | 
 13 | @cobble.data
 14 | class Document(HasChildren):
 15 |     notes = cobble.field()
 16 |     comments = cobble.field()
 17 | 
 18 | @cobble.data
 19 | class Paragraph(HasChildren):
 20 |     style_id = cobble.field()
 21 |     style_name = cobble.field()
 22 |     numbering = cobble.field()
 23 |     alignment = cobble.field()
 24 |     indent = cobble.field()
 25 | 
 26 | 
 27 | @cobble.data
 28 | class ParagraphIndent(object):
 29 |     start = cobble.field()
 30 |     end = cobble.field()
 31 |     first_line = cobble.field()
 32 |     hanging = cobble.field()
 33 | 
 34 | 
 35 | @cobble.data
 36 | class Indent(object):
 37 |     left = cobble.field()
 38 |     right = cobble.field()
 39 |     first_line = cobble.field()
 40 |     hanging = cobble.field()
 41 | 
 42 | 
 43 | @cobble.data
 44 | class Run(HasChildren):
 45 |     style_id = cobble.field()
 46 |     style_name = cobble.field()
 47 |     is_bold = cobble.field()
 48 |     is_italic = cobble.field()
 49 |     is_underline = cobble.field()
 50 |     is_strikethrough = cobble.field()
 51 |     is_all_caps = cobble.field()
 52 |     is_small_caps = cobble.field()
 53 |     vertical_alignment = cobble.field()
 54 |     font = cobble.field()
 55 |     font_size = cobble.field()
 56 |     highlight = cobble.field()
 57 | 
 58 | @cobble.data
 59 | class Text(Element):
 60 |     value = cobble.field()
 61 | 
 62 | @cobble.data
 63 | class Hyperlink(HasChildren):
 64 |     href = cobble.field()
 65 |     anchor = cobble.field()
 66 |     target_frame = cobble.field()
 67 | 
 68 | @cobble.data
 69 | class Checkbox(Element):
 70 |     checked = cobble.field()
 71 | 
 72 | checkbox = Checkbox
 73 | 
 74 | @cobble.data
 75 | class Table(HasChildren):
 76 |     style_id = cobble.field()
 77 |     style_name = cobble.field()
 78 | 
 79 | @cobble.data
 80 | class TableRow(HasChildren):
 81 |     is_header = cobble.field()
 82 | 
 83 | @cobble.data
 84 | class TableCell(HasChildren):
 85 |     colspan = cobble.field()
 86 |     rowspan = cobble.field()
 87 | 
 88 | @cobble.data
 89 | class TableCellUnmerged:
 90 |     children = cobble.field()
 91 |     colspan = cobble.field()
 92 |     rowspan = cobble.field()
 93 |     vmerge = cobble.field()
 94 | 
 95 |     def copy(self, **kwargs):
 96 |         return cobble.copy(self, **kwargs)
 97 | 
 98 | @cobble.data
 99 | class Break(Element):
100 |     break_type = cobble.field()
101 | 
102 | line_break = Break("line")
103 | page_break = Break("page")
104 | column_break = Break("column")
105 | 
106 | 
107 | @cobble.data
108 | class Tab(Element):
109 |     pass
110 | 
111 | 
112 | @cobble.data
113 | class Image(Element):
114 |     alt_text = cobble.field()
115 |     content_type = cobble.field()
116 |     open = cobble.field()
117 | 
118 | 
119 | def document(children, notes=None, comments=None):
120 |     if notes is None:
121 |         notes = Notes({})
122 |     if comments is None:
123 |         comments = []
124 |     return Document(children, notes, comments=comments)
125 | 
126 | def paragraph(children, style_id=None, style_name=None, numbering=None, alignment=None, indent=None):
127 |     if indent is None:
128 |         indent = paragraph_indent()
129 | 
130 |     return Paragraph(children, style_id, style_name, numbering, alignment=alignment, indent=indent)
131 | 
132 | def paragraph_indent(start=None, end=None, first_line=None, hanging=None):
133 |     return ParagraphIndent(start=start, end=end, first_line=first_line, hanging=hanging)
134 | 
135 | def run(
136 |     children,
137 |     style_id=None,
138 |     style_name=None,
139 |     is_bold=None,
140 |     is_italic=None,
141 |     is_underline=None,
142 |     is_strikethrough=None,
143 |     is_all_caps=None,
144 |     is_small_caps=None,
145 |     vertical_alignment=None,
146 |     font=None,
147 |     font_size=None,
148 |     highlight=None,
149 | ):
150 |     if vertical_alignment is None:
151 |         vertical_alignment = VerticalAlignment.baseline
152 |     return Run(
153 |         children=children,
154 |         style_id=style_id,
155 |         style_name=style_name,
156 |         is_bold=bool(is_bold),
157 |         is_italic=bool(is_italic),
158 |         is_underline=bool(is_underline),
159 |         is_strikethrough=bool(is_strikethrough),
160 |         is_all_caps=bool(is_all_caps),
161 |         is_small_caps=bool(is_small_caps),
162 |         vertical_alignment=vertical_alignment,
163 |         font=font,
164 |         font_size=font_size,
165 |         highlight=highlight,
166 |     )
167 | 
168 | class VerticalAlignment(object):
169 |     baseline = "baseline"
170 |     superscript = "superscript"
171 |     subscript = "subscript"
172 | 
173 | text = Text
174 | 
175 | _tab = Tab()
176 | 
177 | def tab():
178 |     return _tab
179 | 
180 | 
181 | image = Image
182 | 
183 | def hyperlink(children, href=None, anchor=None, target_frame=None):
184 |     return Hyperlink(href=href, anchor=anchor, target_frame=target_frame, children=children)
185 | 
186 | 
187 | @cobble.data
188 | class Bookmark(Element):
189 |     name = cobble.field()
190 | 
191 | bookmark = Bookmark
192 | 
193 | 
194 | def table(children, style_id=None, style_name=None):
195 |     return Table(children=children, style_id=style_id, style_name=style_name)
196 | 
197 | def table_row(children, is_header=None):
198 |     return TableRow(children=children, is_header=bool(is_header))
199 | 
200 | def table_cell(children, colspan=None, rowspan=None):
201 |     if colspan is None:
202 |         colspan = 1
203 |     if rowspan is None:
204 |         rowspan = 1
205 |     return TableCell(children=children, colspan=colspan, rowspan=rowspan)
206 | 
207 | def table_cell_unmerged(children, colspan, rowspan, vmerge):
208 |     return TableCellUnmerged(children=children, colspan=colspan, rowspan=rowspan, vmerge=vmerge)
209 | 
210 | def numbering_level(level_index, is_ordered):
211 |     return _NumberingLevel(str(level_index), bool(is_ordered))
212 | 
213 | @cobble.data
214 | class _NumberingLevel(object):
215 |     level_index = cobble.field()
216 |     is_ordered = cobble.field()
217 | 
218 | @cobble.data
219 | class Note(Element):
220 |     note_type = cobble.field()
221 |     note_id = cobble.field()
222 |     body = cobble.field()
223 | 
224 | 
225 | note = Note
226 | 
227 | 
228 | class Notes(object):
229 |     def __init__(self, notes):
230 |         self._notes = notes
231 | 
232 |     def find_note(self, note_type, note_id):
233 |         return self._notes[(note_type, note_id)]
234 | 
235 |     def resolve(self, reference):
236 |         return self.find_note(reference.note_type, reference.note_id)
237 | 
238 |     def __eq__(self, other):
239 |         return isinstance(other, Notes) and self._notes == other._notes
240 | 
241 |     def __ne__(self, other):
242 |         return not (self == other)
243 | 
244 | def notes(notes_list):
245 |     return Notes(dict(
246 |         (_note_key(note), note)
247 |         for note in notes_list
248 |     ))
249 | 
250 | def _note_key(note):
251 |     return (note.note_type, note.note_id)
252 | 
253 | @cobble.data
254 | class NoteReference(Element):
255 |     note_type = cobble.field()
256 |     note_id = cobble.field()
257 | 
258 | note_reference = NoteReference
259 | 
260 | 
261 | @cobble.data
262 | class Comment(object):
263 |     comment_id = cobble.field()
264 |     body = cobble.field()
265 |     author_name = cobble.field()
266 |     author_initials = cobble.field()
267 | 
268 | def comment(comment_id, body, author_name=None, author_initials=None):
269 |     return Comment(
270 |         comment_id=comment_id,
271 |         body=body,
272 |         author_name=author_name,
273 |         author_initials=author_initials,
274 |     )
275 | 
276 | @cobble.data
277 | class CommentReference(Element):
278 |     comment_id = cobble.field()
279 | 
280 | comment_reference = CommentReference
281 | 
282 | def element_visitor(args):
283 |     return cobble.visitor(Element, args=args)
284 | 


--------------------------------------------------------------------------------
/mammoth/docx/__init__.py:
--------------------------------------------------------------------------------
  1 | from functools import partial
  2 | import os
  3 | 
  4 | import cobble
  5 | 
  6 | from .. import results, lists, zips
  7 | from .document_xml import read_document_xml_element
  8 | from .content_types_xml import empty_content_types, read_content_types_xml_element
  9 | from .relationships_xml import read_relationships_xml_element, Relationships
 10 | from .numbering_xml import read_numbering_xml_element, Numbering
 11 | from .styles_xml import read_styles_xml_element, Styles
 12 | from .notes_xml import read_endnotes_xml_element, read_footnotes_xml_element
 13 | from .comments_xml import read_comments_xml_element
 14 | from .files import Files
 15 | from . import body_xml, office_xml
 16 | from ..zips import open_zip
 17 | 
 18 | 
 19 | _empty_result = results.success([])
 20 | 
 21 | 
 22 | def read(fileobj):
 23 |     zip_file = open_zip(fileobj, "r")
 24 |     part_paths = _find_part_paths(zip_file)
 25 |     read_part_with_body = _part_with_body_reader(
 26 |         getattr(fileobj, "name", None),
 27 |         zip_file,
 28 |         part_paths=part_paths,
 29 |     )
 30 | 
 31 |     return results.combine([
 32 |         _read_notes(read_part_with_body, part_paths),
 33 |         _read_comments(read_part_with_body, part_paths),
 34 |     ]).bind(lambda referents:
 35 |         _read_document(zip_file, read_part_with_body, notes=referents[0], comments=referents[1], part_paths=part_paths)
 36 |     )
 37 | 
 38 | 
 39 | @cobble.data
 40 | class _PartPaths(object):
 41 |     main_document = cobble.field()
 42 |     comments = cobble.field()
 43 |     endnotes = cobble.field()
 44 |     footnotes = cobble.field()
 45 |     numbering = cobble.field()
 46 |     styles = cobble.field()
 47 | 
 48 | 
 49 | def _find_part_paths(zip_file):
 50 |     package_relationships = _read_relationships(zip_file, "_rels/.rels")
 51 |     document_filename = _find_document_filename(zip_file, package_relationships)
 52 | 
 53 |     document_relationships = _read_relationships(
 54 |         zip_file,
 55 |         _find_relationships_path_for(document_filename),
 56 |     )
 57 | 
 58 |     def find(name):
 59 |         return _find_part_path(
 60 |             zip_file=zip_file,
 61 |             relationships=document_relationships,
 62 |             relationship_type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/" + name,
 63 |             fallback_path="word/{0}.xml".format(name),
 64 |             base_path=zips.split_path(document_filename)[0],
 65 |         )
 66 | 
 67 |     return _PartPaths(
 68 |         main_document=document_filename,
 69 |         comments=find("comments"),
 70 |         endnotes=find("endnotes"),
 71 |         footnotes=find("footnotes"),
 72 |         numbering=find("numbering"),
 73 |         styles=find("styles"),
 74 |     )
 75 | 
 76 | 
 77 | def _find_document_filename(zip_file, relationships):
 78 |     path = _find_part_path(
 79 |         zip_file,
 80 |         relationships,
 81 |         relationship_type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument",
 82 |         base_path="",
 83 |         fallback_path="word/document.xml",
 84 |     )
 85 |     if zip_file.exists(path):
 86 |         return path
 87 |     else:
 88 |         raise IOError("Could not find main document part. Are you sure this is a valid .docx file?")
 89 | 
 90 | 
 91 | def _find_part_path(zip_file, relationships, relationship_type, base_path, fallback_path):
 92 |     targets = [
 93 |         zips.join_path(base_path, target).lstrip("/")
 94 |         for target in relationships.find_targets_by_type(relationship_type)
 95 |     ]
 96 |     valid_targets = list(filter(lambda target: zip_file.exists(target), targets))
 97 |     if len(valid_targets) == 0:
 98 |         return fallback_path
 99 |     else:
100 |         return valid_targets[0]
101 | 
102 | 
103 | def _read_notes(read_part_with_body, part_paths):
104 |     footnotes = read_part_with_body(
105 |         part_paths.footnotes,
106 |         lambda root, body_reader: read_footnotes_xml_element(root, body_reader=body_reader),
107 |         default=_empty_result,
108 |     )
109 |     endnotes = read_part_with_body(
110 |         part_paths.endnotes,
111 |         lambda root, body_reader: read_endnotes_xml_element(root, body_reader=body_reader),
112 |         default=_empty_result,
113 |     )
114 | 
115 |     return results.combine([footnotes, endnotes]).map(lists.flatten)
116 | 
117 | 
118 | def _read_comments(read_part_with_body, part_paths):
119 |     return read_part_with_body(
120 |         part_paths.comments,
121 |         lambda root, body_reader: read_comments_xml_element(root, body_reader=body_reader),
122 |         default=_empty_result,
123 |     )
124 | 
125 | 
126 | def _read_document(zip_file, read_part_with_body, notes, comments, part_paths):
127 |     return read_part_with_body(
128 |         part_paths.main_document,
129 |         partial(
130 |             read_document_xml_element,
131 |             notes=notes,
132 |             comments=comments,
133 |         ),
134 |     )
135 | 
136 | 
137 | def _part_with_body_reader(document_path, zip_file, part_paths):
138 |     content_types = _try_read_entry_or_default(
139 |         zip_file,
140 |         "[Content_Types].xml",
141 |         read_content_types_xml_element,
142 |         empty_content_types,
143 |     )
144 | 
145 |     styles = _try_read_entry_or_default(
146 |         zip_file,
147 |         part_paths.styles,
148 |         read_styles_xml_element,
149 |         Styles.EMPTY,
150 |     )
151 | 
152 |     numbering = _try_read_entry_or_default(
153 |         zip_file,
154 |         part_paths.numbering,
155 |         lambda element: read_numbering_xml_element(element, styles=styles),
156 |         default=Numbering.EMPTY,
157 |     )
158 | 
159 |     def read_part(name, reader, default=_undefined):
160 |         relationships = _read_relationships(zip_file, _find_relationships_path_for(name))
161 | 
162 |         body_reader = body_xml.reader(
163 |             numbering=numbering,
164 |             content_types=content_types,
165 |             relationships=relationships,
166 |             styles=styles,
167 |             docx_file=zip_file,
168 |             files=Files(None if document_path is None else os.path.dirname(document_path)),
169 |         )
170 | 
171 |         if default is _undefined:
172 |             return _read_entry(zip_file, name, partial(reader, body_reader=body_reader))
173 |         else:
174 |             return _try_read_entry_or_default(zip_file, name, partial(reader, body_reader=body_reader), default=default)
175 | 
176 |     return read_part
177 | 
178 | 
179 | 
180 | def _find_relationships_path_for(name):
181 |     dirname, basename = zips.split_path(name)
182 |     return zips.join_path(dirname, "_rels", basename + ".rels")
183 | 
184 | 
185 | def _read_relationships(zip_file, name):
186 |     return _try_read_entry_or_default(
187 |         zip_file,
188 |         name,
189 |         read_relationships_xml_element,
190 |         default=Relationships.EMPTY,
191 |     )
192 | 
193 | def _try_read_entry_or_default(zip_file, name, reader, default):
194 |     if zip_file.exists(name):
195 |         return _read_entry(zip_file, name, reader)
196 |     else:
197 |         return default
198 | 
199 | 
200 | def _read_entry(zip_file, name, reader):
201 |     with zip_file.open(name) as fileobj:
202 |         return reader(office_xml.read(fileobj))
203 | 
204 | 
205 | _undefined = object()
206 | 


--------------------------------------------------------------------------------
/mammoth/docx/comments_xml.py:
--------------------------------------------------------------------------------
 1 | from .. import lists
 2 | from .. import documents
 3 | from .. import results
 4 | 
 5 | 
 6 | def read_comments_xml_element(element, body_reader):
 7 |     def read_comments_xml_element(element):
 8 |         comment_elements = element.find_children("w:comment")
 9 |         return results.combine(lists.map(_read_comment_element, comment_elements))
10 | 
11 | 
12 |     def _read_comment_element(element):
13 |         def read_optional_attribute(name):
14 |             return element.attributes.get(name, "").strip() or None
15 | 
16 |         return body_reader.read_all(element.children).map(lambda body:
17 |             documents.comment(
18 |                 comment_id=element.attributes["w:id"],
19 |                 body=body,
20 |                 author_name=read_optional_attribute("w:author"),
21 |                 author_initials=read_optional_attribute("w:initials"),
22 |             ))
23 | 
24 |     return read_comments_xml_element(element)
25 | 


--------------------------------------------------------------------------------
/mammoth/docx/complex_fields.py:
--------------------------------------------------------------------------------
 1 | class unknown(object):
 2 |     pass
 3 | 
 4 | 
 5 | class Begin:
 6 |     def __init__(self, *, fld_char):
 7 |         self.fld_char = fld_char
 8 | 
 9 | 
10 | def begin(*, fld_char):
11 |     return Begin(fld_char=fld_char)
12 | 
13 | 
14 | class Hyperlink(object):
15 |     def __init__(self, kwargs):
16 |         self.kwargs = kwargs
17 | 
18 | 
19 | def hyperlink(kwargs):
20 |     return Hyperlink(kwargs=kwargs)
21 | 
22 | 
23 | class Checkbox:
24 |     def __init__(self, *, checked):
25 |         self.checked = checked
26 | 
27 | 
28 | def checkbox(*, checked):
29 |     return Checkbox(checked=checked)
30 | 


--------------------------------------------------------------------------------
/mammoth/docx/content_types_xml.py:
--------------------------------------------------------------------------------
 1 | def read_content_types_xml_element(element):
 2 |     extension_defaults = dict(map(
 3 |         _read_default,
 4 |         element.find_children("content-types:Default")
 5 |     ))
 6 |     overrides = dict(map(
 7 |         _read_override,
 8 |         element.find_children("content-types:Override")
 9 |     ))
10 |     return _ContentTypes(extension_defaults, overrides)
11 | 
12 | 
13 | def _read_default(element):
14 |     extension = element.attributes["Extension"]
15 |     content_type = element.attributes["ContentType"]
16 |     return extension, content_type
17 | 
18 | 
19 | def _read_override(element):
20 |     part_name = element.attributes["PartName"]
21 |     content_type = element.attributes["ContentType"]
22 |     return part_name.lstrip("/"), content_type
23 | 
24 | 
25 | class _ContentTypes(object):
26 |     _image_content_types = {
27 |         "png": "png",
28 |         "gif": "gif",
29 |         "jpeg": "jpeg",
30 |         "jpg": "jpeg",
31 |         "tif": "tiff",
32 |         "tiff": "tiff",
33 |         "bmp": "bmp",
34 |     }
35 |     
36 |     def __init__(self, extension_defaults, overrides):
37 |         self._extension_defaults = extension_defaults
38 |         self._overrides = overrides
39 |     
40 |     def find_content_type(self, path):
41 |         if path in self._overrides:
42 |             return self._overrides[path]
43 | 
44 |         extension = _get_extension(path)
45 |         default_type = self._extension_defaults.get(extension)
46 |         if default_type is not None:
47 |             return default_type
48 | 
49 |         image_type = self._image_content_types.get(extension.lower())
50 |         if image_type is not None:
51 |             return "image/" + image_type
52 |         
53 |         return None
54 | 
55 | empty_content_types = _ContentTypes({}, {})
56 | 
57 | def _get_extension(path):
58 |     return path.rpartition(".")[2]
59 | 


--------------------------------------------------------------------------------
/mammoth/docx/document_xml.py:
--------------------------------------------------------------------------------
 1 | from .. import documents
 2 | 
 3 | 
 4 | def read_document_xml_element(
 5 |         element,
 6 |         body_reader,
 7 |         notes=None,
 8 |         comments=None):
 9 | 
10 |     if notes is None:
11 |         notes = []
12 |     if comments is None:
13 |         comments = []
14 | 
15 |     body_element = element.find_child("w:body")
16 | 
17 |     if body_element is None:
18 |         raise ValueError("Could not find the body element: are you sure this is a docx file?")
19 | 
20 |     return body_reader.read_all(body_element.children) \
21 |         .map(lambda children: documents.document(
22 |             children,
23 |             notes=documents.notes(notes),
24 |             comments=comments
25 |         ))
26 | 


--------------------------------------------------------------------------------
/mammoth/docx/files.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import contextlib
 3 | try:
 4 |     from urllib2 import urlopen
 5 | except ImportError:
 6 |     from urllib.request import urlopen
 7 | try:
 8 |     from urllib.parse import urlparse
 9 | except ImportError:
10 |     from urlparse import urlparse
11 | 
12 | 
13 | class Files(object):
14 |     def __init__(self, base):
15 |         self._base = base
16 |     
17 |     def open(self, uri):
18 |         try:
19 |             if _is_absolute(uri):
20 |                 return contextlib.closing(urlopen(uri))
21 |             elif self._base is not None:
22 |                 return open(os.path.join(self._base, uri), "rb")
23 |             else:
24 |                 raise InvalidFileReferenceError("could not find external image '{0}', fileobj has no name".format(uri))
25 |         except IOError as error:
26 |             message = "could not open external image: '{0}' (document directory: '{1}')\n{2}".format(
27 |                 uri, self._base, str(error))
28 |             raise InvalidFileReferenceError(message)
29 | 
30 | 
31 | def _is_absolute(url):
32 |     return urlparse(url).scheme != ""
33 | 
34 | 
35 | class InvalidFileReferenceError(ValueError):
36 |     pass
37 | 


--------------------------------------------------------------------------------
/mammoth/docx/notes_xml.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | 
 3 | from .. import lists
 4 | from .. import documents
 5 | from .. import results
 6 | 
 7 | 
 8 | def _read_notes(note_type, element, body_reader):
 9 |     def read_notes_xml_element(element):
10 |         note_elements = lists.filter(
11 |             _is_note_element,
12 |             element.find_children("w:" + note_type),
13 |         )
14 |         return results.combine(lists.map(_read_note_element, note_elements))
15 | 
16 | 
17 |     def _is_note_element(element):
18 |         return element.attributes.get("w:type") not in ["continuationSeparator", "separator"]
19 | 
20 | 
21 |     def _read_note_element(element):
22 |         return body_reader.read_all(element.children).map(lambda body: 
23 |             documents.note(
24 |                 note_type=note_type,
25 |                 note_id=element.attributes["w:id"],
26 |                 body=body
27 |             ))
28 |     
29 |     return read_notes_xml_element(element)
30 | 
31 | read_footnotes_xml_element = functools.partial(_read_notes, "footnote")
32 | read_endnotes_xml_element = functools.partial(_read_notes, "endnote")
33 | 


--------------------------------------------------------------------------------
/mammoth/docx/numbering_xml.py:
--------------------------------------------------------------------------------
  1 | import cobble
  2 | 
  3 | from ..documents import numbering_level
  4 | from .styles_xml import Styles
  5 | 
  6 | 
  7 | def read_numbering_xml_element(element, styles):
  8 |     abstract_nums = _read_abstract_nums(element)
  9 |     nums = _read_nums(element)
 10 |     return Numbering(abstract_nums=abstract_nums, nums=nums, styles=styles)
 11 | 
 12 | 
 13 | def _read_abstract_nums(element):
 14 |     abstract_num_elements = element.find_children("w:abstractNum")
 15 |     return dict(map(_read_abstract_num, abstract_num_elements))
 16 | 
 17 | 
 18 | def _read_abstract_num(element):
 19 |     abstract_num_id = element.attributes.get("w:abstractNumId")
 20 |     levels = _read_abstract_num_levels(element)
 21 |     num_style_link = element.find_child_or_null("w:numStyleLink").attributes.get("w:val")
 22 |     return abstract_num_id, _AbstractNum(levels=levels, num_style_link=num_style_link)
 23 | 
 24 | 
 25 | @cobble.data
 26 | class _AbstractNum(object):
 27 |     levels = cobble.field()
 28 |     num_style_link = cobble.field()
 29 | 
 30 | 
 31 | @cobble.data
 32 | class _AbstractNumLevel(object):
 33 |     level_index = cobble.field()
 34 |     is_ordered = cobble.field()
 35 |     paragraph_style_id = cobble.field()
 36 | 
 37 | 
 38 | def _read_abstract_num_levels(element):
 39 |     levels = map(_read_abstract_num_level, element.find_children("w:lvl"))
 40 |     return dict(
 41 |         (level.level_index, level)
 42 |         for level in levels
 43 |     )
 44 | 
 45 | 
 46 | def _read_abstract_num_level(element):
 47 |     level_index = element.attributes["w:ilvl"]
 48 |     num_fmt = element.find_child_or_null("w:numFmt").attributes.get("w:val")
 49 |     is_ordered = num_fmt != "bullet"
 50 |     paragraph_style_id = element.find_child_or_null("w:pStyle").attributes.get("w:val")
 51 |     return _AbstractNumLevel(
 52 |         level_index=level_index,
 53 |         is_ordered=is_ordered,
 54 |         paragraph_style_id=paragraph_style_id,
 55 |     )
 56 | 
 57 | 
 58 | def _read_nums(element):
 59 |     num_elements = element.find_children("w:num")
 60 |     return dict(
 61 |         _read_num(num_element)
 62 |         for num_element in num_elements
 63 |     )
 64 | 
 65 | 
 66 | def _read_num(element):
 67 |     num_id = element.attributes.get("w:numId")
 68 |     abstract_num_id = element.find_child_or_null("w:abstractNumId").attributes["w:val"]
 69 |     return num_id, _Num(abstract_num_id=abstract_num_id)
 70 | 
 71 | 
 72 | @cobble.data
 73 | class _Num(object):
 74 |     abstract_num_id = cobble.field()
 75 | 
 76 | 
 77 | class Numbering(object):
 78 |     def __init__(self, abstract_nums, nums, styles):
 79 |         self._abstract_nums = abstract_nums
 80 |         self._levels_by_paragraph_style_id = dict(
 81 |             (level.paragraph_style_id, self._to_numbering_level(level))
 82 |             for abstract_num in abstract_nums.values()
 83 |             for level in abstract_num.levels.values()
 84 |             if level.paragraph_style_id is not None
 85 |         )
 86 |         self._nums = nums
 87 |         self._styles = styles
 88 | 
 89 |     def find_level(self, num_id, level):
 90 |         num = self._nums.get(num_id)
 91 |         if num is None:
 92 |             return None
 93 |         else:
 94 |             abstract_num = self._abstract_nums.get(num.abstract_num_id)
 95 |             if abstract_num is None:
 96 |                 return None
 97 |             elif abstract_num.num_style_link is None:
 98 |                 return self._to_numbering_level(abstract_num.levels.get(level))
 99 |             else:
100 |                 style = self._styles.find_numbering_style_by_id(abstract_num.num_style_link)
101 |                 return self.find_level(style.num_id, level)
102 | 
103 |     def find_level_by_paragraph_style_id(self, style_id):
104 |         return self._levels_by_paragraph_style_id.get(style_id)
105 | 
106 |     def _to_numbering_level(self, abstract_num_level):
107 |         if abstract_num_level is None:
108 |             return None
109 |         else:
110 |             return numbering_level(
111 |                 level_index=abstract_num_level.level_index,
112 |                 is_ordered=abstract_num_level.is_ordered,
113 |             )
114 | 
115 | 
116 | Numbering.EMPTY = Numbering(abstract_nums={}, nums={}, styles=Styles.EMPTY)
117 | 


--------------------------------------------------------------------------------
/mammoth/docx/office_xml.py:
--------------------------------------------------------------------------------
 1 | from ..lists import flat_map
 2 | from .xmlparser import parse_xml, XmlElement
 3 | 
 4 | 
 5 | _namespaces = [
 6 |     # Transitional format
 7 |     ("w", "http://schemas.openxmlformats.org/wordprocessingml/2006/main"),
 8 |     ("r", "http://schemas.openxmlformats.org/officeDocument/2006/relationships"),
 9 |     ("wp", "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"),
10 |     ("a", "http://schemas.openxmlformats.org/drawingml/2006/main"),
11 |     ("pic", "http://schemas.openxmlformats.org/drawingml/2006/picture"),
12 | 
13 |     # Strict format
14 |     ("w", "http://purl.oclc.org/ooxml/wordprocessingml/main"),
15 |     ("r", "http://purl.oclc.org/ooxml/officeDocument/relationships"),
16 |     ("wp", "http://purl.oclc.org/ooxml/drawingml/wordprocessingDrawing"),
17 |     ("a", "http://purl.oclc.org/ooxml/drawingml/main"),
18 |     ("pic", "http://purl.oclc.org/ooxml/drawingml/picture"),
19 | 
20 |     # Common
21 |     ("content-types", "http://schemas.openxmlformats.org/package/2006/content-types"),
22 |     ("relationships", "http://schemas.openxmlformats.org/package/2006/relationships"),
23 |     ("mc", "http://schemas.openxmlformats.org/markup-compatibility/2006"),
24 |     ("v", "urn:schemas-microsoft-com:vml"),
25 |     ("office-word", "urn:schemas-microsoft-com:office:word"),
26 | 
27 |     # [MS-DOCX]: Word Extensions to the Office Open XML (.docx) File Format
28 |     # https://learn.microsoft.com/en-us/openspecs/office_standards/ms-docx/b839fe1f-e1ca-4fa6-8c26-5954d0abbccd
29 |     ("wordml", "http://schemas.microsoft.com/office/word/2010/wordml"),
30 | ]
31 | 
32 | 
33 | def read(fileobj):
34 |     return _collapse_alternate_content(parse_xml(fileobj, _namespaces))[0]
35 | 
36 | 
37 | def _collapse_alternate_content(node):
38 |     if isinstance(node, XmlElement):
39 |         if node.name == "mc:AlternateContent":
40 |             return node.find_child_or_null("mc:Fallback").children
41 |         else:
42 |             node.children = flat_map(_collapse_alternate_content, node.children)
43 |             return [node]
44 |     else:
45 |         return [node]
46 | 


--------------------------------------------------------------------------------
/mammoth/docx/relationships_xml.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | 
 3 | 
 4 | class Relationships(object):
 5 |     def __init__(self, relationships):
 6 |         self._targets_by_id = dict(
 7 |             (relationship.relationship_id, relationship.target)
 8 |             for relationship in relationships
 9 |         )
10 |         self._targets_by_type = collections.defaultdict(list)
11 |         for relationship in relationships:
12 |             self._targets_by_type[relationship.type].append(relationship.target)
13 |     
14 |     def find_target_by_relationship_id(self, key):
15 |         return self._targets_by_id[key]
16 |     
17 |     def find_targets_by_type(self, relationship_type):
18 |         return self._targets_by_type[relationship_type]
19 | 
20 | 
21 | Relationships.EMPTY = Relationships([])
22 | 
23 | 
24 | Relationship = collections.namedtuple("Relationship", ["relationship_id", "target", "type"])
25 | 
26 | 
27 | def read_relationships_xml_element(element):
28 |     children = element.find_children("relationships:Relationship")
29 |     return Relationships(list(map(_read_relationship, children)))
30 | 
31 | 
32 | def _read_relationship(element):
33 |     relationship = Relationship(
34 |         relationship_id=element.attributes["Id"],
35 |         target=element.attributes["Target"],
36 |         type=element.attributes["Type"],
37 |     )
38 |     return relationship
39 | 


--------------------------------------------------------------------------------
/mammoth/docx/style_map.py:
--------------------------------------------------------------------------------
 1 | from xml.etree import ElementTree
 2 | 
 3 | from ..zips import open_zip, update_zip
 4 | 
 5 | 
 6 | _style_map_path = "mammoth/style-map"
 7 | _style_map_absolute_path = "/" + _style_map_path
 8 | _relationships_path = "word/_rels/document.xml.rels"
 9 | _content_types_path = "[Content_Types].xml"
10 | 
11 | 
12 | def write_style_map(fileobj, style_map):
13 |     with open_zip(fileobj, "r") as zip_file:
14 |         relationships_xml = _generate_relationships_xml(zip_file.read_str(_relationships_path))
15 |         content_types_xml = _generate_content_types_xml(zip_file.read_str(_content_types_path))
16 |     
17 |     update_zip(fileobj, {
18 |         _style_map_path: style_map.encode("utf8"),
19 |         _relationships_path: relationships_xml,
20 |         _content_types_path: content_types_xml,
21 |     })
22 | 
23 | def _generate_relationships_xml(relationships_xml):
24 |     schema = "http://schemas.zwobble.org/mammoth/style-map"
25 |     relationships_uri = "http://schemas.openxmlformats.org/package/2006/relationships"
26 |     relationship_element_name = "{" + relationships_uri + "}Relationship"
27 |     
28 |     relationships = ElementTree.fromstring(relationships_xml)
29 |     _add_or_update_element(relationships, relationship_element_name, "Id", {
30 |         "Id": "rMammothStyleMap",
31 |         "Type": schema,
32 |         "Target": _style_map_absolute_path,
33 |     })
34 | 
35 |     return ElementTree.tostring(relationships, "UTF-8")
36 | 
37 | 
38 | def _generate_content_types_xml(content_types_xml):
39 |     content_types_uri = "http://schemas.openxmlformats.org/package/2006/content-types"
40 |     override_name = "{" + content_types_uri + "}Override"
41 |     
42 |     types = ElementTree.fromstring(content_types_xml)
43 |     _add_or_update_element(types, override_name, "PartName", {
44 |         "PartName": _style_map_absolute_path,
45 |         "ContentType": "text/prs.mammoth.style-map",
46 |     })
47 |     
48 |     return ElementTree.tostring(types, "UTF-8")
49 | 
50 | 
51 | def _add_or_update_element(parent, name, identifying_attribute, attributes):
52 |     existing_child = _find_child(parent, name, identifying_attribute, attributes)
53 |     if existing_child is None:
54 |         ElementTree.SubElement(parent, name, attributes)
55 |     else:
56 |         existing_child.attrib = attributes
57 |     
58 | 
59 | def _find_child(parent, name, identifying_attribute, attributes):
60 |     for element in parent.iter():
61 |         if element.tag == name and element.get(identifying_attribute) == attributes.get(identifying_attribute):
62 |             return element
63 | 
64 | 
65 | def read_style_map(fileobj):
66 |     with open_zip(fileobj, "r") as zip_file:
67 |         if zip_file.exists(_style_map_path):
68 |             return zip_file.read_str(_style_map_path)
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/mammoth/docx/styles_xml.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | 
 3 | 
 4 | class Styles(object):
 5 |     @staticmethod
 6 |     def create(paragraph_styles=None, character_styles=None, table_styles=None, numbering_styles=None):
 7 |         if paragraph_styles is None:
 8 |             paragraph_styles = {}
 9 |         if character_styles is None:
10 |             character_styles = {}
11 |         if table_styles is None:
12 |             table_styles = {}
13 |         if numbering_styles is None:
14 |             numbering_styles = {}
15 | 
16 |         return Styles(
17 |             paragraph_styles=paragraph_styles,
18 |             character_styles=character_styles,
19 |             table_styles=table_styles,
20 |             numbering_styles=numbering_styles,
21 |         )
22 | 
23 |     def __init__(self, paragraph_styles, character_styles, table_styles, numbering_styles):
24 |         self._paragraph_styles = paragraph_styles
25 |         self._character_styles = character_styles
26 |         self._table_styles = table_styles
27 |         self._numbering_styles = numbering_styles
28 | 
29 |     def find_paragraph_style_by_id(self, style_id):
30 |         return self._paragraph_styles.get(style_id)
31 | 
32 |     def find_character_style_by_id(self, style_id):
33 |         return self._character_styles.get(style_id)
34 | 
35 |     def find_table_style_by_id(self, style_id):
36 |         return self._table_styles.get(style_id)
37 | 
38 |     def find_numbering_style_by_id(self, style_id):
39 |         return self._numbering_styles.get(style_id)
40 | 
41 | 
42 | Styles.EMPTY = Styles(
43 |     paragraph_styles={},
44 |     character_styles={},
45 |     table_styles={},
46 |     numbering_styles={},
47 | )
48 | 
49 | 
50 | def read_styles_xml_element(element):
51 |     paragraph_styles = {}
52 |     character_styles = {}
53 |     table_styles = {}
54 |     numbering_styles = {}
55 |     styles = {
56 |         "paragraph": paragraph_styles,
57 |         "character": character_styles,
58 |         "table": table_styles,
59 |     }
60 | 
61 |     for style_element in element.find_children("w:style"):
62 |         style = _read_style_element(style_element)
63 |         element_type = style_element.attributes["w:type"]
64 |         if element_type == "numbering":
65 |             numbering_styles[style.style_id] = _read_numbering_style_element(style_element)
66 |         else:
67 |             style_set = styles.get(element_type)
68 |             if style_set is not None:
69 |                 style_set[style.style_id] = style
70 | 
71 |     return Styles(
72 |         paragraph_styles=paragraph_styles,
73 |         character_styles=character_styles,
74 |         table_styles=table_styles,
75 |         numbering_styles=numbering_styles,
76 |     )
77 | 
78 | 
79 | Style = collections.namedtuple("Style", ["style_id", "name"])
80 | 
81 | 
82 | def _read_style_element(element):
83 |     style_id = element.attributes["w:styleId"]
84 |     name = element.find_child_or_null("w:name").attributes.get("w:val")
85 |     return Style(style_id=style_id, name=name)
86 | 
87 | 
88 | NumberingStyle = collections.namedtuple("NumberingStyle", ["num_id"])
89 | 
90 | 
91 | def _read_numbering_style_element(element):
92 |     num_id = element \
93 |         .find_child_or_null("w:pPr") \
94 |         .find_child_or_null("w:numPr") \
95 |         .find_child_or_null("w:numId") \
96 |         .attributes.get("w:val")
97 | 
98 |     return NumberingStyle(num_id=num_id)
99 | 


--------------------------------------------------------------------------------
/mammoth/docx/uris.py:
--------------------------------------------------------------------------------
 1 | def uri_to_zip_entry_name(base, uri):
 2 |     if uri.startswith("/"):
 3 |         return uri[1:]
 4 |     else:
 5 |         return base + "/" + uri
 6 | 
 7 | 
 8 | def replace_fragment(uri, fragment):
 9 |     hash_index = uri.find("#")
10 |     if hash_index != -1:
11 |         uri = uri[:hash_index]
12 |     return uri + "#" + fragment
13 | 


--------------------------------------------------------------------------------
/mammoth/docx/xmlparser.py:
--------------------------------------------------------------------------------
  1 | import xml.dom.minidom
  2 | 
  3 | import cobble
  4 | 
  5 | 
  6 | @cobble.data
  7 | class XmlElement(object):
  8 |     name = cobble.field()
  9 |     attributes = cobble.field()
 10 |     children = cobble.field()
 11 | 
 12 |     def find_child_or_null(self, name):
 13 |         return self.find_child(name) or null_xml_element
 14 | 
 15 |     def find_child(self, name):
 16 |         for child in self.children:
 17 |             if isinstance(child, XmlElement) and child.name == name:
 18 |                 return child
 19 | 
 20 | 
 21 |     def find_children(self, name):
 22 |         return XmlElementList(filter(
 23 |             lambda child: child.node_type == node_types.element and child.name == name,
 24 |             self.children
 25 |         ))
 26 | 
 27 | 
 28 | class XmlElementList(object):
 29 |     def __init__(self, elements):
 30 |         self._elements = elements
 31 | 
 32 |     def __iter__(self):
 33 |         return iter(self._elements)
 34 | 
 35 |     def find_children(self, name):
 36 |         children = []
 37 |         for element in self._elements:
 38 |             for child in element.find_children(name):
 39 |                 children.append(child)
 40 |         return XmlElementList(children)
 41 | 
 42 | 
 43 | class NullXmlElement(object):
 44 |     attributes = {}
 45 |     children = []
 46 | 
 47 |     def find_child_or_null(self, name):
 48 |         return self
 49 | 
 50 |     def find_child(self, name):
 51 |         return None
 52 | 
 53 | 
 54 | null_xml_element = NullXmlElement()
 55 | 
 56 | 
 57 | @cobble.data
 58 | class XmlText(object):
 59 |     value = cobble.field()
 60 | 
 61 | 
 62 | def element(name, attributes=None, children=None):
 63 |     return XmlElement(name, attributes or {}, children or [])
 64 | 
 65 | text = XmlText
 66 | 
 67 | 
 68 | class node_types(object):
 69 |     element = 1
 70 |     text = 3
 71 | 
 72 | 
 73 | XmlElement.node_type = node_types.element
 74 | XmlText.node_type = node_types.text
 75 | 
 76 | 
 77 | 
 78 | def parse_xml(fileobj, namespace_mapping=None):
 79 |     if namespace_mapping is None:
 80 |         namespace_prefixes = {}
 81 |     else:
 82 |         namespace_prefixes = dict((uri, prefix) for prefix, uri in namespace_mapping)
 83 | 
 84 |     document = xml.dom.minidom.parse(fileobj)
 85 | 
 86 |     def convert_node(node):
 87 |         if node.nodeType == xml.dom.Node.ELEMENT_NODE:
 88 |             return convert_element(node)
 89 |         elif node.nodeType == xml.dom.Node.TEXT_NODE:
 90 |             return XmlText(node.nodeValue)
 91 |         else:
 92 |             return None
 93 | 
 94 |     def convert_element(element):
 95 |         converted_name = convert_name(element)
 96 | 
 97 |         converted_attributes = dict(
 98 |             (convert_name(attribute), attribute.value)
 99 |             for attribute in element.attributes.values()
100 |             if attribute.namespaceURI != "http://www.w3.org/2000/xmlns/"
101 |         )
102 | 
103 |         converted_children = []
104 |         for child_node in element.childNodes:
105 |             converted_child_node = convert_node(child_node)
106 |             if converted_child_node is not None:
107 |                 converted_children.append(converted_child_node)
108 | 
109 |         return XmlElement(converted_name, converted_attributes, converted_children)
110 | 
111 |     def convert_name(node):
112 |         if node.namespaceURI is None:
113 |             return node.localName
114 |         else:
115 |             prefix = namespace_prefixes.get(node.namespaceURI)
116 |             if prefix is None:
117 |                 return "{%s}%s" % (node.namespaceURI, node.localName)
118 |             else:
119 |                 return "%s:%s" % (prefix, node.localName)
120 | 
121 |     return convert_node(document.documentElement)
122 | 


--------------------------------------------------------------------------------
/mammoth/html/__init__.py:
--------------------------------------------------------------------------------
  1 | from ..lists import flat_map
  2 | from .nodes import TextNode, Tag, Element, ForceWrite, NodeVisitor
  3 | 
  4 | 
  5 | def text(value):
  6 |     return TextNode(value)
  7 | 
  8 | 
  9 | def tag(tag_names, attributes=None, collapsible=None, separator=None):
 10 |     if not isinstance(tag_names, list):
 11 |         tag_names = [tag_names]
 12 |     if attributes is None:
 13 |         attributes = {}
 14 |     return Tag(tag_names=tag_names, attributes=attributes, collapsible=bool(collapsible), separator=separator)
 15 | 
 16 | 
 17 | def element(tag_names, attributes=None, children=None, collapsible=None, separator=None):
 18 |     if children is None:
 19 |         children = []
 20 |         
 21 |     element_tag = tag(tag_names=tag_names, attributes=attributes, collapsible=collapsible, separator=separator)
 22 |     return Element(element_tag, children)
 23 | 
 24 | 
 25 | def collapsible_element(tag_names, attributes=None, children=None):
 26 |     return element(tag_names, attributes, children, collapsible=True)
 27 | 
 28 | 
 29 | force_write = ForceWrite()
 30 | 
 31 | 
 32 | def strip_empty(nodes):
 33 |     return flat_map(_strip_empty_node, nodes)
 34 | 
 35 | 
 36 | def _strip_empty_node(node):
 37 |     return StripEmpty().visit(node)
 38 | 
 39 | 
 40 | class StripEmpty(NodeVisitor):
 41 |     def visit_text_node(self, node):
 42 |         if node.value:
 43 |             return [node]
 44 |         else:
 45 |             return []
 46 |     
 47 |     def visit_element(self, element):
 48 |         children = strip_empty(element.children)
 49 |         if len(children) == 0 and not element.is_void():
 50 |             return []
 51 |         else:
 52 |             return [Element(element.tag, children)]
 53 |     
 54 |     def visit_force_write(self, node):
 55 |         return [node]
 56 | 
 57 | 
 58 | def collapse(nodes):
 59 |     collapsed = []
 60 |     
 61 |     for node in nodes:
 62 |         _collapsing_add(collapsed, node)
 63 |     
 64 |     return collapsed
 65 | 
 66 | class _CollapseNode(NodeVisitor):
 67 |     def visit_text_node(self, node):
 68 |         return node
 69 |     
 70 |     def visit_element(self, element):
 71 |         return Element(element.tag, collapse(element.children))
 72 |     
 73 |     def visit_force_write(self, node):
 74 |         return node
 75 |     
 76 | _collapse_node = _CollapseNode().visit
 77 | 
 78 | 
 79 | def _collapsing_add(collapsed, node):
 80 |     collapsed_node = _collapse_node(node)
 81 |     if not _try_collapse(collapsed, collapsed_node):
 82 |         collapsed.append(collapsed_node)
 83 |     
 84 | def _try_collapse(collapsed, node):
 85 |     if not collapsed:
 86 |         return False
 87 | 
 88 |     last = collapsed[-1]
 89 |     if not isinstance(last, Element) or not isinstance(node, Element):
 90 |         return False
 91 |     
 92 |     if not node.collapsible:
 93 |         return False
 94 |         
 95 |     if not _is_match(last, node):
 96 |         return False
 97 |     
 98 |     if node.separator:
 99 |         last.children.append(text(node.separator))
100 |     
101 |     for child in node.children:
102 |         _collapsing_add(last.children, child)
103 |         
104 |     return True
105 | 
106 | def _is_match(first, second):
107 |     return first.tag_name in second.tag_names and first.attributes == second.attributes
108 | 
109 | 
110 | def write(writer, nodes):
111 |     visitor = _NodeWriter(writer)
112 |     visitor.visit_all(nodes)
113 |         
114 | 
115 | class _NodeWriter(NodeVisitor):
116 |     def __init__(self, writer):
117 |         self._writer = writer
118 |     
119 |     def visit_text_node(self, node):
120 |         self._writer.text(node.value)
121 |     
122 |     def visit_element(self, element):
123 |         if element.is_void():
124 |             self._writer.self_closing(element.tag_name, element.attributes)
125 |         else:
126 |             self._writer.start(element.tag_name, element.attributes)
127 |             self.visit_all(element.children)
128 |             self._writer.end(element.tag_name)
129 |     
130 |     def visit_force_write(self, element):
131 |         pass
132 |     
133 |     def visit_all(self, nodes):
134 |         for node in nodes:
135 |             self.visit(node)
136 | 


--------------------------------------------------------------------------------
/mammoth/html/nodes.py:
--------------------------------------------------------------------------------
 1 | import cobble
 2 | 
 3 | 
 4 | class Node(object):
 5 |     pass
 6 | 
 7 | 
 8 | @cobble.data
 9 | class TextNode(Node):
10 |     value = cobble.field()
11 | 
12 | 
13 | @cobble.data
14 | class Tag(object):
15 |     tag_names = cobble.field()
16 |     attributes = cobble.field()
17 |     collapsible = cobble.field()
18 |     separator = cobble.field()
19 | 
20 |     @property
21 |     def tag_name(self):
22 |         return self.tag_names[0]
23 | 
24 | 
25 | @cobble.data
26 | class Element(Node):
27 |     tag = cobble.field()
28 |     children = cobble.field()
29 | 
30 |     @property
31 |     def tag_name(self):
32 |         return self.tag.tag_name
33 | 
34 |     @property
35 |     def tag_names(self):
36 |         return self.tag.tag_names
37 | 
38 |     @property
39 |     def attributes(self):
40 |         return self.tag.attributes
41 | 
42 |     @property
43 |     def collapsible(self):
44 |         return self.tag.collapsible
45 | 
46 |     @property
47 |     def separator(self):
48 |         return self.tag.separator
49 | 
50 |     _VOID_TAG_NAMES = set(["br", "hr", "img", "input"])
51 | 
52 |     def is_void(self):
53 |         return not self.children and self.tag_name in self._VOID_TAG_NAMES
54 | 
55 | 
56 | @cobble.visitable
57 | class ForceWrite(Node):
58 |     pass
59 | 
60 | 
61 | NodeVisitor = cobble.visitor(Node)
62 | 


--------------------------------------------------------------------------------
/mammoth/html_paths.py:
--------------------------------------------------------------------------------
 1 | import cobble
 2 | 
 3 | from . import html
 4 | 
 5 | 
 6 | def path(elements):
 7 |     return HtmlPath(elements)
 8 | 
 9 | 
10 | def element(names, attributes=None, class_names=None, fresh=None, separator=None):
11 |     if attributes is None:
12 |         attributes = {}
13 |     if class_names is None:
14 |         class_names = []
15 |     if fresh is None:
16 |         fresh = False
17 |     if class_names:
18 |         attributes["class"] = " ".join(class_names)
19 | 
20 |     return HtmlPathElement(html.tag(
21 |         tag_names=names,
22 |         attributes=attributes,
23 |         collapsible=not fresh,
24 |         separator=separator,
25 |     ))
26 | 
27 | 
28 | @cobble.data
29 | class HtmlPath(object):
30 |     elements = cobble.field()
31 | 
32 |     def wrap(self, generate_nodes):
33 |         nodes = generate_nodes()
34 | 
35 |         for element in reversed(self.elements):
36 |             nodes = element.wrap_nodes(nodes)
37 | 
38 |         return nodes
39 | 
40 | 
41 | @cobble.data
42 | class HtmlPathElement(object):
43 |     tag = cobble.field()
44 | 
45 |     def wrap(self, generate_nodes):
46 |         return self.wrap_nodes(generate_nodes())
47 | 
48 |     def wrap_nodes(self, nodes):
49 |         element = html.Element(self.tag, nodes)
50 |         return [element]
51 | 
52 | empty = path([])
53 | 
54 | 
55 | class ignore(object):
56 |     @staticmethod
57 |     def wrap(generate_nodes):
58 |         return []
59 | 


--------------------------------------------------------------------------------
/mammoth/images.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | 
 3 | from . import html
 4 | 
 5 | 
 6 | def img_element(func):
 7 |     def convert_image(image):
 8 |         attributes = {}
 9 |         if image.alt_text:
10 |             attributes["alt"] = image.alt_text
11 |         attributes.update(func(image))
12 | 
13 |         return [html.element("img", attributes)]
14 | 
15 |     return convert_image
16 | 
17 | # Undocumented, but retained for backwards-compatibility with 0.3.x
18 | inline = img_element
19 | 
20 | 
21 | @img_element
22 | def data_uri(image):
23 |     with image.open() as image_bytes:
24 |         encoded_src = base64.b64encode(image_bytes.read()).decode("ascii")
25 | 
26 |     return {
27 |         "src": "data:{0};base64,{1}".format(image.content_type, encoded_src)
28 |     }
29 | 


--------------------------------------------------------------------------------
/mammoth/lists.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | 
 4 | def flatten(values):
 5 |     return flat_map(lambda x: x, values)
 6 | 
 7 | 
 8 | def unique(values):
 9 |     output = []
10 |     seen = set()
11 |     for value in values:
12 |         if value not in seen:
13 |             seen.add(value)
14 |             output.append(value)
15 |     return output
16 | 
17 | 
18 | def flat_map(func, values):
19 |     return [
20 |         element
21 |         for value in values
22 |         for element in func(value)
23 |     ]
24 | 
25 | 
26 | def find_index(predicate, values):
27 |     for index, value in enumerate(values):
28 |         if predicate(value):
29 |             return index
30 | 
31 | 
32 | if sys.version_info[0] == 2:
33 |     map = map
34 |     filter = filter
35 | else:
36 |     import builtins
37 |     def map(*args, **kwargs):
38 |         return list(builtins.map(*args, **kwargs))
39 |     def filter(*args, **kwargs):
40 |         return list(builtins.filter(*args, **kwargs))
41 | 


--------------------------------------------------------------------------------
/mammoth/options.py:
--------------------------------------------------------------------------------
  1 | from .styles.parser import read_style_mapping
  2 | from . import lists, results
  3 | 
  4 | 
  5 | def read_options(options):
  6 |     custom_style_map_text = options.pop("style_map", "") or ""
  7 |     embedded_style_map_text = options.pop("embedded_style_map", "") or ""
  8 |     include_default_style_map = options.pop("include_default_style_map", True)
  9 | 
 10 |     read_style_map_result = results.combine([
 11 |         _read_style_map(custom_style_map_text),
 12 |         _read_style_map(embedded_style_map_text),
 13 |     ])
 14 | 
 15 |     custom_style_map, embedded_style_map = read_style_map_result.value
 16 |     style_map = custom_style_map + embedded_style_map
 17 | 
 18 |     if include_default_style_map:
 19 |         style_map += _default_style_map
 20 | 
 21 |     options["ignore_empty_paragraphs"] = options.get("ignore_empty_paragraphs", True)
 22 |     options["style_map"] = style_map
 23 |     return read_style_map_result.map(lambda _: options)
 24 | 
 25 | 
 26 | def _read_style_map(style_text):
 27 |     lines = filter(None, map(_get_line, style_text.split("\n")))
 28 |     return results.combine(lists.map(read_style_mapping, lines)) \
 29 |         .map(lambda style_mappings: lists.filter(None, style_mappings))
 30 | 
 31 | 
 32 | def _get_line(line):
 33 |     line = line.strip()
 34 |     if line.startswith("#"):
 35 |         return None
 36 |     else:
 37 |         return line
 38 | 
 39 | 
 40 | _default_style_map_result = _read_style_map("""
 41 | p.Heading1 => h1:fresh
 42 | p.Heading2 => h2:fresh
 43 | p.Heading3 => h3:fresh
 44 | p.Heading4 => h4:fresh
 45 | p.Heading5 => h5:fresh
 46 | p.Heading6 => h6:fresh
 47 | p[style-name='Heading 1'] => h1:fresh
 48 | p[style-name='Heading 2'] => h2:fresh
 49 | p[style-name='Heading 3'] => h3:fresh
 50 | p[style-name='Heading 4'] => h4:fresh
 51 | p[style-name='Heading 5'] => h5:fresh
 52 | p[style-name='Heading 6'] => h6:fresh
 53 | p[style-name='heading 1'] => h1:fresh
 54 | p[style-name='heading 2'] => h2:fresh
 55 | p[style-name='heading 3'] => h3:fresh
 56 | p[style-name='heading 4'] => h4:fresh
 57 | p[style-name='heading 5'] => h5:fresh
 58 | p[style-name='heading 6'] => h6:fresh
 59 | 
 60 | # Apple Pages
 61 | p.Heading => h1:fresh
 62 | p[style-name='Heading'] => h1:fresh
 63 | 
 64 | r[style-name='Strong'] => strong
 65 | 
 66 | p[style-name='footnote text'] => p:fresh
 67 | r[style-name='footnote reference'] =>
 68 | p[style-name='endnote text'] => p:fresh
 69 | r[style-name='endnote reference'] =>
 70 | p[style-name='annotation text'] => p:fresh
 71 | r[style-name='annotation reference'] =>
 72 | 
 73 | # LibreOffice
 74 | p[style-name='Footnote'] => p:fresh
 75 | r[style-name='Footnote anchor'] =>
 76 | p[style-name='Endnote'] => p:fresh
 77 | r[style-name='Endnote anchor'] =>
 78 | 
 79 | p:unordered-list(1) => ul > li:fresh
 80 | p:unordered-list(2) => ul|ol > li > ul > li:fresh
 81 | p:unordered-list(3) => ul|ol > li > ul|ol > li > ul > li:fresh
 82 | p:unordered-list(4) => ul|ol > li > ul|ol > li > ul|ol > li > ul > li:fresh
 83 | p:unordered-list(5) => ul|ol > li > ul|ol > li > ul|ol > li > ul|ol > li > ul > li:fresh
 84 | p:ordered-list(1) => ol > li:fresh
 85 | p:ordered-list(2) => ul|ol > li > ol > li:fresh
 86 | p:ordered-list(3) => ul|ol > li > ul|ol > li > ol > li:fresh
 87 | p:ordered-list(4) => ul|ol > li > ul|ol > li > ul|ol > li > ol > li:fresh
 88 | p:ordered-list(5) => ul|ol > li > ul|ol > li > ul|ol > li > ul|ol > li > ol > li:fresh
 89 | 
 90 | r[style-name='Hyperlink'] =>
 91 | 
 92 | p[style-name='Normal'] => p:fresh
 93 | 
 94 | # Apple Pages
 95 | p.Body => p:fresh
 96 | p[style-name='Body'] => p:fresh
 97 | """)
 98 | 
 99 | 
100 | assert not _default_style_map_result.messages
101 | _default_style_map = _default_style_map_result.value
102 | 


--------------------------------------------------------------------------------
/mammoth/raw_text.py:
--------------------------------------------------------------------------------
 1 | from . import documents
 2 | 
 3 | 
 4 | def extract_raw_text_from_element(element):
 5 |     if isinstance(element, documents.Text):
 6 |         return element.value
 7 |     elif isinstance(element, documents.Tab):
 8 |         return "\t"
 9 |     else:
10 |         text = "".join(map(extract_raw_text_from_element, getattr(element, "children", [])))
11 |         if isinstance(element, documents.Paragraph):
12 |             return text + "\n\n"
13 |         else:
14 |             return text
15 | 


--------------------------------------------------------------------------------
/mammoth/results.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | 
 3 | from .lists import unique
 4 | 
 5 | 
 6 | class Result(object):
 7 |     def __init__(self, value, messages):
 8 |         self.value = value
 9 |         self.messages = unique(messages)
10 |     
11 |     def map(self, func):
12 |         return Result(func(self.value), self.messages)
13 |     
14 |     def bind(self, func):
15 |         result = func(self.value)
16 |         return Result(result.value, self.messages + result.messages)
17 | 
18 | 
19 | Message = collections.namedtuple("Message", ["type", "message"])
20 | 
21 | 
22 | def warning(message):
23 |     return Message("warning", message)
24 | 
25 | 
26 | def success(value):
27 |     return Result(value, [])
28 | 
29 | 
30 | def combine(results):
31 |     values = []
32 |     messages = []
33 |     for result in results:
34 |         values.append(result.value)
35 |         for message in result.messages:
36 |             messages.append(message)
37 |         
38 |     return Result(values, messages)
39 | 
40 | 
41 | def map(func, *args):
42 |     return combine(args).map(lambda values: func(*values))
43 | 


--------------------------------------------------------------------------------
/mammoth/styles/__init__.py:
--------------------------------------------------------------------------------
1 | import collections
2 | 
3 | 
4 | def style(document_matcher, html_path):
5 |     return Style(document_matcher, html_path)
6 | 
7 | 
8 | Style = collections.namedtuple("Style", ["document_matcher", "html_path"])
9 | 


--------------------------------------------------------------------------------
/mammoth/styles/parser/__init__.py:
--------------------------------------------------------------------------------
 1 | from .errors import LineParseError
 2 | from .style_mapping_parser import parse_style_mapping
 3 | from .tokeniser import tokenise
 4 | from .token_iterator import TokenIterator
 5 | from ... import results
 6 | 
 7 | 
 8 | def read_style_mapping(string):
 9 |     try:
10 |         tokens = tokenise(string)
11 |         return results.success(parse_style_mapping(TokenIterator(tokens)))
12 |     except LineParseError:
13 |         warning = "Did not understand this style mapping, so ignored it: " + string
14 |         return results.Result(None, [results.warning(warning)])
15 | 


--------------------------------------------------------------------------------
/mammoth/styles/parser/document_matcher_parser.py:
--------------------------------------------------------------------------------
  1 | from ... import documents, document_matchers
  2 | from .errors import LineParseError
  3 | from .tokeniser import TokenType
  4 | from .token_parser import try_parse_class_name, parse_string
  5 | 
  6 | 
  7 | def parse_document_matcher(tokens):
  8 |     if tokens.try_skip(TokenType.IDENTIFIER, "p"):
  9 |         style_id = try_parse_class_name(tokens)
 10 |         style_name = _parse_style_name(tokens)
 11 |         numbering = _parse_numbering(tokens)
 12 | 
 13 |         return document_matchers.paragraph(
 14 |             style_id=style_id,
 15 |             style_name=style_name,
 16 |             numbering=numbering,
 17 |         )
 18 | 
 19 |     elif tokens.try_skip(TokenType.IDENTIFIER, "r"):
 20 |         style_id = try_parse_class_name(tokens)
 21 |         style_name = _parse_style_name(tokens)
 22 | 
 23 |         return document_matchers.run(
 24 |             style_id=style_id,
 25 |             style_name=style_name,
 26 |         )
 27 | 
 28 |     elif tokens.try_skip(TokenType.IDENTIFIER, "table"):
 29 |         style_id = try_parse_class_name(tokens)
 30 |         style_name = _parse_style_name(tokens)
 31 | 
 32 |         return document_matchers.table(
 33 |             style_id=style_id,
 34 |             style_name=style_name,
 35 |         )
 36 | 
 37 |     elif tokens.try_skip(TokenType.IDENTIFIER, "b"):
 38 |         return document_matchers.bold
 39 | 
 40 |     elif tokens.try_skip(TokenType.IDENTIFIER, "i"):
 41 |         return document_matchers.italic
 42 | 
 43 |     elif tokens.try_skip(TokenType.IDENTIFIER, "u"):
 44 |         return document_matchers.underline
 45 | 
 46 |     elif tokens.try_skip(TokenType.IDENTIFIER, "strike"):
 47 |         return document_matchers.strikethrough
 48 | 
 49 |     elif tokens.try_skip(TokenType.IDENTIFIER, "all-caps"):
 50 |         return document_matchers.all_caps
 51 | 
 52 |     elif tokens.try_skip(TokenType.IDENTIFIER, "small-caps"):
 53 |         return document_matchers.small_caps
 54 | 
 55 |     elif tokens.try_skip(TokenType.IDENTIFIER, "highlight"):
 56 |         return _parse_highlight(tokens)
 57 | 
 58 |     elif tokens.try_skip(TokenType.IDENTIFIER, "comment-reference"):
 59 |         return document_matchers.comment_reference
 60 | 
 61 |     elif tokens.try_skip(TokenType.IDENTIFIER, "br"):
 62 |         return _parse_break(tokens)
 63 | 
 64 |     else:
 65 |         raise LineParseError("Unrecognised document element: {0}".format(tokens.next_value(TokenType.IDENTIFIER)))
 66 | 
 67 | def _parse_style_name(tokens):
 68 |     if tokens.try_skip(TokenType.SYMBOL, "["):
 69 |         tokens.skip(TokenType.IDENTIFIER, "style-name")
 70 |         string_matcher = _parse_string_matcher(tokens)
 71 |         tokens.skip(TokenType.SYMBOL, "]")
 72 |         return string_matcher
 73 |     else:
 74 |         return None
 75 | 
 76 | 
 77 | def _parse_string_matcher(tokens):
 78 |     if tokens.try_skip(TokenType.SYMBOL, "="):
 79 |         return document_matchers.equal_to(parse_string(tokens))
 80 |     elif tokens.try_skip(TokenType.SYMBOL, "^="):
 81 |         return document_matchers.starts_with(parse_string(tokens))
 82 |     else:
 83 |         raise LineParseError("Unrecognised string matcher: {0}".format(tokens.next_value()))
 84 | 
 85 | def _parse_numbering(tokens):
 86 |     if tokens.try_skip(TokenType.SYMBOL, ":"):
 87 |         is_ordered = _parse_list_type(tokens)
 88 |         tokens.skip(TokenType.SYMBOL, "(")
 89 |         level = int(tokens.next_value(TokenType.INTEGER)) - 1
 90 |         tokens.skip(TokenType.SYMBOL, ")")
 91 |         return documents.numbering_level(level, is_ordered=is_ordered)
 92 | 
 93 | 
 94 | def _parse_list_type(tokens):
 95 |     list_type = tokens.next_value(TokenType.IDENTIFIER)
 96 |     if list_type == "ordered-list":
 97 |         return True
 98 |     elif list_type == "unordered-list":
 99 |         return False
100 |     else:
101 |         raise LineParseError("Unrecognised list type: {0}".format(list_type))
102 | 
103 | 
104 | def _parse_highlight(tokens):
105 |     if tokens.try_skip(TokenType.SYMBOL, "["):
106 |         tokens.skip(TokenType.IDENTIFIER, "color")
107 |         tokens.skip(TokenType.SYMBOL, "=")
108 |         color = parse_string(tokens)
109 |         tokens.skip(TokenType.SYMBOL, "]");
110 |     else:
111 |         color = None
112 | 
113 |     return document_matchers.highlight(color=color)
114 | 
115 | 
116 | def _parse_break(tokens):
117 |     tokens.skip(TokenType.SYMBOL, "[")
118 |     tokens.skip(TokenType.IDENTIFIER, "type")
119 |     tokens.skip(TokenType.SYMBOL, "=")
120 |     type_name = parse_string(tokens)
121 |     tokens.skip(TokenType.SYMBOL, "]");
122 | 
123 |     if type_name == "line":
124 |         return document_matchers.line_break
125 |     elif type_name == "page":
126 |         return document_matchers.page_break
127 |     elif type_name == "column":
128 |         return document_matchers.column_break
129 |     else:
130 |         raise LineParseError("Unrecognised break type: {0}".format(type_name))
131 | 


--------------------------------------------------------------------------------
/mammoth/styles/parser/errors.py:
--------------------------------------------------------------------------------
1 | class LineParseError(Exception):
2 |     pass
3 | 


--------------------------------------------------------------------------------
/mammoth/styles/parser/html_path_parser.py:
--------------------------------------------------------------------------------
  1 | import cobble
  2 | 
  3 | from ... import html_paths
  4 | from .tokeniser import TokenType
  5 | from .token_parser import parse_identifier, parse_string
  6 | 
  7 | 
  8 | @cobble.data
  9 | class _AttributeOrClassName(object):
 10 |     name = cobble.field()
 11 |     value = cobble.field()
 12 |     append = cobble.field()
 13 | 
 14 | 
 15 | def parse_html_path(tokens):
 16 |     if tokens.try_skip(TokenType.SYMBOL, "!"):
 17 |         return html_paths.ignore
 18 |     else:
 19 |         return html_paths.path(_parse_html_path_elements(tokens))
 20 | 
 21 | 
 22 | def _parse_html_path_elements(tokens):
 23 |     elements = []
 24 | 
 25 |     if tokens.peek_token_type() == TokenType.IDENTIFIER:
 26 |         elements.append(_parse_element(tokens))
 27 | 
 28 |         while tokens.try_skip_many(((TokenType.WHITESPACE, None), (TokenType.SYMBOL, ">"))):
 29 |             tokens.skip(TokenType.WHITESPACE)
 30 |             elements.append(_parse_element(tokens))
 31 | 
 32 |     return elements
 33 | 
 34 | 
 35 | def _parse_element(tokens):
 36 |     tag_names = _parse_tag_names(tokens)
 37 |     attributes_list = _parse_attribute_or_class_names(tokens)
 38 |     is_fresh = _parse_is_fresh(tokens)
 39 |     separator = _parse_separator(tokens)
 40 | 
 41 |     attributes = {}
 42 |     for attribute in attributes_list:
 43 |         if attribute.append and attributes.get(attribute.name):
 44 |             attributes[attribute.name] += " " + attribute.value
 45 |         else:
 46 |             attributes[attribute.name] = attribute.value
 47 | 
 48 |     return html_paths.element(
 49 |         tag_names,
 50 |         attributes=attributes,
 51 |         fresh=is_fresh,
 52 |         separator=separator,
 53 |     )
 54 | 
 55 | 
 56 | def _parse_tag_names(tokens):
 57 |     tag_names = [parse_identifier(tokens)]
 58 | 
 59 |     while tokens.try_skip(TokenType.SYMBOL, "|"):
 60 |         tag_names.append(parse_identifier(tokens))
 61 | 
 62 |     return tag_names
 63 | 
 64 | 
 65 | def _parse_attribute_or_class_names(tokens):
 66 |     attribute_or_class_names = []
 67 | 
 68 |     while True:
 69 |         attribute_or_class_name = _try_parse_attribute_or_class_name(tokens)
 70 |         if attribute_or_class_name is None:
 71 |             break
 72 |         else:
 73 |             attribute_or_class_names.append(attribute_or_class_name)
 74 | 
 75 |     return attribute_or_class_names
 76 | 
 77 | 
 78 | def _try_parse_attribute_or_class_name(tokens):
 79 |     if tokens.is_next(TokenType.SYMBOL, "["):
 80 |         return _parse_attribute(tokens)
 81 |     if tokens.is_next(TokenType.SYMBOL, "."):
 82 |         return _parse_class_name(tokens)
 83 |     else:
 84 |         return None
 85 | 
 86 | 
 87 | def _parse_attribute(tokens):
 88 |     tokens.skip(TokenType.SYMBOL, "[")
 89 |     name = parse_identifier(tokens)
 90 |     tokens.skip(TokenType.SYMBOL, "=")
 91 |     value = parse_string(tokens)
 92 |     tokens.skip(TokenType.SYMBOL, "]")
 93 |     return _AttributeOrClassName(name=name, value=value, append=False)
 94 | 
 95 | 
 96 | def _parse_class_name(tokens):
 97 |     tokens.skip(TokenType.SYMBOL, ".")
 98 |     class_name = parse_identifier(tokens)
 99 |     return _AttributeOrClassName(name="class", value=class_name, append=True)
100 | 
101 | 
102 | def _parse_is_fresh(tokens):
103 |     return tokens.try_skip_many((
104 |         (TokenType.SYMBOL, ":"),
105 |         (TokenType.IDENTIFIER, "fresh"),
106 |     ))
107 | 
108 | 
109 | def _parse_separator(tokens):
110 |     is_separator = tokens.try_skip_many((
111 |         (TokenType.SYMBOL, ":"),
112 |         (TokenType.IDENTIFIER, "separator"),
113 |     ))
114 |     if is_separator:
115 |         tokens.skip(TokenType.SYMBOL, "(")
116 |         value = parse_string(tokens)
117 |         tokens.skip(TokenType.SYMBOL, ")")
118 |         return value
119 |     else:
120 |         return None
121 | 


--------------------------------------------------------------------------------
/mammoth/styles/parser/style_mapping_parser.py:
--------------------------------------------------------------------------------
 1 | from .tokeniser import TokenType
 2 | from .document_matcher_parser import parse_document_matcher
 3 | from .html_path_parser import parse_html_path
 4 | from ...styles import Style
 5 | 
 6 | 
 7 | def parse_style_mapping(tokens):
 8 |     document_matcher = parse_document_matcher(tokens)
 9 |     tokens.skip(TokenType.WHITESPACE)
10 |     tokens.skip(TokenType.SYMBOL, "=>")
11 |     tokens.try_skip(TokenType.WHITESPACE)
12 |     html_path = parse_html_path(tokens)
13 |     tokens.skip(TokenType.END)
14 |     
15 |     return Style(document_matcher, html_path)
16 | 


--------------------------------------------------------------------------------
/mammoth/styles/parser/token_iterator.py:
--------------------------------------------------------------------------------
 1 | # TODO: check indices
 2 | # TODO: proper tests for unexpected tokens
 3 | 
 4 | from .errors import LineParseError
 5 | 
 6 | 
 7 | class TokenIterator(object):
 8 |     def __init__(self, tokens):
 9 |         self._tokens = tokens
10 |         self._index = 0
11 | 
12 |     def peek_token_type(self):
13 |         return self._tokens[self._index].type
14 | 
15 |     def next_value(self, token_type=None):
16 |         return self._next(token_type).value
17 | 
18 |     def _next(self, token_type=None):
19 |         token = self._tokens[self._index]
20 |         if token_type is None or token.type == token_type:
21 |             self._index += 1
22 |             return token
23 |         else:
24 |             raise self._unexpected_token_type(token_type, token)
25 | 
26 |     def skip(self, token_type, token_value=None):
27 |         token = self._tokens[self._index]
28 |         if token.type == token_type and (token_value is None or token.value == token_value):
29 |             self._index += 1
30 |             return True
31 |         else:
32 |             raise self._unexpected_token_type(token_type, token)
33 | 
34 |     def try_skip(self, token_type, token_value=None):
35 |         if self.is_next(token_type, token_value):
36 |             self._index += 1
37 |             return True
38 |         else:
39 |             return False
40 | 
41 |     def try_skip_many(self, tokens):
42 |         start = self._index
43 |         for token_type, token_value in tokens:
44 |             token = self._tokens[self._index]
45 |             if not (token.type == token_type and (token_value is None or token.value == token_value)):
46 |                 self._index = start
47 |                 return False
48 |             else:
49 |                 self._index += 1
50 | 
51 |         return True
52 | 
53 |     def is_next(self, token_type, token_value=None):
54 |         token = self._tokens[self._index]
55 |         return token.type == token_type and (token_value is None or token.value == token_value)
56 | 
57 |     def _unexpected_token_type(self, token_type, token):
58 |         raise LineParseError()
59 | 
60 | 


--------------------------------------------------------------------------------
/mammoth/styles/parser/token_parser.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from .tokeniser import TokenType
 4 | 
 5 | 
 6 | def try_parse_class_name(tokens):
 7 |     if tokens.try_skip(TokenType.SYMBOL, "."):
 8 |         return parse_identifier(tokens)
 9 |     else:
10 |         return None
11 | 
12 | 
13 | def parse_identifier(tokens):
14 |     return decode_escape_sequences(tokens.next_value(TokenType.IDENTIFIER))
15 | 
16 | 
17 | def parse_string(tokens):
18 |     return decode_escape_sequences(tokens.next_value(TokenType.STRING)[1:-1])
19 | 
20 | 
21 | _ESCAPE_SEQUENCE_REGEX = re.compile(r"\\(.)")
22 | 
23 | 
24 | def decode_escape_sequences(value):
25 |     return _ESCAPE_SEQUENCE_REGEX.sub(_decode_escape_sequence, value)
26 |     
27 |     
28 | def _decode_escape_sequence(match):
29 |     code = match.group(1)
30 |     if code == "n":
31 |         return "\n"
32 |     elif code == "r":
33 |         return "\r"
34 |     elif code == "t":
35 |         return "\t"
36 |     else:
37 |         return code
38 | 


--------------------------------------------------------------------------------
/mammoth/styles/parser/tokeniser.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | import re
 3 | 
 4 | 
 5 | Token = collections.namedtuple("Token", ["character_index", "type", "value"])
 6 | 
 7 | 
 8 | class TokenType(object):
 9 |     IDENTIFIER = "identifier"
10 |     SYMBOL = "symbol"
11 |     WHITESPACE = "whitespace"
12 |     STRING = "string"
13 |     UNTERMINATED_STRING = "unterminated string"
14 |     INTEGER = "integer"
15 |     END = "end"
16 |     
17 | 
18 | 
19 | def regex_tokeniser(rules):
20 |     rules = [(token_type, _to_regex(regex)) for token_type, regex in rules]
21 |     rules.append(("unknown", re.compile(".")))
22 |     
23 |     def tokenise(value):
24 |         tokens = []
25 |         index = 0
26 |         while index < len(value):
27 |             for token_type, regex in rules:
28 |                 match = regex.match(value, index)
29 |                 if match is not None:
30 |                     tokens.append(Token(index, token_type, match.group(0)))
31 |                     index = match.end()
32 |                     break
33 |             else:
34 |                 # Should be impossible
35 |                 raise Exception("Remaining: " + value[index:])
36 | 
37 |         tokens.append(Token(index, TokenType.END, ""))
38 | 
39 |         return tokens
40 | 
41 |     return tokenise
42 |     
43 | 
44 | def _to_regex(value):
45 |     if hasattr(value, "match"):
46 |         return value
47 |     else:
48 |         return re.compile(value)
49 | 
50 | 
51 | _string_prefix = r"'(?:\\.|[^'])*"
52 | _identifier_character = r"(?:[a-zA-Z\-_]|\\.)"
53 | 
54 | tokenise = regex_tokeniser([
55 |     (TokenType.IDENTIFIER, _identifier_character + "(?:" + _identifier_character + "|[0-9])*"),
56 |     (TokenType.SYMBOL, r":|>|=>|\^=|=|\(|\)|\[|\]|\||!|\."),
57 |     (TokenType.WHITESPACE, r"\s+"),
58 |     (TokenType.STRING, _string_prefix + "'"),
59 |     (TokenType.UNTERMINATED_STRING, _string_prefix),
60 |     (TokenType.INTEGER, "([0-9]+)"),
61 | ])
62 | 


--------------------------------------------------------------------------------
/mammoth/transforms.py:
--------------------------------------------------------------------------------
 1 | from . import documents
 2 | 
 3 | 
 4 | def paragraph(transform_paragraph):
 5 |     return element_of_type(documents.Paragraph, transform_paragraph)
 6 | 
 7 | 
 8 | def run(transform_run):
 9 |     return element_of_type(documents.Run, transform_run)
10 | 
11 | 
12 | def element_of_type(element_type, transform):
13 |     def transform_element(element):
14 |         if isinstance(element, element_type):
15 |             return transform(element)
16 |         else:
17 |             return element
18 | 
19 |     return _each_element(transform_element)
20 | 
21 | 
22 | def _each_element(transform_element):
23 |     def transform_element_and_children(element):
24 |         if isinstance(element, (documents.HasChildren, documents.TableCellUnmerged)):
25 |             children = list(map(transform_element_and_children, element.children))
26 |             element = element.copy(children=children)
27 | 
28 |         return transform_element(element)
29 | 
30 |     return transform_element_and_children
31 | 
32 | 
33 | def get_descendants_of_type(element, element_type):
34 |     return list(filter(
35 |         lambda descendant: isinstance(descendant, element_type),
36 |         get_descendants(element),
37 |     ))
38 | 
39 | 
40 | def get_descendants(element):
41 |     descendants = []
42 | 
43 |     def visit(element):
44 |         descendants.append(element)
45 | 
46 |     _visit_descendants(element, visit)
47 | 
48 |     return descendants
49 | 
50 | 
51 | def _visit_descendants(element, visit):
52 |     if isinstance(element, documents.HasChildren):
53 |         for child in element.children:
54 |             _visit_descendants(child, visit)
55 |             visit(child)
56 | 
57 | 


--------------------------------------------------------------------------------
/mammoth/underline.py:
--------------------------------------------------------------------------------
1 | from . import html
2 | 
3 | 
4 | def element(name):
5 |     def convert_underline(nodes):
6 |         return [html.collapsible_element(name, {}, nodes)]
7 |         
8 |     return convert_underline
9 | 


--------------------------------------------------------------------------------
/mammoth/writers/__init__.py:
--------------------------------------------------------------------------------
 1 | from .html import HtmlWriter
 2 | from .markdown import MarkdownWriter
 3 | 
 4 | 
 5 | def writer(output_format=None):
 6 |     if output_format is None:
 7 |         output_format = "html"
 8 |     
 9 |     return _writers[output_format]()
10 | 
11 | 
12 | def formats():
13 |     return _writers.keys()
14 | 
15 | 
16 | _writers = {
17 |     "html": HtmlWriter,
18 |     "markdown": MarkdownWriter,
19 | }
20 | 


--------------------------------------------------------------------------------
/mammoth/writers/abc.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import abc
 4 | 
 5 | 
 6 | class Writer(object):
 7 |     __metaclass__ = abc.ABCMeta
 8 |     
 9 |     @abc.abstractmethod
10 |     def text(self, text):
11 |         pass
12 |     
13 |     @abc.abstractmethod
14 |     def start(self, name, attributes=None):
15 |         pass
16 | 
17 |     @abc.abstractmethod
18 |     def end(self, name):
19 |         pass
20 |     
21 |     @abc.abstractmethod
22 |     def self_closing(self, name, attributes=None):
23 |         pass
24 |     
25 |     @abc.abstractmethod
26 |     def append(self, html):
27 |         pass
28 |     
29 |     @abc.abstractmethod
30 |     def as_string(self):
31 |         pass
32 | 


--------------------------------------------------------------------------------
/mammoth/writers/html.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from xml.sax.saxutils import escape
 3 | 
 4 | from .abc import Writer
 5 | 
 6 | 
 7 | class HtmlWriter(Writer):
 8 |     def __init__(self):
 9 |         self._fragments = []
10 |     
11 |     def text(self, text):
12 |         self._fragments.append(_escape_html(text))
13 |     
14 |     def start(self, name, attributes=None):
15 |         attribute_string = _generate_attribute_string(attributes)
16 |         self._fragments.append("<{0}{1}>".format(name, attribute_string))
17 | 
18 |     def end(self, name):
19 |         self._fragments.append("</{0}>".format(name))
20 |     
21 |     def self_closing(self, name, attributes=None):
22 |         attribute_string = _generate_attribute_string(attributes)
23 |         self._fragments.append("<{0}{1} />".format(name, attribute_string))
24 |     
25 |     def append(self, html):
26 |         self._fragments.append(html)
27 |     
28 |     def as_string(self):
29 |         return "".join(self._fragments)
30 | 
31 | 
32 | def _escape_html(text):
33 |     return escape(text, {'"': "&quot;"})
34 | 
35 | 
36 | def _generate_attribute_string(attributes):
37 |     if attributes is None:
38 |         return ""
39 |     else:
40 |         return "".join(
41 |             ' {0}="{1}"'.format(key, _escape_html(attributes[key]))
42 |             for key in sorted(attributes)
43 |         )
44 | 


--------------------------------------------------------------------------------
/mammoth/writers/markdown.py:
--------------------------------------------------------------------------------
  1 | from __future__ import unicode_literals
  2 | 
  3 | from .abc import Writer
  4 | 
  5 | import re
  6 | 
  7 | 
  8 | class _WriterOutput(object):
  9 |     def __init__(self, start, end=None, generate_end=None, anchor_position=None):
 10 |         if generate_end is None:
 11 |             generate_end = _constant(end)
 12 |         
 13 |         self.start = start
 14 |         self.generate_end = generate_end
 15 |         self.anchor_position = anchor_position
 16 | 
 17 | 
 18 | def _constant(value):
 19 |     def get():
 20 |         return value
 21 |     
 22 |     return get
 23 | 
 24 | 
 25 | class _MarkdownState(object):
 26 |     def __init__(self):
 27 |         self._list_state_stack = []
 28 |         self.list_state = None
 29 |         self.list_item_has_closed = False
 30 |     
 31 |     def update_list_state(self, list_state):
 32 |         self._list_state_stack.append(self.list_state)
 33 |         self.list_state = list_state
 34 |     
 35 |     def pop_list_state(self):
 36 |         self.list_state = self._list_state_stack.pop()
 37 | 
 38 | 
 39 | class _MarkdownListState(object):
 40 |     def __init__(self, ordered, indentation):
 41 |         self.ordered = ordered
 42 |         self.count = 0
 43 |         self.indentation = indentation
 44 | 
 45 | 
 46 | def _symmetric_wrapped(end):
 47 |     return _Wrapped(end, end)
 48 | 
 49 | 
 50 | class _Wrapped(object):
 51 |     def __init__(self, start, end):
 52 |         self._start = start
 53 |         self._end = end
 54 |     
 55 |     def __call__(self, attributes, markdown_state):
 56 |         return _WriterOutput(self._start, self._end)
 57 | 
 58 | 
 59 | def _hyperlink(attributes, markdown_state):
 60 |     href = attributes.get("href", "")
 61 |     if href:
 62 |         return _WriterOutput(
 63 |             "[", "]({0})".format(href),
 64 |             anchor_position="before",
 65 |         )
 66 |     else:
 67 |         return _default_output
 68 | 
 69 | 
 70 | def _image(attributes, markdown_state):
 71 |     src = attributes.get("src", "")
 72 |     alt_text = attributes.get("alt", "")
 73 |     if src or alt_text:
 74 |         return _WriterOutput("![{0}]({1})".format(alt_text, src), "")
 75 |     else:
 76 |         return _default_output
 77 | 
 78 | 
 79 | def _list(ordered):
 80 |     def call(attributes, markdown_state):
 81 |         if markdown_state.list_state is None:
 82 |             start = ""
 83 |             end_text = "\n"
 84 |             indentation = 0
 85 |         else:
 86 |             start = "\n"
 87 |             end_text = ""
 88 |             indentation = markdown_state.list_state.indentation + 1
 89 |         
 90 |         def generate_end():
 91 |             markdown_state.pop_list_state()
 92 |             return end_text
 93 |         
 94 |         markdown_state.update_list_state(_MarkdownListState(
 95 |             ordered=ordered,
 96 |             indentation=indentation,
 97 |         ))
 98 |         
 99 |         return _WriterOutput(start, generate_end=generate_end)
100 |     
101 |     return call
102 | 
103 | 
104 | def _list_item(attributes, markdown_state):
105 |     markdown_state.list_item_has_closed = False
106 |     
107 |     list_state = markdown_state.list_state or _MarkdownListState(ordered=False, indentation=0)
108 |     list_state.count += 1
109 |     
110 |     if list_state.ordered:
111 |         bullet = "{0}.".format(list_state.count)
112 |     else:
113 |         bullet = "-"
114 |     
115 |     def generate_end():
116 |         if markdown_state.list_item_has_closed:
117 |             return ""
118 |         else:
119 |             markdown_state.list_item_has_closed = True
120 |             return "\n"
121 |     
122 |     return _WriterOutput(
123 |         start=("\t" * list_state.indentation) + bullet + " ",
124 |         generate_end=generate_end
125 |     )
126 | 
127 | 
128 | def _init_writers():
129 |     writers = {
130 |         "p": _Wrapped("", "\n\n"),
131 |         "br": _Wrapped("", "  \n"),
132 |         "strong": _symmetric_wrapped("__"),
133 |         "em": _symmetric_wrapped("*"),
134 |         "a": _hyperlink,
135 |         "img": _image,
136 |         "ol": _list(ordered=True),
137 |         "ul": _list(ordered=False),
138 |         "li": _list_item,
139 |     }
140 |     
141 |     for level in range(1, 7):
142 |         writers["h{0}".format(level)] = _Wrapped("#" * level + " ", "\n\n")
143 |     
144 |     return writers
145 | 
146 | 
147 | _writers = _init_writers()
148 | _default_output = _WriterOutput("", "")
149 | 
150 | def _default_writer(attributes, markdown_state):
151 |     return _default_output
152 | 
153 | 
154 | class MarkdownWriter(Writer):
155 |     def __init__(self):
156 |         self._fragments = []
157 |         self._element_stack = []
158 |         self._markdown_state = _MarkdownState()
159 |     
160 |     def text(self, text):
161 |         self._fragments.append(_escape_markdown(text))
162 |     
163 |     def start(self, name, attributes=None):
164 |         if attributes is None:
165 |             attributes = {}
166 |         
167 |         output = _writers.get(name, _default_writer)(attributes, self._markdown_state)
168 |         self._element_stack.append(output.generate_end)
169 |         
170 |         anchor_before_start = output.anchor_position == "before"
171 |         if anchor_before_start:
172 |             self._write_anchor(attributes)
173 |         
174 |         self._fragments.append(output.start)
175 |         
176 |         if not anchor_before_start:
177 |             self._write_anchor(attributes)
178 |         
179 |         
180 | 
181 |     def end(self, name):
182 |         end = self._element_stack.pop()
183 |         output = end()
184 |         self._fragments.append(output)
185 |     
186 |     def self_closing(self, name, attributes=None):
187 |         self.start(name, attributes)
188 |         self.end(name)
189 |     
190 |     def append(self, other):
191 |         self._fragments.append(other)
192 |     
193 |     def as_string(self):
194 |         return "".join(self._fragments)
195 |     
196 |     def _write_anchor(self, attributes):
197 |         html_id = attributes.get("id")
198 |         if html_id:
199 |             self._fragments.append('<a id="{0}"></a>'.format(html_id))
200 | 
201 | 
202 | def _escape_markdown(value):
203 |     return re.sub(r"([\`\*_\{\}\[\]\(\)\#\+\-\.\!])", r"\\\1", re.sub("\\\\", "\\\\\\\\", value))
204 | 


--------------------------------------------------------------------------------
/mammoth/zips.py:
--------------------------------------------------------------------------------
 1 | import contextlib
 2 | import io
 3 | import shutil
 4 | 
 5 | from zipfile import ZipFile
 6 | 
 7 | 
 8 | def open_zip(fileobj, mode):
 9 |     return _Zip(ZipFile(fileobj, mode))
10 | 
11 | 
12 | class _Zip(object):
13 |     def __init__(self, zip_file):
14 |         self._zip_file = zip_file
15 |     
16 |     def __enter__(self):
17 |         return self
18 |     
19 |     def __exit__(self, *args):
20 |         self._zip_file.close()
21 | 
22 |     def open(self, name):
23 |         return contextlib.closing(self._zip_file.open(name))
24 | 
25 |     def exists(self, name):
26 |         try:
27 |             self._zip_file.getinfo(name)
28 |             return True
29 |         except KeyError:
30 |             return False
31 | 
32 |     def read_str(self, name):
33 |         return self._zip_file.read(name).decode("utf8")
34 | 
35 | 
36 | def update_zip(fileobj, files):
37 |     source = ZipFile(fileobj, "r")
38 |     try:
39 |         destination_fileobj = io.BytesIO()
40 |         destination = ZipFile(destination_fileobj, "w")
41 |         try:
42 |             names = set(source.namelist()) | set(files.keys())
43 |             for name in names:
44 |                 if name in files:
45 |                     contents = files[name]
46 |                 else:
47 |                     contents = source.read(name)
48 |                 destination.writestr(name, contents)
49 |         finally:
50 |             destination.close()
51 |     finally:
52 |         source.close()
53 |     
54 |     fileobj.seek(0)
55 |     destination_fileobj.seek(0)
56 |     shutil.copyfileobj(destination_fileobj, fileobj)
57 | 
58 | 
59 | def split_path(path):
60 |     parts = path.rsplit("/", 1)
61 |     if len(parts) == 1:
62 |         return ("", path)
63 |     else:
64 |         return tuple(parts)
65 | 
66 | 
67 | def join_path(*args):
68 |     non_empty_paths = list(filter(None, args))
69 |     
70 |     relevant_paths = []
71 |     for path in non_empty_paths:
72 |         if path.startswith("/"):
73 |             relevant_paths = [path]
74 |         else:
75 |             relevant_paths.append(path)
76 |     
77 |     return "/".join(relevant_paths)
78 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools"]
3 | build-backend = "setuptools.build_meta"
4 | 


--------------------------------------------------------------------------------
/recipes/wmf_images.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import os
 3 | import shutil
 4 | import subprocess
 5 | import tempfile
 6 | 
 7 | 
 8 | # An example of how to use LibreOffice and ImageMagick to convert WMF images to
 9 | # PNGs.
10 | #
11 | # libreoffice_wmf_conversion uses LibreOffice to convert the image to a PNG.
12 | # This normally creates an image with a large amount of padding, so
13 | # imagemagick_trim can be used to trim the image.
14 | #
15 | # The image can be then be converted using a normal image handler, such as
16 | # mammoth.images.data_uri.
17 | #
18 | # Example usage:
19 | #
20 | # def convert_image(image):
21 | #     image = libreoffice_wmf_conversion(image, post_process=imagemagick_trim)
22 | #     return mammoth.images.data_uri(image)
23 | #    
24 | # with open("document.docx", "rb") as fileobj:
25 | #     result = mammoth.convert_to_html(fileobj, convert_image=convert_image)
26 | 
27 | 
28 | _wmf_extensions = {
29 |     "image/x-wmf": ".wmf",
30 |     "image/x-emf": ".emf",
31 | }
32 | 
33 | 
34 | def libreoffice_wmf_conversion(image, post_process=None):
35 |     if post_process is None:
36 |         post_process = lambda x: x
37 |     
38 |     wmf_extension = _wmf_extensions.get(image.content_type)
39 |     if wmf_extension is None:
40 |         return image
41 |     else:
42 |         temporary_directory = tempfile.mkdtemp()
43 |         try:
44 |             input_path = os.path.join(temporary_directory, "image" + wmf_extension)
45 |             with io.open(input_path, "wb") as input_fileobj:
46 |                 with image.open() as image_fileobj:
47 |                     shutil.copyfileobj(image_fileobj, input_fileobj)
48 |             
49 |             output_path = os.path.join(temporary_directory, "image.png")
50 |             subprocess.check_call([
51 |                 "libreoffice",
52 |                 "--headless",
53 |                 "--convert-to",
54 |                 "png",
55 |                 input_path,
56 |                 "--outdir",
57 |                 temporary_directory,
58 |             ])
59 |             
60 |             with io.open(output_path, "rb") as output_fileobj:
61 |                 output = output_fileobj.read()
62 |             
63 |             def open_image():
64 |                 return io.BytesIO(output)
65 |             
66 |             return post_process(image.copy(
67 |                 content_type="image/png",
68 |                 open=open_image,
69 |             ))
70 |         finally:
71 |             shutil.rmtree(temporary_directory)
72 | 
73 | 
74 | def imagemagick_trim(image):
75 |     command = ["convert", "-", "-trim", "-"]
76 |     process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
77 |     try:
78 |         with image.open() as image_fileobj:
79 |             shutil.copyfileobj(image_fileobj, process.stdin)
80 |         output, err_output = process.communicate()
81 |     except:
82 |         process.kill()
83 |         process.wait()
84 |         raise
85 |         
86 |     return_code = process.poll()
87 |     if return_code:
88 |         raise subprocess.CalledProcessError(return_code, command)
89 |     else:
90 |         def open_image():
91 |             return io.BytesIO(output)
92 |         
93 |         return image.copy(open=open_image)
94 | 
95 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal = 1
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | from setuptools import setup
 5 | 
 6 | def read(fname):
 7 |     return open(os.path.join(os.path.dirname(__file__), fname)).read()
 8 | 
 9 | 
10 | setup(
11 |     name='mammoth',
12 |     version='1.9.1',
13 |     description='Convert Word documents from docx to simple and clean HTML and Markdown',
14 |     long_description=read("README"),
15 |     author='Michael Williamson',
16 |     author_email='mike@zwobble.org',
17 |     url='https://github.com/mwilliamson/python-mammoth',
18 |     packages=['mammoth', 'mammoth.docx', 'mammoth.html', 'mammoth.styles', 'mammoth.styles.parser', 'mammoth.writers'],
19 |     entry_points={
20 |         "console_scripts": [
21 |             "mammoth=mammoth.cli:main"
22 |         ]
23 |     },
24 |     keywords="docx word office clean html markdown md",
25 |     install_requires=[
26 |         "cobble>=0.1.3,<0.2",
27 |     ],
28 |     python_requires='>=3.7',
29 |     license="BSD-2-Clause",
30 |     classifiers=[
31 |         'Development Status :: 5 - Production/Stable',
32 |         'Intended Audience :: Developers',
33 |         'License :: OSI Approved :: BSD License',
34 |         'Programming Language :: Python',
35 |         'Programming Language :: Python :: 3',
36 |         'Programming Language :: Python :: 3.7',
37 |         'Programming Language :: Python :: 3.8',
38 |         'Programming Language :: Python :: 3.9',
39 |         'Programming Language :: Python :: 3.10',
40 |         'Programming Language :: Python :: 3.11',
41 |         'Programming Language :: Python :: 3.12',
42 |     ],
43 | )
44 | 
45 | 


--------------------------------------------------------------------------------
/test-requirements.txt:
--------------------------------------------------------------------------------
1 | funk>=0.4,<0.5
2 | pytest
3 | precisely==0.1.3
4 | pyflakes==2.4.0
5 | spur.local>=0.3.7,<0.4
6 | tempman>=0.1.2,<0.2
7 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/tests/cli_tests.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import base64
 3 | 
 4 | import spur
 5 | import tempman
 6 | 
 7 | from .testing import assert_equal, generate_test_path
 8 | 
 9 | 
10 | _local = spur.LocalShell()
11 | 
12 | 
13 | def test_html_is_printed_to_stdout_if_output_file_is_not_set():
14 |     docx_path = generate_test_path("single-paragraph.docx")
15 |     result = _local.run(["mammoth", docx_path])
16 |     assert_equal(b"", result.stderr_output)
17 |     assert_equal(b"<p>Walking on imported air</p>", result.output)
18 | 
19 | 
20 | def test_html_is_written_to_file_if_output_file_is_set():
21 |     with tempman.create_temp_dir() as temp_dir:
22 |         output_path = os.path.join(temp_dir.path, "output.html")
23 |         docx_path = generate_test_path("single-paragraph.docx")
24 |         result = _local.run(["mammoth", docx_path, output_path])
25 |         assert_equal(b"", result.stderr_output)
26 |         assert_equal(b"", result.output)
27 |         with open(output_path) as output_file:
28 |             assert_equal("<p>Walking on imported air</p>", output_file.read())
29 | 
30 | 
31 | _image_base_64 = b"iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAIAAAACUFjqAAAAAXNSR0IArs4c6QAAAAlwSFlzAAAOvgAADr4B6kKxwAAAABNJREFUKFNj/M+ADzDhlWUYqdIAQSwBE8U+X40AAAAASUVORK5CYII="
32 | 
33 | 
34 | def test_inline_images_are_included_in_output_if_writing_to_single_file():
35 |     docx_path = generate_test_path("tiny-picture.docx")
36 |     result = _local.run(["mammoth", docx_path])
37 |     assert_equal(b"""<p><img src="data:image/png;base64,""" + _image_base_64 + b"""" /></p>""", result.output)
38 | 
39 | 
40 | def test_images_are_written_to_separate_files_if_output_dir_is_set():
41 |     with tempman.create_temp_dir() as temp_dir:
42 |         output_path = os.path.join(temp_dir.path, "tiny-picture.html")
43 |         image_path = os.path.join(temp_dir.path, "1.png")
44 | 
45 |         docx_path = generate_test_path("tiny-picture.docx")
46 |         result = _local.run(["mammoth", docx_path, "--output-dir", temp_dir.path])
47 |         assert_equal(b"", result.stderr_output)
48 |         assert_equal(b"", result.output)
49 |         with open(output_path) as output_file:
50 |             assert_equal("""<p><img src="1.png" /></p>""", output_file.read())
51 | 
52 |         with open(image_path, "rb") as image_file:
53 |             assert_equal(_image_base_64, base64.b64encode(image_file.read()))
54 | 
55 | 
56 | def test_style_map_is_used_if_set():
57 |     with tempman.create_temp_dir() as temp_dir:
58 |         docx_path = generate_test_path("single-paragraph.docx")
59 |         style_map_path = os.path.join(temp_dir.path, "style-map")
60 |         with open(style_map_path, "w") as style_map_file:
61 |             style_map_file.write("p => span:fresh")
62 |         result = _local.run(["mammoth", docx_path, "--style-map", style_map_path])
63 |         assert_equal(b"", result.stderr_output)
64 |         assert_equal(b"<span>Walking on imported air</span>", result.output)
65 | 
66 | 
67 | def test_output_format_markdown_option_generates_markdown_output():
68 |     docx_path = generate_test_path("single-paragraph.docx")
69 |     result = _local.run(["mammoth", docx_path, "--output-format=markdown"])
70 |     assert_equal(b"", result.stderr_output)
71 |     assert_equal(b"Walking on imported air\n\n", result.output)
72 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import funk
 2 | import pytest
 3 | 
 4 | 
 5 | @pytest.fixture(name="mocks")
 6 | def _fixture_mocks():
 7 |     mocks = funk.Mocks()
 8 |     yield mocks
 9 |     mocks.verify()
10 | 


--------------------------------------------------------------------------------
/tests/docx/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/docx/__init__.py


--------------------------------------------------------------------------------
/tests/docx/comments_xml_tests.py:
--------------------------------------------------------------------------------
 1 | from mammoth import documents
 2 | from mammoth.docx.xmlparser import element as xml_element
 3 | from mammoth.docx.comments_xml import read_comments_xml_element
 4 | from mammoth.docx import body_xml
 5 | from ..testing import assert_equal
 6 | 
 7 | 
 8 | def test_id_and_body_of_comment_is_read():
 9 |     body = [xml_element("w:p")]
10 |     comments = read_comments_xml_element(xml_element("w:comments", {}, [
11 |         xml_element("w:comment", {"w:id": "1"}, body),
12 |     ]), body_reader=body_xml.reader())
13 |     assert_equal(1, len(comments.value))
14 |     assert_equal(comments.value[0].body, [documents.paragraph(children=[])])
15 |     assert_equal("1", comments.value[0].comment_id)
16 | 
17 | 
18 | def test_when_optional_attributes_of_comment_are_missing_then_they_are_read_as_none():
19 |     comments = read_comments_xml_element(xml_element("w:comments", {}, [
20 |         xml_element("w:comment", {"w:id": "1"}, []),
21 |     ]), body_reader=body_xml.reader())
22 |     comment, = comments.value
23 |     assert_equal(None, comment.author_name)
24 |     assert_equal(None, comment.author_initials)
25 | 
26 | 
27 | def test_when_optional_attributes_of_comment_are_blank_then_they_are_read_as_none():
28 |     comments = read_comments_xml_element(xml_element("w:comments", {}, [
29 |         xml_element("w:comment", {"w:id": "1", "w:author": " ", "w:initials": " "}, []),
30 |     ]), body_reader=body_xml.reader())
31 |     comment, = comments.value
32 |     assert_equal(None, comment.author_name)
33 |     assert_equal(None, comment.author_initials)
34 | 
35 | 
36 | def test_when_optional_attributes_of_comment_are_not_blank_then_they_are_read():
37 |     comments = read_comments_xml_element(xml_element("w:comments", {}, [
38 |         xml_element("w:comment", {"w:id": "1", "w:author": "The Piemaker", "w:initials": "TP"}, []),
39 |     ]), body_reader=body_xml.reader())
40 |     comment, = comments.value
41 |     assert_equal("The Piemaker", comment.author_name)
42 |     assert_equal("TP", comment.author_initials)
43 | 


--------------------------------------------------------------------------------
/tests/docx/content_types_xml_tests.py:
--------------------------------------------------------------------------------
 1 | from mammoth.docx.xmlparser import element as xml_element
 2 | from mammoth.docx.content_types_xml import read_content_types_xml_element
 3 | from ..testing import assert_equal
 4 | 
 5 | 
 6 | def test_content_type_is_based_on_default_for_extension_if_there_is_no_override():
 7 |     element = xml_element("content-types:Types", {}, [
 8 |         xml_element("content-types:Default", {
 9 |             "Extension": "png",
10 |             "ContentType": "image/png",
11 |         })
12 |     ])
13 |     content_types = read_content_types_xml_element(element)
14 |     assert_equal(
15 |         "image/png",
16 |         content_types.find_content_type("word/media/hat.png"),
17 |     )
18 | 
19 | 
20 | def test_content_type_is_based_on_override_if_present():
21 |     element = xml_element("content-types:Types", {}, [
22 |         xml_element("content-types:Default", {
23 |             "Extension": "png",
24 |             "ContentType": "image/png",
25 |         }),
26 |         xml_element("content-types:Override", {
27 |             "PartName": "/word/media/hat.png",
28 |             "ContentType": "image/hat"
29 |         }),
30 |     ])
31 |     content_types = read_content_types_xml_element(element)
32 |     assert_equal(
33 |         "image/hat",
34 |         content_types.find_content_type("word/media/hat.png"),
35 |     )
36 | 
37 | 
38 | def test_fallback_content_types_have_common_image_types():
39 |     element = xml_element("content-types:Types", {}, [])
40 |     content_types = read_content_types_xml_element(element)
41 |     assert_equal(
42 |         "image/png",
43 |         content_types.find_content_type("word/media/hat.png"),
44 |     )
45 |     assert_equal(
46 |         "image/gif",
47 |         content_types.find_content_type("word/media/hat.gif"),
48 |     )
49 |     assert_equal(
50 |         "image/jpeg",
51 |         content_types.find_content_type("word/media/hat.jpg"),
52 |     )
53 |     assert_equal(
54 |         "image/jpeg",
55 |         content_types.find_content_type("word/media/hat.jpeg"),
56 |     )
57 |     assert_equal(
58 |         "image/bmp",
59 |         content_types.find_content_type("word/media/hat.bmp"),
60 |     )
61 |     assert_equal(
62 |         "image/tiff",
63 |         content_types.find_content_type("word/media/hat.tif"),
64 |     )
65 |     assert_equal(
66 |         "image/tiff",
67 |         content_types.find_content_type("word/media/hat.tiff"),
68 |     )
69 | 
70 | 
71 | def test_fallback_content_types_are_case_insensitive():
72 |     element = xml_element("content-types:Types", {}, [])
73 |     content_types = read_content_types_xml_element(element)
74 |     assert_equal(
75 |         "image/png",
76 |         content_types.find_content_type("word/media/hat.PnG"),
77 |     )
78 | 


--------------------------------------------------------------------------------
/tests/docx/document_matchers.py:
--------------------------------------------------------------------------------
 1 | from precisely import all_of, has_attrs, instance_of
 2 | 
 3 | from mammoth import documents
 4 | 
 5 | 
 6 | def create_element_matcher(element_type):
 7 |     def matcher(**kwargs):
 8 |         return all_of(
 9 |             instance_of(element_type),
10 |             has_attrs(**kwargs),
11 |         )
12 | 
13 |     return matcher
14 | 
15 | 
16 | is_paragraph = create_element_matcher(documents.Paragraph)
17 | is_run = create_element_matcher(documents.Run)
18 | is_hyperlink = create_element_matcher(documents.Hyperlink)
19 | is_checkbox = create_element_matcher(documents.Checkbox)
20 | is_table = create_element_matcher(documents.Table)
21 | is_row = create_element_matcher(documents.TableRow)
22 | 
23 | 
24 | is_empty_run = is_run(children=[])
25 | 
26 | 
27 | def is_text(value):
28 |     return all_of(
29 |         instance_of(documents.Text),
30 |         has_attrs(value=value),
31 |     )
32 | 


--------------------------------------------------------------------------------
/tests/docx/document_xml_tests.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from mammoth import documents
 4 | from mammoth.docx.xmlparser import element as xml_element, text as xml_text
 5 | from mammoth.docx.document_xml import read_document_xml_element
 6 | from mammoth.docx import body_xml
 7 | from ..testing import assert_equal
 8 | 
 9 | 
10 | def test_when_body_element_is_present_then_body_is_read():
11 |     text_xml = xml_element("w:t", {}, [xml_text("Hello!")])
12 |     run_xml = xml_element("w:r", {}, [text_xml])
13 |     paragraph_xml = xml_element("w:p", {}, [run_xml])
14 |     body_xml = xml_element("w:body", {}, [paragraph_xml])
15 |     document_xml = xml_element("w:document", {}, [body_xml])
16 | 
17 |     document = _read_and_get_document_xml_element(document_xml)
18 | 
19 |     assert_equal(
20 |         documents.document([documents.paragraph([documents.run([documents.text("Hello!")])])]),
21 |         document
22 |     )
23 | 
24 | 
25 | def test_when_body_element_is_not_present_then_error_is_raised():
26 |     paragraph_xml = xml_element("w:p", {}, [])
27 |     body_xml = xml_element("w:body2", {}, [paragraph_xml])
28 |     document_xml = xml_element("w:document", {}, [body_xml])
29 | 
30 |     error = pytest.raises(ValueError, lambda: _read_and_get_document_xml_element(document_xml))
31 | 
32 |     assert_equal(str(error.value), "Could not find the body element: are you sure this is a docx file?")
33 | 
34 | 
35 | def test_footnotes_of_document_are_read():
36 |     notes = [documents.note("footnote", "4", [documents.paragraph([])])]
37 | 
38 |     body_xml = xml_element("w:body")
39 |     document_xml = xml_element("w:document", {}, [body_xml])
40 | 
41 |     document = _read_and_get_document_xml_element(document_xml, notes=notes)
42 |     footnote = document.notes.find_note("footnote", "4")
43 |     assert_equal("4", footnote.note_id)
44 |     assert isinstance(footnote.body[0], documents.Paragraph)
45 | 
46 | 
47 | def _read_and_get_document_xml_element(*args, **kwargs):
48 |     body_reader = body_xml.reader()
49 |     result = read_document_xml_element(*args, body_reader=body_reader, **kwargs)
50 |     assert_equal([], result.messages)
51 |     return result.value
52 | 


--------------------------------------------------------------------------------
/tests/docx/docx_tests.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import textwrap
  3 | import zipfile
  4 | 
  5 | from mammoth import docx, documents, zips
  6 | from ..testing import assert_equal, assert_raises, generate_test_path
  7 | 
  8 | 
  9 | class ReadTests(object):
 10 |     def test_can_read_document_with_single_paragraph_with_single_run_of_text(self):
 11 |         with open(generate_test_path("single-paragraph.docx"), "rb") as fileobj:
 12 |             result = docx.read(fileobj=fileobj)
 13 |             expected_document = documents.document([
 14 |                 documents.paragraph([
 15 |                     documents.run([
 16 |                         documents.text("Walking on imported air")
 17 |                     ])
 18 |                 ])
 19 |             ])
 20 |             assert_equal(expected_document, result.value)
 21 | 
 22 | 
 23 | _relationship_namespaces = {
 24 |     "r": "http://schemas.openxmlformats.org/package/2006/relationships",
 25 | }
 26 | 
 27 | 
 28 | def test_main_document_is_found_using_package_relationships():
 29 |     fileobj = _create_zip({
 30 |         "word/document2.xml": textwrap.dedent("""\
 31 |             <?xml version="1.0" encoding="utf-8" ?>
 32 |             <w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
 33 |                 <w:body>
 34 |                     <w:p>
 35 |                         <w:r>
 36 |                             <w:t>Hello.</w:t>
 37 |                         </w:r>
 38 |                     </w:p>
 39 |                 </w:body>
 40 |             </w:document>
 41 |         """),
 42 |         "_rels/.rels": textwrap.dedent("""\
 43 |             <?xml version="1.0" encoding="utf-8"?>
 44 |             <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
 45 |                 <Relationship Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="/word/document2.xml" Id="rId1"/>
 46 |             </Relationships>
 47 |         """),
 48 |     })
 49 |     result = docx.read(fileobj=fileobj)
 50 |     expected_document = documents.document([
 51 |         documents.paragraph([
 52 |             documents.run([
 53 |                 documents.text("Hello.")
 54 |             ])
 55 |         ])
 56 |     ])
 57 |     assert_equal(expected_document, result.value)
 58 | 
 59 | 
 60 | def test_error_is_raised_when_main_document_part_does_not_exist():
 61 |     fileobj = _create_zip({
 62 |         "_rels/.rels": textwrap.dedent("""\
 63 |             <?xml version="1.0" encoding="utf-8"?>
 64 |             <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
 65 |                 <Relationship Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="/word/document2.xml" Id="rId1"/>
 66 |             </Relationships>
 67 |         """),
 68 |     })
 69 |     error = assert_raises(IOError, lambda: docx.read(fileobj=fileobj))
 70 |     assert_equal(
 71 |         "Could not find main document part. Are you sure this is a valid .docx file?",
 72 |         str(error),
 73 |     )
 74 | 
 75 | class PartPathsTests(object):
 76 |     def test_main_document_part_is_found_using_package_relationships(self):
 77 |         fileobj = _create_zip({
 78 |             "word/document2.xml": " ",
 79 |             "_rels/.rels": textwrap.dedent("""\
 80 |                 <?xml version="1.0" encoding="utf-8"?>
 81 |                 <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
 82 |                     <Relationship Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="/word/document2.xml" Id="rId1"/>
 83 |                 </Relationships>
 84 |             """),
 85 |         })
 86 |         part_paths = self._find_part_paths(fileobj)
 87 |         assert_equal("word/document2.xml", part_paths.main_document)
 88 | 
 89 |     def test_when_relationship_for_main_document_cannot_be_found_then_fallback_is_used(self):
 90 |         fileobj = _create_zip({
 91 |             "word/document.xml": " ",
 92 |         })
 93 |         part_paths = self._find_part_paths(fileobj)
 94 |         assert_equal("word/document.xml", part_paths.main_document)
 95 | 
 96 |     def test_comments_part_is_found_using_main_document_relationships(self):
 97 |         self._assert_path_is_found_using_main_document_relationships("comments")
 98 | 
 99 |     def test_when_relationship_for_comments_cannot_be_found_then_fallback_is_used(self):
100 |         self._assert_when_relationship_for_part_cannot_be_found_then_fallback_is_used("comments")
101 | 
102 |     def test_endnotes_part_is_found_using_main_document_relationships(self):
103 |         self._assert_path_is_found_using_main_document_relationships("endnotes")
104 | 
105 |     def test_when_relationship_for_endnotes_cannot_be_found_then_fallback_is_used(self):
106 |         self._assert_when_relationship_for_part_cannot_be_found_then_fallback_is_used("endnotes")
107 | 
108 |     def test_footnotes_part_is_found_using_main_document_relationships(self):
109 |         self._assert_path_is_found_using_main_document_relationships("footnotes")
110 | 
111 |     def test_when_relationship_for_footnotes_cannot_be_found_then_fallback_is_used(self):
112 |         self._assert_when_relationship_for_part_cannot_be_found_then_fallback_is_used("footnotes")
113 | 
114 |     def test_numbering_part_is_found_using_main_document_relationships(self):
115 |         self._assert_path_is_found_using_main_document_relationships("numbering")
116 | 
117 |     def test_when_relationship_for_numbering_cannot_be_found_then_fallback_is_used(self):
118 |         self._assert_when_relationship_for_part_cannot_be_found_then_fallback_is_used("numbering")
119 | 
120 |     def test_styles_part_is_found_using_main_document_relationships(self):
121 |         self._assert_path_is_found_using_main_document_relationships("styles")
122 | 
123 |     def test_when_relationship_for_styles_cannot_be_found_then_fallback_is_used(self):
124 |         self._assert_when_relationship_for_part_cannot_be_found_then_fallback_is_used("styles")
125 | 
126 |     def _assert_path_is_found_using_main_document_relationships(self, name):
127 |         fileobj = _create_zip({
128 |             "_rels/.rels": textwrap.dedent("""\
129 |                 <?xml version="1.0" encoding="utf-8"?>
130 |                 <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
131 |                     <Relationship Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="/word/document.xml" Id="rId1"/>
132 |                 </Relationships>
133 |             """),
134 |             "word/document.xml": " ",
135 |             "word/_rels/document.xml.rels": textwrap.dedent("""\
136 |                 <?xml version="1.0" encoding="utf-8"?>
137 |                 <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
138 |                     <Relationship Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/{name}" Target="target-path.xml" Id="rId2"/>
139 |                 </Relationships>
140 |             """.format(name=name)),
141 |             "word/target-path.xml": " "
142 |         })
143 |         part_paths = self._find_part_paths(fileobj)
144 |         assert_equal("word/target-path.xml", getattr(part_paths, name))
145 | 
146 |     def _assert_when_relationship_for_part_cannot_be_found_then_fallback_is_used(self, name):
147 |         fileobj = _create_zip({
148 |             "_rels/.rels": textwrap.dedent("""\
149 |                 <?xml version="1.0" encoding="utf-8"?>
150 |                 <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
151 |                     <Relationship Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="/word/document.xml" Id="rId1"/>
152 |                 </Relationships>
153 |             """),
154 |             "word/document.xml": " ",
155 |         })
156 |         part_paths = self._find_part_paths(fileobj)
157 |         assert_equal("word/{0}.xml".format(name), getattr(part_paths, name))
158 | 
159 | 
160 |     def _find_part_paths(self, fileobj):
161 |         return docx._find_part_paths(zips.open_zip(fileobj, "r"))
162 | 
163 | 
164 | def _create_zip(files):
165 |     fileobj = io.BytesIO()
166 | 
167 |     zip_file = zipfile.ZipFile(fileobj, "w")
168 |     try:
169 |         for name, contents in files.items():
170 |             zip_file.writestr(name, contents)
171 |     finally:
172 |         zip_file.close()
173 | 
174 |     fileobj.seek(0)
175 |     return fileobj
176 | 


--------------------------------------------------------------------------------
/tests/docx/files_tests.py:
--------------------------------------------------------------------------------
 1 | from mammoth.docx.files import Files, InvalidFileReferenceError
 2 | from ..testing import generate_test_path, assert_equal, assert_raises
 3 | 
 4 | 
 5 | def test_can_open_files_with_file_uri():
 6 |     path = generate_test_path("tiny-picture.png")
 7 |     files = Files(None)
 8 |     with files.open("file:///" + path) as image_file:
 9 |         contents = image_file.read()
10 |         assert_equal(bytes, type(contents))
11 |         with open(path, "rb") as source_file:
12 |             assert_equal(source_file.read(), contents)
13 | 
14 | 
15 | def test_can_open_files_with_relative_uri():
16 |     files = Files(generate_test_path(""))
17 |     with files.open("tiny-picture.png") as image_file:
18 |         contents = image_file.read()
19 |         assert_equal(bytes, type(contents))
20 |         with open(generate_test_path("tiny-picture.png"), "rb") as source_file:
21 |             assert_equal(source_file.read(), contents)
22 | 
23 | 
24 | def test_given_base_is_not_set_when_opening_relative_uri_then_error_is_raised():
25 |     files = Files(None)
26 |     error = assert_raises(InvalidFileReferenceError, lambda: files.open("not-a-real-file.png"))
27 |     expected_message = (
28 |         "could not find external image 'not-a-real-file.png', fileobj has no name"
29 |     )
30 |     assert_equal(expected_message, str(error))
31 | 
32 | 
33 | def test_error_is_raised_if_relative_uri_cannot_be_opened():
34 |     files = Files("/tmp")
35 |     error = assert_raises(InvalidFileReferenceError, lambda: files.open("not-a-real-file.png"))
36 |     expected_message = (
37 |         "could not open external image: 'not-a-real-file.png' (document directory: '/tmp')\n" +
38 |         "[Errno 2] No such file or directory: '/tmp/not-a-real-file.png'"
39 |     )
40 |     assert_equal(expected_message, str(error))
41 | 
42 | 
43 | def test_error_is_raised_if_file_uri_cannot_be_opened():
44 |     files = Files("/tmp")
45 |     error = assert_raises(InvalidFileReferenceError, lambda: files.open("file:///not-a-real-file.png"))
46 |     expected_message = "could not open external image: 'file:///not-a-real-file.png' (document directory: '/tmp')\n"
47 |     assert str(error).startswith(expected_message)
48 | 


--------------------------------------------------------------------------------
/tests/docx/notes_xml_tests.py:
--------------------------------------------------------------------------------
 1 | from mammoth import documents
 2 | from mammoth.docx.xmlparser import element as xml_element
 3 | from mammoth.docx.notes_xml import read_footnotes_xml_element
 4 | from mammoth.docx import body_xml
 5 | from ..testing import assert_equal
 6 | 
 7 | 
 8 | def test_id_and_body_of_footnote_are_read():
 9 |     footnote_body = [xml_element("w:p")]
10 |     footnotes = read_footnotes_xml_element(xml_element("w:footnotes", {}, [
11 |         xml_element("w:footnote", {"w:id": "1"}, footnote_body),
12 |     ]), body_reader=body_xml.reader())
13 |     assert_equal(1, len(footnotes.value))
14 |     assert isinstance(footnotes.value[0].body[0], documents.Paragraph)
15 |     assert_equal("1", footnotes.value[0].note_id)
16 | 
17 | 
18 | def test_continuation_separator_is_ignored():
19 |     _assert_footnote_type_is_ignored("continuationSeparator")
20 | 
21 | 
22 | def test_separator_is_ignored():
23 |     _assert_footnote_type_is_ignored("separator")
24 | 
25 | 
26 | def _assert_footnote_type_is_ignored(footnote_type):
27 |     footnote_body = [xml_element("w:p")]
28 |     footnotes = read_footnotes_xml_element(xml_element("w:footnotes", {}, [
29 |         xml_element("w:footnote", {"w:id": "1", "w:type": footnote_type}, footnote_body),
30 |     ]), body_reader=None)
31 |     assert_equal(0, len(footnotes.value))
32 | 
33 | 


--------------------------------------------------------------------------------
/tests/docx/numbering_xml_tests.py:
--------------------------------------------------------------------------------
  1 | from mammoth.docx.xmlparser import element as xml_element
  2 | from mammoth.docx.numbering_xml import read_numbering_xml_element
  3 | from mammoth.docx.styles_xml import NumberingStyle, Styles
  4 | from ..testing import assert_equal
  5 | 
  6 | 
  7 | def test_find_level_returns_none_if_num_with_id_cannot_be_found():
  8 |     numbering = _read_numbering_xml_element(xml_element("w:numbering"))
  9 |     assert_equal(None, numbering.find_level("47", "0"))
 10 | 
 11 | 
 12 | _sample_numbering_xml = xml_element("w:numbering", {}, [
 13 |     xml_element("w:abstractNum", {"w:abstractNumId": "42"}, [
 14 |         xml_element("w:lvl", {"w:ilvl": "0"}, [
 15 |             xml_element("w:numFmt", {"w:val": "bullet"})
 16 |         ]),
 17 |         xml_element("w:lvl", {"w:ilvl": "1"}, [
 18 |             xml_element("w:numFmt", {"w:val": "decimal"})
 19 |         ])
 20 |     ]),
 21 |     xml_element("w:num", {"w:numId": "47"}, [
 22 |         xml_element("w:abstractNumId", {"w:val": "42"})
 23 |     ])
 24 | ])
 25 | 
 26 | 
 27 | def test_level_includes_level_index():
 28 |     numbering = _read_numbering_xml_element(_sample_numbering_xml)
 29 |     assert_equal("0", numbering.find_level("47", "0").level_index)
 30 |     assert_equal("1", numbering.find_level("47", "1").level_index)
 31 | 
 32 | 
 33 | def test_list_is_not_ordered_if_formatted_as_bullet():
 34 |     numbering = _read_numbering_xml_element(_sample_numbering_xml)
 35 |     assert_equal(False, numbering.find_level("47", "0").is_ordered)
 36 | 
 37 | 
 38 | def test_list_is_ordered_if_formatted_as_decimal():
 39 |     numbering = _read_numbering_xml_element(_sample_numbering_xml)
 40 |     assert_equal(True, numbering.find_level("47", "1").is_ordered)
 41 | 
 42 | 
 43 | def test_list_is_ordered_if_there_is_no_explicit_format():
 44 |     element = xml_element("w:numbering", {}, [
 45 |         xml_element("w:abstractNum", {"w:abstractNumId": "42"}, [
 46 |             xml_element("w:lvl", {"w:ilvl": "0"}),
 47 |         ]),
 48 |         xml_element("w:num", {"w:numId": "47"}, [
 49 |             xml_element("w:abstractNumId", {"w:val": "42"})
 50 |         ])
 51 |     ])
 52 | 
 53 |     numbering = _read_numbering_xml_element(element)
 54 | 
 55 |     assert_equal(True, numbering.find_level("47", "0").is_ordered)
 56 | 
 57 | 
 58 | def test_find_level_returns_none_if_level_cannot_be_found():
 59 |     numbering = _read_numbering_xml_element(_sample_numbering_xml)
 60 |     assert_equal(None, numbering.find_level("47", "2"))
 61 | 
 62 | 
 63 | def test_num_referencing_non_existent_abstract_num_is_ignored():
 64 |     element = xml_element("w:numbering", {}, [
 65 |         xml_element("w:num", {"w:numId": "47"}, [
 66 |             xml_element("w:abstractNumId", {"w:val": "42"})
 67 |         ])
 68 |     ])
 69 | 
 70 |     numbering = _read_numbering_xml_element(element)
 71 | 
 72 |     assert_equal(None, numbering.find_level("47", "0"))
 73 | 
 74 | 
 75 | def test_when_abstract_num_has_num_style_link_then_style_is_used_to_find_num():
 76 |     numbering = _read_numbering_xml_element(
 77 |         xml_element("w:numbering", {}, [
 78 |             xml_element("w:abstractNum", {"w:abstractNumId": "100"}, [
 79 |                 xml_element("w:lvl", {"w:ilvl": "0"}, [
 80 |                     xml_element("w:numFmt", {"w:val": "decimal"}),
 81 |                 ]),
 82 |             ]),
 83 |             xml_element("w:abstractNum", {"w:abstractNumId": "101"}, [
 84 |                 xml_element("w:numStyleLink", {"w:val": "List1"}),
 85 |             ]),
 86 |             xml_element("w:num", {"w:numId": "200"}, [
 87 |                 xml_element("w:abstractNumId", {"w:val": "100"}),
 88 |             ]),
 89 |             xml_element("w:num", {"w:numId": "201"}, [
 90 |                 xml_element("w:abstractNumId", {"w:val": "101"}),
 91 |             ])
 92 |         ]),
 93 |         styles=Styles.create(numbering_styles={"List1": NumberingStyle(num_id="200")}),
 94 |     )
 95 |     assert_equal(True, numbering.find_level("201", "0").is_ordered)
 96 | 
 97 | 
 98 | # See: 17.9.23 pStyle (Paragraph Style's Associated Numbering Level) in ECMA-376, 4th Edition
 99 | def test_numbering_level_can_be_found_by_paragraph_style_id():
100 |     numbering = _read_numbering_xml_element(
101 |         xml_element("w:numbering", {}, [
102 |             xml_element("w:abstractNum", {"w:abstractNumId": "42"}, [
103 |                 xml_element("w:lvl", {"w:ilvl": "0"}, [
104 |                     xml_element("w:numFmt", {"w:val": "bullet"}),
105 |                 ]),
106 |             ]),
107 |             xml_element("w:abstractNum", {"w:abstractNumId": "43"}, [
108 |                 xml_element("w:lvl", {"w:ilvl": "0"}, [
109 |                     xml_element("w:pStyle", {"w:val": "List"}),
110 |                     xml_element("w:numFmt", {"w:val": "decimal"}),
111 |                 ]),
112 |             ]),
113 |         ]),
114 |     )
115 | 
116 |     assert_equal(True, numbering.find_level_by_paragraph_style_id("List").is_ordered)
117 |     assert_equal(None, numbering.find_level_by_paragraph_style_id("Paragraph"))
118 | 
119 | 
120 | def _read_numbering_xml_element(element, styles=None):
121 |     if styles is None:
122 |         styles = Styles.EMPTY
123 | 
124 |     return read_numbering_xml_element(element, styles=styles)
125 | 


--------------------------------------------------------------------------------
/tests/docx/office_xml_tests.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | 
 3 | import io
 4 | 
 5 | from mammoth.docx import xmlparser as xml, office_xml
 6 | from ..testing import assert_equal
 7 | 
 8 | 
 9 | class AlternateContentTests(object):
10 |     def test_when_fallback_is_present_then_fallback_is_read(self):
11 |         xml_string = (
12 |             '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>' +
13 |             '<numbering xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006">' +
14 |             '<mc:AlternateContent>' +
15 |             '<mc:Choice Requires="w14">' +
16 |             '<choice/>' +
17 |             '</mc:Choice>' +
18 |             '<mc:Fallback>' +
19 |             '<fallback/>' +
20 |             '</mc:Fallback>' +
21 |             '</mc:AlternateContent>' +
22 |             '</numbering>')
23 | 
24 |         result = office_xml.read(io.StringIO(xml_string))
25 |         assert_equal([xml.element("fallback")], result.children)
26 | 
27 | 
28 |     def test_when_fallback_is_not_present_then_element_is_ignored(self):
29 |         xml_string = (
30 |             '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>' +
31 |             '<numbering xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006">' +
32 |             '<mc:AlternateContent>' +
33 |             '<mc:Choice Requires="w14">' +
34 |             '<choice/>' +
35 |             '</mc:Choice>' +
36 |             '</mc:AlternateContent>' +
37 |             '</numbering>')
38 | 
39 |         result = office_xml.read(io.StringIO(xml_string))
40 |         assert_equal([], result.children)
41 | 


--------------------------------------------------------------------------------
/tests/docx/relationships_xml_tests.py:
--------------------------------------------------------------------------------
 1 | from mammoth.docx.xmlparser import element as xml_element
 2 | from mammoth.docx.relationships_xml import read_relationships_xml_element
 3 | from ..testing import assert_equal
 4 | 
 5 | 
 6 | def test_relationship_targets_can_be_found_by_id():
 7 |     element = xml_element("relationships:Relationships", {}, [
 8 |         xml_element("relationships:Relationship", {
 9 |             "Id": "rId8",
10 |             "Type": "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink",
11 |             "Target": "http://example.com",
12 |         }),
13 |         xml_element("relationships:Relationship", {
14 |             "Id": "rId2",
15 |             "Type": "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink",
16 |             "Target": "http://example.net",
17 |         }),
18 |     ])
19 |     relationships = read_relationships_xml_element(element)
20 |     assert_equal(
21 |         "http://example.com",
22 |         relationships.find_target_by_relationship_id("rId8"),
23 |     )
24 | 
25 | 
26 | def test_relationship_targets_can_be_found_by_type():
27 |     element = xml_element("relationships:Relationships", {}, [
28 |         xml_element("relationships:Relationship", {
29 |             "Id": "rId2",
30 |             "Target": "docProps/core.xml",
31 |             "Type": "http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties",
32 |         }),
33 |         xml_element("relationships:Relationship", {
34 |             "Id": "rId1",
35 |             "Target": "word/document.xml",
36 |             "Type": "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument",
37 |         }),
38 |         xml_element("relationships:Relationship", {
39 |             "Id": "rId3",
40 |             "Target": "word/document2.xml",
41 |             "Type": "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument",
42 |         }),
43 |     ])
44 |     relationships = read_relationships_xml_element(element)
45 |     assert_equal(
46 |         ["word/document.xml", "word/document2.xml"],
47 |         relationships.find_targets_by_type("http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument"),
48 |     )
49 | 
50 | 
51 | def test_when_there_are_no_relationships_of_requested_type_then_empty_list_is_returned():
52 |     element = xml_element("relationships:Relationships", {}, [])
53 |     relationships = read_relationships_xml_element(element)
54 |     assert_equal(
55 |         [],
56 |         relationships.find_targets_by_type("http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument"),
57 |     )
58 | 


--------------------------------------------------------------------------------
/tests/docx/style_map_tests.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | from zipfile import ZipFile
  3 | 
  4 | from mammoth.docx.style_map import write_style_map, read_style_map
  5 | from mammoth.zips import open_zip
  6 | from mammoth.docx import xmlparser as xml
  7 | from ..testing import assert_equal
  8 | 
  9 | 
 10 | def test_reading_embedded_style_map_on_document_without_embedded_style_map_returns_none():
 11 |     fileobj = _normal_docx()
 12 |     assert_equal(None, read_style_map(fileobj))
 13 | 
 14 | 
 15 | def test_writing_style_map_preserves_unrelated_files():
 16 |     fileobj = _normal_docx()
 17 |     write_style_map(fileobj, "p => h1")
 18 |     with open_zip(fileobj, "r") as zip_file:
 19 |         assert_equal("placeholder", zip_file.read_str("placeholder"))
 20 | 
 21 | def test_embedded_style_map_can_be_read_after_being_written():
 22 |     fileobj = _normal_docx()
 23 |     write_style_map(fileobj, "p => h1")
 24 |     assert_equal("p => h1", read_style_map(fileobj))
 25 | 
 26 | 
 27 | def test_embedded_style_map_is_written_to_separate_file():
 28 |     fileobj = _normal_docx()
 29 |     write_style_map(fileobj, "p => h1")
 30 |     with open_zip(fileobj, "r") as zip_file:
 31 |         assert_equal("p => h1", zip_file.read_str("mammoth/style-map"))
 32 | 
 33 | 
 34 | def test_embedded_style_map_is_referenced_in_relationships():
 35 |     fileobj = _normal_docx()
 36 |     write_style_map(fileobj, "p => h1")
 37 |     assert_equal(expected_relationships_xml, _read_relationships_xml(fileobj))
 38 | 
 39 | def test_embedded_style_map_has_override_content_type_in_content_types_xml():
 40 |     fileobj = _normal_docx()
 41 |     write_style_map(fileobj, "p => h1")
 42 |     assert_equal(expected_content_types_xml, _read_content_types_xml(fileobj))
 43 | 
 44 | 
 45 | def test_can_overwrite_existing_style_map():
 46 |     fileobj = _normal_docx()
 47 |     write_style_map(fileobj, "p => h1")
 48 |     write_style_map(fileobj, "p => h2")
 49 |     with open_zip(fileobj, "r") as zip_file:
 50 |         assert_equal("p => h2", read_style_map(fileobj))
 51 |         _assert_no_duplicates(zip_file._zip_file.namelist())
 52 |         assert_equal(expected_relationships_xml, _read_relationships_xml(fileobj))
 53 |         assert_equal(expected_content_types_xml, _read_content_types_xml(fileobj))
 54 | 
 55 | 
 56 | def _read_relationships_xml(fileobj):
 57 |     with open_zip(fileobj, "r") as zip_file:
 58 |         return xml.parse_xml(
 59 |             io.StringIO(zip_file.read_str("word/_rels/document.xml.rels")),
 60 |             [("r", "http://schemas.openxmlformats.org/package/2006/relationships")],
 61 |         )
 62 | 
 63 | 
 64 | def _read_content_types_xml(fileobj):
 65 |     with open_zip(fileobj, "r") as zip_file:
 66 |         return xml.parse_xml(
 67 |             io.StringIO(zip_file.read_str("[Content_Types].xml")),
 68 |             [("ct", "http://schemas.openxmlformats.org/package/2006/content-types")],
 69 |         )
 70 | 
 71 | 
 72 | original_relationships_xml = ('<?xml version="1.0" encoding="UTF-8" standalone="yes"?>' +
 73 |     '<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">' +
 74 |     '<Relationship Id="rId3" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/settings" Target="settings.xml"/>' +
 75 |     '</Relationships>')
 76 | 
 77 | expected_relationships_xml = xml.element("r:Relationships", {}, [
 78 |     xml.element("r:Relationship", {"Id": "rId3", "Type": "http://schemas.openxmlformats.org/officeDocument/2006/relationships/settings", "Target": "settings.xml"}),
 79 |     xml.element("r:Relationship", {"Id": "rMammothStyleMap", "Type": "http://schemas.zwobble.org/mammoth/style-map", "Target": "/mammoth/style-map"}),
 80 | ])
 81 | 
 82 | original_content_types_xml = ('<?xml version="1.0" encoding="UTF-8" standalone="yes"?>' +
 83 |     '<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">' +
 84 |     '<Default Extension="png" ContentType="image/png"/>' +
 85 |     '</Types>'
 86 | )
 87 | 
 88 | expected_content_types_xml = xml.element("ct:Types", {}, [
 89 |     xml.element("ct:Default", {"Extension": "png", "ContentType": "image/png"}),
 90 |     xml.element("ct:Override", {"PartName": "/mammoth/style-map", "ContentType": "text/prs.mammoth.style-map"}),
 91 | ])
 92 | 
 93 | 
 94 | def _normal_docx():
 95 |     fileobj = io.BytesIO()
 96 |     zip_file = ZipFile(fileobj, "w")
 97 |     try:
 98 |         zip_file.writestr("placeholder", "placeholder")
 99 |         zip_file.writestr("word/_rels/document.xml.rels", original_relationships_xml)
100 |         zip_file.writestr("[Content_Types].xml", original_content_types_xml)
101 |         expected_relationships_xml
102 |     finally:
103 |         zip_file.close()
104 |     return fileobj
105 | 
106 | 
107 | def _assert_no_duplicates(values):
108 |     counts = {}
109 |     for value in values:
110 |         counts[value] = counts.get(value, 0) + 1
111 |     for value, count in counts.items():
112 |         if count != 1:
113 |             assert False, "{0} has count of {1}".format(value, count)
114 | 


--------------------------------------------------------------------------------
/tests/docx/styles_xml_tests.py:
--------------------------------------------------------------------------------
  1 | from mammoth.docx.xmlparser import element as xml_element
  2 | from mammoth.docx.styles_xml import read_styles_xml_element
  3 | from ..testing import assert_equal
  4 | 
  5 | 
  6 | def test_paragraph_style_is_null_if_no_style_with_that_id_exists():
  7 |     element = xml_element("w:styles")
  8 |     styles = read_styles_xml_element(element)
  9 |     assert_equal(None, styles.find_paragraph_style_by_id("Heading1"))
 10 | 
 11 | 
 12 | def test_paragraph_style_can_be_found_by_id():
 13 |     element = xml_element("w:styles", {}, [
 14 |         _paragraph_style_element("Heading1", "Heading 1"),
 15 |     ])
 16 |     styles = read_styles_xml_element(element)
 17 |     assert_equal(
 18 |         "Heading1",
 19 |         styles.find_paragraph_style_by_id("Heading1").style_id
 20 |     )
 21 | 
 22 | 
 23 | def test_character_style_can_be_found_by_id():
 24 |     element = xml_element("w:styles", {}, [
 25 |         _character_style_element("Heading1Char", "Heading 1 Char"),
 26 |     ])
 27 |     styles = read_styles_xml_element(element)
 28 |     assert_equal(
 29 |         "Heading1Char",
 30 |         styles.find_character_style_by_id("Heading1Char").style_id
 31 |     )
 32 | 
 33 | 
 34 | def test_table_style_can_be_found_by_id():
 35 |     element = xml_element("w:styles", {}, [
 36 |         _table_style_element("TableNormal", "Normal Table"),
 37 |     ])
 38 |     styles = read_styles_xml_element(element)
 39 |     assert_equal(
 40 |         "TableNormal",
 41 |         styles.find_table_style_by_id("TableNormal").style_id
 42 |     )
 43 | 
 44 | 
 45 | def test_paragraph_and_character_styles_are_distinct():
 46 |     element = xml_element("w:styles", {}, [
 47 |         _paragraph_style_element("Heading1", "Heading 1"),
 48 |         _character_style_element("Heading1Char", "Heading 1 Char"),
 49 |     ])
 50 |     styles = read_styles_xml_element(element)
 51 |     assert_equal(None, styles.find_character_style_by_id("Heading1"))
 52 |     assert_equal(None, styles.find_paragraph_style_by_id("Heading1Char"))
 53 | 
 54 | 
 55 | def test_styles_include_names():
 56 |     element = xml_element("w:styles", {}, [
 57 |         _paragraph_style_element("Heading1", "Heading 1"),
 58 |     ])
 59 |     styles = read_styles_xml_element(element)
 60 |     assert_equal(
 61 |         "Heading 1",
 62 |         styles.find_paragraph_style_by_id("Heading1").name
 63 |     )
 64 | 
 65 | 
 66 | def test_style_name_is_none_if_name_element_does_not_exist():
 67 |     element = xml_element("w:styles", {}, [
 68 |         _style_without_name_element("paragraph", "Heading1"),
 69 |         _style_without_name_element("character", "Heading1Char")
 70 |     ])
 71 |     styles = read_styles_xml_element(element)
 72 |     assert_equal(None, styles.find_paragraph_style_by_id("Heading1").name)
 73 |     assert_equal(None, styles.find_character_style_by_id("Heading1Char").name)
 74 | 
 75 | 
 76 | def test_numbering_style_is_none_if_no_style_with_that_id_exists():
 77 |     element = xml_element("w:styles", {}, [])
 78 |     styles = read_styles_xml_element(element)
 79 |     assert_equal(None, styles.find_numbering_style_by_id("List1"))
 80 | 
 81 | 
 82 | def test_numbering_style_has_none_num_id_if_style_has_no_paragraph_properties():
 83 |     element = xml_element("w:styles", {}, [
 84 |         xml_element("w:style", {"w:type": "numbering", "w:styleId": "List1"}),
 85 |     ])
 86 |     styles = read_styles_xml_element(element)
 87 |     assert_equal(None, styles.find_numbering_style_by_id("List1").num_id)
 88 | 
 89 | 
 90 | def test_numbering_style_has_num_id_read_from_paragraph_properties():
 91 |     element = xml_element("w:styles", {}, [
 92 |         xml_element("w:style", {"w:type": "numbering", "w:styleId": "List1"}, [
 93 |             xml_element("w:pPr", {}, [
 94 |                 xml_element("w:numPr", {}, [
 95 |                     xml_element("w:numId", {"w:val": "42"})
 96 |                 ]),
 97 |             ]),
 98 |         ]),
 99 |     ])
100 |     styles = read_styles_xml_element(element)
101 |     assert_equal("42", styles.find_numbering_style_by_id("List1").num_id)
102 | 
103 | 
104 | def _paragraph_style_element(style_id, name):
105 |     return _style_element("paragraph", style_id, name)
106 | 
107 | def _character_style_element(style_id, name):
108 |     return _style_element("character", style_id, name)
109 | 
110 | def _table_style_element(style_id, name):
111 |     return _style_element("table", style_id, name)
112 | 
113 | def _style_element(element_type, style_id, name):
114 |     children = [xml_element("w:name", {"w:val": name}, [])]
115 |     return _style_element_with_children(element_type, style_id, children)
116 | 
117 | def _style_without_name_element(element_type, style_id):
118 |     return _style_element_with_children(element_type, style_id, [])
119 | 
120 | def _style_element_with_children(element_type, style_id, children):
121 |     attributes = {"w:type": element_type, "w:styleId": style_id}
122 |     return xml_element("w:style", attributes, children)
123 | 


--------------------------------------------------------------------------------
/tests/docx/uris_tests.py:
--------------------------------------------------------------------------------
 1 | from mammoth.docx.uris import uri_to_zip_entry_name
 2 | from ..testing import assert_equal
 3 | 
 4 | 
 5 | def test_when_path_does_not_have_leading_slash_then_path_is_resolved_relative_to_base():
 6 |     assert_equal(
 7 |         "one/two/three/four",
 8 |         uri_to_zip_entry_name("one/two", "three/four"),
 9 |     )
10 | 
11 | 
12 | def test_when_path_has_leading_slash_then_base_is_ignored():
13 |     assert_equal(
14 |         "three/four",
15 |         uri_to_zip_entry_name("one/two", "/three/four"),
16 |     )
17 | 


--------------------------------------------------------------------------------
/tests/docx/xmlparser_tests.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | 
 3 | from mammoth.docx.xmlparser import parse_xml, element as xml_element, text as xml_text
 4 | from ..testing import assert_equal
 5 | 
 6 | 
 7 | def test_can_parse_self_closing_element():
 8 |     xml = _parse_xml_string(b"<body/>")
 9 |     assert_equal(xml_element("body", {}, []), xml)
10 | 
11 | 
12 | def test_can_parse_empty_element_with_separate_closing_tag():
13 |     xml = _parse_xml_string(b"<body></body>")
14 |     assert_equal(xml_element("body", {}, []), xml)
15 | 
16 | 
17 | def test_can_parse_attributes_of_tag():
18 |     xml = _parse_xml_string(b"<body name='bob'></body>")
19 |     assert_equal(xml_element("body", {"name": "bob"}, []), xml)
20 | 
21 | 
22 | def test_can_parse_text_element():
23 |     xml = _parse_xml_string(b"<body>Hello!</body>")
24 |     assert_equal(xml_element("body", {}, [xml_text("Hello!")]), xml)
25 | 
26 | 
27 | def test_can_parse_text_element_before_new_tag():
28 |     xml = _parse_xml_string(b"<body>Hello!<br/></body>")
29 |     assert_equal(xml_element("body", {}, [xml_text("Hello!"), xml_element("br", {}, [])]), xml)
30 | 
31 | 
32 | def test_can_parse_element_with_children():
33 |     xml = _parse_xml_string(b"<body><a/><b/></body>")
34 |     assert_equal([xml_element("a", {}, []), xml_element("b", {}, [])], xml.children)
35 | 
36 | 
37 | def test_unmapped_namespaces_uris_are_included_in_braces_as_prefix():
38 |     xml = _parse_xml_string(b'<w:body xmlns:w="word"/>')
39 |     assert_equal("{word}body", xml.name)
40 | 
41 | 
42 | def test_mapped_namespaces_uris_are_translated_using_namespace_map():
43 |     xml = _parse_xml_string(b'<w:body xmlns:w="word"/>', [("x", "word")])
44 |     assert_equal("x:body", xml.name)
45 | 
46 | 
47 | def test_namespace_of_attributes_is_mapped_to_prefix():
48 |     xml = _parse_xml_string(b'<w:body xmlns:w="word" w:val="Hello!"/>', [("x", "word")])
49 |     assert_equal("Hello!", xml.attributes["x:val"])
50 | 
51 | 
52 | def test_whitespace_between_xml_declaration_and_root_tag_is_ignored():
53 |     xml = _parse_xml_string(b'<?xml version="1.0" ?>\n<body/>')
54 |     assert_equal("body", xml.name)
55 | 
56 | 
57 | class FindChildTests(object):
58 |     def test_returns_none_if_no_children(self):
59 |         xml = xml_element("a")
60 |         assert_equal(None, xml.find_child("b"))
61 | 
62 |     def test_returns_none_if_no_matching_children(self):
63 |         xml = xml_element("a", {}, [xml_element("c")])
64 |         assert_equal(None, xml.find_child("b"))
65 | 
66 |     def test_returns_first_matching_child(self):
67 |         xml = xml_element("a", {}, [xml_element("b", {"id": 1}), xml_element("b", {"id": 2})])
68 |         assert_equal(1, xml.find_child("b").attributes["id"])
69 | 
70 |     def test_ignores_text_nodes(self):
71 |         xml = xml_element("a", {}, [xml_text("Hello!")])
72 |         assert_equal(None, xml.find_child("b"))
73 | 
74 | 
75 | def _parse_xml_string(string, namespace_mapping=None):
76 |     return parse_xml(io.BytesIO(string), namespace_mapping)
77 | 


--------------------------------------------------------------------------------
/tests/html/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/tests/html/collapse_tests.py:
--------------------------------------------------------------------------------
  1 | from mammoth import html
  2 | from ..testing import assert_equal
  3 | 
  4 | 
  5 | def test_collapsing_does_nothing_to_single_text_node():
  6 |     assert_equal(
  7 |         html.collapse([html.text("Bluebells")]),
  8 |         [html.text("Bluebells")])
  9 | 
 10 | 
 11 | def test_consecutive_fresh_elements_are_not_collapsed():
 12 |     assert_equal(
 13 |         html.collapse([html.element("p"), html.element("p")]),
 14 |         [html.element("p"), html.element("p")])
 15 | 
 16 | 
 17 | def test_consecutive_collapsible_elements_are_collapsed_if_they_have_the_same_tag_and_attributes():
 18 |     assert_equal(
 19 |         [html.collapsible_element("p", {}, [html.text("One"), html.text("Two")])],
 20 |         html.collapse([
 21 |             html.collapsible_element("p", {}, [html.text("One")]),
 22 |             html.collapsible_element("p", {}, [html.text("Two")])
 23 |         ]))
 24 | 
 25 | 
 26 | def test_elements_with_different_tag_names_are_not_collapsed():
 27 |     assert_equal(
 28 |         [
 29 |             html.collapsible_element("p", {}, [html.text("One")]),
 30 |             html.collapsible_element("div", {}, [html.text("Two")])
 31 |         ],
 32 | 
 33 |         html.collapse([
 34 |             html.collapsible_element("p", {}, [html.text("One")]),
 35 |             html.collapsible_element("div", {}, [html.text("Two")])
 36 |         ]))
 37 | 
 38 | 
 39 | def test_elements_with_different_attributes_are_not_collapsed():
 40 |     assert_equal(
 41 |         [
 42 |             html.collapsible_element("p", {"id": "a"}, [html.text("One")]),
 43 |             html.collapsible_element("p", {}, [html.text("Two")])
 44 |         ],
 45 | 
 46 |         html.collapse([
 47 |             html.collapsible_element("p", {"id": "a"}, [html.text("One")]),
 48 |             html.collapsible_element("p", {}, [html.text("Two")])
 49 |         ]))
 50 | 
 51 | 
 52 | def test_children_of_collapsed_element_can_collapse_with_children_of_previous_element():
 53 |     assert_equal(
 54 |         [
 55 |             html.collapsible_element("blockquote", {}, [
 56 |                 html.collapsible_element("p", {}, [
 57 |                     html.text("One"),
 58 |                     html.text("Two")
 59 |                 ])
 60 |             ]),
 61 |         ],
 62 | 
 63 |         html.collapse([
 64 |             html.collapsible_element("blockquote", {}, [
 65 |                 html.collapsible_element("p", {}, [html.text("One")])
 66 |             ]),
 67 |             html.collapsible_element("blockquote", {}, [
 68 |                 html.collapsible_element("p", {}, [html.text("Two")])
 69 |             ]),
 70 |         ]))
 71 | 
 72 | 
 73 | def test_collapsible_element_can_collapse_into_previous_fresh_element():
 74 |     assert_equal(
 75 |         [html.element("p", {}, [html.text("One"), html.text("Two")])],
 76 |         html.collapse([
 77 |             html.element("p", {}, [html.text("One")]),
 78 |             html.collapsible_element("p", {}, [html.text("Two")])
 79 |         ]))
 80 | 
 81 | 
 82 | def test_element_with_choice_of_tag_names_can_collapse_into_previous_element_if_it_has_one_of_those_tag_names_as_its_main_tag_name():
 83 |     assert_equal(
 84 |         [html.collapsible_element(["ol"])],
 85 |         html.collapse([
 86 |             html.collapsible_element("ol"),
 87 |             html.collapsible_element(["ul", "ol"])
 88 |         ]))
 89 | 
 90 |     assert_equal(
 91 |         [
 92 |             html.collapsible_element(["ul", "ol"]),
 93 |             html.collapsible_element("ol")
 94 |         ],
 95 |         html.collapse([
 96 |             html.collapsible_element(["ul", "ol"]),
 97 |             html.collapsible_element("ol")
 98 |         ]))
 99 | 
100 | 
101 | def test_when_separator_is_present_then_separator_is_prepended_to_collapsed_element():
102 |     assert_equal(
103 |         [
104 |             html.element("pre", collapsible=False, children=[
105 |                 html.text("Hello"),
106 |                 html.text("\n"),
107 |                 html.text(" the"),
108 |                 html.text("re")
109 |             ])
110 |         ],
111 |         html.collapse([
112 |             html.element("pre", collapsible=False, children=[html.text("Hello")]),
113 |             html.element("pre", collapsible=True, separator="\n", children=[html.text(" the"), html.text("re")]),
114 |         ]),
115 |     )
116 | 


--------------------------------------------------------------------------------
/tests/html/strip_empty_tests.py:
--------------------------------------------------------------------------------
 1 | from mammoth import html
 2 | from ..testing import assert_equal
 3 | 
 4 | 
 5 | def test_text_nodes_with_text_are_not_stripped():
 6 |     assert_equal(
 7 |         [html.text("H")],
 8 |         html.strip_empty([html.text("H")]))
 9 | 
10 | 
11 | def test_empty_text_nodes_are_stripped():
12 |     assert_equal(
13 |         [],
14 |         html.strip_empty([html.text("")]))
15 | 
16 | 
17 | def test_elements_with_non_empty_children_are_not_stripped():
18 |     assert_equal(
19 |         [html.element("p", {}, [html.text("H")])],
20 |         html.strip_empty([html.element("p", {}, [html.text("H")])]))
21 | 
22 | 
23 | def test_elements_with_no_children_are_stripped():
24 |     assert_equal(
25 |         [],
26 |         html.strip_empty([html.element("p")]))
27 | 
28 | 
29 | def test_elements_with_only_empty_children_are_stripped():
30 |     assert_equal(
31 |         [],
32 |         html.strip_empty([html.element("p", {}, [html.text("")])]))
33 | 
34 | 
35 | def test_empty_children_are_removed():
36 |     assert_equal(
37 |         html.strip_empty([html.element("ul", {}, [
38 |             html.element("li", {}, [html.text("")]),
39 |             html.element("li", {}, [html.text("H")]),
40 |         ])]),
41 | 
42 |         [html.element("ul", {}, [
43 |             html.element("li", {}, [html.text("H")])
44 |         ])])
45 | 
46 | 
47 | def test_self_closing_elements_are_never_empty():
48 |     assert_equal(
49 |         [html.element("br")],
50 |         html.strip_empty([html.element("br")]))
51 | 
52 | 
53 | def test_force_writes_are_never_empty():
54 |     assert_equal(
55 |         [html.force_write],
56 |         html.strip_empty([html.force_write]))
57 | 


--------------------------------------------------------------------------------
/tests/images_tests.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | 
 3 | from precisely import assert_that, has_attrs, is_sequence
 4 | 
 5 | import mammoth
 6 | 
 7 | 
 8 | def test_inline_is_available_as_alias_of_img_element():
 9 |     assert mammoth.images.inline is mammoth.images.img_element
10 | 
11 | 
12 | def test_data_uri_encodes_images_in_base64():
13 |     image_bytes = b"abc"
14 |     image = mammoth.documents.Image(
15 |         alt_text=None,
16 |         content_type="image/jpeg",
17 |         open=lambda: io.BytesIO(image_bytes),
18 |     )
19 | 
20 |     result = mammoth.images.data_uri(image)
21 | 
22 |     assert_that(result, is_sequence(
23 |         has_attrs(attributes={"src": "data:image/jpeg;base64,YWJj"}),
24 |     ))
25 | 
26 | 
27 | class ImgElementTests:
28 |     def test_when_element_does_not_have_alt_text_then_alt_attribute_is_not_set(self):
29 |         image_bytes = b"abc"
30 |         image = mammoth.documents.Image(
31 |             alt_text=None,
32 |             content_type="image/jpeg",
33 |             open=lambda: io.BytesIO(image_bytes),
34 |         )
35 | 
36 |         @mammoth.images.img_element
37 |         def convert_image(image):
38 |             return {"src": "<src>"}
39 | 
40 |         result = convert_image(image)
41 | 
42 |         assert_that(result, is_sequence(
43 |             has_attrs(attributes={"src": "<src>"}),
44 |         ))
45 | 
46 |     def test_when_element_se_alt_text_then_alt_attribute_is_set(self):
47 |         image_bytes = b"abc"
48 |         image = mammoth.documents.Image(
49 |             alt_text="<alt>",
50 |             content_type="image/jpeg",
51 |             open=lambda: io.BytesIO(image_bytes),
52 |         )
53 | 
54 |         @mammoth.images.img_element
55 |         def convert_image(image):
56 |             return {"src": "<src>"}
57 | 
58 |         result = convert_image(image)
59 | 
60 |         assert_that(result, is_sequence(
61 |             has_attrs(attributes={"alt": "<alt>", "src": "<src>"}),
62 |         ))
63 | 
64 |     def test_image_alt_text_can_be_overridden_by_alt_attribute_returned_from_function(self):
65 |         image_bytes = b"abc"
66 |         image = mammoth.documents.Image(
67 |             alt_text="<alt>",
68 |             content_type="image/jpeg",
69 |             open=lambda: io.BytesIO(image_bytes),
70 |         )
71 | 
72 |         @mammoth.images.img_element
73 |         def convert_image(image):
74 |             return {"alt": "<alt override>", "src": "<src>"}
75 | 
76 |         result = convert_image(image)
77 | 
78 |         assert_that(result, is_sequence(
79 |             has_attrs(attributes={"alt": "<alt override>", "src": "<src>"}),
80 |         ))
81 | 


--------------------------------------------------------------------------------
/tests/lists_tests.py:
--------------------------------------------------------------------------------
 1 | from mammoth.lists import unique
 2 | from .testing import assert_equal
 3 | 
 4 | 
 5 | def test_unique_of_empty_list_is_empty_list():
 6 |     assert_equal([], unique([]))
 7 | 
 8 | 
 9 | def test_unique_removes_duplicates_while_preserving_order():
10 |     assert_equal(["apple", "banana"], unique(["apple", "banana", "apple"]))
11 | 


--------------------------------------------------------------------------------
/tests/mammoth_tests.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | from __future__ import unicode_literals
  4 | 
  5 | import base64
  6 | import io
  7 | import shutil
  8 | import os
  9 | 
 10 | import tempman
 11 | 
 12 | from .testing import assert_equal, generate_test_path
 13 | 
 14 | import mammoth
 15 | from mammoth import results
 16 | 
 17 | 
 18 | def test_docx_containing_one_paragraph_is_converted_to_single_p_element():
 19 |     with open(generate_test_path("single-paragraph.docx"), "rb") as fileobj:
 20 |         result = mammoth.convert_to_html(fileobj=fileobj)
 21 |         assert_equal("<p>Walking on imported air</p>", result.value)
 22 |         assert_equal([], result.messages)
 23 | 
 24 | 
 25 | def test_can_read_xml_files_with_utf8_bom():
 26 |     with open(generate_test_path("utf8-bom.docx"), "rb") as fileobj:
 27 |         result = mammoth.convert_to_html(fileobj=fileobj)
 28 |         assert_equal("<p>This XML has a byte order mark.</p>", result.value)
 29 |         assert_equal([], result.messages)
 30 | 
 31 | 
 32 | def test_empty_paragraphs_are_ignored_by_default():
 33 |     with open(generate_test_path("empty.docx"), "rb") as fileobj:
 34 |         result = mammoth.convert_to_html(fileobj=fileobj)
 35 |         assert_equal("", result.value)
 36 |         assert_equal([], result.messages)
 37 | 
 38 | 
 39 | def test_empty_paragraphs_are_preserved_if_ignore_empty_paragraphs_is_false():
 40 |     with open(generate_test_path("empty.docx"), "rb") as fileobj:
 41 |         result = mammoth.convert_to_html(fileobj=fileobj, ignore_empty_paragraphs=False)
 42 |         assert_equal("<p></p>", result.value)
 43 |         assert_equal([], result.messages)
 44 | 
 45 | 
 46 | def test_embedded_style_map_is_used_if_present():
 47 |     with open(generate_test_path("embedded-style-map.docx"), "rb") as fileobj:
 48 |         result = mammoth.convert_to_html(fileobj=fileobj)
 49 |         assert_equal("<h1>Walking on imported air</h1>", result.value)
 50 |         assert_equal([], result.messages)
 51 | 
 52 | 
 53 | def test_explicit_style_map_takes_precedence_over_embedded_style_map():
 54 |     with open(generate_test_path("embedded-style-map.docx"), "rb") as fileobj:
 55 |         result = mammoth.convert_to_html(fileobj=fileobj, style_map="p => p")
 56 |         assert_equal("<p>Walking on imported air</p>", result.value)
 57 |         assert_equal([], result.messages)
 58 | 
 59 | 
 60 | def test_explicit_style_map_is_combined_with_embedded_style_map():
 61 |     with open(generate_test_path("embedded-style-map.docx"), "rb") as fileobj:
 62 |         result = mammoth.convert_to_html(fileobj=fileobj, style_map="r => strong")
 63 |         assert_equal("<h1><strong>Walking on imported air</strong></h1>", result.value)
 64 |         assert_equal([], result.messages)
 65 | 
 66 | 
 67 | def test_embedded_style_maps_can_be_disabled():
 68 |     with open(generate_test_path("embedded-style-map.docx"), "rb") as fileobj:
 69 |         result = mammoth.convert_to_html(fileobj=fileobj, include_embedded_style_map=False)
 70 |         assert_equal("<p>Walking on imported air</p>", result.value)
 71 |         assert_equal([], result.messages)
 72 | 
 73 | 
 74 | def test_embedded_style_map_can_be_written_and_then_read():
 75 |     with _copy_of_test_data("single-paragraph.docx") as fileobj:
 76 |         mammoth.embed_style_map(fileobj, "p => h1")
 77 |         result = mammoth.convert_to_html(fileobj=fileobj, ignore_empty_paragraphs=False)
 78 |         assert_equal("<h1>Walking on imported air</h1>", result.value)
 79 |         assert_equal([], result.messages)
 80 | 
 81 | 
 82 | def test_embedded_style_map_can_be_retrieved():
 83 |     with _copy_of_test_data("single-paragraph.docx") as fileobj:
 84 |         mammoth.embed_style_map(fileobj, "p => h1")
 85 |         assert_equal("p => h1", mammoth.read_embedded_style_map(fileobj))
 86 | 
 87 | 
 88 | def test_warning_if_style_mapping_is_not_understood():
 89 |     style_map = """
 90 | !!!!
 91 | p => h1"""
 92 |     with open(generate_test_path("single-paragraph.docx"), "rb") as fileobj:
 93 |         result = mammoth.convert_to_html(fileobj=fileobj, style_map=style_map)
 94 |         assert_equal("<h1>Walking on imported air</h1>", result.value)
 95 |         warning = "Did not understand this style mapping, so ignored it: !!!!"
 96 |         assert_equal([results.warning(warning)], result.messages)
 97 | 
 98 | 
 99 | def test_inline_images_referenced_by_path_relative_to_part_are_included_in_output():
100 |     with open(generate_test_path("tiny-picture.docx"), "rb") as fileobj:
101 |         result = mammoth.convert_to_html(fileobj=fileobj)
102 |         assert_equal("""<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAIAAAACUFjqAAAAAXNSR0IArs4c6QAAAAlwSFlzAAAOvgAADr4B6kKxwAAAABNJREFUKFNj/M+ADzDhlWUYqdIAQSwBE8U+X40AAAAASUVORK5CYII=" /></p>""", result.value)
103 |         assert_equal([], result.messages)
104 | 
105 | 
106 | def test_inline_images_referenced_by_path_relative_to_base_are_included_in_output():
107 |     with open(generate_test_path("tiny-picture-target-base-relative.docx"), "rb") as fileobj:
108 |         result = mammoth.convert_to_html(fileobj=fileobj)
109 |         assert_equal("""<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAIAAAACUFjqAAAAAXNSR0IArs4c6QAAAAlwSFlzAAAOvgAADr4B6kKxwAAAABNJREFUKFNj/M+ADzDhlWUYqdIAQSwBE8U+X40AAAAASUVORK5CYII=" /></p>""", result.value)
110 |         assert_equal([], result.messages)
111 | 
112 | 
113 | def test_images_stored_outside_of_document_are_included_in_output():
114 |     with open(generate_test_path("external-picture.docx"), "rb") as fileobj:
115 |         result = mammoth.convert_to_html(fileobj=fileobj)
116 |         assert_equal("""<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAIAAAACUFjqAAAAAXNSR0IArs4c6QAAAAlwSFlzAAAOvgAADr4B6kKxwAAAABNJREFUKFNj/M+ADzDhlWUYqdIAQSwBE8U+X40AAAAASUVORK5CYII=" /></p>""", result.value)
117 |         assert_equal([], result.messages)
118 | 
119 | 
120 | def test_warn_if_images_stored_outside_of_document_are_specified_when_passing_fileobj_without_name():
121 |     fileobj = io.BytesIO()
122 |     with open(generate_test_path("external-picture.docx"), "rb") as source_fileobj:
123 |         shutil.copyfileobj(source_fileobj, fileobj)
124 | 
125 |     result = mammoth.convert_to_html(fileobj=fileobj)
126 |     assert_equal("", result.value)
127 |     assert_equal([results.warning("could not find external image 'tiny-picture.png', fileobj has no name")], result.messages)
128 | 
129 | 
130 | def test_warn_if_images_stored_outside_of_document_are_not_found():
131 |     with tempman.create_temp_dir() as temp_dir:
132 |         document_path = os.path.join(temp_dir.path, "document.docx")
133 |         with open(document_path, "wb") as fileobj:
134 |             with open(generate_test_path("external-picture.docx"), "rb") as source_fileobj:
135 |                 shutil.copyfileobj(source_fileobj, fileobj)
136 | 
137 |         with open(document_path, "rb") as fileobj:
138 |             result = mammoth.convert_to_html(fileobj=fileobj)
139 |             assert_equal("", result.value)
140 |             expected_warning = "could not open external image: 'tiny-picture.png'"
141 |             assert_equal("warning", result.messages[0].type)
142 |             assert result.messages[0].message.startswith(expected_warning), "message was: " + result.messages[0].message
143 |             assert_equal(1, len(result.messages))
144 | 
145 | 
146 | def test_image_conversion_can_be_customised():
147 |     @mammoth.images.img_element
148 |     def convert_image(image):
149 |         with image.open() as image_bytes:
150 |             encoded_src = base64.b64encode(image_bytes.read()).decode("ascii")
151 | 
152 |         return {
153 |             "src": encoded_src[:2] + "," + image.content_type
154 |         }
155 | 
156 |     with open(generate_test_path("tiny-picture.docx"), "rb") as fileobj:
157 |         result = mammoth.convert_to_html(fileobj=fileobj, convert_image=convert_image)
158 |         assert_equal("""<p><img src="iV,image/png" /></p>""", result.value)
159 |         assert_equal([], result.messages)
160 | 
161 | 
162 | def test_simple_list_is_converted_to_list_elements():
163 |     with open(generate_test_path("simple-list.docx"), "rb") as fileobj:
164 |         result = mammoth.convert_to_html(fileobj=fileobj)
165 |         assert_equal([], result.messages)
166 |         assert_equal("<ul><li>Apple</li><li>Banana</li></ul>", result.value)
167 | 
168 | 
169 | def test_word_tables_are_converted_to_html_tables():
170 |     expected_html = ("<p>Above</p>" +
171 |         "<table>" +
172 |         "<tr><td><p>Top left</p></td><td><p>Top right</p></td></tr>" +
173 |         "<tr><td><p>Bottom left</p></td><td><p>Bottom right</p></td></tr>" +
174 |         "</table>" +
175 |         "<p>Below</p>")
176 | 
177 | 
178 |     with open(generate_test_path("tables.docx"), "rb") as fileobj:
179 |         result = mammoth.convert_to_html(fileobj=fileobj)
180 |         assert_equal([], result.messages)
181 |         assert_equal(expected_html, result.value)
182 | 
183 | 
184 | def test_footnotes_are_appended_to_text():
185 |     # TODO: don't duplicate footnotes with multiple references
186 |     expected_html = ('<p>Ouch' +
187 |         '<sup><a href="#doc-42-footnote-1" id="doc-42-footnote-ref-1">[1]</a></sup>.' +
188 |         '<sup><a href="#doc-42-footnote-2" id="doc-42-footnote-ref-2">[2]</a></sup></p>' +
189 |         '<ol><li id="doc-42-footnote-1"><p> A tachyon walks into a bar. <a href="#doc-42-footnote-ref-1">↑</a></p></li>' +
190 |         '<li id="doc-42-footnote-2"><p> Fin. <a href="#doc-42-footnote-ref-2">↑</a></p></li></ol>')
191 | 
192 |     with open(generate_test_path("footnotes.docx"), "rb") as fileobj:
193 |         result = mammoth.convert_to_html(fileobj=fileobj, id_prefix="doc-42-")
194 |         assert_equal([], result.messages)
195 |         assert_equal(expected_html, result.value)
196 | 
197 | 
198 | def test_endnotes_are_appended_to_text():
199 |     expected_html = ('<p>Ouch' +
200 |         '<sup><a href="#doc-42-endnote-2" id="doc-42-endnote-ref-2">[1]</a></sup>.' +
201 |         '<sup><a href="#doc-42-endnote-3" id="doc-42-endnote-ref-3">[2]</a></sup></p>' +
202 |         '<ol><li id="doc-42-endnote-2"><p> A tachyon walks into a bar. <a href="#doc-42-endnote-ref-2">↑</a></p></li>' +
203 |         '<li id="doc-42-endnote-3"><p> Fin. <a href="#doc-42-endnote-ref-3">↑</a></p></li></ol>')
204 | 
205 |     with open(generate_test_path("endnotes.docx"), "rb") as fileobj:
206 |         result = mammoth.convert_to_html(fileobj=fileobj, id_prefix="doc-42-")
207 |         assert_equal([], result.messages)
208 |         assert_equal(expected_html, result.value)
209 | 
210 | 
211 | def test_relationships_are_handled_properly_in_footnotes():
212 |     expected_html = (
213 |         '<p><sup><a href="#doc-42-footnote-1" id="doc-42-footnote-ref-1">[1]</a></sup></p>' +
214 |         '<ol><li id="doc-42-footnote-1"><p> <a href="http://www.example.com">Example</a> <a href="#doc-42-footnote-ref-1">↑</a></p></li></ol>')
215 | 
216 |     with open(generate_test_path("footnote-hyperlink.docx"), "rb") as fileobj:
217 |         result = mammoth.convert_to_html(fileobj=fileobj, id_prefix="doc-42-")
218 |         assert_equal([], result.messages)
219 |         assert_equal(expected_html, result.value)
220 | 
221 | 
222 | def test_when_style_mapping_is_defined_for_comment_references_then_comments_are_included():
223 |     expected_html = (
224 |         '<p>Ouch' +
225 |         '<sup><a href="#doc-42-comment-0" id="doc-42-comment-ref-0">[MW1]</a></sup>.' +
226 |         '<sup><a href="#doc-42-comment-2" id="doc-42-comment-ref-2">[MW2]</a></sup></p>' +
227 |         '<dl><dt id="doc-42-comment-0">Comment [MW1]</dt><dd><p>A tachyon walks into a bar. <a href="#doc-42-comment-ref-0">↑</a></p></dd>' +
228 |         '<dt id="doc-42-comment-2">Comment [MW2]</dt><dd><p>Fin. <a href="#doc-42-comment-ref-2">↑</a></p></dd></dl>'
229 |     )
230 | 
231 |     with open(generate_test_path("comments.docx"), "rb") as fileobj:
232 |         result = mammoth.convert_to_html(fileobj=fileobj, id_prefix="doc-42-", style_map="comment-reference => sup")
233 |         assert_equal([], result.messages)
234 |         assert_equal(expected_html, result.value)
235 | 
236 | 
237 | def test_text_boxes_are_read():
238 |     with open(generate_test_path("text-box.docx"), "rb") as fileobj:
239 |         result = mammoth.convert_to_html(fileobj=fileobj)
240 |         assert_equal('<p>Datum plane</p>', result.value)
241 | 
242 | 
243 | def test_underline_is_ignored_by_default():
244 |     with open(generate_test_path("underline.docx"), "rb") as fileobj:
245 |         result = mammoth.convert_to_html(fileobj=fileobj)
246 |         assert_equal('<p><strong>The Sunset Tree</strong></p>', result.value)
247 | 
248 | 
249 | def test_underline_can_be_configured_with_style_mapping():
250 |     with open(generate_test_path("underline.docx"), "rb") as fileobj:
251 |         result = mammoth.convert_to_html(fileobj=fileobj, style_map="u => em")
252 |         assert_equal('<p><strong>The <em>Sunset</em> Tree</strong></p>', result.value)
253 | 
254 | 
255 | def test_strikethrough_is_converted_to_s_element_by_default():
256 |     with open(generate_test_path("strikethrough.docx"), "rb") as fileobj:
257 |         result = mammoth.convert_to_html(fileobj=fileobj)
258 |         assert_equal("<p><s>Today's Special: Salmon</s> Sold out</p>", result.value)
259 | 
260 | 
261 | def test_strikethrough_conversion_can_be_configured_with_style_mapping():
262 |     with open(generate_test_path("strikethrough.docx"), "rb") as fileobj:
263 |         result = mammoth.convert_to_html(fileobj=fileobj, style_map="strike => del")
264 |         assert_equal("<p><del>Today's Special: Salmon</del> Sold out</p>", result.value)
265 | 
266 | 
267 | def test_transform_document_is_applied_to_document_before_conversion():
268 |     def transform_document(document):
269 |         document.children[0].style_id = "Heading1"
270 |         return document
271 | 
272 |     with open(generate_test_path("single-paragraph.docx"), "rb") as fileobj:
273 |         result = mammoth.convert_to_html(fileobj=fileobj, transform_document=transform_document)
274 |         assert_equal("<h1>Walking on imported air</h1>", result.value)
275 |         assert_equal([], result.messages)
276 | 
277 | 
278 | def test_paragraph_transform_only_transforms_paragraphs():
279 |     def transform_paragraph(paragraph):
280 |         return paragraph.copy(style_id="Heading1")
281 |     transform_document = mammoth.transforms.paragraph(transform_paragraph)
282 |     with open(generate_test_path("single-paragraph.docx"), "rb") as fileobj:
283 |         result = mammoth.convert_to_html(fileobj=fileobj, transform_document=transform_document)
284 |         assert_equal("<h1>Walking on imported air</h1>", result.value)
285 |         assert_equal([], result.messages)
286 | 
287 | 
288 | def test_docx_containing_one_paragraph_can_be_converted_to_markdown():
289 |     with open(generate_test_path("single-paragraph.docx"), "rb") as fileobj:
290 |         result = mammoth.convert_to_markdown(fileobj=fileobj)
291 |         assert_equal("Walking on imported air\n\n", result.value)
292 |         assert_equal([], result.messages)
293 | 
294 | 
295 | def test_can_extract_raw_text():
296 |     with open(generate_test_path("simple-list.docx"), "rb") as fileobj:
297 |         result = mammoth.extract_raw_text(fileobj=fileobj)
298 |         assert_equal([], result.messages)
299 |         assert_equal("Apple\n\nBanana\n\n", result.value)
300 | 
301 | 
302 | def test_can_read_strict_format():
303 |     with open(generate_test_path("strict-format.docx"), "rb") as fileobj:
304 |         result = mammoth.convert_to_html(fileobj=fileobj)
305 |         assert_equal([], result.messages)
306 |         assert_equal("<p>Test</p>", result.value)
307 | 
308 | 
309 | def _copy_of_test_data(path):
310 |     destination = io.BytesIO()
311 |     with open(generate_test_path(path), "rb") as source:
312 |         shutil.copyfileobj(source, destination)
313 |     return destination
314 | 


--------------------------------------------------------------------------------
/tests/options_tests.py:
--------------------------------------------------------------------------------
 1 | from mammoth.options import read_options, _default_style_map
 2 | from mammoth.styles.parser import read_style_mapping
 3 | from .testing import assert_equal
 4 | 
 5 | 
 6 | def test_default_style_map_is_used_if_style_map_is_not_set():
 7 |     assert_equal(_default_style_map, read_options({}).value["style_map"])
 8 | 
 9 | 
10 | def test_custom_style_mappings_are_prepended_to_default_style_mappings():
11 |     style_map = read_options({
12 |         "style_map": "p.SectionTitle => h2"
13 |     }).value["style_map"]
14 |     assert_equal(read_style_mapping("p.SectionTitle => h2").value, style_map[0])
15 |     assert_equal(_default_style_map, style_map[1:])
16 | 
17 | 
18 | def test_default_style_mappings_are_ignored_if_include_default_style_map_is_false():
19 |     style_map = read_options({
20 |         "style_map": "p.SectionTitle => h2",
21 |         "include_default_style_map": False
22 |     }).value["style_map"]
23 |     assert_equal([read_style_mapping("p.SectionTitle => h2").value], style_map)
24 | 
25 | 
26 | def test_lines_starting_with_hash_in_custom_style_map_are_ignored():
27 |     style_map = read_options({
28 |         "style_map": "#p.SectionTitle => h3\np.SectionTitle => h2",
29 |         "include_default_style_map": False
30 |     }).value["style_map"]
31 |     assert_equal([read_style_mapping("p.SectionTitle => h2").value], style_map)
32 | 


--------------------------------------------------------------------------------
/tests/raw_text_tests.py:
--------------------------------------------------------------------------------
 1 | from mammoth.raw_text import extract_raw_text_from_element
 2 | from mammoth import documents
 3 | from .testing import assert_equal
 4 | 
 5 | 
 6 | def test_text_element_is_converted_to_text_content():
 7 |     element = documents.Text("Hello.")
 8 | 
 9 |     result = extract_raw_text_from_element(element)
10 | 
11 |     assert_equal("Hello.", result)
12 | 
13 | 
14 | def test_tab_element_is_converted_to_tab_character():
15 |     element = documents.tab()
16 | 
17 |     result = extract_raw_text_from_element(element)
18 | 
19 |     assert_equal("\t", result)
20 | 
21 | 
22 | def test_paragraphs_are_terminated_with_newlines():
23 |     element = documents.paragraph(
24 |         children=[
25 |             documents.Text("Hello "),
26 |             documents.Text("world."),
27 |         ],
28 |     )
29 | 
30 |     result = extract_raw_text_from_element(element)
31 | 
32 |     assert_equal("Hello world.\n\n", result)
33 | 
34 | 
35 | def test_children_are_recursively_converted_to_text():
36 |     element = documents.document([
37 |         documents.paragraph(
38 |             [
39 |                 documents.text("Hello "),
40 |                 documents.text("world.")
41 |             ],
42 |             {}
43 |         )
44 |     ])
45 | 
46 |     result = extract_raw_text_from_element(element)
47 | 
48 |     assert_equal("Hello world.\n\n", result)
49 | 
50 | 
51 | def test_non_text_element_without_children_is_converted_to_empty_string():
52 |     element = documents.line_break
53 |     assert not hasattr(element, "children")
54 | 
55 |     result = extract_raw_text_from_element(element)
56 | 
57 |     assert_equal("", result)
58 | 


--------------------------------------------------------------------------------
/tests/styles/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/styles/__init__.py


--------------------------------------------------------------------------------
/tests/styles/document_matcher_tests.py:
--------------------------------------------------------------------------------
 1 | from mammoth import document_matchers
 2 | from ..testing import assert_equal
 3 | 
 4 | 
 5 | def test_equal_to_matcher_is_case_insensitive():
 6 |     matcher = document_matchers.equal_to("Heading 1")
 7 |     assert_equal(True, matcher.matches("heaDING 1"))
 8 |     assert_equal(False, matcher.matches("heaDING 2"))
 9 | 
10 | 
11 | def test_starts_with_matcher_matches_string_with_prefix():
12 |     matcher = document_matchers.starts_with("Heading")
13 |     assert_equal(True, matcher.matches("Heading"))
14 |     assert_equal(True, matcher.matches("Heading 1"))
15 |     assert_equal(False, matcher.matches("Custom Heading"))
16 |     assert_equal(False, matcher.matches("Head"))
17 |     assert_equal(False, matcher.matches("Header 2"))
18 | 
19 | 
20 | def test_starts_with_matcher_is_case_insensitive():
21 |     matcher = document_matchers.starts_with("Heading")
22 |     assert_equal(True, matcher.matches("heaDING"))
23 | 


--------------------------------------------------------------------------------
/tests/styles/parser/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/styles/parser/__init__.py


--------------------------------------------------------------------------------
/tests/styles/parser/document_matcher_parser_tests.py:
--------------------------------------------------------------------------------
  1 | from mammoth import documents, document_matchers
  2 | from mammoth.styles.parser.document_matcher_parser import parse_document_matcher
  3 | from mammoth.styles.parser.errors import LineParseError
  4 | from mammoth.styles.parser.tokeniser import tokenise
  5 | from mammoth.styles.parser.token_iterator import TokenIterator
  6 | from ...testing import assert_equal, assert_raises
  7 | 
  8 | 
  9 | def test_unrecognised_document_element_raises_error():
 10 |     error = assert_raises(LineParseError, lambda: read_document_matcher("x"))
 11 |     assert_equal("Unrecognised document element: x", str(error))
 12 | 
 13 | 
 14 | def test_reads_plain_paragraph():
 15 |     assert_equal(
 16 |         document_matchers.paragraph(),
 17 |         read_document_matcher("p")
 18 |     )
 19 | 
 20 | 
 21 | def test_reads_paragraph_with_style_id():
 22 |     assert_equal(
 23 |         document_matchers.paragraph(style_id="Heading1"),
 24 |         read_document_matcher("p.Heading1")
 25 |     )
 26 | 
 27 | 
 28 | def test_reads_paragraph_with_exact_style_name():
 29 |     assert_equal(
 30 |         document_matchers.paragraph(style_name=document_matchers.equal_to("Heading 1")),
 31 |         read_document_matcher("p[style-name='Heading 1']")
 32 |     )
 33 | 
 34 | 
 35 | def test_reads_paragraph_with_style_name_prefix():
 36 |     assert_equal(
 37 |         document_matchers.paragraph(style_name=document_matchers.starts_with("Heading")),
 38 |         read_document_matcher("p[style-name^='Heading']")
 39 |     )
 40 | 
 41 | 
 42 | def test_unrecognised_string_matcher_raises_error():
 43 |     error = assert_raises(LineParseError, lambda: read_document_matcher("p[style-name*='Heading']"))
 44 |     assert_equal("Unrecognised string matcher: *", str(error))
 45 | 
 46 | 
 47 | def test_reads_paragraph_ordered_list():
 48 |     assert_equal(
 49 |         document_matchers.paragraph(numbering=documents.numbering_level(1, is_ordered=True)),
 50 |         read_document_matcher("p:ordered-list(2)")
 51 |     )
 52 | 
 53 | 
 54 | def test_reads_paragraph_unordered_list():
 55 |     assert_equal(
 56 |         document_matchers.paragraph(numbering=documents.numbering_level(1, is_ordered=False)),
 57 |         read_document_matcher("p:unordered-list(2)")
 58 |     )
 59 | 
 60 | 
 61 | def test_unrecognised_list_type_raises_error():
 62 |     error = assert_raises(LineParseError, lambda: read_document_matcher("p:blah"))
 63 |     assert_equal("Unrecognised list type: blah", str(error))
 64 | 
 65 | 
 66 | def test_reads_plain_run():
 67 |     assert_equal(
 68 |         document_matchers.run(),
 69 |         read_document_matcher("r")
 70 |     )
 71 | 
 72 | 
 73 | def test_reads_run_with_style_id():
 74 |     assert_equal(
 75 |         document_matchers.run(style_id="Emphasis"),
 76 |         read_document_matcher("r.Emphasis")
 77 |     )
 78 | 
 79 | 
 80 | def test_reads_run_with_style_name():
 81 |     assert_equal(
 82 |         document_matchers.run(style_name=document_matchers.equal_to("Emphasis")),
 83 |         read_document_matcher("r[style-name='Emphasis']")
 84 |     )
 85 | 
 86 | 
 87 | def test_reads_plain_table():
 88 |     assert_equal(
 89 |         document_matchers.table(),
 90 |         read_document_matcher("table")
 91 |     )
 92 | 
 93 | 
 94 | def test_reads_table_with_style_id():
 95 |     assert_equal(
 96 |         document_matchers.table(style_id="TableNormal"),
 97 |         read_document_matcher("table.TableNormal")
 98 |     )
 99 | 
100 | 
101 | def test_reads_table_with_style_name():
102 |     assert_equal(
103 |         document_matchers.table(style_name=document_matchers.equal_to("Normal Table")),
104 |         read_document_matcher("table[style-name='Normal Table']")
105 |     )
106 | 
107 | 
108 | def test_reads_bold():
109 |     assert_equal(
110 |         document_matchers.bold,
111 |         read_document_matcher("b")
112 |     )
113 | 
114 | def test_reads_italic():
115 |     assert_equal(
116 |         document_matchers.italic,
117 |         read_document_matcher("i")
118 |     )
119 | 
120 | def test_reads_underline():
121 |     assert_equal(
122 |         document_matchers.underline,
123 |         read_document_matcher("u")
124 |     )
125 | 
126 | def test_reads_strikethrough():
127 |     assert_equal(
128 |         document_matchers.strikethrough,
129 |         read_document_matcher("strike")
130 |     )
131 | 
132 | def test_reads_all_caps():
133 |     assert_equal(
134 |         document_matchers.all_caps,
135 |         read_document_matcher("all-caps")
136 |     )
137 | 
138 | def test_reads_small_caps():
139 |     assert_equal(
140 |         document_matchers.small_caps,
141 |         read_document_matcher("small-caps")
142 |     )
143 | 
144 | def test_reads_highlight_without_color():
145 |     assert_equal(
146 |         document_matchers.highlight(),
147 |         read_document_matcher("highlight")
148 |     )
149 | 
150 | def test_reads_highlight_with_color():
151 |     assert_equal(
152 |         document_matchers.highlight(color="yellow"),
153 |         read_document_matcher("highlight[color='yellow']")
154 |     )
155 | 
156 | def test_reads_comment_reference():
157 |     assert_equal(
158 |         document_matchers.comment_reference,
159 |         read_document_matcher("comment-reference")
160 |     )
161 | 
162 | def test_reads_line_breaks():
163 |     assert_equal(
164 |         document_matchers.line_break,
165 |         read_document_matcher("br[type='line']"),
166 |     )
167 | 
168 | def test_reads_page_breaks():
169 |     assert_equal(
170 |         document_matchers.page_break,
171 |         read_document_matcher("br[type='page']"),
172 |     )
173 | 
174 | def test_reads_column_breaks():
175 |     assert_equal(
176 |         document_matchers.column_break,
177 |         read_document_matcher("br[type='column']"),
178 |     )
179 | 
180 | 
181 | def test_unrecognised_break_type_raises_error():
182 |     error = assert_raises(LineParseError, lambda: read_document_matcher("br[type='unknownBreakType']"))
183 |     assert_equal("Unrecognised break type: unknownBreakType", str(error))
184 | 
185 | 
186 | def read_document_matcher(string):
187 |     return parse_document_matcher(TokenIterator(tokenise(string)))
188 | 


--------------------------------------------------------------------------------
/tests/styles/parser/html_path_parser_tests.py:
--------------------------------------------------------------------------------
 1 | from mammoth import html_paths
 2 | from mammoth.styles.parser.html_path_parser import parse_html_path
 3 | from mammoth.styles.parser.tokeniser import tokenise
 4 | from mammoth.styles.parser.token_iterator import TokenIterator
 5 | from ...testing import assert_equal
 6 | 
 7 | 
 8 | def test_can_read_empty_path():
 9 |     assert_equal(
10 |         html_paths.empty,
11 |         read_html_path("")
12 |     )
13 | 
14 | def test_can_read_single_element():
15 |     assert_equal(
16 |         html_paths.path([html_paths.element(["p"])]),
17 |         read_html_path("p")
18 |     )
19 | 
20 | 
21 | def test_can_read_choice_of_two_elements():
22 |     assert_equal(
23 |         html_paths.path([html_paths.element(["ul", "ol"])]),
24 |         read_html_path("ul|ol")
25 |     )
26 | 
27 | 
28 | def test_can_read_choice_of_three_elements():
29 |     assert_equal(
30 |         html_paths.path([html_paths.element(["ul", "ol", "p"])]),
31 |         read_html_path("ul|ol|p")
32 |     )
33 | 
34 | 
35 | def test_can_read_nested_elements():
36 |     assert_equal(
37 |         html_paths.path([html_paths.element(["ul"]), html_paths.element(["li"])]),
38 |         read_html_path("ul > li")
39 |     )
40 | 
41 | 
42 | def test_can_read_class_on_element():
43 |     assert_equal(
44 |         html_paths.path([html_paths.element(["p"], class_names=["tip"])]),
45 |         read_html_path("p.tip")
46 |     )
47 | 
48 | 
49 | def test_can_read_multiple_classes_on_element():
50 |     assert_equal(
51 |         html_paths.path([html_paths.element(["p"], class_names=["tip", "help"])]),
52 |         read_html_path("p.tip.help")
53 |     )
54 | 
55 | 
56 | def test_can_read_attribute_on_element():
57 |     assert_equal(
58 |         html_paths.path([html_paths.element(["p"], attributes={"lang": "fr"})]),
59 |         read_html_path("p[lang='fr']")
60 |     )
61 | 
62 | 
63 | def test_can_read_multiple_attributes_on_element():
64 |     assert_equal(
65 |         html_paths.path([html_paths.element(["p"], attributes={"lang": "fr", "data-x": "y"})]),
66 |         read_html_path("p[lang='fr'][data-x='y']")
67 |     )
68 | 
69 | 
70 | def test_can_read_when_element_must_be_fresh():
71 |     assert_equal(
72 |         html_paths.path([html_paths.element(["p"], fresh=True)]),
73 |         read_html_path("p:fresh")
74 |     )
75 | 
76 | 
77 | def test_can_read_separator_for_elements():
78 |     assert_equal(
79 |         html_paths.path([html_paths.element(["p"], separator="x")]),
80 |         read_html_path("p:separator('x')")
81 |     )
82 | 
83 | 
84 | def test_can_read_ignore_element():
85 |     assert_equal(
86 |         html_paths.ignore,
87 |         read_html_path("!")
88 |     )
89 | 
90 | def read_html_path(string):
91 |     return parse_html_path(TokenIterator(tokenise(string)))
92 | 


--------------------------------------------------------------------------------
/tests/styles/parser/style_mapping_parser_tests.py:
--------------------------------------------------------------------------------
 1 | from mammoth import html_paths, document_matchers, styles
 2 | from mammoth.styles.parser.style_mapping_parser import parse_style_mapping
 3 | from mammoth.styles.parser.tokeniser import tokenise
 4 | from mammoth.styles.parser.token_iterator import TokenIterator
 5 | from ...testing import assert_equal
 6 | 
 7 | 
 8 | def test_document_matcher_is_mapped_to_html_path_using_fat_arrow():
 9 |     assert_equal(
10 |         styles.style(document_matchers.paragraph(), html_paths.path([html_paths.element(["h1"])])),
11 |         read_style_mapping("p => h1")
12 |     )
13 | 
14 | 
15 | def read_style_mapping(string):
16 |     return parse_style_mapping(TokenIterator(tokenise(string)))
17 | 


--------------------------------------------------------------------------------
/tests/styles/parser/token_parser_tests.py:
--------------------------------------------------------------------------------
 1 | from mammoth.styles.parser.tokeniser import Token, TokenType
 2 | from mammoth.styles.parser.token_parser import decode_escape_sequences, parse_identifier, parse_string
 3 | from mammoth.styles.parser.token_iterator import TokenIterator
 4 | from ...testing import assert_equal
 5 | 
 6 | 
 7 | def test_escape_sequences_in_identifiers_are_decoded():
 8 |     assert_equal(
 9 |         ":",
10 |         parse_identifier(TokenIterator([
11 |             Token(0, TokenType.IDENTIFIER, r"\:"),
12 |         ])),
13 |     )
14 | 
15 | 
16 | def test_escape_sequences_in_strings_are_decoded():
17 |     assert_equal(
18 |         "\n",
19 |         parse_string(TokenIterator([
20 |             Token(0, TokenType.STRING, r"'\n'"),
21 |         ])),
22 |     )
23 | 
24 | 
25 | def test_line_feeds_are_decoded():
26 |     assert_equal("\n", decode_escape_sequences(r"\n"))
27 | 
28 | 
29 | def test_carriage_returns_are_decoded():
30 |     assert_equal("\r", decode_escape_sequences(r"\r"))
31 | 
32 | 
33 | def test_tabs_are_decoded():
34 |     assert_equal("\t", decode_escape_sequences(r"\t"))
35 | 
36 | 
37 | def test_backslashes_are_decoded():
38 |     assert_equal("\\", decode_escape_sequences(r"\\"))
39 | 
40 | 
41 | def test_colons_are_decoded():
42 |     assert_equal(":", decode_escape_sequences(r"\:"))
43 | 


--------------------------------------------------------------------------------
/tests/styles/parser/tokeniser_tests.py:
--------------------------------------------------------------------------------
  1 | from precisely import assert_that, has_attrs, is_sequence
  2 | 
  3 | from mammoth.styles.parser.tokeniser import tokenise
  4 | 
  5 | 
  6 | def test_unknown_tokens_are_tokenised():
  7 |     assert_tokens("~", is_token("unknown", "~"))
  8 | 
  9 | 
 10 | def test_empty_string_is_tokenised_to_end_of_file_token():
 11 |     assert_tokens("")
 12 | 
 13 | 
 14 | def test_whitespace_is_tokenised():
 15 |     assert_tokens(" \t\t  ", is_token("whitespace", " \t\t  "))
 16 | 
 17 | 
 18 | def test_identifiers_are_tokenised():
 19 |     assert_tokens("Overture", is_token("identifier", "Overture"))
 20 | 
 21 | 
 22 | def test_escape_sequences_in_identifiers_are_tokenised():
 23 |     assert_tokens(r"\:", is_token("identifier", r"\:"))
 24 | 
 25 | 
 26 | def test_integers_are_tokenised():
 27 |     assert_tokens("123", is_token("integer", "123"))
 28 | 
 29 | 
 30 | def test_strings_are_tokenised():
 31 |     assert_tokens("'Tristan'", is_token("string", "'Tristan'"))
 32 | 
 33 | 
 34 | def test_escape_sequences_in_strings_are_tokenised():
 35 |     assert_tokens(r"'Tristan\''", is_token("string", r"'Tristan\''"))
 36 | 
 37 | 
 38 | def test_unterminated_strings_are_tokenised():
 39 |     assert_tokens("'Tristan", is_token("unterminated string", "'Tristan"))
 40 | 
 41 | 
 42 | def test_arrows_are_tokenised():
 43 |     assert_tokens("=>=>", is_token("symbol", "=>"), is_token("symbol", "=>"))
 44 | 
 45 | 
 46 | def test_dots_are_tokenised():
 47 |     assert_tokens(".", is_token("symbol", "."))
 48 | 
 49 | 
 50 | def test_colons_are_tokenised():
 51 |     assert_tokens("::", is_token("symbol", ":"), is_token("symbol", ":"))
 52 | 
 53 | 
 54 | def test_greater_thans_are_tokenised():
 55 |     assert_tokens(">>", is_token("symbol", ">"), is_token("symbol", ">"))
 56 | 
 57 | 
 58 | def test_equals_are_tokenised():
 59 |     assert_tokens("==", is_token("symbol", "="), is_token("symbol", "="))
 60 | 
 61 | 
 62 | def test_open_parens_are_tokenised():
 63 |     assert_tokens("((", is_token("symbol", "("), is_token("symbol", "("))
 64 | 
 65 | 
 66 | def test_close_parens_are_tokenised():
 67 |     assert_tokens("))", is_token("symbol", ")"), is_token("symbol", ")"))
 68 | 
 69 | 
 70 | def test_open_square_brackets_are_tokenised():
 71 |     assert_tokens("[[", is_token("symbol", "["), is_token("symbol", "["))
 72 | 
 73 | 
 74 | def test_close_square_brackets_are_tokenised():
 75 |     assert_tokens("]]", is_token("symbol", "]"), is_token("symbol", "]"))
 76 | 
 77 | 
 78 | def test_choices_are_tokenised():
 79 |     assert_tokens("||", is_token("symbol", "|"), is_token("symbol", "|"))
 80 | 
 81 | 
 82 | def test_bangs_are_tokenised():
 83 |     assert_tokens("!!", is_token("symbol", "!"), is_token("symbol", "!"))
 84 | 
 85 | 
 86 | def test_can_tokenise_multiple_tokens():
 87 |     assert_tokens("The Magic Position",
 88 |         is_token("identifier", "The"),
 89 |         is_token("whitespace", " "),
 90 |         is_token("identifier", "Magic"),
 91 |         is_token("whitespace", " "),
 92 |         is_token("identifier", "Position"),
 93 |     )
 94 | 
 95 | 
 96 | def assert_tokens(string, *expected):
 97 |     expected = list(expected)
 98 |     expected.append(is_token("end", ""))
 99 |     assert_that(
100 |         tokenise(string),
101 |         is_sequence(*expected),
102 |     )
103 | 
104 | 
105 | def is_token(token_type, value):
106 |     return has_attrs(
107 |         type=token_type,
108 |         value=value,
109 |     )
110 | 


--------------------------------------------------------------------------------
/tests/test-data/comments.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/test-data/comments.docx


--------------------------------------------------------------------------------
/tests/test-data/embedded-style-map.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/test-data/embedded-style-map.docx


--------------------------------------------------------------------------------
/tests/test-data/empty.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/test-data/empty.docx


--------------------------------------------------------------------------------
/tests/test-data/endnotes.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/test-data/endnotes.docx


--------------------------------------------------------------------------------
/tests/test-data/external-picture.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/test-data/external-picture.docx


--------------------------------------------------------------------------------
/tests/test-data/footnote-hyperlink.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/test-data/footnote-hyperlink.docx


--------------------------------------------------------------------------------
/tests/test-data/footnotes.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/test-data/footnotes.docx


--------------------------------------------------------------------------------
/tests/test-data/hyperlinks/word/_rels/document.xml.rels:
--------------------------------------------------------------------------------
 1 | <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
 2 |   <Relationship Id="rId8" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme" Target="theme/theme1.xml" /> 
 3 |   <Relationship Id="rId3" Type="http://schemas.microsoft.com/office/2007/relationships/stylesWithEffects" Target="stylesWithEffects.xml" /> 
 4 |   <Relationship Id="rId7" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable" Target="fontTable.xml" /> 
 5 |   <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles" Target="styles.xml" /> 
 6 |   <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/numbering" Target="numbering.xml" /> 
 7 |   <Relationship Id="rId6" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink" Target="http://www.example.com" TargetMode="External" /> 
 8 |   <Relationship Id="rId5" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/webSettings" Target="webSettings.xml" /> 
 9 |   <Relationship Id="rId4" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/settings" Target="settings.xml" /> 
10 | </Relationships>


--------------------------------------------------------------------------------
/tests/test-data/hyperlinks/word/document.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" ?>
 2 | <w:document mc:Ignorable="w14 wp14" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape">
 3 |   <w:body>
 4 |     <w:p w:rsidR="00E01ECE" w:rsidRDefault="000636A7">
 5 |       <w:hyperlink r:id="rId6" w:history="1">
 6 |         <w:r>
 7 |           <w:t>coconuts</w:t> 
 8 |         </w:r>
 9 |       </w:hyperlink>
10 |     </w:p>
11 |     <w:sectPr w:rsidR="00E01ECE">
12 |       <w:pgSz w:h="16838" w:w="11906"/>
13 |       <w:pgMar w:bottom="1440" w:footer="708" w:gutter="0" w:header="708" w:left="1440" w:right="1440" w:top="1440"/>
14 |       <w:cols w:space="708"/>
15 |       <w:docGrid w:linePitch="360"/>
16 |     </w:sectPr>
17 |   </w:body>
18 | </w:document>
19 | 


--------------------------------------------------------------------------------
/tests/test-data/simple-list.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/test-data/simple-list.docx


--------------------------------------------------------------------------------
/tests/test-data/simple/word/document.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" ?>
 2 | <w:document mc:Ignorable="w14 wp14" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape">
 3 |   <w:body>
 4 |     <w:p w:rsidR="00E01ECE" w:rsidRDefault="000636A7">
 5 |       <w:r>
 6 |         <w:t>Hello.</w:t>
 7 |       </w:r>
 8 |       <w:bookmarkStart w:id="0" w:name="_GoBack"/>
 9 |       <w:bookmarkEnd w:id="0"/>
10 |     </w:p>
11 |     <w:sectPr w:rsidR="00E01ECE">
12 |       <w:pgSz w:h="16838" w:w="11906"/>
13 |       <w:pgMar w:bottom="1440" w:footer="708" w:gutter="0" w:header="708" w:left="1440" w:right="1440" w:top="1440"/>
14 |       <w:cols w:space="708"/>
15 |       <w:docGrid w:linePitch="360"/>
16 |     </w:sectPr>
17 |   </w:body>
18 | </w:document>
19 | 


--------------------------------------------------------------------------------
/tests/test-data/single-paragraph.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/test-data/single-paragraph.docx


--------------------------------------------------------------------------------
/tests/test-data/strict-format.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/test-data/strict-format.docx


--------------------------------------------------------------------------------
/tests/test-data/strikethrough.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/test-data/strikethrough.docx


--------------------------------------------------------------------------------
/tests/test-data/tables.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/test-data/tables.docx


--------------------------------------------------------------------------------
/tests/test-data/text-box.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/test-data/text-box.docx


--------------------------------------------------------------------------------
/tests/test-data/tiny-picture-target-base-relative.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/test-data/tiny-picture-target-base-relative.docx


--------------------------------------------------------------------------------
/tests/test-data/tiny-picture.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/test-data/tiny-picture.docx


--------------------------------------------------------------------------------
/tests/test-data/tiny-picture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/test-data/tiny-picture.png


--------------------------------------------------------------------------------
/tests/test-data/underline.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/test-data/underline.docx


--------------------------------------------------------------------------------
/tests/test-data/utf8-bom.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/bd2cf5a02ec9bf0e6ff877e3a962b236b4143f34/tests/test-data/utf8-bom.docx


--------------------------------------------------------------------------------
/tests/testing.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from precisely import assert_that, equal_to
 4 | 
 5 | 
 6 | def generate_test_path(path):
 7 |     this_dir = os.path.dirname(__file__)
 8 |     return os.path.join(this_dir, "test-data", path)
 9 | 
10 | 
11 | def assert_equal(expected, actual):
12 |     assert_that(actual, equal_to(expected))
13 | 
14 | 
15 | def assert_raises(exception, func):
16 |     try:
17 |         func()
18 |         assert False, "Expected " + exception.__name__
19 |     except exception as error:
20 |         return error
21 | 
22 | 


--------------------------------------------------------------------------------
/tests/transforms_tests.py:
--------------------------------------------------------------------------------
 1 | import cobble
 2 | 
 3 | from mammoth import documents, transforms
 4 | from mammoth.transforms import get_descendants, get_descendants_of_type, _each_element
 5 | from .testing import assert_equal
 6 | 
 7 | 
 8 | class ParagraphTests(object):
 9 |     def test_paragraph_is_transformed(self):
10 |         paragraph = documents.paragraph(children=[])
11 |         result = transforms.paragraph(lambda _: documents.tab())(paragraph)
12 |         assert_equal(documents.tab(), result)
13 | 
14 |     def test_non_paragraph_elements_are_not_transformed(self):
15 |         run = documents.run(children=[])
16 |         result = transforms.paragraph(lambda _: documents.tab())(run)
17 |         assert_equal(documents.run(children=[]), result)
18 | 
19 | 
20 | class RunTests(object):
21 |     def test_run_is_transformed(self):
22 |         run = documents.run(children=[])
23 |         result = transforms.run(lambda _: documents.tab())(run)
24 |         assert_equal(documents.tab(), result)
25 | 
26 |     def test_non_paragraph_elements_are_not_transformed(self):
27 |         paragraph = documents.paragraph(children=[])
28 |         result = transforms.run(lambda _: documents.tab())(paragraph)
29 |         assert_equal(documents.paragraph(children=[]), result)
30 | 
31 | 
32 | class EachElementTests(object):
33 |     def test_all_descendants_are_transformed(self):
34 |         @cobble.data
35 |         class Count(documents.HasChildren):
36 |             count = cobble.field()
37 | 
38 |         root = Count(count=None, children=[
39 |             Count(count=None, children=[
40 |                 Count(count=None, children=[]),
41 |             ]),
42 |         ])
43 | 
44 |         current_count = [0]
45 |         def set_count(node):
46 |             current_count[0] += 1
47 |             return node.copy(count=current_count[0])
48 | 
49 |         result = _each_element(set_count)(root)
50 | 
51 |         assert_equal(Count(count=3, children=[
52 |             Count(count=2, children=[
53 |                 Count(count=1, children=[]),
54 |             ]),
55 |         ]), result)
56 | 
57 | 
58 | class GetDescendantsTests(object):
59 |     def test_returns_nothing_if_element_type_has_no_children(self):
60 |         assert_equal([], get_descendants(documents.tab()))
61 | 
62 |     def test_returns_nothing_if_element_has_empty_children(self):
63 |         assert_equal([], get_descendants(documents.paragraph(children=[])))
64 | 
65 |     def test_includes_children(self):
66 |         children = [documents.text("child 1"), documents.text("child 2")]
67 |         element = documents.paragraph(children=children)
68 |         assert_equal(children, get_descendants(element))
69 | 
70 |     def test_includes_indirect_descendants(self):
71 |         grandchild = documents.text("grandchild")
72 |         child = documents.run(children=[grandchild])
73 |         element = documents.paragraph(children=[child])
74 |         assert_equal([grandchild, child], get_descendants(element))
75 | 
76 | 
77 | class GetDescendantsOfTypeTests(object):
78 |     def test_filters_descendants_to_type(self):
79 |         tab = documents.tab()
80 |         run = documents.run(children=[])
81 |         element = documents.paragraph(children=[tab, run])
82 |         assert_equal([run], get_descendants_of_type(element, documents.Run))
83 | 


--------------------------------------------------------------------------------
/tests/writers/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/tests/writers/markdown_tests.py:
--------------------------------------------------------------------------------
  1 | from __future__ import unicode_literals
  2 | 
  3 | from mammoth.writers.markdown import MarkdownWriter
  4 | from ..testing import assert_equal
  5 | 
  6 | 
  7 | def test_special_markdown_characters_are_escaped():
  8 |     writer = _create_writer()
  9 |     writer.text(r"\*")
 10 |     assert_equal(r"\\\*", writer.as_string())
 11 | 
 12 | 
 13 | def test_unrecognised_elements_are_treated_as_normal_text():
 14 |     writer = _create_writer()
 15 |     writer.start("blah");
 16 |     writer.text("Hello");
 17 |     writer.end("blah");
 18 |     assert_equal("Hello", writer.as_string())
 19 | 
 20 | 
 21 | def test_paragraphs_are_terminated_with_double_new_line():
 22 |     writer = _create_writer()
 23 |     writer.start("p");
 24 |     writer.text("Hello");
 25 |     writer.end("p");
 26 |     assert_equal("Hello\n\n", writer.as_string())
 27 | 
 28 | 
 29 | def test_h1_elements_are_converted_to_heading_with_leading_hash():
 30 |     writer = _create_writer()
 31 |     writer.start("h1");
 32 |     writer.text("Hello");
 33 |     writer.end("h1");
 34 |     assert_equal("# Hello\n\n", writer.as_string())
 35 | 
 36 | 
 37 | def test_h6_elements_are_converted_to_heading_with_six_leading_hashes():
 38 |     writer = _create_writer()
 39 |     writer.start("h6");
 40 |     writer.text("Hello");
 41 |     writer.end("h6");
 42 |     assert_equal("###### Hello\n\n", writer.as_string())
 43 | 
 44 | 
 45 | def test_br_is_written_as_two_spaces_followed_by_newline():
 46 |     writer = _create_writer()
 47 |     writer.text("Hello");
 48 |     writer.self_closing("br");
 49 |     assert_equal("Hello  \n", writer.as_string())
 50 | 
 51 | 
 52 | def test_strong_text_is_surrounded_by_two_underscores():
 53 |     writer = _create_writer()
 54 |     writer.text("Hello ");
 55 |     writer.start("strong");
 56 |     writer.text("World")
 57 |     writer.end("strong")
 58 |     assert_equal("Hello __World__", writer.as_string())
 59 | 
 60 | 
 61 | def test_emphasised_text_is_surrounded_by_one_asterix():
 62 |     writer = _create_writer()
 63 |     writer.text("Hello ");
 64 |     writer.start("em");
 65 |     writer.text("World")
 66 |     writer.end("em")
 67 |     assert_equal("Hello *World*", writer.as_string())
 68 | 
 69 | 
 70 | def test_anchor_tags_are_written_as_hyperlinks():
 71 |     writer = _create_writer()
 72 |     writer.start("a", {"href": "http://example.com"});
 73 |     writer.text("Hello");
 74 |     writer.end("a");
 75 |     assert_equal("[Hello](http://example.com)", writer.as_string())
 76 | 
 77 | 
 78 | def test_anchor_tags_without_href_attribute_are_treated_as_ordinary_text():
 79 |     writer = _create_writer()
 80 |     writer.start("a");
 81 |     writer.text("Hello");
 82 |     writer.end("a");
 83 |     assert_equal("Hello", writer.as_string())
 84 | 
 85 | 
 86 | def test_elements_with_ids_have_anchor_tags_with_ids_appended_to_start_of_markdown_element():
 87 |     writer = _create_writer()
 88 |     writer.start("h1", {"id": "start"})
 89 |     writer.text("Hello")
 90 |     writer.end("h1")
 91 |     assert_equal('# <a id="start"></a>Hello\n\n', writer.as_string())
 92 | 
 93 | 
 94 | def test_links_have_anchors_before_opening_square_bracket():
 95 |     writer = _create_writer()
 96 |     writer.start("a", {"href": "http://example.com", "id": "start"})
 97 |     writer.text("Hello")
 98 |     writer.end("a")
 99 |     assert_equal('<a id="start"></a>[Hello](http://example.com)', writer.as_string())
100 | 
101 | 
102 | def test_image_elements_are_written_as_markdown_images():
103 |     writer = _create_writer()
104 |     writer.self_closing("img", {"src": "http://example.com/image.jpg", "alt": "Alt Text"})
105 |     assert_equal("![Alt Text](http://example.com/image.jpg)", writer.as_string())
106 | 
107 | 
108 | def test_images_are_written_even_if_they_dont_have_alt_text():
109 |     writer = _create_writer()
110 |     writer.self_closing("img", {"src": "http://example.com/image.jpg"})
111 |     assert_equal("![](http://example.com/image.jpg)", writer.as_string())
112 | 
113 | 
114 | def test_images_are_written_even_if_they_dont_have_a_src_attribute():
115 |     writer = _create_writer()
116 |     writer.self_closing("img", {"alt": "Alt Text"})
117 |     assert_equal("![Alt Text]()", writer.as_string())
118 | 
119 | 
120 | def test_image_elements_are_ignored_if_they_have_no_src_and_no_alt_text():
121 |     writer = _create_writer()
122 |     writer.self_closing("img")
123 |     assert_equal("", writer.as_string())
124 | 
125 | 
126 | def test_list_item_outside_of_list_is_treated_as_unordered_list():
127 |     writer = _create_writer()
128 |     writer.start("li")
129 |     writer.text("Fruit")
130 |     writer.end("li")
131 |     assert_equal("- Fruit\n", writer.as_string())
132 | 
133 | 
134 | def test_ol_element_is_written_as_ordered_list_with_sequential_numbering():
135 |     writer = _create_writer()
136 |     writer.start("ol")
137 |     writer.start("li")
138 |     writer.text("Fruit")
139 |     writer.end("li")
140 |     writer.start("li")
141 |     writer.text("Condiments")
142 |     writer.end("li")
143 |     writer.end("ol")
144 |     assert_equal("1. Fruit\n2. Condiments\n\n", writer.as_string())
145 | 
146 | 
147 | def test_ul_element_is_written_as_unordered_list_using_hyphens_as_bullets():
148 |     writer = _create_writer()
149 |     writer.start("ul")
150 |     writer.start("li")
151 |     writer.text("Fruit")
152 |     writer.end("li")
153 |     writer.start("li")
154 |     writer.text("Condiments")
155 |     writer.end("li")
156 |     writer.end("ul")
157 |     assert_equal("- Fruit\n- Condiments\n\n", writer.as_string())
158 | 
159 | 
160 | def test_numbering_is_separate_for_nested_list_and_parent_list():
161 |     writer = _create_writer()
162 |     writer.start("ol")
163 | 
164 |     writer.start("li")
165 |     writer.text("Fruit")
166 |     writer.start("ol")
167 |     writer.start("li")
168 |     writer.text("Apple")
169 |     writer.end("li")
170 |     writer.start("li")
171 |     writer.text("Banana")
172 |     writer.end("li")
173 |     writer.end("ol")
174 |     writer.end("li")
175 | 
176 |     writer.start("li")
177 |     writer.text("Condiments")
178 |     writer.end("li")
179 |     writer.end("ol")
180 |     assert_equal("1. Fruit\n\t1. Apple\n\t2. Banana\n2. Condiments\n\n", writer.as_string())
181 | 
182 | 
183 | 
184 | def _create_writer():
185 |     return MarkdownWriter()
186 | 


--------------------------------------------------------------------------------
/tests/zips_tests.py:
--------------------------------------------------------------------------------
 1 | from mammoth import zips
 2 | from .testing import assert_equal
 3 | 
 4 | 
 5 | def test_split_path_splits_zip_paths_on_last_forward_slash():
 6 |     assert_equal(("a", "b"), zips.split_path("a/b"))
 7 |     assert_equal(("a/b", "c"), zips.split_path("a/b/c"))
 8 |     assert_equal(("/a/b", "c"), zips.split_path("/a/b/c"))
 9 | 
10 | 
11 | def test_when_path_has_no_forward_slashes_then_split_path_returns_empty_dirname():
12 |     assert_equal(("", "name"), zips.split_path("name"))
13 | 
14 | 
15 | def test_join_path_joins_arguments_with_forward_slashes():
16 |     assert_equal("a/b", zips.join_path("a", "b"))
17 |     assert_equal("a/b/c", zips.join_path("a/b", "c"))
18 |     assert_equal("/a/b/c", zips.join_path("/a/b", "c"))
19 | 
20 | 
21 | def test_empty_parts_are_ignored_when_joining_paths():
22 |     assert_equal("a", zips.join_path("a", ""))
23 |     assert_equal("b", zips.join_path("", "b"))
24 |     assert_equal("a/b", zips.join_path("a", "", "b"))
25 | 
26 | 
27 | def test_when_joining_paths_then_absolute_paths_ignore_earlier_paths():
28 |     assert_equal("/b", zips.join_path("a", "/b"))
29 |     assert_equal("/b/c", zips.join_path("a", "/b", "c"))
30 |     assert_equal("/b", zips.join_path("/a", "/b"))
31 |     assert_equal("/a", zips.join_path("/a"))
32 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py37,py38,py39,py310,py311,py312,pypy3
 3 | [testenv]
 4 | changedir = {envtmpdir}
 5 | deps=-r{toxinidir}/test-requirements.txt
 6 | commands=
 7 |     py.test {toxinidir}/tests
 8 | [pytest]
 9 | python_classes = *Tests
10 | python_files = *_tests.py
11 | 


--------------------------------------------------------------------------------