├── tests
├── __init__.py
├── docx
│ ├── __init__.py
│ ├── uris_tests.py
│ ├── document_matchers.py
│ ├── notes_xml_tests.py
│ ├── office_xml_tests.py
│ ├── comments_xml_tests.py
│ ├── document_xml_tests.py
│ ├── relationships_xml_tests.py
│ ├── content_types_xml_tests.py
│ ├── files_tests.py
│ ├── xmlparser_tests.py
│ ├── style_map_tests.py
│ ├── styles_xml_tests.py
│ ├── numbering_xml_tests.py
│ └── docx_tests.py
├── html
│ ├── __init__.py
│ ├── strip_empty_tests.py
│ └── collapse_tests.py
├── styles
│ ├── __init__.py
│ ├── parser
│ │ ├── __init__.py
│ │ ├── style_mapping_parser_tests.py
│ │ ├── token_parser_tests.py
│ │ ├── html_path_parser_tests.py
│ │ ├── tokeniser_tests.py
│ │ └── document_matcher_parser_tests.py
│ └── document_matcher_tests.py
├── writers
│ ├── __init__.py
│ └── markdown_tests.py
├── test-data
│ ├── empty.docx
│ ├── tables.docx
│ ├── comments.docx
│ ├── endnotes.docx
│ ├── footnotes.docx
│ ├── text-box.docx
│ ├── underline.docx
│ ├── utf8-bom.docx
│ ├── simple-list.docx
│ ├── tiny-picture.docx
│ ├── tiny-picture.png
│ ├── strict-format.docx
│ ├── strikethrough.docx
│ ├── external-picture.docx
│ ├── single-paragraph.docx
│ ├── embedded-style-map.docx
│ ├── footnote-hyperlink.docx
│ ├── tiny-picture-target-base-relative.docx
│ ├── hyperlinks
│ │ └── word
│ │ │ ├── _rels
│ │ │ └── document.xml.rels
│ │ │ └── document.xml
│ └── simple
│ │ └── word
│ │ └── document.xml
├── conftest.py
├── lists_tests.py
├── testing.py
├── zips_tests.py
├── options_tests.py
├── raw_text_tests.py
├── images_tests.py
├── cli_tests.py
└── transforms_tests.py
├── setup.cfg
├── mammoth
├── styles
│ ├── parser
│ │ ├── errors.py
│ │ ├── __init__.py
│ │ ├── style_mapping_parser.py
│ │ ├── token_parser.py
│ │ ├── tokeniser.py
│ │ ├── token_iterator.py
│ │ ├── html_path_parser.py
│ │ └── document_matcher_parser.py
│ └── __init__.py
├── underline.py
├── docx
│ ├── uris.py
│ ├── complex_fields.py
│ ├── document_xml.py
│ ├── comments_xml.py
│ ├── notes_xml.py
│ ├── relationships_xml.py
│ ├── files.py
│ ├── content_types_xml.py
│ ├── office_xml.py
│ ├── style_map.py
│ ├── xmlparser.py
│ ├── styles_xml.py
│ ├── numbering_xml.py
│ └── __init__.py
├── writers
│ ├── __init__.py
│ ├── abc.py
│ ├── html.py
│ └── markdown.py
├── raw_text.py
├── images.py
├── lists.py
├── results.py
├── html
│ ├── nodes.py
│ └── __init__.py
├── html_paths.py
├── transforms.py
├── __init__.py
├── zips.py
├── document_matchers.py
├── cli.py
├── options.py
└── documents.py
├── .gitignore
├── pyproject.toml
├── test-requirements.txt
├── .github
├── pull_request_template.md
├── ISSUE_TEMPLATE.md
└── workflows
│ └── tests.yml
├── tox.ini
├── makefile
├── LICENSE
├── setup.py
├── recipes
└── wmf_images.py
└── NEWS
/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/tests/docx/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/html/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/tests/styles/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/styles/parser/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/writers/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal = 1
3 |
--------------------------------------------------------------------------------
/mammoth/styles/parser/errors.py:
--------------------------------------------------------------------------------
1 | class LineParseError(Exception):
2 | pass
3 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | /README
3 | /_virtualenv
4 | /*.egg-info
5 | /.tox
6 | /MANIFEST
7 | /build
8 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools"]
3 | build-backend = "setuptools.build_meta"
4 |
--------------------------------------------------------------------------------
/tests/test-data/empty.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/HEAD/tests/test-data/empty.docx
--------------------------------------------------------------------------------
/tests/test-data/tables.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/HEAD/tests/test-data/tables.docx
--------------------------------------------------------------------------------
/tests/test-data/comments.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/HEAD/tests/test-data/comments.docx
--------------------------------------------------------------------------------
/tests/test-data/endnotes.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/HEAD/tests/test-data/endnotes.docx
--------------------------------------------------------------------------------
/tests/test-data/footnotes.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/HEAD/tests/test-data/footnotes.docx
--------------------------------------------------------------------------------
/tests/test-data/text-box.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/HEAD/tests/test-data/text-box.docx
--------------------------------------------------------------------------------
/tests/test-data/underline.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/HEAD/tests/test-data/underline.docx
--------------------------------------------------------------------------------
/tests/test-data/utf8-bom.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/HEAD/tests/test-data/utf8-bom.docx
--------------------------------------------------------------------------------
/tests/test-data/simple-list.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/HEAD/tests/test-data/simple-list.docx
--------------------------------------------------------------------------------
/tests/test-data/tiny-picture.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/HEAD/tests/test-data/tiny-picture.docx
--------------------------------------------------------------------------------
/tests/test-data/tiny-picture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/HEAD/tests/test-data/tiny-picture.png
--------------------------------------------------------------------------------
/tests/test-data/strict-format.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/HEAD/tests/test-data/strict-format.docx
--------------------------------------------------------------------------------
/tests/test-data/strikethrough.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/HEAD/tests/test-data/strikethrough.docx
--------------------------------------------------------------------------------
/tests/test-data/external-picture.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/HEAD/tests/test-data/external-picture.docx
--------------------------------------------------------------------------------
/tests/test-data/single-paragraph.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/HEAD/tests/test-data/single-paragraph.docx
--------------------------------------------------------------------------------
/test-requirements.txt:
--------------------------------------------------------------------------------
1 | funk>=0.4,<0.5
2 | pytest
3 | precisely==0.1.3
4 | pyflakes==2.4.0
5 | spur.local>=0.3.7,<0.4
6 | tempman>=0.1.2,<0.2
7 |
--------------------------------------------------------------------------------
/tests/test-data/embedded-style-map.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/HEAD/tests/test-data/embedded-style-map.docx
--------------------------------------------------------------------------------
/tests/test-data/footnote-hyperlink.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/HEAD/tests/test-data/footnote-hyperlink.docx
--------------------------------------------------------------------------------
/tests/test-data/tiny-picture-target-base-relative.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mwilliamson/python-mammoth/HEAD/tests/test-data/tiny-picture-target-base-relative.docx
--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
1 | In general, pull requests are not currently accepted.
2 |
3 | Please instead submit an issue if you find a bug or would like to request a feature.
4 |
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import funk
2 | import pytest
3 |
4 |
5 | @pytest.fixture(name="mocks")
6 | def _fixture_mocks():
7 | mocks = funk.Mocks()
8 | yield mocks
9 | mocks.verify()
10 |
--------------------------------------------------------------------------------
/mammoth/underline.py:
--------------------------------------------------------------------------------
1 | from . import html
2 |
3 |
4 | def element(name):
5 | def convert_underline(nodes):
6 | return [html.collapsible_element(name, {}, nodes)]
7 |
8 | return convert_underline
9 |
--------------------------------------------------------------------------------
/mammoth/styles/__init__.py:
--------------------------------------------------------------------------------
1 | import collections
2 |
3 |
4 | def style(document_matcher, html_path):
5 | return Style(document_matcher, html_path)
6 |
7 |
8 | Style = collections.namedtuple("Style", ["document_matcher", "html_path"])
9 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist = py37,py38,py39,py310,py311,py312,pypy3
3 | [testenv]
4 | changedir = {envtmpdir}
5 | deps=-r{toxinidir}/test-requirements.txt
6 | commands=
7 | py.test {toxinidir}/tests
8 | [pytest]
9 | python_classes = *Tests
10 | python_files = *_tests.py
11 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | If you're reporting a bug or requesting a feature, please include:
2 | * a minimal example document
3 | * the HTML output that you'd expect
4 |
5 | If you're reporting a bug, it's also useful to know what platform you're
6 | running on, including:
7 |
8 | * the version of Python
9 | * the operating system and version
10 |
--------------------------------------------------------------------------------
/tests/lists_tests.py:
--------------------------------------------------------------------------------
1 | from mammoth.lists import unique
2 | from .testing import assert_equal
3 |
4 |
5 | def test_unique_of_empty_list_is_empty_list():
6 | assert_equal([], unique([]))
7 |
8 |
9 | def test_unique_removes_duplicates_while_preserving_order():
10 | assert_equal(["apple", "banana"], unique(["apple", "banana", "apple"]))
11 |
--------------------------------------------------------------------------------
/mammoth/docx/uris.py:
--------------------------------------------------------------------------------
1 | def uri_to_zip_entry_name(base, uri):
2 | if uri.startswith("/"):
3 | return uri[1:]
4 | else:
5 | return base + "/" + uri
6 |
7 |
8 | def replace_fragment(uri, fragment):
9 | hash_index = uri.find("#")
10 | if hash_index != -1:
11 | uri = uri[:hash_index]
12 | return uri + "#" + fragment
13 |
--------------------------------------------------------------------------------
/mammoth/writers/__init__.py:
--------------------------------------------------------------------------------
1 | from .html import HtmlWriter
2 | from .markdown import MarkdownWriter
3 |
4 |
5 | def writer(output_format=None):
6 | if output_format is None:
7 | output_format = "html"
8 |
9 | return _writers[output_format]()
10 |
11 |
12 | def formats():
13 | return _writers.keys()
14 |
15 |
16 | _writers = {
17 | "html": HtmlWriter,
18 | "markdown": MarkdownWriter,
19 | }
20 |
--------------------------------------------------------------------------------
/mammoth/raw_text.py:
--------------------------------------------------------------------------------
1 | from . import documents
2 |
3 |
4 | def extract_raw_text_from_element(element):
5 | if isinstance(element, documents.Text):
6 | return element.value
7 | elif isinstance(element, documents.Tab):
8 | return "\t"
9 | else:
10 | text = "".join(map(extract_raw_text_from_element, getattr(element, "children", [])))
11 | if isinstance(element, documents.Paragraph):
12 | return text + "\n\n"
13 | else:
14 | return text
15 |
--------------------------------------------------------------------------------
/tests/testing.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from precisely import assert_that, equal_to
4 |
5 |
6 | def generate_test_path(path):
7 | this_dir = os.path.dirname(__file__)
8 | return os.path.join(this_dir, "test-data", path)
9 |
10 |
11 | def assert_equal(expected, actual):
12 | assert_that(actual, equal_to(expected))
13 |
14 |
15 | def assert_raises(exception, func):
16 | try:
17 | func()
18 | assert False, "Expected " + exception.__name__
19 | except exception as error:
20 | return error
21 |
22 |
--------------------------------------------------------------------------------
/tests/docx/uris_tests.py:
--------------------------------------------------------------------------------
1 | from mammoth.docx.uris import uri_to_zip_entry_name
2 | from ..testing import assert_equal
3 |
4 |
5 | def test_when_path_does_not_have_leading_slash_then_path_is_resolved_relative_to_base():
6 | assert_equal(
7 | "one/two/three/four",
8 | uri_to_zip_entry_name("one/two", "three/four"),
9 | )
10 |
11 |
12 | def test_when_path_has_leading_slash_then_base_is_ignored():
13 | assert_equal(
14 | "three/four",
15 | uri_to_zip_entry_name("one/two", "/three/four"),
16 | )
17 |
--------------------------------------------------------------------------------
/mammoth/styles/parser/__init__.py:
--------------------------------------------------------------------------------
1 | from .errors import LineParseError
2 | from .style_mapping_parser import parse_style_mapping
3 | from .tokeniser import tokenise
4 | from .token_iterator import TokenIterator
5 | from ... import results
6 |
7 |
8 | def read_style_mapping(string):
9 | try:
10 | tokens = tokenise(string)
11 | return results.success(parse_style_mapping(TokenIterator(tokens)))
12 | except LineParseError:
13 | warning = "Did not understand this style mapping, so ignored it: " + string
14 | return results.Result(None, [results.warning(warning)])
15 |
--------------------------------------------------------------------------------
/mammoth/styles/parser/style_mapping_parser.py:
--------------------------------------------------------------------------------
1 | from .tokeniser import TokenType
2 | from .document_matcher_parser import parse_document_matcher
3 | from .html_path_parser import parse_html_path
4 | from ...styles import Style
5 |
6 |
7 | def parse_style_mapping(tokens):
8 | document_matcher = parse_document_matcher(tokens)
9 | tokens.skip(TokenType.WHITESPACE)
10 | tokens.skip(TokenType.SYMBOL, "=>")
11 | tokens.try_skip(TokenType.WHITESPACE)
12 | html_path = parse_html_path(tokens)
13 | tokens.skip(TokenType.END)
14 |
15 | return Style(document_matcher, html_path)
16 |
--------------------------------------------------------------------------------
/mammoth/docx/complex_fields.py:
--------------------------------------------------------------------------------
1 | class unknown(object):
2 | pass
3 |
4 |
5 | class Begin:
6 | def __init__(self, *, fld_char):
7 | self.fld_char = fld_char
8 |
9 |
10 | def begin(*, fld_char):
11 | return Begin(fld_char=fld_char)
12 |
13 |
14 | class Hyperlink(object):
15 | def __init__(self, kwargs):
16 | self.kwargs = kwargs
17 |
18 |
19 | def hyperlink(kwargs):
20 | return Hyperlink(kwargs=kwargs)
21 |
22 |
23 | class Checkbox:
24 | def __init__(self, *, checked):
25 | self.checked = checked
26 |
27 |
28 | def checkbox(*, checked):
29 | return Checkbox(checked=checked)
30 |
--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
1 | name: Tests
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | build:
7 | runs-on: ubuntu-22.04
8 |
9 | strategy:
10 | matrix:
11 | python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "pypy3.9"]
12 |
13 | steps:
14 |
15 | - uses: actions/checkout@v4
16 |
17 | - name: Use Python ${{ matrix.python-version }}
18 | uses: actions/setup-python@v5
19 | with:
20 | python-version: ${{ matrix.python-version }}
21 |
22 | - run: pip install tox
23 |
24 | - run: make README
25 |
26 | - run: tox -e py
27 |
--------------------------------------------------------------------------------
/mammoth/writers/abc.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 |
3 | import abc
4 |
5 |
6 | class Writer(object):
7 | __metaclass__ = abc.ABCMeta
8 |
9 | @abc.abstractmethod
10 | def text(self, text):
11 | pass
12 |
13 | @abc.abstractmethod
14 | def start(self, name, attributes=None):
15 | pass
16 |
17 | @abc.abstractmethod
18 | def end(self, name):
19 | pass
20 |
21 | @abc.abstractmethod
22 | def self_closing(self, name, attributes=None):
23 | pass
24 |
25 | @abc.abstractmethod
26 | def append(self, html):
27 | pass
28 |
29 | @abc.abstractmethod
30 | def as_string(self):
31 | pass
32 |
--------------------------------------------------------------------------------
/tests/styles/parser/style_mapping_parser_tests.py:
--------------------------------------------------------------------------------
1 | from mammoth import html_paths, document_matchers, styles
2 | from mammoth.styles.parser.style_mapping_parser import parse_style_mapping
3 | from mammoth.styles.parser.tokeniser import tokenise
4 | from mammoth.styles.parser.token_iterator import TokenIterator
5 | from ...testing import assert_equal
6 |
7 |
8 | def test_document_matcher_is_mapped_to_html_path_using_fat_arrow():
9 | assert_equal(
10 | styles.style(document_matchers.paragraph(), html_paths.path([html_paths.element(["h1"])])),
11 | read_style_mapping("p => h1")
12 | )
13 |
14 |
15 | def read_style_mapping(string):
16 | return parse_style_mapping(TokenIterator(tokenise(string)))
17 |
--------------------------------------------------------------------------------
/mammoth/docx/document_xml.py:
--------------------------------------------------------------------------------
1 | from .. import documents
2 |
3 |
4 | def read_document_xml_element(
5 | element,
6 | body_reader,
7 | notes=None,
8 | comments=None):
9 |
10 | if notes is None:
11 | notes = []
12 | if comments is None:
13 | comments = []
14 |
15 | body_element = element.find_child("w:body")
16 |
17 | if body_element is None:
18 | raise ValueError("Could not find the body element: are you sure this is a docx file?")
19 |
20 | return body_reader.read_all(body_element.children) \
21 | .map(lambda children: documents.document(
22 | children,
23 | notes=documents.notes(notes),
24 | comments=comments
25 | ))
26 |
--------------------------------------------------------------------------------
/mammoth/images.py:
--------------------------------------------------------------------------------
1 | import base64
2 |
3 | from . import html
4 |
5 |
6 | def img_element(func):
7 | def convert_image(image):
8 | attributes = {}
9 | if image.alt_text:
10 | attributes["alt"] = image.alt_text
11 | attributes.update(func(image))
12 |
13 | return [html.element("img", attributes)]
14 |
15 | return convert_image
16 |
17 | # Undocumented, but retained for backwards-compatibility with 0.3.x
18 | inline = img_element
19 |
20 |
21 | @img_element
22 | def data_uri(image):
23 | with image.open() as image_bytes:
24 | encoded_src = base64.b64encode(image_bytes.read()).decode("ascii")
25 |
26 | return {
27 | "src": "data:{0};base64,{1}".format(image.content_type, encoded_src)
28 | }
29 |
--------------------------------------------------------------------------------
/tests/styles/document_matcher_tests.py:
--------------------------------------------------------------------------------
1 | from mammoth import document_matchers
2 | from ..testing import assert_equal
3 |
4 |
5 | def test_equal_to_matcher_is_case_insensitive():
6 | matcher = document_matchers.equal_to("Heading 1")
7 | assert_equal(True, matcher.matches("heaDING 1"))
8 | assert_equal(False, matcher.matches("heaDING 2"))
9 |
10 |
11 | def test_starts_with_matcher_matches_string_with_prefix():
12 | matcher = document_matchers.starts_with("Heading")
13 | assert_equal(True, matcher.matches("Heading"))
14 | assert_equal(True, matcher.matches("Heading 1"))
15 | assert_equal(False, matcher.matches("Custom Heading"))
16 | assert_equal(False, matcher.matches("Head"))
17 | assert_equal(False, matcher.matches("Header 2"))
18 |
19 |
20 | def test_starts_with_matcher_is_case_insensitive():
21 | matcher = document_matchers.starts_with("Heading")
22 | assert_equal(True, matcher.matches("heaDING"))
23 |
--------------------------------------------------------------------------------
/mammoth/docx/comments_xml.py:
--------------------------------------------------------------------------------
1 | from .. import lists
2 | from .. import documents
3 | from .. import results
4 |
5 |
6 | def read_comments_xml_element(element, body_reader):
7 | def read_comments_xml_element(element):
8 | comment_elements = element.find_children("w:comment")
9 | return results.combine(lists.map(_read_comment_element, comment_elements))
10 |
11 |
12 | def _read_comment_element(element):
13 | def read_optional_attribute(name):
14 | return element.attributes.get(name, "").strip() or None
15 |
16 | return body_reader.read_all(element.children).map(lambda body:
17 | documents.comment(
18 | comment_id=element.attributes["w:id"],
19 | body=body,
20 | author_name=read_optional_attribute("w:author"),
21 | author_initials=read_optional_attribute("w:initials"),
22 | ))
23 |
24 | return read_comments_xml_element(element)
25 |
--------------------------------------------------------------------------------
/mammoth/lists.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def flatten(values):
5 | return flat_map(lambda x: x, values)
6 |
7 |
8 | def unique(values):
9 | output = []
10 | seen = set()
11 | for value in values:
12 | if value not in seen:
13 | seen.add(value)
14 | output.append(value)
15 | return output
16 |
17 |
18 | def flat_map(func, values):
19 | return [
20 | element
21 | for value in values
22 | for element in func(value)
23 | ]
24 |
25 |
26 | def find_index(predicate, values):
27 | for index, value in enumerate(values):
28 | if predicate(value):
29 | return index
30 |
31 |
32 | if sys.version_info[0] == 2:
33 | map = map
34 | filter = filter
35 | else:
36 | import builtins
37 | def map(*args, **kwargs):
38 | return list(builtins.map(*args, **kwargs))
39 | def filter(*args, **kwargs):
40 | return list(builtins.filter(*args, **kwargs))
41 |
--------------------------------------------------------------------------------
/tests/docx/document_matchers.py:
--------------------------------------------------------------------------------
1 | from precisely import all_of, has_attrs, instance_of
2 |
3 | from mammoth import documents
4 |
5 |
6 | def create_element_matcher(element_type):
7 | def matcher(**kwargs):
8 | return all_of(
9 | instance_of(element_type),
10 | has_attrs(**kwargs),
11 | )
12 |
13 | return matcher
14 |
15 |
16 | is_paragraph = create_element_matcher(documents.Paragraph)
17 | is_run = create_element_matcher(documents.Run)
18 | is_hyperlink = create_element_matcher(documents.Hyperlink)
19 | is_checkbox = create_element_matcher(documents.Checkbox)
20 | is_table = create_element_matcher(documents.Table)
21 | is_row = create_element_matcher(documents.TableRow)
22 | is_image = create_element_matcher(documents.Image)
23 |
24 |
25 | is_empty_run = is_run(children=[])
26 |
27 |
28 | def is_text(value):
29 | return all_of(
30 | instance_of(documents.Text),
31 | has_attrs(value=value),
32 | )
33 |
--------------------------------------------------------------------------------
/mammoth/styles/parser/token_parser.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | from .tokeniser import TokenType
4 |
5 |
6 | def try_parse_class_name(tokens):
7 | if tokens.try_skip(TokenType.SYMBOL, "."):
8 | return parse_identifier(tokens)
9 | else:
10 | return None
11 |
12 |
13 | def parse_identifier(tokens):
14 | return decode_escape_sequences(tokens.next_value(TokenType.IDENTIFIER))
15 |
16 |
17 | def parse_string(tokens):
18 | return decode_escape_sequences(tokens.next_value(TokenType.STRING)[1:-1])
19 |
20 |
21 | _ESCAPE_SEQUENCE_REGEX = re.compile(r"\\(.)")
22 |
23 |
24 | def decode_escape_sequences(value):
25 | return _ESCAPE_SEQUENCE_REGEX.sub(_decode_escape_sequence, value)
26 |
27 |
28 | def _decode_escape_sequence(match):
29 | code = match.group(1)
30 | if code == "n":
31 | return "\n"
32 | elif code == "r":
33 | return "\r"
34 | elif code == "t":
35 | return "\t"
36 | else:
37 | return code
38 |
--------------------------------------------------------------------------------
/mammoth/results.py:
--------------------------------------------------------------------------------
1 | import collections
2 |
3 | from .lists import unique
4 |
5 |
6 | class Result(object):
7 | def __init__(self, value, messages):
8 | self.value = value
9 | self.messages = unique(messages)
10 |
11 | def map(self, func):
12 | return Result(func(self.value), self.messages)
13 |
14 | def bind(self, func):
15 | result = func(self.value)
16 | return Result(result.value, self.messages + result.messages)
17 |
18 |
19 | Message = collections.namedtuple("Message", ["type", "message"])
20 |
21 |
22 | def warning(message):
23 | return Message("warning", message)
24 |
25 |
26 | def success(value):
27 | return Result(value, [])
28 |
29 |
30 | def combine(results):
31 | values = []
32 | messages = []
33 | for result in results:
34 | values.append(result.value)
35 | for message in result.messages:
36 | messages.append(message)
37 |
38 | return Result(values, messages)
39 |
40 |
41 | def map(func, *args):
42 | return combine(args).map(lambda values: func(*values))
43 |
--------------------------------------------------------------------------------
/mammoth/docx/notes_xml.py:
--------------------------------------------------------------------------------
1 | import functools
2 |
3 | from .. import lists
4 | from .. import documents
5 | from .. import results
6 |
7 |
8 | def _read_notes(note_type, element, body_reader):
9 | def read_notes_xml_element(element):
10 | note_elements = lists.filter(
11 | _is_note_element,
12 | element.find_children("w:" + note_type),
13 | )
14 | return results.combine(lists.map(_read_note_element, note_elements))
15 |
16 |
17 | def _is_note_element(element):
18 | return element.attributes.get("w:type") not in ["continuationSeparator", "separator"]
19 |
20 |
21 | def _read_note_element(element):
22 | return body_reader.read_all(element.children).map(lambda body:
23 | documents.note(
24 | note_type=note_type,
25 | note_id=element.attributes["w:id"],
26 | body=body
27 | ))
28 |
29 | return read_notes_xml_element(element)
30 |
31 | read_footnotes_xml_element = functools.partial(_read_notes, "footnote")
32 | read_endnotes_xml_element = functools.partial(_read_notes, "endnote")
33 |
--------------------------------------------------------------------------------
/tests/zips_tests.py:
--------------------------------------------------------------------------------
1 | from mammoth import zips
2 | from .testing import assert_equal
3 |
4 |
5 | def test_split_path_splits_zip_paths_on_last_forward_slash():
6 | assert_equal(("a", "b"), zips.split_path("a/b"))
7 | assert_equal(("a/b", "c"), zips.split_path("a/b/c"))
8 | assert_equal(("/a/b", "c"), zips.split_path("/a/b/c"))
9 |
10 |
11 | def test_when_path_has_no_forward_slashes_then_split_path_returns_empty_dirname():
12 | assert_equal(("", "name"), zips.split_path("name"))
13 |
14 |
15 | def test_join_path_joins_arguments_with_forward_slashes():
16 | assert_equal("a/b", zips.join_path("a", "b"))
17 | assert_equal("a/b/c", zips.join_path("a/b", "c"))
18 | assert_equal("/a/b/c", zips.join_path("/a/b", "c"))
19 |
20 |
21 | def test_empty_parts_are_ignored_when_joining_paths():
22 | assert_equal("a", zips.join_path("a", ""))
23 | assert_equal("b", zips.join_path("", "b"))
24 | assert_equal("a/b", zips.join_path("a", "", "b"))
25 |
26 |
27 | def test_when_joining_paths_then_absolute_paths_ignore_earlier_paths():
28 | assert_equal("/b", zips.join_path("a", "/b"))
29 | assert_equal("/b/c", zips.join_path("a", "/b", "c"))
30 | assert_equal("/b", zips.join_path("/a", "/b"))
31 | assert_equal("/a", zips.join_path("/a"))
32 |
--------------------------------------------------------------------------------
/tests/test-data/hyperlinks/word/_rels/document.xml.rels:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/makefile:
--------------------------------------------------------------------------------
1 | .PHONY: test
2 |
3 | test:
4 | _virtualenv/bin/pyflakes mammoth tests
5 | sh -c '. _virtualenv/bin/activate; py.test tests'
6 |
7 | .PHONY: test-all
8 |
9 | test-all:
10 | tox
11 |
12 | .PHONY: upload
13 |
14 | upload: setup assert-converted-readme build-dist
15 | _virtualenv/bin/twine upload dist/*
16 | make clean
17 |
18 | .PHONY: build-dist
19 |
20 | build-dist:
21 | rm -rf dist
22 | _virtualenv/bin/pyproject-build
23 |
24 | README: README.md
25 | pandoc --from=markdown --to=rst README.md > README || cp README.md README
26 |
27 | .PHONY: assert-converted-readme
28 |
29 | assert-converted-readme:
30 | test "`cat README`" != "`cat README.md`"
31 |
32 | .PHONY: clean
33 |
34 | clean:
35 | rm -f README
36 | rm -f MANIFEST
37 | rm -rf dist
38 |
39 | .PHONY: bootstrap
40 |
41 | bootstrap: _virtualenv setup
42 | _virtualenv/bin/pip install -e .
43 | ifneq ($(wildcard test-requirements.txt),)
44 | _virtualenv/bin/pip install -r test-requirements.txt
45 | endif
46 | make clean
47 |
48 | .PHONY: setup
49 |
50 | setup: README
51 |
52 | _virtualenv:
53 | python3 -m venv _virtualenv
54 | _virtualenv/bin/pip install --upgrade pip
55 | _virtualenv/bin/pip install --upgrade setuptools
56 | _virtualenv/bin/pip install --upgrade wheel
57 | _virtualenv/bin/pip install --upgrade build twine
58 |
--------------------------------------------------------------------------------
/tests/styles/parser/token_parser_tests.py:
--------------------------------------------------------------------------------
1 | from mammoth.styles.parser.tokeniser import Token, TokenType
2 | from mammoth.styles.parser.token_parser import decode_escape_sequences, parse_identifier, parse_string
3 | from mammoth.styles.parser.token_iterator import TokenIterator
4 | from ...testing import assert_equal
5 |
6 |
7 | def test_escape_sequences_in_identifiers_are_decoded():
8 | assert_equal(
9 | ":",
10 | parse_identifier(TokenIterator([
11 | Token(0, TokenType.IDENTIFIER, r"\:"),
12 | ])),
13 | )
14 |
15 |
16 | def test_escape_sequences_in_strings_are_decoded():
17 | assert_equal(
18 | "\n",
19 | parse_string(TokenIterator([
20 | Token(0, TokenType.STRING, r"'\n'"),
21 | ])),
22 | )
23 |
24 |
25 | def test_line_feeds_are_decoded():
26 | assert_equal("\n", decode_escape_sequences(r"\n"))
27 |
28 |
29 | def test_carriage_returns_are_decoded():
30 | assert_equal("\r", decode_escape_sequences(r"\r"))
31 |
32 |
33 | def test_tabs_are_decoded():
34 | assert_equal("\t", decode_escape_sequences(r"\t"))
35 |
36 |
37 | def test_backslashes_are_decoded():
38 | assert_equal("\\", decode_escape_sequences(r"\\"))
39 |
40 |
41 | def test_colons_are_decoded():
42 | assert_equal(":", decode_escape_sequences(r"\:"))
43 |
--------------------------------------------------------------------------------
/tests/docx/notes_xml_tests.py:
--------------------------------------------------------------------------------
1 | from mammoth import documents
2 | from mammoth.docx.xmlparser import element as xml_element
3 | from mammoth.docx.notes_xml import read_footnotes_xml_element
4 | from mammoth.docx import body_xml
5 | from ..testing import assert_equal
6 |
7 |
8 | def test_id_and_body_of_footnote_are_read():
9 | footnote_body = [xml_element("w:p")]
10 | footnotes = read_footnotes_xml_element(xml_element("w:footnotes", {}, [
11 | xml_element("w:footnote", {"w:id": "1"}, footnote_body),
12 | ]), body_reader=body_xml.reader())
13 | assert_equal(1, len(footnotes.value))
14 | assert isinstance(footnotes.value[0].body[0], documents.Paragraph)
15 | assert_equal("1", footnotes.value[0].note_id)
16 |
17 |
18 | def test_continuation_separator_is_ignored():
19 | _assert_footnote_type_is_ignored("continuationSeparator")
20 |
21 |
22 | def test_separator_is_ignored():
23 | _assert_footnote_type_is_ignored("separator")
24 |
25 |
26 | def _assert_footnote_type_is_ignored(footnote_type):
27 | footnote_body = [xml_element("w:p")]
28 | footnotes = read_footnotes_xml_element(xml_element("w:footnotes", {}, [
29 | xml_element("w:footnote", {"w:id": "1", "w:type": footnote_type}, footnote_body),
30 | ]), body_reader=None)
31 | assert_equal(0, len(footnotes.value))
32 |
33 |
--------------------------------------------------------------------------------
/mammoth/html/nodes.py:
--------------------------------------------------------------------------------
1 | import cobble
2 |
3 |
4 | class Node(object):
5 | pass
6 |
7 |
8 | @cobble.data
9 | class TextNode(Node):
10 | value = cobble.field()
11 |
12 |
13 | @cobble.data
14 | class Tag(object):
15 | tag_names = cobble.field()
16 | attributes = cobble.field()
17 | collapsible = cobble.field()
18 | separator = cobble.field()
19 |
20 | @property
21 | def tag_name(self):
22 | return self.tag_names[0]
23 |
24 |
25 | @cobble.data
26 | class Element(Node):
27 | tag = cobble.field()
28 | children = cobble.field()
29 |
30 | @property
31 | def tag_name(self):
32 | return self.tag.tag_name
33 |
34 | @property
35 | def tag_names(self):
36 | return self.tag.tag_names
37 |
38 | @property
39 | def attributes(self):
40 | return self.tag.attributes
41 |
42 | @property
43 | def collapsible(self):
44 | return self.tag.collapsible
45 |
46 | @property
47 | def separator(self):
48 | return self.tag.separator
49 |
50 | _VOID_TAG_NAMES = set(["br", "hr", "img", "input"])
51 |
52 | def is_void(self):
53 | return not self.children and self.tag_name in self._VOID_TAG_NAMES
54 |
55 |
56 | @cobble.visitable
57 | class ForceWrite(Node):
58 | pass
59 |
60 |
61 | NodeVisitor = cobble.visitor(Node)
62 |
--------------------------------------------------------------------------------
/tests/options_tests.py:
--------------------------------------------------------------------------------
1 | from mammoth.options import read_options, _default_style_map
2 | from mammoth.styles.parser import read_style_mapping
3 | from .testing import assert_equal
4 |
5 |
6 | def test_default_style_map_is_used_if_style_map_is_not_set():
7 | assert_equal(_default_style_map, read_options({}).value["style_map"])
8 |
9 |
10 | def test_custom_style_mappings_are_prepended_to_default_style_mappings():
11 | style_map = read_options({
12 | "style_map": "p.SectionTitle => h2"
13 | }).value["style_map"]
14 | assert_equal(read_style_mapping("p.SectionTitle => h2").value, style_map[0])
15 | assert_equal(_default_style_map, style_map[1:])
16 |
17 |
18 | def test_default_style_mappings_are_ignored_if_include_default_style_map_is_false():
19 | style_map = read_options({
20 | "style_map": "p.SectionTitle => h2",
21 | "include_default_style_map": False
22 | }).value["style_map"]
23 | assert_equal([read_style_mapping("p.SectionTitle => h2").value], style_map)
24 |
25 |
26 | def test_lines_starting_with_hash_in_custom_style_map_are_ignored():
27 | style_map = read_options({
28 | "style_map": "#p.SectionTitle => h3\np.SectionTitle => h2",
29 | "include_default_style_map": False
30 | }).value["style_map"]
31 | assert_equal([read_style_mapping("p.SectionTitle => h2").value], style_map)
32 |
--------------------------------------------------------------------------------
/mammoth/docx/relationships_xml.py:
--------------------------------------------------------------------------------
1 | import collections
2 |
3 |
4 | class Relationships(object):
5 | def __init__(self, relationships):
6 | self._targets_by_id = dict(
7 | (relationship.relationship_id, relationship.target)
8 | for relationship in relationships
9 | )
10 | self._targets_by_type = collections.defaultdict(list)
11 | for relationship in relationships:
12 | self._targets_by_type[relationship.type].append(relationship.target)
13 |
14 | def find_target_by_relationship_id(self, key):
15 | return self._targets_by_id[key]
16 |
17 | def find_targets_by_type(self, relationship_type):
18 | return self._targets_by_type[relationship_type]
19 |
20 |
21 | Relationships.EMPTY = Relationships([])
22 |
23 |
24 | Relationship = collections.namedtuple("Relationship", ["relationship_id", "target", "type"])
25 |
26 |
27 | def read_relationships_xml_element(element):
28 | children = element.find_children("relationships:Relationship")
29 | return Relationships(list(map(_read_relationship, children)))
30 |
31 |
32 | def _read_relationship(element):
33 | relationship = Relationship(
34 | relationship_id=element.attributes["Id"],
35 | target=element.attributes["Target"],
36 | type=element.attributes["Type"],
37 | )
38 | return relationship
39 |
--------------------------------------------------------------------------------
/mammoth/writers/html.py:
--------------------------------------------------------------------------------
1 | from __future__ import unicode_literals
2 | from xml.sax.saxutils import escape
3 |
4 | from .abc import Writer
5 |
6 |
7 | class HtmlWriter(Writer):
8 | def __init__(self):
9 | self._fragments = []
10 |
11 | def text(self, text):
12 | self._fragments.append(_escape_html(text))
13 |
14 | def start(self, name, attributes=None):
15 | attribute_string = _generate_attribute_string(attributes)
16 | self._fragments.append("<{0}{1}>".format(name, attribute_string))
17 |
18 | def end(self, name):
19 | self._fragments.append("{0}>".format(name))
20 |
21 | def self_closing(self, name, attributes=None):
22 | attribute_string = _generate_attribute_string(attributes)
23 | self._fragments.append("<{0}{1} />".format(name, attribute_string))
24 |
25 | def append(self, html):
26 | self._fragments.append(html)
27 |
28 | def as_string(self):
29 | return "".join(self._fragments)
30 |
31 |
32 | def _escape_html(text):
33 | return escape(text, {'"': """})
34 |
35 |
36 | def _generate_attribute_string(attributes):
37 | if attributes is None:
38 | return ""
39 | else:
40 | return "".join(
41 | ' {0}="{1}"'.format(key, _escape_html(attributes[key]))
42 | for key in sorted(attributes)
43 | )
44 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2013, Michael Williamson
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without
5 | modification, are permitted provided that the following conditions are met:
6 |
7 | 1. Redistributions of source code must retain the above copyright notice, this
8 | list of conditions and the following disclaimer.
9 | 2. Redistributions in binary form must reproduce the above copyright notice,
10 | this list of conditions and the following disclaimer in the documentation
11 | and/or other materials provided with the distribution.
12 |
13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 |
--------------------------------------------------------------------------------
/mammoth/html_paths.py:
--------------------------------------------------------------------------------
1 | import cobble
2 |
3 | from . import html
4 |
5 |
6 | def path(elements):
7 | return HtmlPath(elements)
8 |
9 |
10 | def element(names, attributes=None, class_names=None, fresh=None, separator=None):
11 | if attributes is None:
12 | attributes = {}
13 | if class_names is None:
14 | class_names = []
15 | if fresh is None:
16 | fresh = False
17 | if class_names:
18 | attributes["class"] = " ".join(class_names)
19 |
20 | return HtmlPathElement(html.tag(
21 | tag_names=names,
22 | attributes=attributes,
23 | collapsible=not fresh,
24 | separator=separator,
25 | ))
26 |
27 |
28 | @cobble.data
29 | class HtmlPath(object):
30 | elements = cobble.field()
31 |
32 | def wrap(self, generate_nodes):
33 | nodes = generate_nodes()
34 |
35 | for element in reversed(self.elements):
36 | nodes = element.wrap_nodes(nodes)
37 |
38 | return nodes
39 |
40 |
41 | @cobble.data
42 | class HtmlPathElement(object):
43 | tag = cobble.field()
44 |
45 | def wrap(self, generate_nodes):
46 | return self.wrap_nodes(generate_nodes())
47 |
48 | def wrap_nodes(self, nodes):
49 | element = html.Element(self.tag, nodes)
50 | return [element]
51 |
52 | empty = path([])
53 |
54 |
55 | class ignore(object):
56 | @staticmethod
57 | def wrap(generate_nodes):
58 | return []
59 |
--------------------------------------------------------------------------------
/mammoth/docx/files.py:
--------------------------------------------------------------------------------
1 | import os
2 | import contextlib
3 | try:
4 | from urllib2 import urlopen
5 | except ImportError:
6 | from urllib.request import urlopen
7 | try:
8 | from urllib.parse import urlparse
9 | except ImportError:
10 | from urlparse import urlparse
11 |
12 |
13 | class Files(object):
14 | def __init__(self, base, external_file_access):
15 | self._base = base
16 | self._external_file_access = external_file_access
17 |
18 | def open(self, uri):
19 | if not self._external_file_access:
20 | raise ExternalFileAccessIsDisabledError(
21 | "could not open external image '{0}', external file access is disabled".format(uri)
22 | )
23 |
24 | try:
25 | if _is_absolute(uri):
26 | return contextlib.closing(urlopen(uri))
27 | elif self._base is not None:
28 | return open(os.path.join(self._base, uri), "rb")
29 | else:
30 | raise InvalidFileReferenceError("could not find external image '{0}', fileobj has no name".format(uri))
31 | except IOError as error:
32 | message = "could not open external image: '{0}' (document directory: '{1}')\n{2}".format(
33 | uri, self._base, str(error))
34 | raise InvalidFileReferenceError(message)
35 |
36 |
37 | def _is_absolute(url):
38 | return urlparse(url).scheme != ""
39 |
40 |
41 | class InvalidFileReferenceError(ValueError):
42 | pass
43 |
44 |
45 | class ExternalFileAccessIsDisabledError(InvalidFileReferenceError):
46 | pass
47 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import os
4 | from setuptools import setup
5 |
6 | def read(fname):
7 | return open(os.path.join(os.path.dirname(__file__), fname)).read()
8 |
9 |
10 | setup(
11 | name='mammoth',
12 | version='1.11.0',
13 | description='Convert Word documents from docx to simple and clean HTML and Markdown',
14 | long_description=read("README"),
15 | author='Michael Williamson',
16 | author_email='mike@zwobble.org',
17 | url='https://github.com/mwilliamson/python-mammoth',
18 | packages=['mammoth', 'mammoth.docx', 'mammoth.html', 'mammoth.styles', 'mammoth.styles.parser', 'mammoth.writers'],
19 | entry_points={
20 | "console_scripts": [
21 | "mammoth=mammoth.cli:main"
22 | ]
23 | },
24 | keywords="docx word office clean html markdown md",
25 | install_requires=[
26 | "cobble>=0.1.3,<0.2",
27 | ],
28 | python_requires='>=3.7',
29 | license="BSD-2-Clause",
30 | classifiers=[
31 | 'Development Status :: 5 - Production/Stable',
32 | 'Intended Audience :: Developers',
33 | 'License :: OSI Approved :: BSD License',
34 | 'Programming Language :: Python',
35 | 'Programming Language :: Python :: 3',
36 | 'Programming Language :: Python :: 3.7',
37 | 'Programming Language :: Python :: 3.8',
38 | 'Programming Language :: Python :: 3.9',
39 | 'Programming Language :: Python :: 3.10',
40 | 'Programming Language :: Python :: 3.11',
41 | 'Programming Language :: Python :: 3.12',
42 | ],
43 | )
44 |
45 |
--------------------------------------------------------------------------------
/tests/docx/office_xml_tests.py:
--------------------------------------------------------------------------------
1 | from __future__ import unicode_literals
2 |
3 | import io
4 |
5 | from mammoth.docx import xmlparser as xml, office_xml
6 | from ..testing import assert_equal
7 |
8 |
9 | class AlternateContentTests(object):
10 | def test_when_fallback_is_present_then_fallback_is_read(self):
11 | xml_string = (
12 | '' +
13 | '' +
14 | '' +
15 | '' +
16 | '' +
17 | '' +
18 | '' +
19 | '' +
20 | '' +
21 | '' +
22 | '')
23 |
24 | result = office_xml.read(io.StringIO(xml_string))
25 | assert_equal([xml.element("fallback")], result.children)
26 |
27 |
28 | def test_when_fallback_is_not_present_then_element_is_ignored(self):
29 | xml_string = (
30 | '' +
31 | '' +
32 | '' +
33 | '' +
34 | '' +
35 | '' +
36 | '' +
37 | '')
38 |
39 | result = office_xml.read(io.StringIO(xml_string))
40 | assert_equal([], result.children)
41 |
--------------------------------------------------------------------------------
/tests/raw_text_tests.py:
--------------------------------------------------------------------------------
1 | from mammoth.raw_text import extract_raw_text_from_element
2 | from mammoth import documents
3 | from .testing import assert_equal
4 |
5 |
6 | def test_text_element_is_converted_to_text_content():
7 | element = documents.Text("Hello.")
8 |
9 | result = extract_raw_text_from_element(element)
10 |
11 | assert_equal("Hello.", result)
12 |
13 |
14 | def test_tab_element_is_converted_to_tab_character():
15 | element = documents.tab()
16 |
17 | result = extract_raw_text_from_element(element)
18 |
19 | assert_equal("\t", result)
20 |
21 |
22 | def test_paragraphs_are_terminated_with_newlines():
23 | element = documents.paragraph(
24 | children=[
25 | documents.Text("Hello "),
26 | documents.Text("world."),
27 | ],
28 | )
29 |
30 | result = extract_raw_text_from_element(element)
31 |
32 | assert_equal("Hello world.\n\n", result)
33 |
34 |
35 | def test_children_are_recursively_converted_to_text():
36 | element = documents.document([
37 | documents.paragraph(
38 | [
39 | documents.text("Hello "),
40 | documents.text("world.")
41 | ],
42 | {}
43 | )
44 | ])
45 |
46 | result = extract_raw_text_from_element(element)
47 |
48 | assert_equal("Hello world.\n\n", result)
49 |
50 |
51 | def test_non_text_element_without_children_is_converted_to_empty_string():
52 | element = documents.line_break
53 | assert not hasattr(element, "children")
54 |
55 | result = extract_raw_text_from_element(element)
56 |
57 | assert_equal("", result)
58 |
--------------------------------------------------------------------------------
/mammoth/transforms.py:
--------------------------------------------------------------------------------
1 | from . import documents
2 |
3 |
4 | def paragraph(transform_paragraph):
5 | return element_of_type(documents.Paragraph, transform_paragraph)
6 |
7 |
8 | def run(transform_run):
9 | return element_of_type(documents.Run, transform_run)
10 |
11 |
12 | def element_of_type(element_type, transform):
13 | def transform_element(element):
14 | if isinstance(element, element_type):
15 | return transform(element)
16 | else:
17 | return element
18 |
19 | return _each_element(transform_element)
20 |
21 |
22 | def _each_element(transform_element):
23 | def transform_element_and_children(element):
24 | if isinstance(element, (documents.HasChildren, documents.TableCellUnmerged)):
25 | children = list(map(transform_element_and_children, element.children))
26 | element = element.copy(children=children)
27 |
28 | return transform_element(element)
29 |
30 | return transform_element_and_children
31 |
32 |
33 | def get_descendants_of_type(element, element_type):
34 | return list(filter(
35 | lambda descendant: isinstance(descendant, element_type),
36 | get_descendants(element),
37 | ))
38 |
39 |
40 | def get_descendants(element):
41 | descendants = []
42 |
43 | def visit(element):
44 | descendants.append(element)
45 |
46 | _visit_descendants(element, visit)
47 |
48 | return descendants
49 |
50 |
51 | def _visit_descendants(element, visit):
52 | if isinstance(element, documents.HasChildren):
53 | for child in element.children:
54 | _visit_descendants(child, visit)
55 | visit(child)
56 |
57 |
--------------------------------------------------------------------------------
/tests/test-data/hyperlinks/word/document.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | coconuts
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
--------------------------------------------------------------------------------
/tests/html/strip_empty_tests.py:
--------------------------------------------------------------------------------
1 | from mammoth import html
2 | from ..testing import assert_equal
3 |
4 |
5 | def test_text_nodes_with_text_are_not_stripped():
6 | assert_equal(
7 | [html.text("H")],
8 | html.strip_empty([html.text("H")]))
9 |
10 |
11 | def test_empty_text_nodes_are_stripped():
12 | assert_equal(
13 | [],
14 | html.strip_empty([html.text("")]))
15 |
16 |
17 | def test_elements_with_non_empty_children_are_not_stripped():
18 | assert_equal(
19 | [html.element("p", {}, [html.text("H")])],
20 | html.strip_empty([html.element("p", {}, [html.text("H")])]))
21 |
22 |
23 | def test_elements_with_no_children_are_stripped():
24 | assert_equal(
25 | [],
26 | html.strip_empty([html.element("p")]))
27 |
28 |
29 | def test_elements_with_only_empty_children_are_stripped():
30 | assert_equal(
31 | [],
32 | html.strip_empty([html.element("p", {}, [html.text("")])]))
33 |
34 |
35 | def test_empty_children_are_removed():
36 | assert_equal(
37 | html.strip_empty([html.element("ul", {}, [
38 | html.element("li", {}, [html.text("")]),
39 | html.element("li", {}, [html.text("H")]),
40 | ])]),
41 |
42 | [html.element("ul", {}, [
43 | html.element("li", {}, [html.text("H")])
44 | ])])
45 |
46 |
47 | def test_self_closing_elements_are_never_empty():
48 | assert_equal(
49 | [html.element("br")],
50 | html.strip_empty([html.element("br")]))
51 |
52 |
53 | def test_force_writes_are_never_empty():
54 | assert_equal(
55 | [html.force_write],
56 | html.strip_empty([html.force_write]))
57 |
--------------------------------------------------------------------------------
/tests/test-data/simple/word/document.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | Hello.
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
--------------------------------------------------------------------------------
/mammoth/__init__.py:
--------------------------------------------------------------------------------
1 | from . import docx, conversion, options, images, transforms, underline
2 | from .raw_text import extract_raw_text_from_element
3 | from .docx.style_map import write_style_map, read_style_map
4 |
5 | __all__ = ["convert_to_html", "extract_raw_text", "images", "transforms", "underline"]
6 |
7 |
8 | _undefined = object()
9 |
10 |
11 | def convert_to_html(*args, **kwargs):
12 | return convert(*args, output_format="html", **kwargs)
13 |
14 |
15 | def convert_to_markdown(*args, **kwargs):
16 | return convert(*args, output_format="markdown", **kwargs)
17 |
18 |
19 | def convert(
20 | fileobj,
21 | transform_document=None,
22 | id_prefix=None,
23 | include_embedded_style_map=_undefined,
24 | external_file_access=_undefined,
25 | **kwargs
26 | ):
27 | if include_embedded_style_map is _undefined:
28 | include_embedded_style_map = True
29 |
30 | if transform_document is None:
31 | transform_document = lambda x: x
32 |
33 | if include_embedded_style_map:
34 | kwargs["embedded_style_map"] = read_style_map(fileobj)
35 |
36 | if external_file_access is _undefined:
37 | external_file_access = False
38 |
39 | return options.read_options(kwargs).bind(lambda convert_options:
40 | docx.read(fileobj, external_file_access=external_file_access).map(transform_document).bind(lambda document:
41 | conversion.convert_document_element_to_html(
42 | document,
43 | id_prefix=id_prefix,
44 | **convert_options
45 | )
46 | )
47 | )
48 |
49 |
50 | def extract_raw_text(fileobj):
51 | return docx.read(fileobj).map(extract_raw_text_from_element)
52 |
53 |
54 | def embed_style_map(fileobj, style_map):
55 | write_style_map(fileobj, style_map)
56 |
57 | def read_embedded_style_map(fileobj):
58 | return read_style_map(fileobj)
59 |
--------------------------------------------------------------------------------
/mammoth/docx/content_types_xml.py:
--------------------------------------------------------------------------------
1 | def read_content_types_xml_element(element):
2 | extension_defaults = dict(map(
3 | _read_default,
4 | element.find_children("content-types:Default")
5 | ))
6 | overrides = dict(map(
7 | _read_override,
8 | element.find_children("content-types:Override")
9 | ))
10 | return _ContentTypes(extension_defaults, overrides)
11 |
12 |
13 | def _read_default(element):
14 | extension = element.attributes["Extension"]
15 | content_type = element.attributes["ContentType"]
16 | return extension, content_type
17 |
18 |
19 | def _read_override(element):
20 | part_name = element.attributes["PartName"]
21 | content_type = element.attributes["ContentType"]
22 | return part_name.lstrip("/"), content_type
23 |
24 |
25 | class _ContentTypes(object):
26 | _image_content_types = {
27 | "png": "png",
28 | "gif": "gif",
29 | "jpeg": "jpeg",
30 | "jpg": "jpeg",
31 | "tif": "tiff",
32 | "tiff": "tiff",
33 | "bmp": "bmp",
34 | }
35 |
36 | def __init__(self, extension_defaults, overrides):
37 | self._extension_defaults = extension_defaults
38 | self._overrides = overrides
39 |
40 | def find_content_type(self, path):
41 | if path in self._overrides:
42 | return self._overrides[path]
43 |
44 | extension = _get_extension(path)
45 | default_type = self._extension_defaults.get(extension)
46 | if default_type is not None:
47 | return default_type
48 |
49 | image_type = self._image_content_types.get(extension.lower())
50 | if image_type is not None:
51 | return "image/" + image_type
52 |
53 | return None
54 |
55 | empty_content_types = _ContentTypes({}, {})
56 |
57 | def _get_extension(path):
58 | return path.rpartition(".")[2]
59 |
--------------------------------------------------------------------------------
/mammoth/styles/parser/tokeniser.py:
--------------------------------------------------------------------------------
1 | import collections
2 | import re
3 |
4 |
5 | Token = collections.namedtuple("Token", ["character_index", "type", "value"])
6 |
7 |
8 | class TokenType(object):
9 | IDENTIFIER = "identifier"
10 | SYMBOL = "symbol"
11 | WHITESPACE = "whitespace"
12 | STRING = "string"
13 | UNTERMINATED_STRING = "unterminated string"
14 | INTEGER = "integer"
15 | END = "end"
16 |
17 |
18 |
19 | def regex_tokeniser(rules):
20 | rules = [(token_type, _to_regex(regex)) for token_type, regex in rules]
21 | rules.append(("unknown", re.compile(".")))
22 |
23 | def tokenise(value):
24 | tokens = []
25 | index = 0
26 | while index < len(value):
27 | for token_type, regex in rules:
28 | match = regex.match(value, index)
29 | if match is not None:
30 | tokens.append(Token(index, token_type, match.group(0)))
31 | index = match.end()
32 | break
33 | else:
34 | # Should be impossible
35 | raise Exception("Remaining: " + value[index:])
36 |
37 | tokens.append(Token(index, TokenType.END, ""))
38 |
39 | return tokens
40 |
41 | return tokenise
42 |
43 |
44 | def _to_regex(value):
45 | if hasattr(value, "match"):
46 | return value
47 | else:
48 | return re.compile(value)
49 |
50 |
51 | _string_prefix = r"'(?:\\.|[^'])*"
52 | _identifier_character = r"(?:[a-zA-Z\-_]|\\.)"
53 |
54 | tokenise = regex_tokeniser([
55 | (TokenType.IDENTIFIER, _identifier_character + "(?:" + _identifier_character + "|[0-9])*"),
56 | (TokenType.SYMBOL, r":|>|=>|\^=|=|\(|\)|\[|\]|\||!|\."),
57 | (TokenType.WHITESPACE, r"\s+"),
58 | (TokenType.STRING, _string_prefix + "'"),
59 | (TokenType.UNTERMINATED_STRING, _string_prefix),
60 | (TokenType.INTEGER, "([0-9]+)"),
61 | ])
62 |
--------------------------------------------------------------------------------
/tests/docx/comments_xml_tests.py:
--------------------------------------------------------------------------------
1 | from mammoth import documents
2 | from mammoth.docx.xmlparser import element as xml_element
3 | from mammoth.docx.comments_xml import read_comments_xml_element
4 | from mammoth.docx import body_xml
5 | from ..testing import assert_equal
6 |
7 |
8 | def test_id_and_body_of_comment_is_read():
9 | body = [xml_element("w:p")]
10 | comments = read_comments_xml_element(xml_element("w:comments", {}, [
11 | xml_element("w:comment", {"w:id": "1"}, body),
12 | ]), body_reader=body_xml.reader())
13 | assert_equal(1, len(comments.value))
14 | assert_equal(comments.value[0].body, [documents.paragraph(children=[])])
15 | assert_equal("1", comments.value[0].comment_id)
16 |
17 |
18 | def test_when_optional_attributes_of_comment_are_missing_then_they_are_read_as_none():
19 | comments = read_comments_xml_element(xml_element("w:comments", {}, [
20 | xml_element("w:comment", {"w:id": "1"}, []),
21 | ]), body_reader=body_xml.reader())
22 | comment, = comments.value
23 | assert_equal(None, comment.author_name)
24 | assert_equal(None, comment.author_initials)
25 |
26 |
27 | def test_when_optional_attributes_of_comment_are_blank_then_they_are_read_as_none():
28 | comments = read_comments_xml_element(xml_element("w:comments", {}, [
29 | xml_element("w:comment", {"w:id": "1", "w:author": " ", "w:initials": " "}, []),
30 | ]), body_reader=body_xml.reader())
31 | comment, = comments.value
32 | assert_equal(None, comment.author_name)
33 | assert_equal(None, comment.author_initials)
34 |
35 |
36 | def test_when_optional_attributes_of_comment_are_not_blank_then_they_are_read():
37 | comments = read_comments_xml_element(xml_element("w:comments", {}, [
38 | xml_element("w:comment", {"w:id": "1", "w:author": "The Piemaker", "w:initials": "TP"}, []),
39 | ]), body_reader=body_xml.reader())
40 | comment, = comments.value
41 | assert_equal("The Piemaker", comment.author_name)
42 | assert_equal("TP", comment.author_initials)
43 |
--------------------------------------------------------------------------------
/mammoth/docx/office_xml.py:
--------------------------------------------------------------------------------
1 | from ..lists import flat_map
2 | from .xmlparser import parse_xml, XmlElement
3 |
4 |
5 | _namespaces = [
6 | # Transitional format
7 | ("w", "http://schemas.openxmlformats.org/wordprocessingml/2006/main"),
8 | ("r", "http://schemas.openxmlformats.org/officeDocument/2006/relationships"),
9 | ("wp", "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"),
10 | ("a", "http://schemas.openxmlformats.org/drawingml/2006/main"),
11 | ("pic", "http://schemas.openxmlformats.org/drawingml/2006/picture"),
12 |
13 | # Strict format
14 | ("w", "http://purl.oclc.org/ooxml/wordprocessingml/main"),
15 | ("r", "http://purl.oclc.org/ooxml/officeDocument/relationships"),
16 | ("wp", "http://purl.oclc.org/ooxml/drawingml/wordprocessingDrawing"),
17 | ("a", "http://purl.oclc.org/ooxml/drawingml/main"),
18 | ("pic", "http://purl.oclc.org/ooxml/drawingml/picture"),
19 |
20 | # Common
21 | ("content-types", "http://schemas.openxmlformats.org/package/2006/content-types"),
22 | ("relationships", "http://schemas.openxmlformats.org/package/2006/relationships"),
23 | ("mc", "http://schemas.openxmlformats.org/markup-compatibility/2006"),
24 | ("v", "urn:schemas-microsoft-com:vml"),
25 | ("office-word", "urn:schemas-microsoft-com:office:word"),
26 |
27 | # [MS-DOCX]: Word Extensions to the Office Open XML (.docx) File Format
28 | # https://learn.microsoft.com/en-us/openspecs/office_standards/ms-docx/b839fe1f-e1ca-4fa6-8c26-5954d0abbccd
29 | ("wordml", "http://schemas.microsoft.com/office/word/2010/wordml"),
30 | ]
31 |
32 |
33 | def read(fileobj):
34 | return _collapse_alternate_content(parse_xml(fileobj, _namespaces))[0]
35 |
36 |
37 | def _collapse_alternate_content(node):
38 | if isinstance(node, XmlElement):
39 | if node.name == "mc:AlternateContent":
40 | return node.find_child_or_null("mc:Fallback").children
41 | else:
42 | node.children = flat_map(_collapse_alternate_content, node.children)
43 | return [node]
44 | else:
45 | return [node]
46 |
--------------------------------------------------------------------------------
/mammoth/styles/parser/token_iterator.py:
--------------------------------------------------------------------------------
1 | # TODO: check indices
2 | # TODO: proper tests for unexpected tokens
3 |
4 | from .errors import LineParseError
5 |
6 |
7 | class TokenIterator(object):
8 | def __init__(self, tokens):
9 | self._tokens = tokens
10 | self._index = 0
11 |
12 | def peek_token_type(self):
13 | return self._tokens[self._index].type
14 |
15 | def next_value(self, token_type=None):
16 | return self._next(token_type).value
17 |
18 | def _next(self, token_type=None):
19 | token = self._tokens[self._index]
20 | if token_type is None or token.type == token_type:
21 | self._index += 1
22 | return token
23 | else:
24 | raise self._unexpected_token_type(token_type, token)
25 |
26 | def skip(self, token_type, token_value=None):
27 | token = self._tokens[self._index]
28 | if token.type == token_type and (token_value is None or token.value == token_value):
29 | self._index += 1
30 | return True
31 | else:
32 | raise self._unexpected_token_type(token_type, token)
33 |
34 | def try_skip(self, token_type, token_value=None):
35 | if self.is_next(token_type, token_value):
36 | self._index += 1
37 | return True
38 | else:
39 | return False
40 |
41 | def try_skip_many(self, tokens):
42 | start = self._index
43 | for token_type, token_value in tokens:
44 | token = self._tokens[self._index]
45 | if not (token.type == token_type and (token_value is None or token.value == token_value)):
46 | self._index = start
47 | return False
48 | else:
49 | self._index += 1
50 |
51 | return True
52 |
53 | def is_next(self, token_type, token_value=None):
54 | token = self._tokens[self._index]
55 | return token.type == token_type and (token_value is None or token.value == token_value)
56 |
57 | def _unexpected_token_type(self, token_type, token):
58 | raise LineParseError()
59 |
60 |
--------------------------------------------------------------------------------
/mammoth/zips.py:
--------------------------------------------------------------------------------
1 | import contextlib
2 | import io
3 | import shutil
4 |
5 | from zipfile import ZipFile
6 |
7 |
8 | def open_zip(fileobj, mode):
9 | return _Zip(ZipFile(fileobj, mode))
10 |
11 |
12 | class _Zip(object):
13 | def __init__(self, zip_file):
14 | self._zip_file = zip_file
15 |
16 | def __enter__(self):
17 | return self
18 |
19 | def __exit__(self, *args):
20 | self._zip_file.close()
21 |
22 | def open(self, name):
23 | return contextlib.closing(self._zip_file.open(name))
24 |
25 | def exists(self, name):
26 | try:
27 | self._zip_file.getinfo(name)
28 | return True
29 | except KeyError:
30 | return False
31 |
32 | def read_str(self, name):
33 | return self._zip_file.read(name).decode("utf8")
34 |
35 |
36 | def update_zip(fileobj, files):
37 | source = ZipFile(fileobj, "r")
38 | try:
39 | destination_fileobj = io.BytesIO()
40 | destination = ZipFile(destination_fileobj, "w")
41 | try:
42 | names = set(source.namelist()) | set(files.keys())
43 | for name in names:
44 | if name in files:
45 | contents = files[name]
46 | else:
47 | contents = source.read(name)
48 | destination.writestr(name, contents)
49 | finally:
50 | destination.close()
51 | finally:
52 | source.close()
53 |
54 | fileobj.seek(0)
55 | destination_fileobj.seek(0)
56 | shutil.copyfileobj(destination_fileobj, fileobj)
57 |
58 |
59 | def split_path(path):
60 | parts = path.rsplit("/", 1)
61 | if len(parts) == 1:
62 | return ("", path)
63 | else:
64 | return tuple(parts)
65 |
66 |
67 | def join_path(*args):
68 | non_empty_paths = list(filter(None, args))
69 |
70 | relevant_paths = []
71 | for path in non_empty_paths:
72 | if path.startswith("/"):
73 | relevant_paths = [path]
74 | else:
75 | relevant_paths.append(path)
76 |
77 | return "/".join(relevant_paths)
78 |
--------------------------------------------------------------------------------
/tests/docx/document_xml_tests.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from mammoth import documents
4 | from mammoth.docx.xmlparser import element as xml_element, text as xml_text
5 | from mammoth.docx.document_xml import read_document_xml_element
6 | from mammoth.docx import body_xml
7 | from ..testing import assert_equal
8 |
9 |
10 | def test_when_body_element_is_present_then_body_is_read():
11 | text_xml = xml_element("w:t", {}, [xml_text("Hello!")])
12 | run_xml = xml_element("w:r", {}, [text_xml])
13 | paragraph_xml = xml_element("w:p", {}, [run_xml])
14 | body_xml = xml_element("w:body", {}, [paragraph_xml])
15 | document_xml = xml_element("w:document", {}, [body_xml])
16 |
17 | document = _read_and_get_document_xml_element(document_xml)
18 |
19 | assert_equal(
20 | documents.document([documents.paragraph([documents.run([documents.text("Hello!")])])]),
21 | document
22 | )
23 |
24 |
25 | def test_when_body_element_is_not_present_then_error_is_raised():
26 | paragraph_xml = xml_element("w:p", {}, [])
27 | body_xml = xml_element("w:body2", {}, [paragraph_xml])
28 | document_xml = xml_element("w:document", {}, [body_xml])
29 |
30 | error = pytest.raises(ValueError, lambda: _read_and_get_document_xml_element(document_xml))
31 |
32 | assert_equal(str(error.value), "Could not find the body element: are you sure this is a docx file?")
33 |
34 |
35 | def test_footnotes_of_document_are_read():
36 | notes = [documents.note("footnote", "4", [documents.paragraph([])])]
37 |
38 | body_xml = xml_element("w:body")
39 | document_xml = xml_element("w:document", {}, [body_xml])
40 |
41 | document = _read_and_get_document_xml_element(document_xml, notes=notes)
42 | footnote = document.notes.find_note("footnote", "4")
43 | assert_equal("4", footnote.note_id)
44 | assert isinstance(footnote.body[0], documents.Paragraph)
45 |
46 |
47 | def _read_and_get_document_xml_element(*args, **kwargs):
48 | body_reader = body_xml.reader()
49 | result = read_document_xml_element(*args, body_reader=body_reader, **kwargs)
50 | assert_equal([], result.messages)
51 | return result.value
52 |
--------------------------------------------------------------------------------
/mammoth/document_matchers.py:
--------------------------------------------------------------------------------
1 | import collections
2 |
3 | import cobble
4 |
5 |
6 | def paragraph(style_id=None, style_name=None, numbering=None):
7 | return ParagraphMatcher(style_id, style_name, numbering)
8 |
9 |
10 | ParagraphMatcher = collections.namedtuple("ParagraphMatcher", ["style_id", "style_name", "numbering"])
11 | ParagraphMatcher.element_type = "paragraph"
12 |
13 |
14 | def run(style_id=None, style_name=None):
15 | return RunMatcher(style_id, style_name)
16 |
17 |
18 | RunMatcher = collections.namedtuple("RunMatcher", ["style_id", "style_name"])
19 | RunMatcher.element_type = "run"
20 |
21 |
22 | def table(style_id=None, style_name=None):
23 | return TableMatcher(style_id, style_name)
24 |
25 |
26 | TableMatcher = collections.namedtuple("TableMatcher", ["style_id", "style_name"])
27 | TableMatcher.element_type = "table"
28 |
29 |
30 | class bold(object):
31 | element_type = "bold"
32 |
33 |
34 | class italic(object):
35 | element_type = "italic"
36 |
37 |
38 | class underline(object):
39 | element_type = "underline"
40 |
41 |
42 | class strikethrough(object):
43 | element_type = "strikethrough"
44 |
45 |
46 | class all_caps(object):
47 | element_type = "all_caps"
48 |
49 |
50 | class small_caps(object):
51 | element_type = "small_caps"
52 |
53 |
54 | def highlight(color=None):
55 | return HighlightMatcher(color=color)
56 |
57 |
58 | HighlightMatcher = collections.namedtuple("HighlightMatcher", ["color"])
59 | HighlightMatcher.element_type = "highlight"
60 |
61 | class comment_reference(object):
62 | element_type = "comment_reference"
63 |
64 |
65 | BreakMatcher = collections.namedtuple("BreakMatcher", ["break_type"])
66 | BreakMatcher.element_type = "break"
67 |
68 |
69 | line_break = BreakMatcher("line")
70 | page_break = BreakMatcher("page")
71 | column_break = BreakMatcher("column")
72 |
73 |
74 | def equal_to(value):
75 | return StringMatcher(_operator_equal_to, value)
76 |
77 |
78 | def _operator_equal_to(first, second):
79 | return first.upper() == second.upper()
80 |
81 |
82 | def starts_with(value):
83 | return StringMatcher(_operator_starts_with, value)
84 |
85 | def _operator_starts_with(first, second):
86 | return second.upper().startswith(first.upper())
87 |
88 |
89 | @cobble.data
90 | class StringMatcher(object):
91 | operator = cobble.field()
92 | value = cobble.field()
93 |
94 | def matches(self, other):
95 | return self.operator(self.value, other)
96 |
--------------------------------------------------------------------------------
/tests/images_tests.py:
--------------------------------------------------------------------------------
1 | import io
2 |
3 | from precisely import assert_that, has_attrs, is_sequence
4 |
5 | import mammoth
6 |
7 |
8 | def test_inline_is_available_as_alias_of_img_element():
9 | assert mammoth.images.inline is mammoth.images.img_element
10 |
11 |
12 | def test_data_uri_encodes_images_in_base64():
13 | image_bytes = b"abc"
14 | image = mammoth.documents.Image(
15 | alt_text=None,
16 | content_type="image/jpeg",
17 | open=lambda: io.BytesIO(image_bytes),
18 | )
19 |
20 | result = mammoth.images.data_uri(image)
21 |
22 | assert_that(result, is_sequence(
23 | has_attrs(attributes={"src": ""}),
24 | ))
25 |
26 |
27 | class ImgElementTests:
28 | def test_when_element_does_not_have_alt_text_then_alt_attribute_is_not_set(self):
29 | image_bytes = b"abc"
30 | image = mammoth.documents.Image(
31 | alt_text=None,
32 | content_type="image/jpeg",
33 | open=lambda: io.BytesIO(image_bytes),
34 | )
35 |
36 | @mammoth.images.img_element
37 | def convert_image(image):
38 | return {"src": ""}
39 |
40 | result = convert_image(image)
41 |
42 | assert_that(result, is_sequence(
43 | has_attrs(attributes={"src": ""}),
44 | ))
45 |
46 | def test_when_element_se_alt_text_then_alt_attribute_is_set(self):
47 | image_bytes = b"abc"
48 | image = mammoth.documents.Image(
49 | alt_text="",
50 | content_type="image/jpeg",
51 | open=lambda: io.BytesIO(image_bytes),
52 | )
53 |
54 | @mammoth.images.img_element
55 | def convert_image(image):
56 | return {"src": ""}
57 |
58 | result = convert_image(image)
59 |
60 | assert_that(result, is_sequence(
61 | has_attrs(attributes={"alt": "", "src": ""}),
62 | ))
63 |
64 | def test_image_alt_text_can_be_overridden_by_alt_attribute_returned_from_function(self):
65 | image_bytes = b"abc"
66 | image = mammoth.documents.Image(
67 | alt_text="",
68 | content_type="image/jpeg",
69 | open=lambda: io.BytesIO(image_bytes),
70 | )
71 |
72 | @mammoth.images.img_element
73 | def convert_image(image):
74 | return {"alt": "", "src": ""}
75 |
76 | result = convert_image(image)
77 |
78 | assert_that(result, is_sequence(
79 | has_attrs(attributes={"alt": "", "src": ""}),
80 | ))
81 |
--------------------------------------------------------------------------------
/tests/docx/relationships_xml_tests.py:
--------------------------------------------------------------------------------
1 | from mammoth.docx.xmlparser import element as xml_element
2 | from mammoth.docx.relationships_xml import read_relationships_xml_element
3 | from ..testing import assert_equal
4 |
5 |
6 | def test_relationship_targets_can_be_found_by_id():
7 | element = xml_element("relationships:Relationships", {}, [
8 | xml_element("relationships:Relationship", {
9 | "Id": "rId8",
10 | "Type": "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink",
11 | "Target": "http://example.com",
12 | }),
13 | xml_element("relationships:Relationship", {
14 | "Id": "rId2",
15 | "Type": "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink",
16 | "Target": "http://example.net",
17 | }),
18 | ])
19 | relationships = read_relationships_xml_element(element)
20 | assert_equal(
21 | "http://example.com",
22 | relationships.find_target_by_relationship_id("rId8"),
23 | )
24 |
25 |
26 | def test_relationship_targets_can_be_found_by_type():
27 | element = xml_element("relationships:Relationships", {}, [
28 | xml_element("relationships:Relationship", {
29 | "Id": "rId2",
30 | "Target": "docProps/core.xml",
31 | "Type": "http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties",
32 | }),
33 | xml_element("relationships:Relationship", {
34 | "Id": "rId1",
35 | "Target": "word/document.xml",
36 | "Type": "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument",
37 | }),
38 | xml_element("relationships:Relationship", {
39 | "Id": "rId3",
40 | "Target": "word/document2.xml",
41 | "Type": "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument",
42 | }),
43 | ])
44 | relationships = read_relationships_xml_element(element)
45 | assert_equal(
46 | ["word/document.xml", "word/document2.xml"],
47 | relationships.find_targets_by_type("http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument"),
48 | )
49 |
50 |
51 | def test_when_there_are_no_relationships_of_requested_type_then_empty_list_is_returned():
52 | element = xml_element("relationships:Relationships", {}, [])
53 | relationships = read_relationships_xml_element(element)
54 | assert_equal(
55 | [],
56 | relationships.find_targets_by_type("http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument"),
57 | )
58 |
--------------------------------------------------------------------------------
/tests/docx/content_types_xml_tests.py:
--------------------------------------------------------------------------------
1 | from mammoth.docx.xmlparser import element as xml_element
2 | from mammoth.docx.content_types_xml import read_content_types_xml_element
3 | from ..testing import assert_equal
4 |
5 |
6 | def test_content_type_is_based_on_default_for_extension_if_there_is_no_override():
7 | element = xml_element("content-types:Types", {}, [
8 | xml_element("content-types:Default", {
9 | "Extension": "png",
10 | "ContentType": "image/png",
11 | })
12 | ])
13 | content_types = read_content_types_xml_element(element)
14 | assert_equal(
15 | "image/png",
16 | content_types.find_content_type("word/media/hat.png"),
17 | )
18 |
19 |
20 | def test_content_type_is_based_on_override_if_present():
21 | element = xml_element("content-types:Types", {}, [
22 | xml_element("content-types:Default", {
23 | "Extension": "png",
24 | "ContentType": "image/png",
25 | }),
26 | xml_element("content-types:Override", {
27 | "PartName": "/word/media/hat.png",
28 | "ContentType": "image/hat"
29 | }),
30 | ])
31 | content_types = read_content_types_xml_element(element)
32 | assert_equal(
33 | "image/hat",
34 | content_types.find_content_type("word/media/hat.png"),
35 | )
36 |
37 |
38 | def test_fallback_content_types_have_common_image_types():
39 | element = xml_element("content-types:Types", {}, [])
40 | content_types = read_content_types_xml_element(element)
41 | assert_equal(
42 | "image/png",
43 | content_types.find_content_type("word/media/hat.png"),
44 | )
45 | assert_equal(
46 | "image/gif",
47 | content_types.find_content_type("word/media/hat.gif"),
48 | )
49 | assert_equal(
50 | "image/jpeg",
51 | content_types.find_content_type("word/media/hat.jpg"),
52 | )
53 | assert_equal(
54 | "image/jpeg",
55 | content_types.find_content_type("word/media/hat.jpeg"),
56 | )
57 | assert_equal(
58 | "image/bmp",
59 | content_types.find_content_type("word/media/hat.bmp"),
60 | )
61 | assert_equal(
62 | "image/tiff",
63 | content_types.find_content_type("word/media/hat.tif"),
64 | )
65 | assert_equal(
66 | "image/tiff",
67 | content_types.find_content_type("word/media/hat.tiff"),
68 | )
69 |
70 |
71 | def test_fallback_content_types_are_case_insensitive():
72 | element = xml_element("content-types:Types", {}, [])
73 | content_types = read_content_types_xml_element(element)
74 | assert_equal(
75 | "image/png",
76 | content_types.find_content_type("word/media/hat.PnG"),
77 | )
78 |
--------------------------------------------------------------------------------
/tests/styles/parser/html_path_parser_tests.py:
--------------------------------------------------------------------------------
1 | from mammoth import html_paths
2 | from mammoth.styles.parser.html_path_parser import parse_html_path
3 | from mammoth.styles.parser.tokeniser import tokenise
4 | from mammoth.styles.parser.token_iterator import TokenIterator
5 | from ...testing import assert_equal
6 |
7 |
8 | def test_can_read_empty_path():
9 | assert_equal(
10 | html_paths.empty,
11 | read_html_path("")
12 | )
13 |
14 | def test_can_read_single_element():
15 | assert_equal(
16 | html_paths.path([html_paths.element(["p"])]),
17 | read_html_path("p")
18 | )
19 |
20 |
21 | def test_can_read_choice_of_two_elements():
22 | assert_equal(
23 | html_paths.path([html_paths.element(["ul", "ol"])]),
24 | read_html_path("ul|ol")
25 | )
26 |
27 |
28 | def test_can_read_choice_of_three_elements():
29 | assert_equal(
30 | html_paths.path([html_paths.element(["ul", "ol", "p"])]),
31 | read_html_path("ul|ol|p")
32 | )
33 |
34 |
35 | def test_can_read_nested_elements():
36 | assert_equal(
37 | html_paths.path([html_paths.element(["ul"]), html_paths.element(["li"])]),
38 | read_html_path("ul > li")
39 | )
40 |
41 |
42 | def test_can_read_class_on_element():
43 | assert_equal(
44 | html_paths.path([html_paths.element(["p"], class_names=["tip"])]),
45 | read_html_path("p.tip")
46 | )
47 |
48 |
49 | def test_can_read_multiple_classes_on_element():
50 | assert_equal(
51 | html_paths.path([html_paths.element(["p"], class_names=["tip", "help"])]),
52 | read_html_path("p.tip.help")
53 | )
54 |
55 |
56 | def test_can_read_attribute_on_element():
57 | assert_equal(
58 | html_paths.path([html_paths.element(["p"], attributes={"lang": "fr"})]),
59 | read_html_path("p[lang='fr']")
60 | )
61 |
62 |
63 | def test_can_read_multiple_attributes_on_element():
64 | assert_equal(
65 | html_paths.path([html_paths.element(["p"], attributes={"lang": "fr", "data-x": "y"})]),
66 | read_html_path("p[lang='fr'][data-x='y']")
67 | )
68 |
69 |
70 | def test_can_read_when_element_must_be_fresh():
71 | assert_equal(
72 | html_paths.path([html_paths.element(["p"], fresh=True)]),
73 | read_html_path("p:fresh")
74 | )
75 |
76 |
77 | def test_can_read_separator_for_elements():
78 | assert_equal(
79 | html_paths.path([html_paths.element(["p"], separator="x")]),
80 | read_html_path("p:separator('x')")
81 | )
82 |
83 |
84 | def test_can_read_ignore_element():
85 | assert_equal(
86 | html_paths.ignore,
87 | read_html_path("!")
88 | )
89 |
90 | def read_html_path(string):
91 | return parse_html_path(TokenIterator(tokenise(string)))
92 |
--------------------------------------------------------------------------------
/tests/docx/files_tests.py:
--------------------------------------------------------------------------------
1 | from mammoth.docx.files import ExternalFileAccessIsDisabledError, Files, InvalidFileReferenceError
2 | from ..testing import generate_test_path, assert_equal, assert_raises
3 |
4 |
5 | def test_when_external_file_access_is_disabled_then_opening_file_raises_error():
6 | files = Files(None, external_file_access=False)
7 | error = assert_raises(ExternalFileAccessIsDisabledError, lambda: files.open("/tmp/image.png"))
8 | expected_message = (
9 | "could not open external image '/tmp/image.png', external file access is disabled"
10 | )
11 | assert_equal(expected_message, str(error))
12 |
13 |
14 | def test_can_open_files_with_file_uri():
15 | path = generate_test_path("tiny-picture.png")
16 | files = Files(None, external_file_access=True)
17 | with files.open("file:///" + path) as image_file:
18 | contents = image_file.read()
19 | assert_equal(bytes, type(contents))
20 | with open(path, "rb") as source_file:
21 | assert_equal(source_file.read(), contents)
22 |
23 |
24 | def test_can_open_files_with_relative_uri():
25 | files = Files(generate_test_path(""), external_file_access=True)
26 | with files.open("tiny-picture.png") as image_file:
27 | contents = image_file.read()
28 | assert_equal(bytes, type(contents))
29 | with open(generate_test_path("tiny-picture.png"), "rb") as source_file:
30 | assert_equal(source_file.read(), contents)
31 |
32 |
33 | def test_given_base_is_not_set_when_opening_relative_uri_then_error_is_raised():
34 | files = Files(None, external_file_access=True)
35 | error = assert_raises(InvalidFileReferenceError, lambda: files.open("not-a-real-file.png"))
36 | expected_message = (
37 | "could not find external image 'not-a-real-file.png', fileobj has no name"
38 | )
39 | assert_equal(expected_message, str(error))
40 |
41 |
42 | def test_error_is_raised_if_relative_uri_cannot_be_opened():
43 | files = Files("/tmp", external_file_access=True)
44 | error = assert_raises(InvalidFileReferenceError, lambda: files.open("not-a-real-file.png"))
45 | expected_message = (
46 | "could not open external image: 'not-a-real-file.png' (document directory: '/tmp')\n" +
47 | "[Errno 2] No such file or directory: '/tmp/not-a-real-file.png'"
48 | )
49 | assert_equal(expected_message, str(error))
50 |
51 |
52 | def test_error_is_raised_if_file_uri_cannot_be_opened():
53 | files = Files("/tmp", external_file_access=True)
54 | error = assert_raises(InvalidFileReferenceError, lambda: files.open("file:///not-a-real-file.png"))
55 | expected_message = "could not open external image: 'file:///not-a-real-file.png' (document directory: '/tmp')\n"
56 | assert str(error).startswith(expected_message)
57 |
--------------------------------------------------------------------------------
/mammoth/docx/style_map.py:
--------------------------------------------------------------------------------
1 | from xml.etree import ElementTree
2 |
3 | from ..zips import open_zip, update_zip
4 |
5 |
6 | _style_map_path = "mammoth/style-map"
7 | _style_map_absolute_path = "/" + _style_map_path
8 | _relationships_path = "word/_rels/document.xml.rels"
9 | _content_types_path = "[Content_Types].xml"
10 |
11 |
12 | def write_style_map(fileobj, style_map):
13 | with open_zip(fileobj, "r") as zip_file:
14 | relationships_xml = _generate_relationships_xml(zip_file.read_str(_relationships_path))
15 | content_types_xml = _generate_content_types_xml(zip_file.read_str(_content_types_path))
16 |
17 | update_zip(fileobj, {
18 | _style_map_path: style_map.encode("utf8"),
19 | _relationships_path: relationships_xml,
20 | _content_types_path: content_types_xml,
21 | })
22 |
23 | def _generate_relationships_xml(relationships_xml):
24 | schema = "http://schemas.zwobble.org/mammoth/style-map"
25 | relationships_uri = "http://schemas.openxmlformats.org/package/2006/relationships"
26 | relationship_element_name = "{" + relationships_uri + "}Relationship"
27 |
28 | relationships = ElementTree.fromstring(relationships_xml)
29 | _add_or_update_element(relationships, relationship_element_name, "Id", {
30 | "Id": "rMammothStyleMap",
31 | "Type": schema,
32 | "Target": _style_map_absolute_path,
33 | })
34 |
35 | return ElementTree.tostring(relationships, "UTF-8")
36 |
37 |
38 | def _generate_content_types_xml(content_types_xml):
39 | content_types_uri = "http://schemas.openxmlformats.org/package/2006/content-types"
40 | override_name = "{" + content_types_uri + "}Override"
41 |
42 | types = ElementTree.fromstring(content_types_xml)
43 | _add_or_update_element(types, override_name, "PartName", {
44 | "PartName": _style_map_absolute_path,
45 | "ContentType": "text/prs.mammoth.style-map",
46 | })
47 |
48 | return ElementTree.tostring(types, "UTF-8")
49 |
50 |
51 | def _add_or_update_element(parent, name, identifying_attribute, attributes):
52 | existing_child = _find_child(parent, name, identifying_attribute, attributes)
53 | if existing_child is None:
54 | ElementTree.SubElement(parent, name, attributes)
55 | else:
56 | existing_child.attrib = attributes
57 |
58 |
59 | def _find_child(parent, name, identifying_attribute, attributes):
60 | for element in parent.iter():
61 | if element.tag == name and element.get(identifying_attribute) == attributes.get(identifying_attribute):
62 | return element
63 |
64 |
65 | def read_style_map(fileobj):
66 | with open_zip(fileobj, "r") as zip_file:
67 | if zip_file.exists(_style_map_path):
68 | return zip_file.read_str(_style_map_path)
69 |
70 |
71 |
--------------------------------------------------------------------------------
/tests/docx/xmlparser_tests.py:
--------------------------------------------------------------------------------
1 | import io
2 |
3 | from mammoth.docx.xmlparser import parse_xml, element as xml_element, text as xml_text
4 | from ..testing import assert_equal
5 |
6 |
7 | def test_can_parse_self_closing_element():
8 | xml = _parse_xml_string(b"")
9 | assert_equal(xml_element("body", {}, []), xml)
10 |
11 |
12 | def test_can_parse_empty_element_with_separate_closing_tag():
13 | xml = _parse_xml_string(b"")
14 | assert_equal(xml_element("body", {}, []), xml)
15 |
16 |
17 | def test_can_parse_attributes_of_tag():
18 | xml = _parse_xml_string(b"")
19 | assert_equal(xml_element("body", {"name": "bob"}, []), xml)
20 |
21 |
22 | def test_can_parse_text_element():
23 | xml = _parse_xml_string(b"Hello!")
24 | assert_equal(xml_element("body", {}, [xml_text("Hello!")]), xml)
25 |
26 |
27 | def test_can_parse_text_element_before_new_tag():
28 | xml = _parse_xml_string(b"Hello!
")
29 | assert_equal(xml_element("body", {}, [xml_text("Hello!"), xml_element("br", {}, [])]), xml)
30 |
31 |
32 | def test_can_parse_element_with_children():
33 | xml = _parse_xml_string(b"")
34 | assert_equal([xml_element("a", {}, []), xml_element("b", {}, [])], xml.children)
35 |
36 |
37 | def test_unmapped_namespaces_uris_are_included_in_braces_as_prefix():
38 | xml = _parse_xml_string(b'')
39 | assert_equal("{word}body", xml.name)
40 |
41 |
42 | def test_mapped_namespaces_uris_are_translated_using_namespace_map():
43 | xml = _parse_xml_string(b'', [("x", "word")])
44 | assert_equal("x:body", xml.name)
45 |
46 |
47 | def test_namespace_of_attributes_is_mapped_to_prefix():
48 | xml = _parse_xml_string(b'', [("x", "word")])
49 | assert_equal("Hello!", xml.attributes["x:val"])
50 |
51 |
52 | def test_whitespace_between_xml_declaration_and_root_tag_is_ignored():
53 | xml = _parse_xml_string(b'\n')
54 | assert_equal("body", xml.name)
55 |
56 |
57 | class FindChildTests(object):
58 | def test_returns_none_if_no_children(self):
59 | xml = xml_element("a")
60 | assert_equal(None, xml.find_child("b"))
61 |
62 | def test_returns_none_if_no_matching_children(self):
63 | xml = xml_element("a", {}, [xml_element("c")])
64 | assert_equal(None, xml.find_child("b"))
65 |
66 | def test_returns_first_matching_child(self):
67 | xml = xml_element("a", {}, [xml_element("b", {"id": 1}), xml_element("b", {"id": 2})])
68 | assert_equal(1, xml.find_child("b").attributes["id"])
69 |
70 | def test_ignores_text_nodes(self):
71 | xml = xml_element("a", {}, [xml_text("Hello!")])
72 | assert_equal(None, xml.find_child("b"))
73 |
74 |
75 | def _parse_xml_string(string, namespace_mapping=None):
76 | return parse_xml(io.BytesIO(string), namespace_mapping)
77 |
--------------------------------------------------------------------------------
/recipes/wmf_images.py:
--------------------------------------------------------------------------------
1 | import io
2 | import os
3 | import shutil
4 | import subprocess
5 | import tempfile
6 |
7 |
8 | # An example of how to use LibreOffice and ImageMagick to convert WMF images to
9 | # PNGs.
10 | #
11 | # libreoffice_wmf_conversion uses LibreOffice to convert the image to a PNG.
12 | # This normally creates an image with a large amount of padding, so
13 | # imagemagick_trim can be used to trim the image.
14 | #
15 | # The image can be then be converted using a normal image handler, such as
16 | # mammoth.images.data_uri.
17 | #
18 | # Example usage:
19 | #
20 | # def convert_image(image):
21 | # image = libreoffice_wmf_conversion(image, post_process=imagemagick_trim)
22 | # return mammoth.images.data_uri(image)
23 | #
24 | # with open("document.docx", "rb") as fileobj:
25 | # result = mammoth.convert_to_html(fileobj, convert_image=convert_image)
26 |
27 |
28 | _wmf_extensions = {
29 | "image/x-wmf": ".wmf",
30 | "image/x-emf": ".emf",
31 | }
32 |
33 |
34 | def libreoffice_wmf_conversion(image, post_process=None):
35 | if post_process is None:
36 | post_process = lambda x: x
37 |
38 | wmf_extension = _wmf_extensions.get(image.content_type)
39 | if wmf_extension is None:
40 | return image
41 | else:
42 | temporary_directory = tempfile.mkdtemp()
43 | try:
44 | input_path = os.path.join(temporary_directory, "image" + wmf_extension)
45 | with io.open(input_path, "wb") as input_fileobj:
46 | with image.open() as image_fileobj:
47 | shutil.copyfileobj(image_fileobj, input_fileobj)
48 |
49 | output_path = os.path.join(temporary_directory, "image.png")
50 | subprocess.check_call([
51 | "libreoffice",
52 | "--headless",
53 | "--convert-to",
54 | "png",
55 | input_path,
56 | "--outdir",
57 | temporary_directory,
58 | ])
59 |
60 | with io.open(output_path, "rb") as output_fileobj:
61 | output = output_fileobj.read()
62 |
63 | def open_image():
64 | return io.BytesIO(output)
65 |
66 | return post_process(image.copy(
67 | content_type="image/png",
68 | open=open_image,
69 | ))
70 | finally:
71 | shutil.rmtree(temporary_directory)
72 |
73 |
74 | def imagemagick_trim(image):
75 | command = ["convert", "-", "-trim", "-"]
76 | process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
77 | try:
78 | with image.open() as image_fileobj:
79 | shutil.copyfileobj(image_fileobj, process.stdin)
80 | output, err_output = process.communicate()
81 | except:
82 | process.kill()
83 | process.wait()
84 | raise
85 |
86 | return_code = process.poll()
87 | if return_code:
88 | raise subprocess.CalledProcessError(return_code, command)
89 | else:
90 | def open_image():
91 | return io.BytesIO(output)
92 |
93 | return image.copy(open=open_image)
94 |
95 |
--------------------------------------------------------------------------------
/tests/cli_tests.py:
--------------------------------------------------------------------------------
1 | import os
2 | import base64
3 |
4 | import spur
5 | import tempman
6 |
7 | from .testing import assert_equal, generate_test_path
8 |
9 |
10 | _local = spur.LocalShell()
11 |
12 |
13 | def test_html_is_printed_to_stdout_if_output_file_is_not_set():
14 | docx_path = generate_test_path("single-paragraph.docx")
15 | result = _local.run(["mammoth", docx_path])
16 | assert_equal(b"", result.stderr_output)
17 | assert_equal(b"Walking on imported air
", result.output)
18 |
19 |
20 | def test_html_is_written_to_file_if_output_file_is_set():
21 | with tempman.create_temp_dir() as temp_dir:
22 | output_path = os.path.join(temp_dir.path, "output.html")
23 | docx_path = generate_test_path("single-paragraph.docx")
24 | result = _local.run(["mammoth", docx_path, output_path])
25 | assert_equal(b"", result.stderr_output)
26 | assert_equal(b"", result.output)
27 | with open(output_path) as output_file:
28 | assert_equal("Walking on imported air
", output_file.read())
29 |
30 |
31 | _image_base_64 = b"iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAIAAAACUFjqAAAAAXNSR0IArs4c6QAAAAlwSFlzAAAOvgAADr4B6kKxwAAAABNJREFUKFNj/M+ADzDhlWUYqdIAQSwBE8U+X40AAAAASUVORK5CYII="
32 |
33 |
34 | def test_inline_images_are_included_in_output_if_writing_to_single_file():
35 | docx_path = generate_test_path("tiny-picture.docx")
36 | result = _local.run(["mammoth", docx_path])
37 | assert_equal(b"""
""", result.output)
38 |
39 |
40 | def test_images_are_written_to_separate_files_if_output_dir_is_set():
41 | with tempman.create_temp_dir() as temp_dir:
42 | output_path = os.path.join(temp_dir.path, "tiny-picture.html")
43 | image_path = os.path.join(temp_dir.path, "1.png")
44 |
45 | docx_path = generate_test_path("tiny-picture.docx")
46 | result = _local.run(["mammoth", docx_path, "--output-dir", temp_dir.path])
47 | assert_equal(b"", result.stderr_output)
48 | assert_equal(b"", result.output)
49 | with open(output_path) as output_file:
50 | assert_equal("""
""", output_file.read())
51 |
52 | with open(image_path, "rb") as image_file:
53 | assert_equal(_image_base_64, base64.b64encode(image_file.read()))
54 |
55 |
56 | def test_style_map_is_used_if_set():
57 | with tempman.create_temp_dir() as temp_dir:
58 | docx_path = generate_test_path("single-paragraph.docx")
59 | style_map_path = os.path.join(temp_dir.path, "style-map")
60 | with open(style_map_path, "w") as style_map_file:
61 | style_map_file.write("p => span:fresh")
62 | result = _local.run(["mammoth", docx_path, "--style-map", style_map_path])
63 | assert_equal(b"", result.stderr_output)
64 | assert_equal(b"Walking on imported air", result.output)
65 |
66 |
67 | def test_output_format_markdown_option_generates_markdown_output():
68 | docx_path = generate_test_path("single-paragraph.docx")
69 | result = _local.run(["mammoth", docx_path, "--output-format=markdown"])
70 | assert_equal(b"", result.stderr_output)
71 | assert_equal(b"Walking on imported air\n\n", result.output)
72 |
--------------------------------------------------------------------------------
/tests/transforms_tests.py:
--------------------------------------------------------------------------------
1 | import cobble
2 |
3 | from mammoth import documents, transforms
4 | from mammoth.transforms import get_descendants, get_descendants_of_type, _each_element
5 | from .testing import assert_equal
6 |
7 |
8 | class ParagraphTests(object):
9 | def test_paragraph_is_transformed(self):
10 | paragraph = documents.paragraph(children=[])
11 | result = transforms.paragraph(lambda _: documents.tab())(paragraph)
12 | assert_equal(documents.tab(), result)
13 |
14 | def test_non_paragraph_elements_are_not_transformed(self):
15 | run = documents.run(children=[])
16 | result = transforms.paragraph(lambda _: documents.tab())(run)
17 | assert_equal(documents.run(children=[]), result)
18 |
19 |
20 | class RunTests(object):
21 | def test_run_is_transformed(self):
22 | run = documents.run(children=[])
23 | result = transforms.run(lambda _: documents.tab())(run)
24 | assert_equal(documents.tab(), result)
25 |
26 | def test_non_paragraph_elements_are_not_transformed(self):
27 | paragraph = documents.paragraph(children=[])
28 | result = transforms.run(lambda _: documents.tab())(paragraph)
29 | assert_equal(documents.paragraph(children=[]), result)
30 |
31 |
32 | class EachElementTests(object):
33 | def test_all_descendants_are_transformed(self):
34 | @cobble.data
35 | class Count(documents.HasChildren):
36 | count = cobble.field()
37 |
38 | root = Count(count=None, children=[
39 | Count(count=None, children=[
40 | Count(count=None, children=[]),
41 | ]),
42 | ])
43 |
44 | current_count = [0]
45 | def set_count(node):
46 | current_count[0] += 1
47 | return node.copy(count=current_count[0])
48 |
49 | result = _each_element(set_count)(root)
50 |
51 | assert_equal(Count(count=3, children=[
52 | Count(count=2, children=[
53 | Count(count=1, children=[]),
54 | ]),
55 | ]), result)
56 |
57 |
58 | class GetDescendantsTests(object):
59 | def test_returns_nothing_if_element_type_has_no_children(self):
60 | assert_equal([], get_descendants(documents.tab()))
61 |
62 | def test_returns_nothing_if_element_has_empty_children(self):
63 | assert_equal([], get_descendants(documents.paragraph(children=[])))
64 |
65 | def test_includes_children(self):
66 | children = [documents.text("child 1"), documents.text("child 2")]
67 | element = documents.paragraph(children=children)
68 | assert_equal(children, get_descendants(element))
69 |
70 | def test_includes_indirect_descendants(self):
71 | grandchild = documents.text("grandchild")
72 | child = documents.run(children=[grandchild])
73 | element = documents.paragraph(children=[child])
74 | assert_equal([grandchild, child], get_descendants(element))
75 |
76 |
77 | class GetDescendantsOfTypeTests(object):
78 | def test_filters_descendants_to_type(self):
79 | tab = documents.tab()
80 | run = documents.run(children=[])
81 | element = documents.paragraph(children=[tab, run])
82 | assert_equal([run], get_descendants_of_type(element, documents.Run))
83 |
--------------------------------------------------------------------------------
/tests/styles/parser/tokeniser_tests.py:
--------------------------------------------------------------------------------
1 | from precisely import assert_that, has_attrs, is_sequence
2 |
3 | from mammoth.styles.parser.tokeniser import tokenise
4 |
5 |
6 | def test_unknown_tokens_are_tokenised():
7 | assert_tokens("~", is_token("unknown", "~"))
8 |
9 |
10 | def test_empty_string_is_tokenised_to_end_of_file_token():
11 | assert_tokens("")
12 |
13 |
14 | def test_whitespace_is_tokenised():
15 | assert_tokens(" \t\t ", is_token("whitespace", " \t\t "))
16 |
17 |
18 | def test_identifiers_are_tokenised():
19 | assert_tokens("Overture", is_token("identifier", "Overture"))
20 |
21 |
22 | def test_escape_sequences_in_identifiers_are_tokenised():
23 | assert_tokens(r"\:", is_token("identifier", r"\:"))
24 |
25 |
26 | def test_integers_are_tokenised():
27 | assert_tokens("123", is_token("integer", "123"))
28 |
29 |
30 | def test_strings_are_tokenised():
31 | assert_tokens("'Tristan'", is_token("string", "'Tristan'"))
32 |
33 |
34 | def test_escape_sequences_in_strings_are_tokenised():
35 | assert_tokens(r"'Tristan\''", is_token("string", r"'Tristan\''"))
36 |
37 |
38 | def test_unterminated_strings_are_tokenised():
39 | assert_tokens("'Tristan", is_token("unterminated string", "'Tristan"))
40 |
41 |
42 | def test_arrows_are_tokenised():
43 | assert_tokens("=>=>", is_token("symbol", "=>"), is_token("symbol", "=>"))
44 |
45 |
46 | def test_dots_are_tokenised():
47 | assert_tokens(".", is_token("symbol", "."))
48 |
49 |
50 | def test_colons_are_tokenised():
51 | assert_tokens("::", is_token("symbol", ":"), is_token("symbol", ":"))
52 |
53 |
54 | def test_greater_thans_are_tokenised():
55 | assert_tokens(">>", is_token("symbol", ">"), is_token("symbol", ">"))
56 |
57 |
58 | def test_equals_are_tokenised():
59 | assert_tokens("==", is_token("symbol", "="), is_token("symbol", "="))
60 |
61 |
62 | def test_open_parens_are_tokenised():
63 | assert_tokens("((", is_token("symbol", "("), is_token("symbol", "("))
64 |
65 |
66 | def test_close_parens_are_tokenised():
67 | assert_tokens("))", is_token("symbol", ")"), is_token("symbol", ")"))
68 |
69 |
70 | def test_open_square_brackets_are_tokenised():
71 | assert_tokens("[[", is_token("symbol", "["), is_token("symbol", "["))
72 |
73 |
74 | def test_close_square_brackets_are_tokenised():
75 | assert_tokens("]]", is_token("symbol", "]"), is_token("symbol", "]"))
76 |
77 |
78 | def test_choices_are_tokenised():
79 | assert_tokens("||", is_token("symbol", "|"), is_token("symbol", "|"))
80 |
81 |
82 | def test_bangs_are_tokenised():
83 | assert_tokens("!!", is_token("symbol", "!"), is_token("symbol", "!"))
84 |
85 |
86 | def test_can_tokenise_multiple_tokens():
87 | assert_tokens("The Magic Position",
88 | is_token("identifier", "The"),
89 | is_token("whitespace", " "),
90 | is_token("identifier", "Magic"),
91 | is_token("whitespace", " "),
92 | is_token("identifier", "Position"),
93 | )
94 |
95 |
96 | def assert_tokens(string, *expected):
97 | expected = list(expected)
98 | expected.append(is_token("end", ""))
99 | assert_that(
100 | tokenise(string),
101 | is_sequence(*expected),
102 | )
103 |
104 |
105 | def is_token(token_type, value):
106 | return has_attrs(
107 | type=token_type,
108 | value=value,
109 | )
110 |
--------------------------------------------------------------------------------
/mammoth/cli.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import io
3 | import os
4 | import shutil
5 | import sys
6 |
7 | import mammoth
8 | from . import writers
9 |
10 |
11 | def main():
12 | args = _parse_args()
13 |
14 | if args.style_map is None:
15 | style_map = None
16 | else:
17 | with open(args.style_map) as style_map_fileobj:
18 | style_map = style_map_fileobj.read()
19 |
20 | with open(args.path, "rb") as docx_fileobj:
21 | if args.output_dir is None:
22 | convert_image = None
23 | output_path = args.output
24 | else:
25 | convert_image = mammoth.images.img_element(ImageWriter(args.output_dir))
26 | output_filename = "{0}.html".format(os.path.basename(args.path).rpartition(".")[0])
27 | output_path = os.path.join(args.output_dir, output_filename)
28 |
29 | result = mammoth.convert(
30 | docx_fileobj,
31 | style_map=style_map,
32 | convert_image=convert_image,
33 | output_format=args.output_format,
34 | )
35 | for message in result.messages:
36 | sys.stderr.write(message.message)
37 | sys.stderr.write("\n")
38 |
39 | _write_output(output_path, result.value)
40 |
41 |
42 | class ImageWriter(object):
43 | def __init__(self, output_dir):
44 | self._output_dir = output_dir
45 | self._image_number = 1
46 |
47 | def __call__(self, element):
48 | extension = element.content_type.partition("/")[2]
49 | image_filename = "{0}.{1}".format(self._image_number, extension)
50 | with open(os.path.join(self._output_dir, image_filename), "wb") as image_dest:
51 | with element.open() as image_source:
52 | shutil.copyfileobj(image_source, image_dest)
53 |
54 | self._image_number += 1
55 |
56 | return {"src": image_filename}
57 |
58 |
59 | def _write_output(path, contents):
60 | if path is None:
61 | if sys.version_info[0] <= 2:
62 | stdout = sys.stdout
63 | else:
64 | stdout = sys.stdout.buffer
65 |
66 | stdout.write(contents.encode("utf-8"))
67 | stdout.flush()
68 | else:
69 | with io.open(path, "w", encoding="utf-8") as fileobj:
70 | fileobj.write(contents)
71 |
72 |
73 | def _parse_args():
74 | parser = argparse.ArgumentParser()
75 | parser.add_argument(
76 | "path",
77 | metavar="docx-path",
78 | help="Path to the .docx file to convert.")
79 |
80 | output_group = parser.add_mutually_exclusive_group()
81 | output_group.add_argument(
82 | "output",
83 | nargs="?",
84 | metavar="output-path",
85 | help="Output path for the generated document. Images will be stored inline in the output document. Output is written to stdout if not set.")
86 | output_group.add_argument(
87 | "--output-dir",
88 | help="Output directory for generated HTML and images. Images will be stored in separate files. Mutually exclusive with output-path.")
89 |
90 | parser.add_argument(
91 | "--output-format",
92 | required=False,
93 | choices=writers.formats(),
94 | help="Output format.")
95 | parser.add_argument(
96 | "--style-map",
97 | required=False,
98 | help="File containg a style map.")
99 | return parser.parse_args()
100 |
101 |
102 | if __name__ == "__main__":
103 | main()
104 |
105 |
--------------------------------------------------------------------------------
/mammoth/options.py:
--------------------------------------------------------------------------------
1 | from .styles.parser import read_style_mapping
2 | from . import lists, results
3 |
4 |
5 | def read_options(options):
6 | custom_style_map_text = options.pop("style_map", "") or ""
7 | embedded_style_map_text = options.pop("embedded_style_map", "") or ""
8 | include_default_style_map = options.pop("include_default_style_map", True)
9 |
10 | read_style_map_result = results.combine([
11 | _read_style_map(custom_style_map_text),
12 | _read_style_map(embedded_style_map_text),
13 | ])
14 |
15 | custom_style_map, embedded_style_map = read_style_map_result.value
16 | style_map = custom_style_map + embedded_style_map
17 |
18 | if include_default_style_map:
19 | style_map += _default_style_map
20 |
21 | options["ignore_empty_paragraphs"] = options.get("ignore_empty_paragraphs", True)
22 | options["style_map"] = style_map
23 | return read_style_map_result.map(lambda _: options)
24 |
25 |
26 | def _read_style_map(style_text):
27 | lines = filter(None, map(_get_line, style_text.split("\n")))
28 | return results.combine(lists.map(read_style_mapping, lines)) \
29 | .map(lambda style_mappings: lists.filter(None, style_mappings))
30 |
31 |
32 | def _get_line(line):
33 | line = line.strip()
34 | if line.startswith("#"):
35 | return None
36 | else:
37 | return line
38 |
39 |
40 | _default_style_map_result = _read_style_map("""
41 | p.Heading1 => h1:fresh
42 | p.Heading2 => h2:fresh
43 | p.Heading3 => h3:fresh
44 | p.Heading4 => h4:fresh
45 | p.Heading5 => h5:fresh
46 | p.Heading6 => h6:fresh
47 | p[style-name='Heading 1'] => h1:fresh
48 | p[style-name='Heading 2'] => h2:fresh
49 | p[style-name='Heading 3'] => h3:fresh
50 | p[style-name='Heading 4'] => h4:fresh
51 | p[style-name='Heading 5'] => h5:fresh
52 | p[style-name='Heading 6'] => h6:fresh
53 | p[style-name='heading 1'] => h1:fresh
54 | p[style-name='heading 2'] => h2:fresh
55 | p[style-name='heading 3'] => h3:fresh
56 | p[style-name='heading 4'] => h4:fresh
57 | p[style-name='heading 5'] => h5:fresh
58 | p[style-name='heading 6'] => h6:fresh
59 |
60 | # Apple Pages
61 | p.Heading => h1:fresh
62 | p[style-name='Heading'] => h1:fresh
63 |
64 | r[style-name='Strong'] => strong
65 |
66 | p[style-name='footnote text'] => p:fresh
67 | r[style-name='footnote reference'] =>
68 | p[style-name='endnote text'] => p:fresh
69 | r[style-name='endnote reference'] =>
70 | p[style-name='annotation text'] => p:fresh
71 | r[style-name='annotation reference'] =>
72 |
73 | # LibreOffice
74 | p[style-name='Footnote'] => p:fresh
75 | r[style-name='Footnote anchor'] =>
76 | p[style-name='Endnote'] => p:fresh
77 | r[style-name='Endnote anchor'] =>
78 |
79 | p:unordered-list(1) => ul > li:fresh
80 | p:unordered-list(2) => ul|ol > li > ul > li:fresh
81 | p:unordered-list(3) => ul|ol > li > ul|ol > li > ul > li:fresh
82 | p:unordered-list(4) => ul|ol > li > ul|ol > li > ul|ol > li > ul > li:fresh
83 | p:unordered-list(5) => ul|ol > li > ul|ol > li > ul|ol > li > ul|ol > li > ul > li:fresh
84 | p:ordered-list(1) => ol > li:fresh
85 | p:ordered-list(2) => ul|ol > li > ol > li:fresh
86 | p:ordered-list(3) => ul|ol > li > ul|ol > li > ol > li:fresh
87 | p:ordered-list(4) => ul|ol > li > ul|ol > li > ul|ol > li > ol > li:fresh
88 | p:ordered-list(5) => ul|ol > li > ul|ol > li > ul|ol > li > ul|ol > li > ol > li:fresh
89 |
90 | r[style-name='Hyperlink'] =>
91 |
92 | p[style-name='Normal'] => p:fresh
93 |
94 | # Apple Pages
95 | p.Body => p:fresh
96 | p[style-name='Body'] => p:fresh
97 | """)
98 |
99 |
100 | assert not _default_style_map_result.messages
101 | _default_style_map = _default_style_map_result.value
102 |
--------------------------------------------------------------------------------
/mammoth/docx/xmlparser.py:
--------------------------------------------------------------------------------
1 | import xml.dom.minidom
2 |
3 | import cobble
4 |
5 |
6 | @cobble.data
7 | class XmlElement(object):
8 | name = cobble.field()
9 | attributes = cobble.field()
10 | children = cobble.field()
11 |
12 | def find_child_or_null(self, name):
13 | return self.find_child(name) or null_xml_element
14 |
15 | def find_child(self, name):
16 | for child in self.children:
17 | if isinstance(child, XmlElement) and child.name == name:
18 | return child
19 |
20 |
21 | def find_children(self, name):
22 | return XmlElementList(filter(
23 | lambda child: child.node_type == node_types.element and child.name == name,
24 | self.children
25 | ))
26 |
27 |
28 | class XmlElementList(object):
29 | def __init__(self, elements):
30 | self._elements = elements
31 |
32 | def __iter__(self):
33 | return iter(self._elements)
34 |
35 | def find_children(self, name):
36 | children = []
37 | for element in self._elements:
38 | for child in element.find_children(name):
39 | children.append(child)
40 | return XmlElementList(children)
41 |
42 |
43 | class NullXmlElement(object):
44 | attributes = {}
45 | children = []
46 |
47 | def find_child_or_null(self, name):
48 | return self
49 |
50 | def find_child(self, name):
51 | return None
52 |
53 |
54 | null_xml_element = NullXmlElement()
55 |
56 |
57 | @cobble.data
58 | class XmlText(object):
59 | value = cobble.field()
60 |
61 |
62 | def element(name, attributes=None, children=None):
63 | return XmlElement(name, attributes or {}, children or [])
64 |
65 | text = XmlText
66 |
67 |
68 | class node_types(object):
69 | element = 1
70 | text = 3
71 |
72 |
73 | XmlElement.node_type = node_types.element
74 | XmlText.node_type = node_types.text
75 |
76 |
77 |
78 | def parse_xml(fileobj, namespace_mapping=None):
79 | if namespace_mapping is None:
80 | namespace_prefixes = {}
81 | else:
82 | namespace_prefixes = dict((uri, prefix) for prefix, uri in namespace_mapping)
83 |
84 | document = xml.dom.minidom.parse(fileobj)
85 |
86 | def convert_node(node):
87 | if node.nodeType == xml.dom.Node.ELEMENT_NODE:
88 | return convert_element(node)
89 | elif node.nodeType == xml.dom.Node.TEXT_NODE:
90 | return XmlText(node.nodeValue)
91 | else:
92 | return None
93 |
94 | def convert_element(element):
95 | converted_name = convert_name(element)
96 |
97 | converted_attributes = dict(
98 | (convert_name(attribute), attribute.value)
99 | for attribute in element.attributes.values()
100 | if attribute.namespaceURI != "http://www.w3.org/2000/xmlns/"
101 | )
102 |
103 | converted_children = []
104 | for child_node in element.childNodes:
105 | converted_child_node = convert_node(child_node)
106 | if converted_child_node is not None:
107 | converted_children.append(converted_child_node)
108 |
109 | return XmlElement(converted_name, converted_attributes, converted_children)
110 |
111 | def convert_name(node):
112 | if node.namespaceURI is None:
113 | return node.localName
114 | else:
115 | prefix = namespace_prefixes.get(node.namespaceURI)
116 | if prefix is None:
117 | return "{%s}%s" % (node.namespaceURI, node.localName)
118 | else:
119 | return "%s:%s" % (prefix, node.localName)
120 |
121 | return convert_node(document.documentElement)
122 |
--------------------------------------------------------------------------------
/mammoth/styles/parser/html_path_parser.py:
--------------------------------------------------------------------------------
1 | import cobble
2 |
3 | from ... import html_paths
4 | from .tokeniser import TokenType
5 | from .token_parser import parse_identifier, parse_string
6 |
7 |
8 | @cobble.data
9 | class _AttributeOrClassName(object):
10 | name = cobble.field()
11 | value = cobble.field()
12 | append = cobble.field()
13 |
14 |
15 | def parse_html_path(tokens):
16 | if tokens.try_skip(TokenType.SYMBOL, "!"):
17 | return html_paths.ignore
18 | else:
19 | return html_paths.path(_parse_html_path_elements(tokens))
20 |
21 |
22 | def _parse_html_path_elements(tokens):
23 | elements = []
24 |
25 | if tokens.peek_token_type() == TokenType.IDENTIFIER:
26 | elements.append(_parse_element(tokens))
27 |
28 | while tokens.try_skip_many(((TokenType.WHITESPACE, None), (TokenType.SYMBOL, ">"))):
29 | tokens.skip(TokenType.WHITESPACE)
30 | elements.append(_parse_element(tokens))
31 |
32 | return elements
33 |
34 |
35 | def _parse_element(tokens):
36 | tag_names = _parse_tag_names(tokens)
37 | attributes_list = _parse_attribute_or_class_names(tokens)
38 | is_fresh = _parse_is_fresh(tokens)
39 | separator = _parse_separator(tokens)
40 |
41 | attributes = {}
42 | for attribute in attributes_list:
43 | if attribute.append and attributes.get(attribute.name):
44 | attributes[attribute.name] += " " + attribute.value
45 | else:
46 | attributes[attribute.name] = attribute.value
47 |
48 | return html_paths.element(
49 | tag_names,
50 | attributes=attributes,
51 | fresh=is_fresh,
52 | separator=separator,
53 | )
54 |
55 |
56 | def _parse_tag_names(tokens):
57 | tag_names = [parse_identifier(tokens)]
58 |
59 | while tokens.try_skip(TokenType.SYMBOL, "|"):
60 | tag_names.append(parse_identifier(tokens))
61 |
62 | return tag_names
63 |
64 |
65 | def _parse_attribute_or_class_names(tokens):
66 | attribute_or_class_names = []
67 |
68 | while True:
69 | attribute_or_class_name = _try_parse_attribute_or_class_name(tokens)
70 | if attribute_or_class_name is None:
71 | break
72 | else:
73 | attribute_or_class_names.append(attribute_or_class_name)
74 |
75 | return attribute_or_class_names
76 |
77 |
78 | def _try_parse_attribute_or_class_name(tokens):
79 | if tokens.is_next(TokenType.SYMBOL, "["):
80 | return _parse_attribute(tokens)
81 | if tokens.is_next(TokenType.SYMBOL, "."):
82 | return _parse_class_name(tokens)
83 | else:
84 | return None
85 |
86 |
87 | def _parse_attribute(tokens):
88 | tokens.skip(TokenType.SYMBOL, "[")
89 | name = parse_identifier(tokens)
90 | tokens.skip(TokenType.SYMBOL, "=")
91 | value = parse_string(tokens)
92 | tokens.skip(TokenType.SYMBOL, "]")
93 | return _AttributeOrClassName(name=name, value=value, append=False)
94 |
95 |
96 | def _parse_class_name(tokens):
97 | tokens.skip(TokenType.SYMBOL, ".")
98 | class_name = parse_identifier(tokens)
99 | return _AttributeOrClassName(name="class", value=class_name, append=True)
100 |
101 |
102 | def _parse_is_fresh(tokens):
103 | return tokens.try_skip_many((
104 | (TokenType.SYMBOL, ":"),
105 | (TokenType.IDENTIFIER, "fresh"),
106 | ))
107 |
108 |
109 | def _parse_separator(tokens):
110 | is_separator = tokens.try_skip_many((
111 | (TokenType.SYMBOL, ":"),
112 | (TokenType.IDENTIFIER, "separator"),
113 | ))
114 | if is_separator:
115 | tokens.skip(TokenType.SYMBOL, "(")
116 | value = parse_string(tokens)
117 | tokens.skip(TokenType.SYMBOL, ")")
118 | return value
119 | else:
120 | return None
121 |
--------------------------------------------------------------------------------
/mammoth/html/__init__.py:
--------------------------------------------------------------------------------
1 | from ..lists import flat_map
2 | from .nodes import TextNode, Tag, Element, ForceWrite, NodeVisitor
3 |
4 |
5 | def text(value):
6 | return TextNode(value)
7 |
8 |
9 | def tag(tag_names, attributes=None, collapsible=None, separator=None):
10 | if not isinstance(tag_names, list):
11 | tag_names = [tag_names]
12 | if attributes is None:
13 | attributes = {}
14 | return Tag(tag_names=tag_names, attributes=attributes, collapsible=bool(collapsible), separator=separator)
15 |
16 |
17 | def element(tag_names, attributes=None, children=None, collapsible=None, separator=None):
18 | if children is None:
19 | children = []
20 |
21 | element_tag = tag(tag_names=tag_names, attributes=attributes, collapsible=collapsible, separator=separator)
22 | return Element(element_tag, children)
23 |
24 |
25 | def collapsible_element(tag_names, attributes=None, children=None):
26 | return element(tag_names, attributes, children, collapsible=True)
27 |
28 |
29 | force_write = ForceWrite()
30 |
31 |
32 | def strip_empty(nodes):
33 | return flat_map(_strip_empty_node, nodes)
34 |
35 |
36 | def _strip_empty_node(node):
37 | return StripEmpty().visit(node)
38 |
39 |
40 | class StripEmpty(NodeVisitor):
41 | def visit_text_node(self, node):
42 | if node.value:
43 | return [node]
44 | else:
45 | return []
46 |
47 | def visit_element(self, element):
48 | children = strip_empty(element.children)
49 | if len(children) == 0 and not element.is_void():
50 | return []
51 | else:
52 | return [Element(element.tag, children)]
53 |
54 | def visit_force_write(self, node):
55 | return [node]
56 |
57 |
58 | def collapse(nodes):
59 | collapsed = []
60 |
61 | for node in nodes:
62 | _collapsing_add(collapsed, node)
63 |
64 | return collapsed
65 |
66 | class _CollapseNode(NodeVisitor):
67 | def visit_text_node(self, node):
68 | return node
69 |
70 | def visit_element(self, element):
71 | return Element(element.tag, collapse(element.children))
72 |
73 | def visit_force_write(self, node):
74 | return node
75 |
76 | _collapse_node = _CollapseNode().visit
77 |
78 |
79 | def _collapsing_add(collapsed, node):
80 | collapsed_node = _collapse_node(node)
81 | if not _try_collapse(collapsed, collapsed_node):
82 | collapsed.append(collapsed_node)
83 |
84 | def _try_collapse(collapsed, node):
85 | if not collapsed:
86 | return False
87 |
88 | last = collapsed[-1]
89 | if not isinstance(last, Element) or not isinstance(node, Element):
90 | return False
91 |
92 | if not node.collapsible:
93 | return False
94 |
95 | if not _is_match(last, node):
96 | return False
97 |
98 | if node.separator:
99 | last.children.append(text(node.separator))
100 |
101 | for child in node.children:
102 | _collapsing_add(last.children, child)
103 |
104 | return True
105 |
106 | def _is_match(first, second):
107 | return first.tag_name in second.tag_names and first.attributes == second.attributes
108 |
109 |
110 | def write(writer, nodes):
111 | visitor = _NodeWriter(writer)
112 | visitor.visit_all(nodes)
113 |
114 |
115 | class _NodeWriter(NodeVisitor):
116 | def __init__(self, writer):
117 | self._writer = writer
118 |
119 | def visit_text_node(self, node):
120 | self._writer.text(node.value)
121 |
122 | def visit_element(self, element):
123 | if element.is_void():
124 | self._writer.self_closing(element.tag_name, element.attributes)
125 | else:
126 | self._writer.start(element.tag_name, element.attributes)
127 | self.visit_all(element.children)
128 | self._writer.end(element.tag_name)
129 |
130 | def visit_force_write(self, element):
131 | pass
132 |
133 | def visit_all(self, nodes):
134 | for node in nodes:
135 | self.visit(node)
136 |
--------------------------------------------------------------------------------
/mammoth/docx/styles_xml.py:
--------------------------------------------------------------------------------
1 | import collections
2 |
3 |
4 | class Styles(object):
5 | @staticmethod
6 | def create(paragraph_styles=None, character_styles=None, table_styles=None, numbering_styles=None):
7 | if paragraph_styles is None:
8 | paragraph_styles = {}
9 | if character_styles is None:
10 | character_styles = {}
11 | if table_styles is None:
12 | table_styles = {}
13 | if numbering_styles is None:
14 | numbering_styles = {}
15 |
16 | return Styles(
17 | paragraph_styles=paragraph_styles,
18 | character_styles=character_styles,
19 | table_styles=table_styles,
20 | numbering_styles=numbering_styles,
21 | )
22 |
23 | def __init__(self, paragraph_styles, character_styles, table_styles, numbering_styles):
24 | self._paragraph_styles = paragraph_styles
25 | self._character_styles = character_styles
26 | self._table_styles = table_styles
27 | self._numbering_styles = numbering_styles
28 |
29 | def find_paragraph_style_by_id(self, style_id):
30 | return self._paragraph_styles.get(style_id)
31 |
32 | def find_character_style_by_id(self, style_id):
33 | return self._character_styles.get(style_id)
34 |
35 | def find_table_style_by_id(self, style_id):
36 | return self._table_styles.get(style_id)
37 |
38 | def find_numbering_style_by_id(self, style_id):
39 | return self._numbering_styles.get(style_id)
40 |
41 |
42 | Styles.EMPTY = Styles(
43 | paragraph_styles={},
44 | character_styles={},
45 | table_styles={},
46 | numbering_styles={},
47 | )
48 |
49 |
50 | def read_styles_xml_element(element):
51 | paragraph_styles = {}
52 | character_styles = {}
53 | table_styles = {}
54 | numbering_styles = {}
55 | styles = {
56 | "paragraph": paragraph_styles,
57 | "character": character_styles,
58 | "table": table_styles,
59 | "numbering": numbering_styles,
60 | }
61 |
62 | for style_element in element.find_children("w:style"):
63 | element_type = style_element.attributes["w:type"]
64 | if element_type == "numbering":
65 | style = _read_numbering_style_element(style_element)
66 | else:
67 | style = _read_style_element(style_element)
68 |
69 | style_set = styles.get(element_type)
70 |
71 | # Per 17.7.4.17 style (Style Definition) of ECMA-376 4th edition Part 1:
72 | #
73 | # > If multiple style definitions each declare the same value for their
74 | # > styleId, then the first such instance shall keep its current
75 | # > identifier with all other instances being reassigned in any manner
76 | # > desired.
77 | #
78 | # For the purpose of conversion, there's no point holding onto styles
79 | # with reassigned style IDs, so we ignore such style definitions.
80 |
81 | if style_set is not None and style.style_id not in style_set:
82 | style_set[style.style_id] = style
83 |
84 | return Styles(
85 | paragraph_styles=paragraph_styles,
86 | character_styles=character_styles,
87 | table_styles=table_styles,
88 | numbering_styles=numbering_styles,
89 | )
90 |
91 |
92 | Style = collections.namedtuple("Style", ["style_id", "name"])
93 |
94 |
95 | def _read_style_element(element):
96 | style_id = _read_style_id(element)
97 | name = element.find_child_or_null("w:name").attributes.get("w:val")
98 | return Style(style_id=style_id, name=name)
99 |
100 |
101 | NumberingStyle = collections.namedtuple("NumberingStyle", ["style_id", "num_id"])
102 |
103 |
104 | def _read_numbering_style_element(element):
105 | style_id = _read_style_id(element)
106 |
107 | num_id = element \
108 | .find_child_or_null("w:pPr") \
109 | .find_child_or_null("w:numPr") \
110 | .find_child_or_null("w:numId") \
111 | .attributes.get("w:val")
112 |
113 | return NumberingStyle(style_id=style_id, num_id=num_id)
114 |
115 |
116 | def _read_style_id(element):
117 | return element.attributes["w:styleId"]
118 |
--------------------------------------------------------------------------------
/tests/html/collapse_tests.py:
--------------------------------------------------------------------------------
1 | from mammoth import html
2 | from ..testing import assert_equal
3 |
4 |
5 | def test_collapsing_does_nothing_to_single_text_node():
6 | assert_equal(
7 | html.collapse([html.text("Bluebells")]),
8 | [html.text("Bluebells")])
9 |
10 |
11 | def test_consecutive_fresh_elements_are_not_collapsed():
12 | assert_equal(
13 | html.collapse([html.element("p"), html.element("p")]),
14 | [html.element("p"), html.element("p")])
15 |
16 |
17 | def test_consecutive_collapsible_elements_are_collapsed_if_they_have_the_same_tag_and_attributes():
18 | assert_equal(
19 | [html.collapsible_element("p", {}, [html.text("One"), html.text("Two")])],
20 | html.collapse([
21 | html.collapsible_element("p", {}, [html.text("One")]),
22 | html.collapsible_element("p", {}, [html.text("Two")])
23 | ]))
24 |
25 |
26 | def test_elements_with_different_tag_names_are_not_collapsed():
27 | assert_equal(
28 | [
29 | html.collapsible_element("p", {}, [html.text("One")]),
30 | html.collapsible_element("div", {}, [html.text("Two")])
31 | ],
32 |
33 | html.collapse([
34 | html.collapsible_element("p", {}, [html.text("One")]),
35 | html.collapsible_element("div", {}, [html.text("Two")])
36 | ]))
37 |
38 |
39 | def test_elements_with_different_attributes_are_not_collapsed():
40 | assert_equal(
41 | [
42 | html.collapsible_element("p", {"id": "a"}, [html.text("One")]),
43 | html.collapsible_element("p", {}, [html.text("Two")])
44 | ],
45 |
46 | html.collapse([
47 | html.collapsible_element("p", {"id": "a"}, [html.text("One")]),
48 | html.collapsible_element("p", {}, [html.text("Two")])
49 | ]))
50 |
51 |
52 | def test_children_of_collapsed_element_can_collapse_with_children_of_previous_element():
53 | assert_equal(
54 | [
55 | html.collapsible_element("blockquote", {}, [
56 | html.collapsible_element("p", {}, [
57 | html.text("One"),
58 | html.text("Two")
59 | ])
60 | ]),
61 | ],
62 |
63 | html.collapse([
64 | html.collapsible_element("blockquote", {}, [
65 | html.collapsible_element("p", {}, [html.text("One")])
66 | ]),
67 | html.collapsible_element("blockquote", {}, [
68 | html.collapsible_element("p", {}, [html.text("Two")])
69 | ]),
70 | ]))
71 |
72 |
73 | def test_collapsible_element_can_collapse_into_previous_fresh_element():
74 | assert_equal(
75 | [html.element("p", {}, [html.text("One"), html.text("Two")])],
76 | html.collapse([
77 | html.element("p", {}, [html.text("One")]),
78 | html.collapsible_element("p", {}, [html.text("Two")])
79 | ]))
80 |
81 |
82 | def test_element_with_choice_of_tag_names_can_collapse_into_previous_element_if_it_has_one_of_those_tag_names_as_its_main_tag_name():
83 | assert_equal(
84 | [html.collapsible_element(["ol"])],
85 | html.collapse([
86 | html.collapsible_element("ol"),
87 | html.collapsible_element(["ul", "ol"])
88 | ]))
89 |
90 | assert_equal(
91 | [
92 | html.collapsible_element(["ul", "ol"]),
93 | html.collapsible_element("ol")
94 | ],
95 | html.collapse([
96 | html.collapsible_element(["ul", "ol"]),
97 | html.collapsible_element("ol")
98 | ]))
99 |
100 |
101 | def test_when_separator_is_present_then_separator_is_prepended_to_collapsed_element():
102 | assert_equal(
103 | [
104 | html.element("pre", collapsible=False, children=[
105 | html.text("Hello"),
106 | html.text("\n"),
107 | html.text(" the"),
108 | html.text("re")
109 | ])
110 | ],
111 | html.collapse([
112 | html.element("pre", collapsible=False, children=[html.text("Hello")]),
113 | html.element("pre", collapsible=True, separator="\n", children=[html.text(" the"), html.text("re")]),
114 | ]),
115 | )
116 |
--------------------------------------------------------------------------------
/mammoth/docx/numbering_xml.py:
--------------------------------------------------------------------------------
1 | import cobble
2 |
3 | from ..documents import numbering_level
4 | from .styles_xml import Styles
5 |
6 |
7 | def read_numbering_xml_element(element, styles):
8 | abstract_nums = _read_abstract_nums(element)
9 | nums = _read_nums(element)
10 | return Numbering(abstract_nums=abstract_nums, nums=nums, styles=styles)
11 |
12 |
13 | def _read_abstract_nums(element):
14 | abstract_num_elements = element.find_children("w:abstractNum")
15 | return dict(map(_read_abstract_num, abstract_num_elements))
16 |
17 |
18 | def _read_abstract_num(element):
19 | abstract_num_id = element.attributes.get("w:abstractNumId")
20 | levels = _read_abstract_num_levels(element)
21 | num_style_link = element.find_child_or_null("w:numStyleLink").attributes.get("w:val")
22 | return abstract_num_id, _AbstractNum(levels=levels, num_style_link=num_style_link)
23 |
24 |
25 | @cobble.data
26 | class _AbstractNum(object):
27 | levels = cobble.field()
28 | num_style_link = cobble.field()
29 |
30 |
31 | @cobble.data
32 | class _AbstractNumLevel(object):
33 | level_index = cobble.field()
34 | is_ordered = cobble.field()
35 | paragraph_style_id = cobble.field()
36 |
37 |
38 | def _read_abstract_num_levels(element):
39 | levels = {}
40 |
41 | # Some malformed documents define numbering levels without an index, and
42 | # reference the numbering using a w:numPr element without a w:ilvl child.
43 | # To handle such cases, we assume a level of 0 as a fallback.
44 | level_without_index = None
45 |
46 | for level_element in element.find_children("w:lvl"):
47 | level = _read_abstract_num_level(level_element)
48 | if level.level_index is None:
49 | level.level_index = "0"
50 | level_without_index = level
51 | else:
52 | levels[level.level_index] = level
53 |
54 | if level_without_index is not None and level_without_index.level_index not in levels:
55 | levels[level_without_index.level_index] = level_without_index
56 |
57 | return levels
58 |
59 |
60 | def _read_abstract_num_level(element):
61 | level_index = element.attributes.get("w:ilvl")
62 | num_fmt = element.find_child_or_null("w:numFmt").attributes.get("w:val")
63 | is_ordered = num_fmt != "bullet"
64 | paragraph_style_id = element.find_child_or_null("w:pStyle").attributes.get("w:val")
65 | return _AbstractNumLevel(
66 | level_index=level_index,
67 | is_ordered=is_ordered,
68 | paragraph_style_id=paragraph_style_id,
69 | )
70 |
71 |
72 | def _read_nums(element):
73 | num_elements = element.find_children("w:num")
74 | return dict(
75 | _read_num(num_element)
76 | for num_element in num_elements
77 | )
78 |
79 |
80 | def _read_num(element):
81 | num_id = element.attributes.get("w:numId")
82 | abstract_num_id = element.find_child_or_null("w:abstractNumId").attributes["w:val"]
83 | return num_id, _Num(abstract_num_id=abstract_num_id)
84 |
85 |
86 | @cobble.data
87 | class _Num(object):
88 | abstract_num_id = cobble.field()
89 |
90 |
91 | class Numbering(object):
92 | def __init__(self, abstract_nums, nums, styles):
93 | self._abstract_nums = abstract_nums
94 | self._levels_by_paragraph_style_id = dict(
95 | (level.paragraph_style_id, self._to_numbering_level(level))
96 | for abstract_num in abstract_nums.values()
97 | for level in abstract_num.levels.values()
98 | if level.paragraph_style_id is not None
99 | )
100 | self._nums = nums
101 | self._styles = styles
102 |
103 | def find_level(self, num_id, level):
104 | num = self._nums.get(num_id)
105 | if num is None:
106 | return None
107 | else:
108 | abstract_num = self._abstract_nums.get(num.abstract_num_id)
109 | if abstract_num is None:
110 | return None
111 | elif abstract_num.num_style_link is None:
112 | return self._to_numbering_level(abstract_num.levels.get(level))
113 | else:
114 | style = self._styles.find_numbering_style_by_id(abstract_num.num_style_link)
115 | return self.find_level(style.num_id, level)
116 |
117 | def find_level_by_paragraph_style_id(self, style_id):
118 | return self._levels_by_paragraph_style_id.get(style_id)
119 |
120 | def _to_numbering_level(self, abstract_num_level):
121 | if abstract_num_level is None:
122 | return None
123 | else:
124 | return numbering_level(
125 | level_index=abstract_num_level.level_index,
126 | is_ordered=abstract_num_level.is_ordered,
127 | )
128 |
129 |
130 | Numbering.EMPTY = Numbering(abstract_nums={}, nums={}, styles=Styles.EMPTY)
131 |
--------------------------------------------------------------------------------
/mammoth/styles/parser/document_matcher_parser.py:
--------------------------------------------------------------------------------
1 | from ... import documents, document_matchers
2 | from .errors import LineParseError
3 | from .tokeniser import TokenType
4 | from .token_parser import try_parse_class_name, parse_string
5 |
6 |
7 | def parse_document_matcher(tokens):
8 | if tokens.try_skip(TokenType.IDENTIFIER, "p"):
9 | style_id = try_parse_class_name(tokens)
10 | style_name = _parse_style_name(tokens)
11 | numbering = _parse_numbering(tokens)
12 |
13 | return document_matchers.paragraph(
14 | style_id=style_id,
15 | style_name=style_name,
16 | numbering=numbering,
17 | )
18 |
19 | elif tokens.try_skip(TokenType.IDENTIFIER, "r"):
20 | style_id = try_parse_class_name(tokens)
21 | style_name = _parse_style_name(tokens)
22 |
23 | return document_matchers.run(
24 | style_id=style_id,
25 | style_name=style_name,
26 | )
27 |
28 | elif tokens.try_skip(TokenType.IDENTIFIER, "table"):
29 | style_id = try_parse_class_name(tokens)
30 | style_name = _parse_style_name(tokens)
31 |
32 | return document_matchers.table(
33 | style_id=style_id,
34 | style_name=style_name,
35 | )
36 |
37 | elif tokens.try_skip(TokenType.IDENTIFIER, "b"):
38 | return document_matchers.bold
39 |
40 | elif tokens.try_skip(TokenType.IDENTIFIER, "i"):
41 | return document_matchers.italic
42 |
43 | elif tokens.try_skip(TokenType.IDENTIFIER, "u"):
44 | return document_matchers.underline
45 |
46 | elif tokens.try_skip(TokenType.IDENTIFIER, "strike"):
47 | return document_matchers.strikethrough
48 |
49 | elif tokens.try_skip(TokenType.IDENTIFIER, "all-caps"):
50 | return document_matchers.all_caps
51 |
52 | elif tokens.try_skip(TokenType.IDENTIFIER, "small-caps"):
53 | return document_matchers.small_caps
54 |
55 | elif tokens.try_skip(TokenType.IDENTIFIER, "highlight"):
56 | return _parse_highlight(tokens)
57 |
58 | elif tokens.try_skip(TokenType.IDENTIFIER, "comment-reference"):
59 | return document_matchers.comment_reference
60 |
61 | elif tokens.try_skip(TokenType.IDENTIFIER, "br"):
62 | return _parse_break(tokens)
63 |
64 | else:
65 | raise LineParseError("Unrecognised document element: {0}".format(tokens.next_value(TokenType.IDENTIFIER)))
66 |
67 | def _parse_style_name(tokens):
68 | if tokens.try_skip(TokenType.SYMBOL, "["):
69 | tokens.skip(TokenType.IDENTIFIER, "style-name")
70 | string_matcher = _parse_string_matcher(tokens)
71 | tokens.skip(TokenType.SYMBOL, "]")
72 | return string_matcher
73 | else:
74 | return None
75 |
76 |
77 | def _parse_string_matcher(tokens):
78 | if tokens.try_skip(TokenType.SYMBOL, "="):
79 | return document_matchers.equal_to(parse_string(tokens))
80 | elif tokens.try_skip(TokenType.SYMBOL, "^="):
81 | return document_matchers.starts_with(parse_string(tokens))
82 | else:
83 | raise LineParseError("Unrecognised string matcher: {0}".format(tokens.next_value()))
84 |
85 | def _parse_numbering(tokens):
86 | if tokens.try_skip(TokenType.SYMBOL, ":"):
87 | is_ordered = _parse_list_type(tokens)
88 | tokens.skip(TokenType.SYMBOL, "(")
89 | level = int(tokens.next_value(TokenType.INTEGER)) - 1
90 | tokens.skip(TokenType.SYMBOL, ")")
91 | return documents.numbering_level(level, is_ordered=is_ordered)
92 |
93 |
94 | def _parse_list_type(tokens):
95 | list_type = tokens.next_value(TokenType.IDENTIFIER)
96 | if list_type == "ordered-list":
97 | return True
98 | elif list_type == "unordered-list":
99 | return False
100 | else:
101 | raise LineParseError("Unrecognised list type: {0}".format(list_type))
102 |
103 |
104 | def _parse_highlight(tokens):
105 | if tokens.try_skip(TokenType.SYMBOL, "["):
106 | tokens.skip(TokenType.IDENTIFIER, "color")
107 | tokens.skip(TokenType.SYMBOL, "=")
108 | color = parse_string(tokens)
109 | tokens.skip(TokenType.SYMBOL, "]");
110 | else:
111 | color = None
112 |
113 | return document_matchers.highlight(color=color)
114 |
115 |
116 | def _parse_break(tokens):
117 | tokens.skip(TokenType.SYMBOL, "[")
118 | tokens.skip(TokenType.IDENTIFIER, "type")
119 | tokens.skip(TokenType.SYMBOL, "=")
120 | type_name = parse_string(tokens)
121 | tokens.skip(TokenType.SYMBOL, "]");
122 |
123 | if type_name == "line":
124 | return document_matchers.line_break
125 | elif type_name == "page":
126 | return document_matchers.page_break
127 | elif type_name == "column":
128 | return document_matchers.column_break
129 | else:
130 | raise LineParseError("Unrecognised break type: {0}".format(type_name))
131 |
--------------------------------------------------------------------------------
/tests/docx/style_map_tests.py:
--------------------------------------------------------------------------------
1 | import io
2 | from zipfile import ZipFile
3 |
4 | from mammoth.docx.style_map import write_style_map, read_style_map
5 | from mammoth.zips import open_zip
6 | from mammoth.docx import xmlparser as xml
7 | from ..testing import assert_equal
8 |
9 |
10 | def test_reading_embedded_style_map_on_document_without_embedded_style_map_returns_none():
11 | fileobj = _normal_docx()
12 | assert_equal(None, read_style_map(fileobj))
13 |
14 |
15 | def test_writing_style_map_preserves_unrelated_files():
16 | fileobj = _normal_docx()
17 | write_style_map(fileobj, "p => h1")
18 | with open_zip(fileobj, "r") as zip_file:
19 | assert_equal("placeholder", zip_file.read_str("placeholder"))
20 |
21 | def test_embedded_style_map_can_be_read_after_being_written():
22 | fileobj = _normal_docx()
23 | write_style_map(fileobj, "p => h1")
24 | assert_equal("p => h1", read_style_map(fileobj))
25 |
26 |
27 | def test_embedded_style_map_is_written_to_separate_file():
28 | fileobj = _normal_docx()
29 | write_style_map(fileobj, "p => h1")
30 | with open_zip(fileobj, "r") as zip_file:
31 | assert_equal("p => h1", zip_file.read_str("mammoth/style-map"))
32 |
33 |
34 | def test_embedded_style_map_is_referenced_in_relationships():
35 | fileobj = _normal_docx()
36 | write_style_map(fileobj, "p => h1")
37 | assert_equal(expected_relationships_xml, _read_relationships_xml(fileobj))
38 |
39 | def test_embedded_style_map_has_override_content_type_in_content_types_xml():
40 | fileobj = _normal_docx()
41 | write_style_map(fileobj, "p => h1")
42 | assert_equal(expected_content_types_xml, _read_content_types_xml(fileobj))
43 |
44 |
45 | def test_can_overwrite_existing_style_map():
46 | fileobj = _normal_docx()
47 | write_style_map(fileobj, "p => h1")
48 | write_style_map(fileobj, "p => h2")
49 | with open_zip(fileobj, "r") as zip_file:
50 | assert_equal("p => h2", read_style_map(fileobj))
51 | _assert_no_duplicates(zip_file._zip_file.namelist())
52 | assert_equal(expected_relationships_xml, _read_relationships_xml(fileobj))
53 | assert_equal(expected_content_types_xml, _read_content_types_xml(fileobj))
54 |
55 |
56 | def _read_relationships_xml(fileobj):
57 | with open_zip(fileobj, "r") as zip_file:
58 | return xml.parse_xml(
59 | io.StringIO(zip_file.read_str("word/_rels/document.xml.rels")),
60 | [("r", "http://schemas.openxmlformats.org/package/2006/relationships")],
61 | )
62 |
63 |
64 | def _read_content_types_xml(fileobj):
65 | with open_zip(fileobj, "r") as zip_file:
66 | return xml.parse_xml(
67 | io.StringIO(zip_file.read_str("[Content_Types].xml")),
68 | [("ct", "http://schemas.openxmlformats.org/package/2006/content-types")],
69 | )
70 |
71 |
72 | original_relationships_xml = ('' +
73 | '' +
74 | '' +
75 | '')
76 |
77 | expected_relationships_xml = xml.element("r:Relationships", {}, [
78 | xml.element("r:Relationship", {"Id": "rId3", "Type": "http://schemas.openxmlformats.org/officeDocument/2006/relationships/settings", "Target": "settings.xml"}),
79 | xml.element("r:Relationship", {"Id": "rMammothStyleMap", "Type": "http://schemas.zwobble.org/mammoth/style-map", "Target": "/mammoth/style-map"}),
80 | ])
81 |
82 | original_content_types_xml = ('' +
83 | '' +
84 | '' +
85 | ''
86 | )
87 |
88 | expected_content_types_xml = xml.element("ct:Types", {}, [
89 | xml.element("ct:Default", {"Extension": "png", "ContentType": "image/png"}),
90 | xml.element("ct:Override", {"PartName": "/mammoth/style-map", "ContentType": "text/prs.mammoth.style-map"}),
91 | ])
92 |
93 |
94 | def _normal_docx():
95 | fileobj = io.BytesIO()
96 | zip_file = ZipFile(fileobj, "w")
97 | try:
98 | zip_file.writestr("placeholder", "placeholder")
99 | zip_file.writestr("word/_rels/document.xml.rels", original_relationships_xml)
100 | zip_file.writestr("[Content_Types].xml", original_content_types_xml)
101 | expected_relationships_xml
102 | finally:
103 | zip_file.close()
104 | return fileobj
105 |
106 |
107 | def _assert_no_duplicates(values):
108 | counts = {}
109 | for value in values:
110 | counts[value] = counts.get(value, 0) + 1
111 | for value, count in counts.items():
112 | if count != 1:
113 | assert False, "{0} has count of {1}".format(value, count)
114 |
--------------------------------------------------------------------------------
/tests/docx/styles_xml_tests.py:
--------------------------------------------------------------------------------
1 | from mammoth.docx.xmlparser import element as xml_element
2 | from mammoth.docx.styles_xml import read_styles_xml_element
3 | from ..testing import assert_equal
4 |
5 |
6 | def test_paragraph_style_is_null_if_no_style_with_that_id_exists():
7 | element = xml_element("w:styles")
8 | styles = read_styles_xml_element(element)
9 | assert_equal(None, styles.find_paragraph_style_by_id("Heading1"))
10 |
11 |
12 | def test_paragraph_style_can_be_found_by_id():
13 | element = xml_element("w:styles", {}, [
14 | _paragraph_style_element("Heading1", "Heading 1"),
15 | ])
16 | styles = read_styles_xml_element(element)
17 | assert_equal(
18 | "Heading1",
19 | styles.find_paragraph_style_by_id("Heading1").style_id
20 | )
21 |
22 |
23 | def test_character_style_can_be_found_by_id():
24 | element = xml_element("w:styles", {}, [
25 | _character_style_element("Heading1Char", "Heading 1 Char"),
26 | ])
27 | styles = read_styles_xml_element(element)
28 | assert_equal(
29 | "Heading1Char",
30 | styles.find_character_style_by_id("Heading1Char").style_id
31 | )
32 |
33 |
34 | def test_table_style_can_be_found_by_id():
35 | element = xml_element("w:styles", {}, [
36 | _table_style_element("TableNormal", "Normal Table"),
37 | ])
38 | styles = read_styles_xml_element(element)
39 | assert_equal(
40 | "TableNormal",
41 | styles.find_table_style_by_id("TableNormal").style_id
42 | )
43 |
44 |
45 | def test_paragraph_and_character_styles_are_distinct():
46 | element = xml_element("w:styles", {}, [
47 | _paragraph_style_element("Heading1", "Heading 1"),
48 | _character_style_element("Heading1Char", "Heading 1 Char"),
49 | ])
50 | styles = read_styles_xml_element(element)
51 | assert_equal(None, styles.find_character_style_by_id("Heading1"))
52 | assert_equal(None, styles.find_paragraph_style_by_id("Heading1Char"))
53 |
54 |
55 | def test_styles_include_names():
56 | element = xml_element("w:styles", {}, [
57 | _paragraph_style_element("Heading1", "Heading 1"),
58 | ])
59 | styles = read_styles_xml_element(element)
60 | assert_equal(
61 | "Heading 1",
62 | styles.find_paragraph_style_by_id("Heading1").name
63 | )
64 |
65 |
66 | def test_style_name_is_none_if_name_element_does_not_exist():
67 | element = xml_element("w:styles", {}, [
68 | _style_without_name_element("paragraph", "Heading1"),
69 | _style_without_name_element("character", "Heading1Char")
70 | ])
71 | styles = read_styles_xml_element(element)
72 | assert_equal(None, styles.find_paragraph_style_by_id("Heading1").name)
73 | assert_equal(None, styles.find_character_style_by_id("Heading1Char").name)
74 |
75 |
76 | def test_numbering_style_is_none_if_no_style_with_that_id_exists():
77 | element = xml_element("w:styles", {}, [])
78 | styles = read_styles_xml_element(element)
79 | assert_equal(None, styles.find_numbering_style_by_id("List1"))
80 |
81 |
82 | def test_numbering_style_has_none_num_id_if_style_has_no_paragraph_properties():
83 | element = xml_element("w:styles", {}, [
84 | xml_element("w:style", {"w:type": "numbering", "w:styleId": "List1"}),
85 | ])
86 | styles = read_styles_xml_element(element)
87 | assert_equal(None, styles.find_numbering_style_by_id("List1").num_id)
88 |
89 |
90 | def test_numbering_style_has_num_id_read_from_paragraph_properties():
91 | element = xml_element("w:styles", {}, [
92 | xml_element("w:style", {"w:type": "numbering", "w:styleId": "List1"}, [
93 | xml_element("w:pPr", {}, [
94 | xml_element("w:numPr", {}, [
95 | xml_element("w:numId", {"w:val": "42"})
96 | ]),
97 | ]),
98 | ]),
99 | ])
100 | styles = read_styles_xml_element(element)
101 | assert_equal("42", styles.find_numbering_style_by_id("List1").num_id)
102 |
103 |
104 | def test_when_multiple_style_elements_have_same_style_id_then_only_first_element_is_used():
105 | element = xml_element("w:styles", {}, [
106 | _table_style_element("TableNormal", "Normal Table"),
107 | _table_style_element("TableNormal", "Table Normal"),
108 | ])
109 | styles = read_styles_xml_element(element)
110 | assert_equal(
111 | "Normal Table",
112 | styles.find_table_style_by_id("TableNormal").name
113 | )
114 |
115 |
116 | def _paragraph_style_element(style_id, name):
117 | return _style_element("paragraph", style_id, name)
118 |
119 | def _character_style_element(style_id, name):
120 | return _style_element("character", style_id, name)
121 |
122 | def _table_style_element(style_id, name):
123 | return _style_element("table", style_id, name)
124 |
125 | def _style_element(element_type, style_id, name):
126 | children = [xml_element("w:name", {"w:val": name}, [])]
127 | return _style_element_with_children(element_type, style_id, children)
128 |
129 | def _style_without_name_element(element_type, style_id):
130 | return _style_element_with_children(element_type, style_id, [])
131 |
132 | def _style_element_with_children(element_type, style_id, children):
133 | attributes = {"w:type": element_type, "w:styleId": style_id}
134 | return xml_element("w:style", attributes, children)
135 |
--------------------------------------------------------------------------------
/tests/styles/parser/document_matcher_parser_tests.py:
--------------------------------------------------------------------------------
1 | from mammoth import documents, document_matchers
2 | from mammoth.styles.parser.document_matcher_parser import parse_document_matcher
3 | from mammoth.styles.parser.errors import LineParseError
4 | from mammoth.styles.parser.tokeniser import tokenise
5 | from mammoth.styles.parser.token_iterator import TokenIterator
6 | from ...testing import assert_equal, assert_raises
7 |
8 |
9 | def test_unrecognised_document_element_raises_error():
10 | error = assert_raises(LineParseError, lambda: read_document_matcher("x"))
11 | assert_equal("Unrecognised document element: x", str(error))
12 |
13 |
14 | def test_reads_plain_paragraph():
15 | assert_equal(
16 | document_matchers.paragraph(),
17 | read_document_matcher("p")
18 | )
19 |
20 |
21 | def test_reads_paragraph_with_style_id():
22 | assert_equal(
23 | document_matchers.paragraph(style_id="Heading1"),
24 | read_document_matcher("p.Heading1")
25 | )
26 |
27 |
28 | def test_reads_paragraph_with_exact_style_name():
29 | assert_equal(
30 | document_matchers.paragraph(style_name=document_matchers.equal_to("Heading 1")),
31 | read_document_matcher("p[style-name='Heading 1']")
32 | )
33 |
34 |
35 | def test_reads_paragraph_with_style_name_prefix():
36 | assert_equal(
37 | document_matchers.paragraph(style_name=document_matchers.starts_with("Heading")),
38 | read_document_matcher("p[style-name^='Heading']")
39 | )
40 |
41 |
42 | def test_unrecognised_string_matcher_raises_error():
43 | error = assert_raises(LineParseError, lambda: read_document_matcher("p[style-name*='Heading']"))
44 | assert_equal("Unrecognised string matcher: *", str(error))
45 |
46 |
47 | def test_reads_paragraph_ordered_list():
48 | assert_equal(
49 | document_matchers.paragraph(numbering=documents.numbering_level(1, is_ordered=True)),
50 | read_document_matcher("p:ordered-list(2)")
51 | )
52 |
53 |
54 | def test_reads_paragraph_unordered_list():
55 | assert_equal(
56 | document_matchers.paragraph(numbering=documents.numbering_level(1, is_ordered=False)),
57 | read_document_matcher("p:unordered-list(2)")
58 | )
59 |
60 |
61 | def test_unrecognised_list_type_raises_error():
62 | error = assert_raises(LineParseError, lambda: read_document_matcher("p:blah"))
63 | assert_equal("Unrecognised list type: blah", str(error))
64 |
65 |
66 | def test_reads_plain_run():
67 | assert_equal(
68 | document_matchers.run(),
69 | read_document_matcher("r")
70 | )
71 |
72 |
73 | def test_reads_run_with_style_id():
74 | assert_equal(
75 | document_matchers.run(style_id="Emphasis"),
76 | read_document_matcher("r.Emphasis")
77 | )
78 |
79 |
80 | def test_reads_run_with_style_name():
81 | assert_equal(
82 | document_matchers.run(style_name=document_matchers.equal_to("Emphasis")),
83 | read_document_matcher("r[style-name='Emphasis']")
84 | )
85 |
86 |
87 | def test_reads_plain_table():
88 | assert_equal(
89 | document_matchers.table(),
90 | read_document_matcher("table")
91 | )
92 |
93 |
94 | def test_reads_table_with_style_id():
95 | assert_equal(
96 | document_matchers.table(style_id="TableNormal"),
97 | read_document_matcher("table.TableNormal")
98 | )
99 |
100 |
101 | def test_reads_table_with_style_name():
102 | assert_equal(
103 | document_matchers.table(style_name=document_matchers.equal_to("Normal Table")),
104 | read_document_matcher("table[style-name='Normal Table']")
105 | )
106 |
107 |
108 | def test_reads_bold():
109 | assert_equal(
110 | document_matchers.bold,
111 | read_document_matcher("b")
112 | )
113 |
114 | def test_reads_italic():
115 | assert_equal(
116 | document_matchers.italic,
117 | read_document_matcher("i")
118 | )
119 |
120 | def test_reads_underline():
121 | assert_equal(
122 | document_matchers.underline,
123 | read_document_matcher("u")
124 | )
125 |
126 | def test_reads_strikethrough():
127 | assert_equal(
128 | document_matchers.strikethrough,
129 | read_document_matcher("strike")
130 | )
131 |
132 | def test_reads_all_caps():
133 | assert_equal(
134 | document_matchers.all_caps,
135 | read_document_matcher("all-caps")
136 | )
137 |
138 | def test_reads_small_caps():
139 | assert_equal(
140 | document_matchers.small_caps,
141 | read_document_matcher("small-caps")
142 | )
143 |
144 | def test_reads_highlight_without_color():
145 | assert_equal(
146 | document_matchers.highlight(),
147 | read_document_matcher("highlight")
148 | )
149 |
150 | def test_reads_highlight_with_color():
151 | assert_equal(
152 | document_matchers.highlight(color="yellow"),
153 | read_document_matcher("highlight[color='yellow']")
154 | )
155 |
156 | def test_reads_comment_reference():
157 | assert_equal(
158 | document_matchers.comment_reference,
159 | read_document_matcher("comment-reference")
160 | )
161 |
162 | def test_reads_line_breaks():
163 | assert_equal(
164 | document_matchers.line_break,
165 | read_document_matcher("br[type='line']"),
166 | )
167 |
168 | def test_reads_page_breaks():
169 | assert_equal(
170 | document_matchers.page_break,
171 | read_document_matcher("br[type='page']"),
172 | )
173 |
174 | def test_reads_column_breaks():
175 | assert_equal(
176 | document_matchers.column_break,
177 | read_document_matcher("br[type='column']"),
178 | )
179 |
180 |
181 | def test_unrecognised_break_type_raises_error():
182 | error = assert_raises(LineParseError, lambda: read_document_matcher("br[type='unknownBreakType']"))
183 | assert_equal("Unrecognised break type: unknownBreakType", str(error))
184 |
185 |
186 | def read_document_matcher(string):
187 | return parse_document_matcher(TokenIterator(tokenise(string)))
188 |
--------------------------------------------------------------------------------
/tests/writers/markdown_tests.py:
--------------------------------------------------------------------------------
1 | from __future__ import unicode_literals
2 |
3 | from mammoth.writers.markdown import MarkdownWriter
4 | from ..testing import assert_equal
5 |
6 |
7 | def test_special_markdown_characters_are_escaped():
8 | writer = _create_writer()
9 | writer.text(r"\*")
10 | assert_equal(r"\\\*", writer.as_string())
11 |
12 |
13 | def test_unrecognised_elements_are_treated_as_normal_text():
14 | writer = _create_writer()
15 | writer.start("blah");
16 | writer.text("Hello");
17 | writer.end("blah");
18 | assert_equal("Hello", writer.as_string())
19 |
20 |
21 | def test_paragraphs_are_terminated_with_double_new_line():
22 | writer = _create_writer()
23 | writer.start("p");
24 | writer.text("Hello");
25 | writer.end("p");
26 | assert_equal("Hello\n\n", writer.as_string())
27 |
28 |
29 | def test_h1_elements_are_converted_to_heading_with_leading_hash():
30 | writer = _create_writer()
31 | writer.start("h1");
32 | writer.text("Hello");
33 | writer.end("h1");
34 | assert_equal("# Hello\n\n", writer.as_string())
35 |
36 |
37 | def test_h6_elements_are_converted_to_heading_with_six_leading_hashes():
38 | writer = _create_writer()
39 | writer.start("h6");
40 | writer.text("Hello");
41 | writer.end("h6");
42 | assert_equal("###### Hello\n\n", writer.as_string())
43 |
44 |
45 | def test_br_is_written_as_two_spaces_followed_by_newline():
46 | writer = _create_writer()
47 | writer.text("Hello");
48 | writer.self_closing("br");
49 | assert_equal("Hello \n", writer.as_string())
50 |
51 |
52 | def test_strong_text_is_surrounded_by_two_underscores():
53 | writer = _create_writer()
54 | writer.text("Hello ");
55 | writer.start("strong");
56 | writer.text("World")
57 | writer.end("strong")
58 | assert_equal("Hello __World__", writer.as_string())
59 |
60 |
61 | def test_emphasised_text_is_surrounded_by_one_asterix():
62 | writer = _create_writer()
63 | writer.text("Hello ");
64 | writer.start("em");
65 | writer.text("World")
66 | writer.end("em")
67 | assert_equal("Hello *World*", writer.as_string())
68 |
69 |
70 | def test_anchor_tags_are_written_as_hyperlinks():
71 | writer = _create_writer()
72 | writer.start("a", {"href": "http://example.com"});
73 | writer.text("Hello");
74 | writer.end("a");
75 | assert_equal("[Hello](http://example.com)", writer.as_string())
76 |
77 |
78 | def test_anchor_tags_without_href_attribute_are_treated_as_ordinary_text():
79 | writer = _create_writer()
80 | writer.start("a");
81 | writer.text("Hello");
82 | writer.end("a");
83 | assert_equal("Hello", writer.as_string())
84 |
85 |
86 | def test_elements_with_ids_have_anchor_tags_with_ids_appended_to_start_of_markdown_element():
87 | writer = _create_writer()
88 | writer.start("h1", {"id": "start"})
89 | writer.text("Hello")
90 | writer.end("h1")
91 | assert_equal('# Hello\n\n', writer.as_string())
92 |
93 |
94 | def test_links_have_anchors_before_opening_square_bracket():
95 | writer = _create_writer()
96 | writer.start("a", {"href": "http://example.com", "id": "start"})
97 | writer.text("Hello")
98 | writer.end("a")
99 | assert_equal('[Hello](http://example.com)', writer.as_string())
100 |
101 |
102 | def test_image_elements_are_written_as_markdown_images():
103 | writer = _create_writer()
104 | writer.self_closing("img", {"src": "http://example.com/image.jpg", "alt": "Alt Text"})
105 | assert_equal("", writer.as_string())
106 |
107 |
108 | def test_images_are_written_even_if_they_dont_have_alt_text():
109 | writer = _create_writer()
110 | writer.self_closing("img", {"src": "http://example.com/image.jpg"})
111 | assert_equal("", writer.as_string())
112 |
113 |
114 | def test_images_are_written_even_if_they_dont_have_a_src_attribute():
115 | writer = _create_writer()
116 | writer.self_closing("img", {"alt": "Alt Text"})
117 | assert_equal("![Alt Text]()", writer.as_string())
118 |
119 |
120 | def test_image_elements_are_ignored_if_they_have_no_src_and_no_alt_text():
121 | writer = _create_writer()
122 | writer.self_closing("img")
123 | assert_equal("", writer.as_string())
124 |
125 |
126 | def test_list_item_outside_of_list_is_treated_as_unordered_list():
127 | writer = _create_writer()
128 | writer.start("li")
129 | writer.text("Fruit")
130 | writer.end("li")
131 | assert_equal("- Fruit\n", writer.as_string())
132 |
133 |
134 | def test_ol_element_is_written_as_ordered_list_with_sequential_numbering():
135 | writer = _create_writer()
136 | writer.start("ol")
137 | writer.start("li")
138 | writer.text("Fruit")
139 | writer.end("li")
140 | writer.start("li")
141 | writer.text("Condiments")
142 | writer.end("li")
143 | writer.end("ol")
144 | assert_equal("1. Fruit\n2. Condiments\n\n", writer.as_string())
145 |
146 |
147 | def test_ul_element_is_written_as_unordered_list_using_hyphens_as_bullets():
148 | writer = _create_writer()
149 | writer.start("ul")
150 | writer.start("li")
151 | writer.text("Fruit")
152 | writer.end("li")
153 | writer.start("li")
154 | writer.text("Condiments")
155 | writer.end("li")
156 | writer.end("ul")
157 | assert_equal("- Fruit\n- Condiments\n\n", writer.as_string())
158 |
159 |
160 | def test_numbering_is_separate_for_nested_list_and_parent_list():
161 | writer = _create_writer()
162 | writer.start("ol")
163 |
164 | writer.start("li")
165 | writer.text("Fruit")
166 | writer.start("ol")
167 | writer.start("li")
168 | writer.text("Apple")
169 | writer.end("li")
170 | writer.start("li")
171 | writer.text("Banana")
172 | writer.end("li")
173 | writer.end("ol")
174 | writer.end("li")
175 |
176 | writer.start("li")
177 | writer.text("Condiments")
178 | writer.end("li")
179 | writer.end("ol")
180 | assert_equal("1. Fruit\n\t1. Apple\n\t2. Banana\n2. Condiments\n\n", writer.as_string())
181 |
182 |
183 |
184 | def _create_writer():
185 | return MarkdownWriter()
186 |
--------------------------------------------------------------------------------
/mammoth/writers/markdown.py:
--------------------------------------------------------------------------------
1 | from __future__ import unicode_literals
2 |
3 | from .abc import Writer
4 |
5 | import re
6 |
7 |
8 | class _WriterOutput(object):
9 | def __init__(self, start, end=None, generate_end=None, anchor_position=None):
10 | if generate_end is None:
11 | generate_end = _constant(end)
12 |
13 | self.start = start
14 | self.generate_end = generate_end
15 | self.anchor_position = anchor_position
16 |
17 |
18 | def _constant(value):
19 | def get():
20 | return value
21 |
22 | return get
23 |
24 |
25 | class _MarkdownState(object):
26 | def __init__(self):
27 | self._list_state_stack = []
28 | self.list_state = None
29 | self.list_item_has_closed = False
30 |
31 | def update_list_state(self, list_state):
32 | self._list_state_stack.append(self.list_state)
33 | self.list_state = list_state
34 |
35 | def pop_list_state(self):
36 | self.list_state = self._list_state_stack.pop()
37 |
38 |
39 | class _MarkdownListState(object):
40 | def __init__(self, ordered, indentation):
41 | self.ordered = ordered
42 | self.count = 0
43 | self.indentation = indentation
44 |
45 |
46 | def _symmetric_wrapped(end):
47 | return _Wrapped(end, end)
48 |
49 |
50 | class _Wrapped(object):
51 | def __init__(self, start, end):
52 | self._start = start
53 | self._end = end
54 |
55 | def __call__(self, attributes, markdown_state):
56 | return _WriterOutput(self._start, self._end)
57 |
58 |
59 | def _hyperlink(attributes, markdown_state):
60 | href = attributes.get("href", "")
61 | if href:
62 | return _WriterOutput(
63 | "[", "]({0})".format(href),
64 | anchor_position="before",
65 | )
66 | else:
67 | return _default_output
68 |
69 |
70 | def _image(attributes, markdown_state):
71 | src = attributes.get("src", "")
72 | alt_text = attributes.get("alt", "")
73 | if src or alt_text:
74 | return _WriterOutput("".format(alt_text, src), "")
75 | else:
76 | return _default_output
77 |
78 |
79 | def _list(ordered):
80 | def call(attributes, markdown_state):
81 | if markdown_state.list_state is None:
82 | start = ""
83 | end_text = "\n"
84 | indentation = 0
85 | else:
86 | start = "\n"
87 | end_text = ""
88 | indentation = markdown_state.list_state.indentation + 1
89 |
90 | def generate_end():
91 | markdown_state.pop_list_state()
92 | return end_text
93 |
94 | markdown_state.update_list_state(_MarkdownListState(
95 | ordered=ordered,
96 | indentation=indentation,
97 | ))
98 |
99 | return _WriterOutput(start, generate_end=generate_end)
100 |
101 | return call
102 |
103 |
104 | def _list_item(attributes, markdown_state):
105 | markdown_state.list_item_has_closed = False
106 |
107 | list_state = markdown_state.list_state or _MarkdownListState(ordered=False, indentation=0)
108 | list_state.count += 1
109 |
110 | if list_state.ordered:
111 | bullet = "{0}.".format(list_state.count)
112 | else:
113 | bullet = "-"
114 |
115 | def generate_end():
116 | if markdown_state.list_item_has_closed:
117 | return ""
118 | else:
119 | markdown_state.list_item_has_closed = True
120 | return "\n"
121 |
122 | return _WriterOutput(
123 | start=("\t" * list_state.indentation) + bullet + " ",
124 | generate_end=generate_end
125 | )
126 |
127 |
128 | def _init_writers():
129 | writers = {
130 | "p": _Wrapped("", "\n\n"),
131 | "br": _Wrapped("", " \n"),
132 | "strong": _symmetric_wrapped("__"),
133 | "em": _symmetric_wrapped("*"),
134 | "a": _hyperlink,
135 | "img": _image,
136 | "ol": _list(ordered=True),
137 | "ul": _list(ordered=False),
138 | "li": _list_item,
139 | }
140 |
141 | for level in range(1, 7):
142 | writers["h{0}".format(level)] = _Wrapped("#" * level + " ", "\n\n")
143 |
144 | return writers
145 |
146 |
147 | _writers = _init_writers()
148 | _default_output = _WriterOutput("", "")
149 |
150 | def _default_writer(attributes, markdown_state):
151 | return _default_output
152 |
153 |
154 | class MarkdownWriter(Writer):
155 | def __init__(self):
156 | self._fragments = []
157 | self._element_stack = []
158 | self._markdown_state = _MarkdownState()
159 |
160 | def text(self, text):
161 | self._fragments.append(_escape_markdown(text))
162 |
163 | def start(self, name, attributes=None):
164 | if attributes is None:
165 | attributes = {}
166 |
167 | output = _writers.get(name, _default_writer)(attributes, self._markdown_state)
168 | self._element_stack.append(output.generate_end)
169 |
170 | anchor_before_start = output.anchor_position == "before"
171 | if anchor_before_start:
172 | self._write_anchor(attributes)
173 |
174 | self._fragments.append(output.start)
175 |
176 | if not anchor_before_start:
177 | self._write_anchor(attributes)
178 |
179 |
180 |
181 | def end(self, name):
182 | end = self._element_stack.pop()
183 | output = end()
184 | self._fragments.append(output)
185 |
186 | def self_closing(self, name, attributes=None):
187 | self.start(name, attributes)
188 | self.end(name)
189 |
190 | def append(self, other):
191 | self._fragments.append(other)
192 |
193 | def as_string(self):
194 | return "".join(self._fragments)
195 |
196 | def _write_anchor(self, attributes):
197 | html_id = attributes.get("id")
198 | if html_id:
199 | self._fragments.append(''.format(html_id))
200 |
201 |
202 | def _escape_markdown(value):
203 | return re.sub(r"([\`\*_\{\}\[\]\(\)\#\+\-\.\!])", r"\\\1", re.sub("\\\\", "\\\\\\\\", value))
204 |
--------------------------------------------------------------------------------
/mammoth/docx/__init__.py:
--------------------------------------------------------------------------------
1 | from functools import partial
2 | import os
3 |
4 | import cobble
5 |
6 | from .. import results, lists, zips
7 | from .document_xml import read_document_xml_element
8 | from .content_types_xml import empty_content_types, read_content_types_xml_element
9 | from .relationships_xml import read_relationships_xml_element, Relationships
10 | from .numbering_xml import read_numbering_xml_element, Numbering
11 | from .styles_xml import read_styles_xml_element, Styles
12 | from .notes_xml import read_endnotes_xml_element, read_footnotes_xml_element
13 | from .comments_xml import read_comments_xml_element
14 | from .files import Files
15 | from . import body_xml, office_xml
16 | from ..zips import open_zip
17 |
18 |
19 | _empty_result = results.success([])
20 |
21 |
22 | def read(fileobj, external_file_access=False):
23 | zip_file = open_zip(fileobj, "r")
24 | part_paths = _find_part_paths(zip_file)
25 | read_part_with_body = _part_with_body_reader(
26 | getattr(fileobj, "name", None),
27 | zip_file,
28 | part_paths=part_paths,
29 | external_file_access=external_file_access,
30 | )
31 |
32 | return results.combine([
33 | _read_notes(read_part_with_body, part_paths),
34 | _read_comments(read_part_with_body, part_paths),
35 | ]).bind(lambda referents:
36 | _read_document(zip_file, read_part_with_body, notes=referents[0], comments=referents[1], part_paths=part_paths)
37 | )
38 |
39 |
40 | @cobble.data
41 | class _PartPaths(object):
42 | main_document = cobble.field()
43 | comments = cobble.field()
44 | endnotes = cobble.field()
45 | footnotes = cobble.field()
46 | numbering = cobble.field()
47 | styles = cobble.field()
48 |
49 |
50 | def _find_part_paths(zip_file):
51 | package_relationships = _read_relationships(zip_file, "_rels/.rels")
52 | document_filename = _find_document_filename(zip_file, package_relationships)
53 |
54 | document_relationships = _read_relationships(
55 | zip_file,
56 | _find_relationships_path_for(document_filename),
57 | )
58 |
59 | def find(name):
60 | return _find_part_path(
61 | zip_file=zip_file,
62 | relationships=document_relationships,
63 | relationship_type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/" + name,
64 | fallback_path="word/{0}.xml".format(name),
65 | base_path=zips.split_path(document_filename)[0],
66 | )
67 |
68 | return _PartPaths(
69 | main_document=document_filename,
70 | comments=find("comments"),
71 | endnotes=find("endnotes"),
72 | footnotes=find("footnotes"),
73 | numbering=find("numbering"),
74 | styles=find("styles"),
75 | )
76 |
77 |
78 | def _find_document_filename(zip_file, relationships):
79 | path = _find_part_path(
80 | zip_file,
81 | relationships,
82 | relationship_type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument",
83 | base_path="",
84 | fallback_path="word/document.xml",
85 | )
86 | if zip_file.exists(path):
87 | return path
88 | else:
89 | raise IOError("Could not find main document part. Are you sure this is a valid .docx file?")
90 |
91 |
92 | def _find_part_path(zip_file, relationships, relationship_type, base_path, fallback_path):
93 | targets = [
94 | zips.join_path(base_path, target).lstrip("/")
95 | for target in relationships.find_targets_by_type(relationship_type)
96 | ]
97 | valid_targets = list(filter(lambda target: zip_file.exists(target), targets))
98 | if len(valid_targets) == 0:
99 | return fallback_path
100 | else:
101 | return valid_targets[0]
102 |
103 |
104 | def _read_notes(read_part_with_body, part_paths):
105 | footnotes = read_part_with_body(
106 | part_paths.footnotes,
107 | lambda root, body_reader: read_footnotes_xml_element(root, body_reader=body_reader),
108 | default=_empty_result,
109 | )
110 | endnotes = read_part_with_body(
111 | part_paths.endnotes,
112 | lambda root, body_reader: read_endnotes_xml_element(root, body_reader=body_reader),
113 | default=_empty_result,
114 | )
115 |
116 | return results.combine([footnotes, endnotes]).map(lists.flatten)
117 |
118 |
119 | def _read_comments(read_part_with_body, part_paths):
120 | return read_part_with_body(
121 | part_paths.comments,
122 | lambda root, body_reader: read_comments_xml_element(root, body_reader=body_reader),
123 | default=_empty_result,
124 | )
125 |
126 |
127 | def _read_document(zip_file, read_part_with_body, notes, comments, part_paths):
128 | return read_part_with_body(
129 | part_paths.main_document,
130 | partial(
131 | read_document_xml_element,
132 | notes=notes,
133 | comments=comments,
134 | ),
135 | )
136 |
137 |
138 | def _part_with_body_reader(document_path, zip_file, part_paths, external_file_access):
139 | content_types = _try_read_entry_or_default(
140 | zip_file,
141 | "[Content_Types].xml",
142 | read_content_types_xml_element,
143 | empty_content_types,
144 | )
145 |
146 | styles = _try_read_entry_or_default(
147 | zip_file,
148 | part_paths.styles,
149 | read_styles_xml_element,
150 | Styles.EMPTY,
151 | )
152 |
153 | numbering = _try_read_entry_or_default(
154 | zip_file,
155 | part_paths.numbering,
156 | lambda element: read_numbering_xml_element(element, styles=styles),
157 | default=Numbering.EMPTY,
158 | )
159 |
160 | files = Files(
161 | None if document_path is None else os.path.dirname(document_path),
162 | external_file_access=external_file_access,
163 | )
164 |
165 | def read_part(name, reader, default=_undefined):
166 | relationships = _read_relationships(zip_file, _find_relationships_path_for(name))
167 |
168 | body_reader = body_xml.reader(
169 | numbering=numbering,
170 | content_types=content_types,
171 | relationships=relationships,
172 | styles=styles,
173 | docx_file=zip_file,
174 | files=files,
175 | )
176 |
177 | if default is _undefined:
178 | return _read_entry(zip_file, name, partial(reader, body_reader=body_reader))
179 | else:
180 | return _try_read_entry_or_default(zip_file, name, partial(reader, body_reader=body_reader), default=default)
181 |
182 | return read_part
183 |
184 |
185 |
186 | def _find_relationships_path_for(name):
187 | dirname, basename = zips.split_path(name)
188 | return zips.join_path(dirname, "_rels", basename + ".rels")
189 |
190 |
191 | def _read_relationships(zip_file, name):
192 | return _try_read_entry_or_default(
193 | zip_file,
194 | name,
195 | read_relationships_xml_element,
196 | default=Relationships.EMPTY,
197 | )
198 |
199 | def _try_read_entry_or_default(zip_file, name, reader, default):
200 | if zip_file.exists(name):
201 | return _read_entry(zip_file, name, reader)
202 | else:
203 | return default
204 |
205 |
206 | def _read_entry(zip_file, name, reader):
207 | with zip_file.open(name) as fileobj:
208 | return reader(office_xml.read(fileobj))
209 |
210 |
211 | _undefined = object()
212 |
--------------------------------------------------------------------------------
/tests/docx/numbering_xml_tests.py:
--------------------------------------------------------------------------------
1 | from mammoth.docx.xmlparser import element as xml_element
2 | from mammoth.docx.numbering_xml import read_numbering_xml_element
3 | from mammoth.docx.styles_xml import NumberingStyle, Styles
4 | from ..testing import assert_equal
5 |
6 |
7 | def test_find_level_returns_none_if_num_with_id_cannot_be_found():
8 | numbering = _read_numbering_xml_element(xml_element("w:numbering"))
9 | assert_equal(None, numbering.find_level("47", "0"))
10 |
11 |
12 | _sample_numbering_xml = xml_element("w:numbering", {}, [
13 | xml_element("w:abstractNum", {"w:abstractNumId": "42"}, [
14 | xml_element("w:lvl", {"w:ilvl": "0"}, [
15 | xml_element("w:numFmt", {"w:val": "bullet"})
16 | ]),
17 | xml_element("w:lvl", {"w:ilvl": "1"}, [
18 | xml_element("w:numFmt", {"w:val": "decimal"})
19 | ])
20 | ]),
21 | xml_element("w:num", {"w:numId": "47"}, [
22 | xml_element("w:abstractNumId", {"w:val": "42"})
23 | ])
24 | ])
25 |
26 |
27 | def test_level_includes_level_index():
28 | numbering = _read_numbering_xml_element(_sample_numbering_xml)
29 | assert_equal("0", numbering.find_level("47", "0").level_index)
30 | assert_equal("1", numbering.find_level("47", "1").level_index)
31 |
32 |
33 | def test_list_is_not_ordered_if_formatted_as_bullet():
34 | numbering = _read_numbering_xml_element(_sample_numbering_xml)
35 | assert_equal(False, numbering.find_level("47", "0").is_ordered)
36 |
37 |
38 | def test_list_is_ordered_if_formatted_as_decimal():
39 | numbering = _read_numbering_xml_element(_sample_numbering_xml)
40 | assert_equal(True, numbering.find_level("47", "1").is_ordered)
41 |
42 |
43 | def test_list_is_ordered_if_there_is_no_explicit_format():
44 | element = xml_element("w:numbering", {}, [
45 | xml_element("w:abstractNum", {"w:abstractNumId": "42"}, [
46 | xml_element("w:lvl", {"w:ilvl": "0"}),
47 | ]),
48 | xml_element("w:num", {"w:numId": "47"}, [
49 | xml_element("w:abstractNumId", {"w:val": "42"})
50 | ])
51 | ])
52 |
53 | numbering = _read_numbering_xml_element(element)
54 |
55 | assert_equal(True, numbering.find_level("47", "0").is_ordered)
56 |
57 |
58 | def test_find_level_returns_none_if_level_cannot_be_found():
59 | numbering = _read_numbering_xml_element(_sample_numbering_xml)
60 | assert_equal(None, numbering.find_level("47", "2"))
61 |
62 |
63 | def test_num_referencing_non_existent_abstract_num_is_ignored():
64 | element = xml_element("w:numbering", {}, [
65 | xml_element("w:num", {"w:numId": "47"}, [
66 | xml_element("w:abstractNumId", {"w:val": "42"})
67 | ])
68 | ])
69 |
70 | numbering = _read_numbering_xml_element(element)
71 |
72 | assert_equal(None, numbering.find_level("47", "0"))
73 |
74 |
75 | def test_given_no_other_levels_with_index_of_0_when_level_is_missing_ilvl_then_level_index_is_0():
76 | element = xml_element("w:numbering", {}, [
77 | xml_element("w:abstractNum", {"w:abstractNumId": "42"}, [
78 | xml_element("w:lvl", {}, [
79 | xml_element("w:numFmt", {"w:val": "decimal"}),
80 | ]),
81 | ]),
82 | xml_element("w:num", {"w:numId": "47"}, [
83 | xml_element("w:abstractNumId", {"w:val": "42"})
84 | ])
85 | ])
86 |
87 | numbering = _read_numbering_xml_element(element)
88 |
89 | assert_equal(True, numbering.find_level("47", "0").is_ordered)
90 |
91 |
92 | def test_given_previous_other_level_with_index_of_0_when_level_is_missing_ilvl_then_level_is_ignored():
93 | element = xml_element("w:numbering", {}, [
94 | xml_element("w:abstractNum", {"w:abstractNumId": "42"}, [
95 | xml_element("w:lvl", {"w:ilvl": "0"}, [
96 | xml_element("w:numFmt", {"w:val": "bullet"}),
97 | ]),
98 | xml_element("w:lvl", {}, [
99 | xml_element("w:numFmt", {"w:val": "decimal"}),
100 | ]),
101 | ]),
102 | xml_element("w:num", {"w:numId": "47"}, [
103 | xml_element("w:abstractNumId", {"w:val": "42"})
104 | ])
105 | ])
106 |
107 | numbering = _read_numbering_xml_element(element)
108 |
109 | assert_equal(False, numbering.find_level("47", "0").is_ordered)
110 |
111 |
112 | def test_given_subsequent_other_level_with_index_of_0_when_level_is_missing_ilvl_then_level_is_ignored():
113 | element = xml_element("w:numbering", {}, [
114 | xml_element("w:abstractNum", {"w:abstractNumId": "42"}, [
115 | xml_element("w:lvl", {}, [
116 | xml_element("w:numFmt", {"w:val": "decimal"}),
117 | ]),
118 | xml_element("w:lvl", {"w:ilvl": "0"}, [
119 | xml_element("w:numFmt", {"w:val": "bullet"}),
120 | ]),
121 | ]),
122 | xml_element("w:num", {"w:numId": "47"}, [
123 | xml_element("w:abstractNumId", {"w:val": "42"})
124 | ])
125 | ])
126 |
127 | numbering = _read_numbering_xml_element(element)
128 |
129 | assert_equal(False, numbering.find_level("47", "0").is_ordered)
130 |
131 |
132 | def test_when_abstract_num_has_num_style_link_then_style_is_used_to_find_num():
133 | numbering = _read_numbering_xml_element(
134 | xml_element("w:numbering", {}, [
135 | xml_element("w:abstractNum", {"w:abstractNumId": "100"}, [
136 | xml_element("w:lvl", {"w:ilvl": "0"}, [
137 | xml_element("w:numFmt", {"w:val": "decimal"}),
138 | ]),
139 | ]),
140 | xml_element("w:abstractNum", {"w:abstractNumId": "101"}, [
141 | xml_element("w:numStyleLink", {"w:val": "List1"}),
142 | ]),
143 | xml_element("w:num", {"w:numId": "200"}, [
144 | xml_element("w:abstractNumId", {"w:val": "100"}),
145 | ]),
146 | xml_element("w:num", {"w:numId": "201"}, [
147 | xml_element("w:abstractNumId", {"w:val": "101"}),
148 | ])
149 | ]),
150 | styles=Styles.create(numbering_styles={
151 | "List1": NumberingStyle(style_id="List1", num_id="200"),
152 | }),
153 | )
154 | assert_equal(True, numbering.find_level("201", "0").is_ordered)
155 |
156 |
157 | # See: 17.9.23 pStyle (Paragraph Style's Associated Numbering Level) in ECMA-376, 4th Edition
158 | def test_numbering_level_can_be_found_by_paragraph_style_id():
159 | numbering = _read_numbering_xml_element(
160 | xml_element("w:numbering", {}, [
161 | xml_element("w:abstractNum", {"w:abstractNumId": "42"}, [
162 | xml_element("w:lvl", {"w:ilvl": "0"}, [
163 | xml_element("w:numFmt", {"w:val": "bullet"}),
164 | ]),
165 | ]),
166 | xml_element("w:abstractNum", {"w:abstractNumId": "43"}, [
167 | xml_element("w:lvl", {"w:ilvl": "0"}, [
168 | xml_element("w:pStyle", {"w:val": "List"}),
169 | xml_element("w:numFmt", {"w:val": "decimal"}),
170 | ]),
171 | ]),
172 | ]),
173 | )
174 |
175 | assert_equal(True, numbering.find_level_by_paragraph_style_id("List").is_ordered)
176 | assert_equal(None, numbering.find_level_by_paragraph_style_id("Paragraph"))
177 |
178 |
179 | def _read_numbering_xml_element(element, styles=None):
180 | if styles is None:
181 | styles = Styles.EMPTY
182 |
183 | return read_numbering_xml_element(element, styles=styles)
184 |
--------------------------------------------------------------------------------
/mammoth/documents.py:
--------------------------------------------------------------------------------
1 | import cobble
2 |
3 |
4 | class Element(object):
5 | def copy(self, **kwargs):
6 | return cobble.copy(self, **kwargs)
7 |
8 |
9 | class HasChildren(Element):
10 | children = cobble.field()
11 |
12 |
13 | @cobble.data
14 | class Document(HasChildren):
15 | notes = cobble.field()
16 | comments = cobble.field()
17 |
18 | @cobble.data
19 | class Paragraph(HasChildren):
20 | style_id = cobble.field()
21 | style_name = cobble.field()
22 | numbering = cobble.field()
23 | alignment = cobble.field()
24 | indent = cobble.field()
25 |
26 |
27 | @cobble.data
28 | class ParagraphIndent(object):
29 | start = cobble.field()
30 | end = cobble.field()
31 | first_line = cobble.field()
32 | hanging = cobble.field()
33 |
34 |
35 | @cobble.data
36 | class Indent(object):
37 | left = cobble.field()
38 | right = cobble.field()
39 | first_line = cobble.field()
40 | hanging = cobble.field()
41 |
42 |
43 | @cobble.data
44 | class Run(HasChildren):
45 | style_id = cobble.field()
46 | style_name = cobble.field()
47 | is_bold = cobble.field()
48 | is_italic = cobble.field()
49 | is_underline = cobble.field()
50 | is_strikethrough = cobble.field()
51 | is_all_caps = cobble.field()
52 | is_small_caps = cobble.field()
53 | vertical_alignment = cobble.field()
54 | font = cobble.field()
55 | font_size = cobble.field()
56 | highlight = cobble.field()
57 |
58 | @cobble.data
59 | class Text(Element):
60 | value = cobble.field()
61 |
62 | @cobble.data
63 | class Hyperlink(HasChildren):
64 | href = cobble.field()
65 | anchor = cobble.field()
66 | target_frame = cobble.field()
67 |
68 | @cobble.data
69 | class Checkbox(Element):
70 | checked = cobble.field()
71 |
72 | checkbox = Checkbox
73 |
74 | @cobble.data
75 | class Table(HasChildren):
76 | style_id = cobble.field()
77 | style_name = cobble.field()
78 |
79 | @cobble.data
80 | class TableRow(HasChildren):
81 | is_header = cobble.field()
82 |
83 | @cobble.data
84 | class TableCell(HasChildren):
85 | colspan = cobble.field()
86 | rowspan = cobble.field()
87 |
88 | @cobble.data
89 | class TableCellUnmerged:
90 | children = cobble.field()
91 | colspan = cobble.field()
92 | rowspan = cobble.field()
93 | vmerge = cobble.field()
94 |
95 | def _accept1(self, visitor, arg0):
96 | return visitor.visit_table_cell(self, arg0)
97 |
98 | def copy(self, **kwargs):
99 | return cobble.copy(self, **kwargs)
100 |
101 | @cobble.data
102 | class Break(Element):
103 | break_type = cobble.field()
104 |
105 | line_break = Break("line")
106 | page_break = Break("page")
107 | column_break = Break("column")
108 |
109 |
110 | @cobble.data
111 | class Tab(Element):
112 | pass
113 |
114 |
115 | @cobble.data
116 | class Image(Element):
117 | alt_text = cobble.field()
118 | content_type = cobble.field()
119 | open = cobble.field()
120 |
121 |
122 | def document(children, notes=None, comments=None):
123 | if notes is None:
124 | notes = Notes({})
125 | if comments is None:
126 | comments = []
127 | return Document(children, notes, comments=comments)
128 |
129 | def paragraph(children, style_id=None, style_name=None, numbering=None, alignment=None, indent=None):
130 | if indent is None:
131 | indent = paragraph_indent()
132 |
133 | return Paragraph(children, style_id, style_name, numbering, alignment=alignment, indent=indent)
134 |
135 | def paragraph_indent(start=None, end=None, first_line=None, hanging=None):
136 | return ParagraphIndent(start=start, end=end, first_line=first_line, hanging=hanging)
137 |
138 | def run(
139 | children,
140 | style_id=None,
141 | style_name=None,
142 | is_bold=None,
143 | is_italic=None,
144 | is_underline=None,
145 | is_strikethrough=None,
146 | is_all_caps=None,
147 | is_small_caps=None,
148 | vertical_alignment=None,
149 | font=None,
150 | font_size=None,
151 | highlight=None,
152 | ):
153 | if vertical_alignment is None:
154 | vertical_alignment = VerticalAlignment.baseline
155 | return Run(
156 | children=children,
157 | style_id=style_id,
158 | style_name=style_name,
159 | is_bold=bool(is_bold),
160 | is_italic=bool(is_italic),
161 | is_underline=bool(is_underline),
162 | is_strikethrough=bool(is_strikethrough),
163 | is_all_caps=bool(is_all_caps),
164 | is_small_caps=bool(is_small_caps),
165 | vertical_alignment=vertical_alignment,
166 | font=font,
167 | font_size=font_size,
168 | highlight=highlight,
169 | )
170 |
171 | class VerticalAlignment(object):
172 | baseline = "baseline"
173 | superscript = "superscript"
174 | subscript = "subscript"
175 |
176 | text = Text
177 |
178 | _tab = Tab()
179 |
180 | def tab():
181 | return _tab
182 |
183 |
184 | image = Image
185 |
186 | def hyperlink(children, href=None, anchor=None, target_frame=None):
187 | return Hyperlink(href=href, anchor=anchor, target_frame=target_frame, children=children)
188 |
189 |
190 | @cobble.data
191 | class Bookmark(Element):
192 | name = cobble.field()
193 |
194 | bookmark = Bookmark
195 |
196 |
197 | def table(children, style_id=None, style_name=None):
198 | return Table(children=children, style_id=style_id, style_name=style_name)
199 |
200 | def table_row(children, is_header=None):
201 | return TableRow(children=children, is_header=bool(is_header))
202 |
203 | def table_cell(children, colspan=None, rowspan=None):
204 | if colspan is None:
205 | colspan = 1
206 | if rowspan is None:
207 | rowspan = 1
208 | return TableCell(children=children, colspan=colspan, rowspan=rowspan)
209 |
210 | def table_cell_unmerged(children, colspan, rowspan, vmerge):
211 | return TableCellUnmerged(children=children, colspan=colspan, rowspan=rowspan, vmerge=vmerge)
212 |
213 | def numbering_level(level_index, is_ordered):
214 | return _NumberingLevel(str(level_index), bool(is_ordered))
215 |
216 | @cobble.data
217 | class _NumberingLevel(object):
218 | level_index = cobble.field()
219 | is_ordered = cobble.field()
220 |
221 | @cobble.data
222 | class Note(Element):
223 | note_type = cobble.field()
224 | note_id = cobble.field()
225 | body = cobble.field()
226 |
227 |
228 | note = Note
229 |
230 |
231 | class Notes(object):
232 | def __init__(self, notes):
233 | self._notes = notes
234 |
235 | def find_note(self, note_type, note_id):
236 | return self._notes[(note_type, note_id)]
237 |
238 | def resolve(self, reference):
239 | return self.find_note(reference.note_type, reference.note_id)
240 |
241 | def __eq__(self, other):
242 | return isinstance(other, Notes) and self._notes == other._notes
243 |
244 | def __ne__(self, other):
245 | return not (self == other)
246 |
247 | def notes(notes_list):
248 | return Notes(dict(
249 | (_note_key(note), note)
250 | for note in notes_list
251 | ))
252 |
253 | def _note_key(note):
254 | return (note.note_type, note.note_id)
255 |
256 | @cobble.data
257 | class NoteReference(Element):
258 | note_type = cobble.field()
259 | note_id = cobble.field()
260 |
261 | note_reference = NoteReference
262 |
263 |
264 | @cobble.data
265 | class Comment(object):
266 | comment_id = cobble.field()
267 | body = cobble.field()
268 | author_name = cobble.field()
269 | author_initials = cobble.field()
270 |
271 | def comment(comment_id, body, author_name=None, author_initials=None):
272 | return Comment(
273 | comment_id=comment_id,
274 | body=body,
275 | author_name=author_name,
276 | author_initials=author_initials,
277 | )
278 |
279 | @cobble.data
280 | class CommentReference(Element):
281 | comment_id = cobble.field()
282 |
283 | comment_reference = CommentReference
284 |
285 | def element_visitor(args):
286 | return cobble.visitor(Element, args=args)
287 |
--------------------------------------------------------------------------------
/tests/docx/docx_tests.py:
--------------------------------------------------------------------------------
1 | import io
2 | import textwrap
3 | import zipfile
4 |
5 | from mammoth import docx, documents, zips
6 | from ..testing import assert_equal, assert_raises, generate_test_path
7 |
8 |
9 | class ReadTests(object):
10 | def test_can_read_document_with_single_paragraph_with_single_run_of_text(self):
11 | with open(generate_test_path("single-paragraph.docx"), "rb") as fileobj:
12 | result = docx.read(fileobj=fileobj)
13 | expected_document = documents.document([
14 | documents.paragraph([
15 | documents.run([
16 | documents.text("Walking on imported air")
17 | ])
18 | ])
19 | ])
20 | assert_equal(expected_document, result.value)
21 |
22 |
23 | _relationship_namespaces = {
24 | "r": "http://schemas.openxmlformats.org/package/2006/relationships",
25 | }
26 |
27 |
28 | def test_main_document_is_found_using_package_relationships():
29 | fileobj = _create_zip({
30 | "word/document2.xml": textwrap.dedent("""\
31 |
32 |
33 |
34 |
35 |
36 | Hello.
37 |
38 |
39 |
40 |
41 | """),
42 | "_rels/.rels": textwrap.dedent("""\
43 |
44 |
45 |
46 |
47 | """),
48 | })
49 | result = docx.read(fileobj=fileobj)
50 | expected_document = documents.document([
51 | documents.paragraph([
52 | documents.run([
53 | documents.text("Hello.")
54 | ])
55 | ])
56 | ])
57 | assert_equal(expected_document, result.value)
58 |
59 |
60 | def test_error_is_raised_when_main_document_part_does_not_exist():
61 | fileobj = _create_zip({
62 | "_rels/.rels": textwrap.dedent("""\
63 |
64 |
65 |
66 |
67 | """),
68 | })
69 | error = assert_raises(IOError, lambda: docx.read(fileobj=fileobj))
70 | assert_equal(
71 | "Could not find main document part. Are you sure this is a valid .docx file?",
72 | str(error),
73 | )
74 |
75 | class PartPathsTests(object):
76 | def test_main_document_part_is_found_using_package_relationships(self):
77 | fileobj = _create_zip({
78 | "word/document2.xml": " ",
79 | "_rels/.rels": textwrap.dedent("""\
80 |
81 |
82 |
83 |
84 | """),
85 | })
86 | part_paths = self._find_part_paths(fileobj)
87 | assert_equal("word/document2.xml", part_paths.main_document)
88 |
89 | def test_when_relationship_for_main_document_cannot_be_found_then_fallback_is_used(self):
90 | fileobj = _create_zip({
91 | "word/document.xml": " ",
92 | })
93 | part_paths = self._find_part_paths(fileobj)
94 | assert_equal("word/document.xml", part_paths.main_document)
95 |
96 | def test_comments_part_is_found_using_main_document_relationships(self):
97 | self._assert_path_is_found_using_main_document_relationships("comments")
98 |
99 | def test_when_relationship_for_comments_cannot_be_found_then_fallback_is_used(self):
100 | self._assert_when_relationship_for_part_cannot_be_found_then_fallback_is_used("comments")
101 |
102 | def test_endnotes_part_is_found_using_main_document_relationships(self):
103 | self._assert_path_is_found_using_main_document_relationships("endnotes")
104 |
105 | def test_when_relationship_for_endnotes_cannot_be_found_then_fallback_is_used(self):
106 | self._assert_when_relationship_for_part_cannot_be_found_then_fallback_is_used("endnotes")
107 |
108 | def test_footnotes_part_is_found_using_main_document_relationships(self):
109 | self._assert_path_is_found_using_main_document_relationships("footnotes")
110 |
111 | def test_when_relationship_for_footnotes_cannot_be_found_then_fallback_is_used(self):
112 | self._assert_when_relationship_for_part_cannot_be_found_then_fallback_is_used("footnotes")
113 |
114 | def test_numbering_part_is_found_using_main_document_relationships(self):
115 | self._assert_path_is_found_using_main_document_relationships("numbering")
116 |
117 | def test_when_relationship_for_numbering_cannot_be_found_then_fallback_is_used(self):
118 | self._assert_when_relationship_for_part_cannot_be_found_then_fallback_is_used("numbering")
119 |
120 | def test_styles_part_is_found_using_main_document_relationships(self):
121 | self._assert_path_is_found_using_main_document_relationships("styles")
122 |
123 | def test_when_relationship_for_styles_cannot_be_found_then_fallback_is_used(self):
124 | self._assert_when_relationship_for_part_cannot_be_found_then_fallback_is_used("styles")
125 |
126 | def _assert_path_is_found_using_main_document_relationships(self, name):
127 | fileobj = _create_zip({
128 | "_rels/.rels": textwrap.dedent("""\
129 |
130 |
131 |
132 |
133 | """),
134 | "word/document.xml": " ",
135 | "word/_rels/document.xml.rels": textwrap.dedent("""\
136 |
137 |
138 |
139 |
140 | """.format(name=name)),
141 | "word/target-path.xml": " "
142 | })
143 | part_paths = self._find_part_paths(fileobj)
144 | assert_equal("word/target-path.xml", getattr(part_paths, name))
145 |
146 | def _assert_when_relationship_for_part_cannot_be_found_then_fallback_is_used(self, name):
147 | fileobj = _create_zip({
148 | "_rels/.rels": textwrap.dedent("""\
149 |
150 |
151 |
152 |
153 | """),
154 | "word/document.xml": " ",
155 | })
156 | part_paths = self._find_part_paths(fileobj)
157 | assert_equal("word/{0}.xml".format(name), getattr(part_paths, name))
158 |
159 |
160 | def _find_part_paths(self, fileobj):
161 | return docx._find_part_paths(zips.open_zip(fileobj, "r"))
162 |
163 |
164 | def _create_zip(files):
165 | fileobj = io.BytesIO()
166 |
167 | zip_file = zipfile.ZipFile(fileobj, "w")
168 | try:
169 | for name, contents in files.items():
170 | zip_file.writestr(name, contents)
171 | finally:
172 | zip_file.close()
173 |
174 | fileobj.seek(0)
175 | return fileobj
176 |
--------------------------------------------------------------------------------
/NEWS:
--------------------------------------------------------------------------------
1 | # 1.12.0
2 |
3 | * Handle hyperlinked wp:anchor and wp:inline elements.
4 |
5 | # 1.11.0
6 |
7 | * Ignore style definitions using a style ID that has already been used.
8 |
9 | * Fix conversion of unmerged table cells.
10 |
11 | * Disable external file accesses by default. External file access can be enabled
12 | using the external_file_access argument.
13 |
14 | * Handle numbering levels defined without an index.
15 |
16 | # 1.10.0
17 |
18 | * Add "Heading" and "Body" styles, as found in documents created by Apple Pages,
19 | to the default style map.
20 |
21 | * Handle structured document tags representing checkboxes wrapped in other
22 | elements, such as table cells. Previously, the wrapping elements would have
23 | been ignored.
24 |
25 | * Ignore deleted table rows.
26 |
27 | * Add notes on security.
28 |
29 | # 1.9.1
30 |
31 | * Ignore AlternateContent elements when there is no Fallback element.
32 |
33 | # 1.9.0
34 |
35 | * Detect checkboxes, both as complex fields and structured document tags, and
36 | convert them to checkbox inputs.
37 |
38 | * Ignore AlternateContent elements when there is no Fallback element.
39 |
40 | # 1.8.0
41 |
42 | * Add style mapping for highlights.
43 |
44 | # 1.7.1
45 |
46 | * Switch the precedence of numbering properties in paragraph properties and the
47 | numbering in paragraph styles so that the numbering properties in paragraph
48 | properties takes precedence.
49 |
50 | # 1.7.0
51 |
52 | * Support attributes in HTML paths in style mappings.
53 |
54 | * Improve error message when failing to find the body element in a document.
55 |
56 | * Drop support for Python 2.7, Python 3.5 and Python 3.6.
57 |
58 | * Add support for the strict document format.
59 |
60 | # 1.6.0
61 |
62 | * Support merged paragraphs when revisions are tracked.
63 |
64 | # 1.5.1
65 |
66 | * Add a pyproject.toml to add an explicit build dependency on setuptools.
67 |
68 | # 1.5.0
69 |
70 | * Only use the alt text of image elements as a fallback. If an alt attribute is
71 | returned from the function passed to mammoth.images.img_element, that value
72 | will now be preferred to the alt text of the image element.
73 |
74 | # 1.4.19
75 |
76 | * Ignore w:u elements when w:val is missing.
77 |
78 | # 1.4.18
79 |
80 | * Emit warning instead of throwing exception when image file cannot be found for
81 | a:blip elements.
82 |
83 | # 1.4.17
84 |
85 | * When extracting raw text, convert tab elements to tab characters.
86 |
87 | * Handle internal hyperlinks created with complex fields.
88 |
89 | # 1.4.16
90 |
91 | * Handle w:num with invalid w:abstractNumId.
92 |
93 | # 1.4.15
94 |
95 | * Convert symbols in supported fonts to corresponding Unicode characters.
96 |
97 | # 1.4.14
98 |
99 | * Support numbering defined by paragraph style.
100 |
101 | # 1.4.13
102 |
103 | * Add style mapping for all caps.
104 |
105 | # 1.4.12
106 |
107 | * Handle underline elements where w:val is "none".
108 |
109 | # 1.4.11
110 |
111 | * Read font size for runs.
112 | * Support soft hyphens.
113 |
114 | # 1.4.10
115 |
116 | * Update supported Python versions to 2.7 and 3.4 to 3.8.
117 |
118 | # 1.4.9
119 |
120 | * Improve list support by following w:numStyleLink in w:abstractNum.
121 |
122 | # 1.4.8
123 |
124 | * Preserve empty table rows.
125 |
126 | # 1.4.7
127 |
128 | * Always write files as UTF-8 in the CLI.
129 |
130 | # 1.4.6
131 |
132 | * Fix: default style mappings caused footnotes, endnotes and comments
133 | containing multiple paragraphs to be converted into a single paragraph.
134 |
135 | # 1.4.5
136 |
137 | * Read the children of v:rect elements.
138 |
139 | # 1.4.4
140 |
141 | * Parse paragraph indents.
142 |
143 | * Read part paths using relationships. This improves support for documents
144 | created by Word Online.
145 |
146 | # 1.4.3
147 |
148 | * Add style mapping for small caps.
149 |
150 | * Add style mapping for tables.
151 |
152 | # 1.4.2
153 |
154 | * Read children of v:group elements.
155 |
156 | # 1.4.1
157 |
158 | * Read w:noBreakHyphen elements as non-breaking hyphen characters.
159 |
160 | # 1.4.0
161 |
162 | * Extract the default data URI image converter to the images module.
163 |
164 | * Add anchor on hyperlinks as fragment if present.
165 |
166 | * Convert target frames on hyperlinks to targets on anchors.
167 |
168 | * Detect header rows in tables and convert to thead > tr > th.
169 |
170 | # 1.3.5
171 |
172 | * Handle complex fields that do not have a "separate" fldChar.
173 |
174 | # 1.3.4
175 |
176 | * Add transforms.run.
177 |
178 | # 1.3.3
179 |
180 | * Read children of w:object elements.
181 |
182 | * Add support for document transforms.
183 |
184 | # 1.3.2
185 |
186 | * Handle hyperlinks created with complex fields.
187 |
188 | # 1.3.1
189 |
190 | * Handle absolute paths within zip files. This should fix an issue where some
191 | images within a document couldn't be found.
192 |
193 | # 1.3.0
194 |
195 | * Allow style names to be mapped by prefix. For instance:
196 |
197 | r[style-name^='Code '] => code
198 |
199 | * Add default style mappings for Heading 5 and Heading 6.
200 |
201 | * Allow escape sequences in style IDs, style names and CSS class names.
202 |
203 | * Allow a separator to be specified when HTML elements are collapsed.
204 |
205 | * Add include_embedded_style_map argument to allow embedded style maps to be
206 | disabled.
207 |
208 | * Include embedded styles when explicit style map is passed.
209 |
210 | # 1.2.2
211 |
212 | * Ignore bold, italic, underline and strikethrough elements that have a value of
213 | false or 0.
214 |
215 | # 1.2.1
216 |
217 | * Ignore v:imagedata elements without relationship ID with warning.
218 |
219 | # 1.2.0
220 |
221 | * Use alt text title as alt text for images when the alt text description is
222 | blank or missing.
223 |
224 | # 1.1.1
225 |
226 | * Handle comments without author initials.
227 |
228 | * Change numbering of comments to be global rather than per-user to match the
229 | behaviour of Word.
230 |
231 | # 1.1.0
232 |
233 | * Add support for comments.
234 |
235 | # 1.0.4
236 |
237 | * Add support for w:sdt elements. This allows the bodies of content controls,
238 | such as bibliographies, to be converted.
239 |
240 | # 1.0.3
241 |
242 | * Add support for table cells spanning multiple rows.
243 |
244 | # 1.0.2
245 |
246 | * Add support for table cells spanning multiple columns.
247 |
248 | # 1.0.1
249 |
250 | * Improve script installation on Windows by using entry_points instead of
251 | scripts in setup.py.
252 |
253 | # 1.0.0
254 |
255 | * Remove deprecated convert_underline argument.
256 |
257 | * Officially support ID prefixes.
258 |
259 | * Generated IDs no longer insert a hyphen after the ID prefix.
260 |
261 | * The default ID prefix is now the empty string rather than a random number
262 | followed by a hyphen.
263 |
264 | * Rename mammoth.images.inline to mammoth.images.img_element to better reflect
265 | its behaviour.
266 |
267 | # 0.3.31
268 |
269 | * Improve collapsing of similar non-fresh HTML elements.
270 |
271 | # 0.3.30
272 |
273 | * Allow bold and italic style mappings to be configured.
274 |
275 | # 0.3.29
276 |
277 | * Handle references to missing styles when reading documents.
278 |
279 | # 0.3.28
280 |
281 | * Improve support for lists made in LibreOffice. Specifically, this changes the
282 | default style mapping for paragraphs with a style of "Normal" to have the
283 | lowest precedence.
284 |
285 | # 0.3.27
286 |
287 | * Handle XML where the child nodes of an element contains text nodes.
288 |
289 | # 0.3.26
290 |
291 | * Always use mc:Fallback when reading mc:AlternateContent elements.
292 |
293 | # 0.3.25
294 |
295 | * Remove duplicate messages from results.
296 |
297 | * Read v:imagedata with r:id attribute.
298 |
299 | * Read children of v:roundrect.
300 |
301 | * Ignore office-word:wrap, v:shadow and v:shapetype.
302 |
303 | # 0.3.24
304 |
305 | * Continue with warning if external images cannot be found.
306 |
307 | * Add support for embedded style maps.
308 |
309 | # 0.3.23
310 |
311 | * Fix Python 3 support.
312 |
313 | # 0.3.22
314 |
315 | * Generate warnings for not-understood style mappings and continue, rather than
316 | stopping with an error.
317 |
318 | * Support file objects without a name attribute again (broken since 0.3.20).
319 |
320 | # 0.3.21
321 |
322 | * Ignore w:numPr elements without w:numId or w:ilvl children.
323 |
324 | # 0.3.20
325 |
326 | * Add support for linked images.
327 |
328 | # 0.3.19
329 |
330 | * Fix: cannot extract raw text from elements without children
331 |
332 | # 0.3.18
333 |
334 | * Support links and images in footnotes and endnotes.
335 |
336 | # 0.3.17
337 |
338 | * Add support for underlines in style map.
339 |
340 | * Add support for strikethrough.
341 |
342 | # 0.3.16
343 |
344 | * Add basic support for text boxes. The contents of the text box are treated as
345 | a separate paragraph that appears after the paragraph containing the text box.
346 |
347 | # 0.3.15
348 |
349 | * Support styles defined without a name
350 |
351 | # 0.3.14
352 |
353 | * Add ignore_empty_paragraphs option, which defaults to True.
354 |
355 | # 0.3.13
356 |
357 | * Always use forward slashes in ZIP paths. This should fix image handling on
358 | Windows.
359 |
360 | # 0.3.12
361 |
362 | * Make style names case-insensitive in style mappings. This should make style
363 | mappings easier to write, especially since Microsoft Word sometimes represents
364 | style names in the UI differently from in the style definition. For instance,
365 | the style displayed in Word as "Heading 1" has a style name of "heading 1".
366 | This hopefully shouldn't cause an issue for anyone, but if you were relying
367 | on case-sensitivity, please do get in touch.
368 |
369 | # 0.3.11
370 |
371 | * Add support for hyperlinks to bookmarks in the same document.
372 |
373 | # 0.3.10
374 |
375 | * Add basic support for Markdown. Not all features are currently supported.
376 |
377 | # 0.3.9
378 |
379 | * Add default style mappings for builtin footnote and endnote styles in
380 | Microsoft Word and LibreOffice.
381 |
382 | * Allow style mappings with a zero-element HTML path.
383 |
384 | * Emit warnings when image types are unlikely to be supported by web browsers.
385 |
386 | # 0.3.8
387 |
388 | * Add support for endnotes.
389 |
390 | # 0.3.7
391 |
392 | * Add support for superscript and subscript text.
393 |
394 | # 0.3.6
395 |
396 | * Add support for footnotes.
397 |
398 | # 0.3.5
399 |
400 | * Add support for line breaks.
401 |
402 | # 0.3.4
403 |
404 | * Add optional underline conversion.
405 |
406 | # 0.3.3
407 |
408 | * Add `mammoth.images.inline`, and document custom image conversion.
409 |
410 | # 0.3.2
411 |
412 | * Add the function `mammoth.extract_raw_text`.
413 |
414 | # 0.3.1
415 |
416 | * Add support for tables
417 |
418 | # 0.3.0
419 |
420 | * Rename --styles CLI argument to --style-map.
421 |
422 | * Rename styles argument in convert_to_html to style_map.
423 |
424 | * Allow paragraphs and runs to be matched by style name. For instance, to match
425 | a paragraph with the style name `Heading 1`:
426 |
427 | p[style-name='Heading 1']
428 |
--------------------------------------------------------------------------------