├── tests ├── __init__.py ├── docx │ ├── __init__.py │ ├── uris_tests.py │ ├── document_matchers.py │ ├── notes_xml_tests.py │ ├── office_xml_tests.py │ ├── comments_xml_tests.py │ ├── document_xml_tests.py │ ├── relationships_xml_tests.py │ ├── content_types_xml_tests.py │ ├── files_tests.py │ ├── xmlparser_tests.py │ ├── style_map_tests.py │ ├── styles_xml_tests.py │ ├── numbering_xml_tests.py │ └── docx_tests.py ├── html │ ├── __init__.py │ ├── strip_empty_tests.py │ └── collapse_tests.py ├── styles │ ├── __init__.py │ ├── parser │ │ ├── __init__.py │ │ ├── style_mapping_parser_tests.py │ │ ├── token_parser_tests.py │ │ ├── html_path_parser_tests.py │ │ ├── tokeniser_tests.py │ │ └── document_matcher_parser_tests.py │ └── document_matcher_tests.py ├── writers │ ├── __init__.py │ └── markdown_tests.py ├── test-data │ ├── empty.docx │ ├── tables.docx │ ├── comments.docx │ ├── endnotes.docx │ ├── footnotes.docx │ ├── text-box.docx │ ├── underline.docx │ ├── utf8-bom.docx │ ├── simple-list.docx │ ├── tiny-picture.docx │ ├── tiny-picture.png │ ├── strict-format.docx │ ├── strikethrough.docx │ ├── external-picture.docx │ ├── single-paragraph.docx │ ├── embedded-style-map.docx │ ├── footnote-hyperlink.docx │ ├── tiny-picture-target-base-relative.docx │ ├── hyperlinks │ │ └── word │ │ │ ├── _rels │ │ │ └── document.xml.rels │ │ │ └── document.xml │ └── simple │ │ └── word │ │ └── document.xml ├── conftest.py ├── lists_tests.py ├── testing.py ├── zips_tests.py ├── options_tests.py ├── raw_text_tests.py ├── images_tests.py ├── cli_tests.py └── transforms_tests.py ├── setup.cfg ├── mammoth ├── styles │ ├── parser │ │ ├── errors.py │ │ ├── __init__.py │ │ ├── style_mapping_parser.py │ │ ├── token_parser.py │ │ ├── tokeniser.py │ │ ├── token_iterator.py │ │ ├── html_path_parser.py │ │ └── document_matcher_parser.py │ └── __init__.py ├── underline.py ├── docx │ ├── uris.py │ ├── complex_fields.py │ ├── document_xml.py │ ├── comments_xml.py │ ├── notes_xml.py │ ├── relationships_xml.py │ ├── files.py │ ├── content_types_xml.py │ ├── office_xml.py │ ├── style_map.py │ ├── xmlparser.py │ ├── styles_xml.py │ ├── numbering_xml.py │ └── __init__.py ├── writers │ ├── __init__.py │ ├── abc.py │ ├── html.py │ └── markdown.py ├── raw_text.py ├── images.py ├── lists.py ├── results.py ├── html │ ├── nodes.py │ └── __init__.py ├── html_paths.py ├── transforms.py ├── __init__.py ├── zips.py ├── document_matchers.py ├── cli.py ├── options.py └── documents.py ├── .gitignore ├── pyproject.toml ├── test-requirements.txt ├── .github ├── pull_request_template.md ├── ISSUE_TEMPLATE.md └── workflows │ └── tests.yml ├── tox.ini ├── makefile ├── LICENSE ├── setup.py ├── recipes └── wmf_images.py └── NEWS /tests/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tests/docx/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/html/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tests/styles/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/styles/parser/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/writers/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal = 1 3 | -------------------------------------------------------------------------------- /mammoth/styles/parser/errors.py: -------------------------------------------------------------------------------- 1 | class LineParseError(Exception): 2 | pass 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | /README 3 | /_virtualenv 4 | /*.egg-info 5 | /.tox 6 | /MANIFEST 7 | /build 8 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools"] 3 | build-backend = "setuptools.build_meta" 4 | -------------------------------------------------------------------------------- /tests/test-data/empty.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/HEAD/tests/test-data/empty.docx -------------------------------------------------------------------------------- /tests/test-data/tables.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/HEAD/tests/test-data/tables.docx -------------------------------------------------------------------------------- /tests/test-data/comments.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/HEAD/tests/test-data/comments.docx -------------------------------------------------------------------------------- /tests/test-data/endnotes.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/HEAD/tests/test-data/endnotes.docx -------------------------------------------------------------------------------- /tests/test-data/footnotes.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/HEAD/tests/test-data/footnotes.docx -------------------------------------------------------------------------------- /tests/test-data/text-box.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/HEAD/tests/test-data/text-box.docx -------------------------------------------------------------------------------- /tests/test-data/underline.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/HEAD/tests/test-data/underline.docx -------------------------------------------------------------------------------- /tests/test-data/utf8-bom.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/HEAD/tests/test-data/utf8-bom.docx -------------------------------------------------------------------------------- /tests/test-data/simple-list.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/HEAD/tests/test-data/simple-list.docx -------------------------------------------------------------------------------- /tests/test-data/tiny-picture.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/HEAD/tests/test-data/tiny-picture.docx -------------------------------------------------------------------------------- /tests/test-data/tiny-picture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/HEAD/tests/test-data/tiny-picture.png -------------------------------------------------------------------------------- /tests/test-data/strict-format.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/HEAD/tests/test-data/strict-format.docx -------------------------------------------------------------------------------- /tests/test-data/strikethrough.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/HEAD/tests/test-data/strikethrough.docx -------------------------------------------------------------------------------- /tests/test-data/external-picture.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/HEAD/tests/test-data/external-picture.docx -------------------------------------------------------------------------------- /tests/test-data/single-paragraph.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/HEAD/tests/test-data/single-paragraph.docx -------------------------------------------------------------------------------- /test-requirements.txt: -------------------------------------------------------------------------------- 1 | funk>=0.4,<0.5 2 | pytest 3 | precisely==0.1.3 4 | pyflakes==2.4.0 5 | spur.local>=0.3.7,<0.4 6 | tempman>=0.1.2,<0.2 7 | -------------------------------------------------------------------------------- /tests/test-data/embedded-style-map.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/HEAD/tests/test-data/embedded-style-map.docx -------------------------------------------------------------------------------- /tests/test-data/footnote-hyperlink.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/HEAD/tests/test-data/footnote-hyperlink.docx -------------------------------------------------------------------------------- /tests/test-data/tiny-picture-target-base-relative.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwilliamson/python-mammoth/HEAD/tests/test-data/tiny-picture-target-base-relative.docx -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | In general, pull requests are not currently accepted. 2 | 3 | Please instead submit an issue if you find a bug or would like to request a feature. 4 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import funk 2 | import pytest 3 | 4 | 5 | @pytest.fixture(name="mocks") 6 | def _fixture_mocks(): 7 | mocks = funk.Mocks() 8 | yield mocks 9 | mocks.verify() 10 | -------------------------------------------------------------------------------- /mammoth/underline.py: -------------------------------------------------------------------------------- 1 | from . import html 2 | 3 | 4 | def element(name): 5 | def convert_underline(nodes): 6 | return [html.collapsible_element(name, {}, nodes)] 7 | 8 | return convert_underline 9 | -------------------------------------------------------------------------------- /mammoth/styles/__init__.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | 4 | def style(document_matcher, html_path): 5 | return Style(document_matcher, html_path) 6 | 7 | 8 | Style = collections.namedtuple("Style", ["document_matcher", "html_path"]) 9 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py37,py38,py39,py310,py311,py312,pypy3 3 | [testenv] 4 | changedir = {envtmpdir} 5 | deps=-r{toxinidir}/test-requirements.txt 6 | commands= 7 | py.test {toxinidir}/tests 8 | [pytest] 9 | python_classes = *Tests 10 | python_files = *_tests.py 11 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | If you're reporting a bug or requesting a feature, please include: 2 | * a minimal example document 3 | * the HTML output that you'd expect 4 | 5 | If you're reporting a bug, it's also useful to know what platform you're 6 | running on, including: 7 | 8 | * the version of Python 9 | * the operating system and version 10 | -------------------------------------------------------------------------------- /tests/lists_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth.lists import unique 2 | from .testing import assert_equal 3 | 4 | 5 | def test_unique_of_empty_list_is_empty_list(): 6 | assert_equal([], unique([])) 7 | 8 | 9 | def test_unique_removes_duplicates_while_preserving_order(): 10 | assert_equal(["apple", "banana"], unique(["apple", "banana", "apple"])) 11 | -------------------------------------------------------------------------------- /mammoth/docx/uris.py: -------------------------------------------------------------------------------- 1 | def uri_to_zip_entry_name(base, uri): 2 | if uri.startswith("/"): 3 | return uri[1:] 4 | else: 5 | return base + "/" + uri 6 | 7 | 8 | def replace_fragment(uri, fragment): 9 | hash_index = uri.find("#") 10 | if hash_index != -1: 11 | uri = uri[:hash_index] 12 | return uri + "#" + fragment 13 | -------------------------------------------------------------------------------- /mammoth/writers/__init__.py: -------------------------------------------------------------------------------- 1 | from .html import HtmlWriter 2 | from .markdown import MarkdownWriter 3 | 4 | 5 | def writer(output_format=None): 6 | if output_format is None: 7 | output_format = "html" 8 | 9 | return _writers[output_format]() 10 | 11 | 12 | def formats(): 13 | return _writers.keys() 14 | 15 | 16 | _writers = { 17 | "html": HtmlWriter, 18 | "markdown": MarkdownWriter, 19 | } 20 | -------------------------------------------------------------------------------- /mammoth/raw_text.py: -------------------------------------------------------------------------------- 1 | from . import documents 2 | 3 | 4 | def extract_raw_text_from_element(element): 5 | if isinstance(element, documents.Text): 6 | return element.value 7 | elif isinstance(element, documents.Tab): 8 | return "\t" 9 | else: 10 | text = "".join(map(extract_raw_text_from_element, getattr(element, "children", []))) 11 | if isinstance(element, documents.Paragraph): 12 | return text + "\n\n" 13 | else: 14 | return text 15 | -------------------------------------------------------------------------------- /tests/testing.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from precisely import assert_that, equal_to 4 | 5 | 6 | def generate_test_path(path): 7 | this_dir = os.path.dirname(__file__) 8 | return os.path.join(this_dir, "test-data", path) 9 | 10 | 11 | def assert_equal(expected, actual): 12 | assert_that(actual, equal_to(expected)) 13 | 14 | 15 | def assert_raises(exception, func): 16 | try: 17 | func() 18 | assert False, "Expected " + exception.__name__ 19 | except exception as error: 20 | return error 21 | 22 | -------------------------------------------------------------------------------- /tests/docx/uris_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth.docx.uris import uri_to_zip_entry_name 2 | from ..testing import assert_equal 3 | 4 | 5 | def test_when_path_does_not_have_leading_slash_then_path_is_resolved_relative_to_base(): 6 | assert_equal( 7 | "one/two/three/four", 8 | uri_to_zip_entry_name("one/two", "three/four"), 9 | ) 10 | 11 | 12 | def test_when_path_has_leading_slash_then_base_is_ignored(): 13 | assert_equal( 14 | "three/four", 15 | uri_to_zip_entry_name("one/two", "/three/four"), 16 | ) 17 | -------------------------------------------------------------------------------- /mammoth/styles/parser/__init__.py: -------------------------------------------------------------------------------- 1 | from .errors import LineParseError 2 | from .style_mapping_parser import parse_style_mapping 3 | from .tokeniser import tokenise 4 | from .token_iterator import TokenIterator 5 | from ... import results 6 | 7 | 8 | def read_style_mapping(string): 9 | try: 10 | tokens = tokenise(string) 11 | return results.success(parse_style_mapping(TokenIterator(tokens))) 12 | except LineParseError: 13 | warning = "Did not understand this style mapping, so ignored it: " + string 14 | return results.Result(None, [results.warning(warning)]) 15 | -------------------------------------------------------------------------------- /mammoth/styles/parser/style_mapping_parser.py: -------------------------------------------------------------------------------- 1 | from .tokeniser import TokenType 2 | from .document_matcher_parser import parse_document_matcher 3 | from .html_path_parser import parse_html_path 4 | from ...styles import Style 5 | 6 | 7 | def parse_style_mapping(tokens): 8 | document_matcher = parse_document_matcher(tokens) 9 | tokens.skip(TokenType.WHITESPACE) 10 | tokens.skip(TokenType.SYMBOL, "=>") 11 | tokens.try_skip(TokenType.WHITESPACE) 12 | html_path = parse_html_path(tokens) 13 | tokens.skip(TokenType.END) 14 | 15 | return Style(document_matcher, html_path) 16 | -------------------------------------------------------------------------------- /mammoth/docx/complex_fields.py: -------------------------------------------------------------------------------- 1 | class unknown(object): 2 | pass 3 | 4 | 5 | class Begin: 6 | def __init__(self, *, fld_char): 7 | self.fld_char = fld_char 8 | 9 | 10 | def begin(*, fld_char): 11 | return Begin(fld_char=fld_char) 12 | 13 | 14 | class Hyperlink(object): 15 | def __init__(self, kwargs): 16 | self.kwargs = kwargs 17 | 18 | 19 | def hyperlink(kwargs): 20 | return Hyperlink(kwargs=kwargs) 21 | 22 | 23 | class Checkbox: 24 | def __init__(self, *, checked): 25 | self.checked = checked 26 | 27 | 28 | def checkbox(*, checked): 29 | return Checkbox(checked=checked) 30 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-22.04 8 | 9 | strategy: 10 | matrix: 11 | python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "pypy3.9"] 12 | 13 | steps: 14 | 15 | - uses: actions/checkout@v4 16 | 17 | - name: Use Python ${{ matrix.python-version }} 18 | uses: actions/setup-python@v5 19 | with: 20 | python-version: ${{ matrix.python-version }} 21 | 22 | - run: pip install tox 23 | 24 | - run: make README 25 | 26 | - run: tox -e py 27 | -------------------------------------------------------------------------------- /mammoth/writers/abc.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import abc 4 | 5 | 6 | class Writer(object): 7 | __metaclass__ = abc.ABCMeta 8 | 9 | @abc.abstractmethod 10 | def text(self, text): 11 | pass 12 | 13 | @abc.abstractmethod 14 | def start(self, name, attributes=None): 15 | pass 16 | 17 | @abc.abstractmethod 18 | def end(self, name): 19 | pass 20 | 21 | @abc.abstractmethod 22 | def self_closing(self, name, attributes=None): 23 | pass 24 | 25 | @abc.abstractmethod 26 | def append(self, html): 27 | pass 28 | 29 | @abc.abstractmethod 30 | def as_string(self): 31 | pass 32 | -------------------------------------------------------------------------------- /tests/styles/parser/style_mapping_parser_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth import html_paths, document_matchers, styles 2 | from mammoth.styles.parser.style_mapping_parser import parse_style_mapping 3 | from mammoth.styles.parser.tokeniser import tokenise 4 | from mammoth.styles.parser.token_iterator import TokenIterator 5 | from ...testing import assert_equal 6 | 7 | 8 | def test_document_matcher_is_mapped_to_html_path_using_fat_arrow(): 9 | assert_equal( 10 | styles.style(document_matchers.paragraph(), html_paths.path([html_paths.element(["h1"])])), 11 | read_style_mapping("p => h1") 12 | ) 13 | 14 | 15 | def read_style_mapping(string): 16 | return parse_style_mapping(TokenIterator(tokenise(string))) 17 | -------------------------------------------------------------------------------- /mammoth/docx/document_xml.py: -------------------------------------------------------------------------------- 1 | from .. import documents 2 | 3 | 4 | def read_document_xml_element( 5 | element, 6 | body_reader, 7 | notes=None, 8 | comments=None): 9 | 10 | if notes is None: 11 | notes = [] 12 | if comments is None: 13 | comments = [] 14 | 15 | body_element = element.find_child("w:body") 16 | 17 | if body_element is None: 18 | raise ValueError("Could not find the body element: are you sure this is a docx file?") 19 | 20 | return body_reader.read_all(body_element.children) \ 21 | .map(lambda children: documents.document( 22 | children, 23 | notes=documents.notes(notes), 24 | comments=comments 25 | )) 26 | -------------------------------------------------------------------------------- /mammoth/images.py: -------------------------------------------------------------------------------- 1 | import base64 2 | 3 | from . import html 4 | 5 | 6 | def img_element(func): 7 | def convert_image(image): 8 | attributes = {} 9 | if image.alt_text: 10 | attributes["alt"] = image.alt_text 11 | attributes.update(func(image)) 12 | 13 | return [html.element("img", attributes)] 14 | 15 | return convert_image 16 | 17 | # Undocumented, but retained for backwards-compatibility with 0.3.x 18 | inline = img_element 19 | 20 | 21 | @img_element 22 | def data_uri(image): 23 | with image.open() as image_bytes: 24 | encoded_src = base64.b64encode(image_bytes.read()).decode("ascii") 25 | 26 | return { 27 | "src": "data:{0};base64,{1}".format(image.content_type, encoded_src) 28 | } 29 | -------------------------------------------------------------------------------- /tests/styles/document_matcher_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth import document_matchers 2 | from ..testing import assert_equal 3 | 4 | 5 | def test_equal_to_matcher_is_case_insensitive(): 6 | matcher = document_matchers.equal_to("Heading 1") 7 | assert_equal(True, matcher.matches("heaDING 1")) 8 | assert_equal(False, matcher.matches("heaDING 2")) 9 | 10 | 11 | def test_starts_with_matcher_matches_string_with_prefix(): 12 | matcher = document_matchers.starts_with("Heading") 13 | assert_equal(True, matcher.matches("Heading")) 14 | assert_equal(True, matcher.matches("Heading 1")) 15 | assert_equal(False, matcher.matches("Custom Heading")) 16 | assert_equal(False, matcher.matches("Head")) 17 | assert_equal(False, matcher.matches("Header 2")) 18 | 19 | 20 | def test_starts_with_matcher_is_case_insensitive(): 21 | matcher = document_matchers.starts_with("Heading") 22 | assert_equal(True, matcher.matches("heaDING")) 23 | -------------------------------------------------------------------------------- /mammoth/docx/comments_xml.py: -------------------------------------------------------------------------------- 1 | from .. import lists 2 | from .. import documents 3 | from .. import results 4 | 5 | 6 | def read_comments_xml_element(element, body_reader): 7 | def read_comments_xml_element(element): 8 | comment_elements = element.find_children("w:comment") 9 | return results.combine(lists.map(_read_comment_element, comment_elements)) 10 | 11 | 12 | def _read_comment_element(element): 13 | def read_optional_attribute(name): 14 | return element.attributes.get(name, "").strip() or None 15 | 16 | return body_reader.read_all(element.children).map(lambda body: 17 | documents.comment( 18 | comment_id=element.attributes["w:id"], 19 | body=body, 20 | author_name=read_optional_attribute("w:author"), 21 | author_initials=read_optional_attribute("w:initials"), 22 | )) 23 | 24 | return read_comments_xml_element(element) 25 | -------------------------------------------------------------------------------- /mammoth/lists.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def flatten(values): 5 | return flat_map(lambda x: x, values) 6 | 7 | 8 | def unique(values): 9 | output = [] 10 | seen = set() 11 | for value in values: 12 | if value not in seen: 13 | seen.add(value) 14 | output.append(value) 15 | return output 16 | 17 | 18 | def flat_map(func, values): 19 | return [ 20 | element 21 | for value in values 22 | for element in func(value) 23 | ] 24 | 25 | 26 | def find_index(predicate, values): 27 | for index, value in enumerate(values): 28 | if predicate(value): 29 | return index 30 | 31 | 32 | if sys.version_info[0] == 2: 33 | map = map 34 | filter = filter 35 | else: 36 | import builtins 37 | def map(*args, **kwargs): 38 | return list(builtins.map(*args, **kwargs)) 39 | def filter(*args, **kwargs): 40 | return list(builtins.filter(*args, **kwargs)) 41 | -------------------------------------------------------------------------------- /tests/docx/document_matchers.py: -------------------------------------------------------------------------------- 1 | from precisely import all_of, has_attrs, instance_of 2 | 3 | from mammoth import documents 4 | 5 | 6 | def create_element_matcher(element_type): 7 | def matcher(**kwargs): 8 | return all_of( 9 | instance_of(element_type), 10 | has_attrs(**kwargs), 11 | ) 12 | 13 | return matcher 14 | 15 | 16 | is_paragraph = create_element_matcher(documents.Paragraph) 17 | is_run = create_element_matcher(documents.Run) 18 | is_hyperlink = create_element_matcher(documents.Hyperlink) 19 | is_checkbox = create_element_matcher(documents.Checkbox) 20 | is_table = create_element_matcher(documents.Table) 21 | is_row = create_element_matcher(documents.TableRow) 22 | is_image = create_element_matcher(documents.Image) 23 | 24 | 25 | is_empty_run = is_run(children=[]) 26 | 27 | 28 | def is_text(value): 29 | return all_of( 30 | instance_of(documents.Text), 31 | has_attrs(value=value), 32 | ) 33 | -------------------------------------------------------------------------------- /mammoth/styles/parser/token_parser.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from .tokeniser import TokenType 4 | 5 | 6 | def try_parse_class_name(tokens): 7 | if tokens.try_skip(TokenType.SYMBOL, "."): 8 | return parse_identifier(tokens) 9 | else: 10 | return None 11 | 12 | 13 | def parse_identifier(tokens): 14 | return decode_escape_sequences(tokens.next_value(TokenType.IDENTIFIER)) 15 | 16 | 17 | def parse_string(tokens): 18 | return decode_escape_sequences(tokens.next_value(TokenType.STRING)[1:-1]) 19 | 20 | 21 | _ESCAPE_SEQUENCE_REGEX = re.compile(r"\\(.)") 22 | 23 | 24 | def decode_escape_sequences(value): 25 | return _ESCAPE_SEQUENCE_REGEX.sub(_decode_escape_sequence, value) 26 | 27 | 28 | def _decode_escape_sequence(match): 29 | code = match.group(1) 30 | if code == "n": 31 | return "\n" 32 | elif code == "r": 33 | return "\r" 34 | elif code == "t": 35 | return "\t" 36 | else: 37 | return code 38 | -------------------------------------------------------------------------------- /mammoth/results.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | from .lists import unique 4 | 5 | 6 | class Result(object): 7 | def __init__(self, value, messages): 8 | self.value = value 9 | self.messages = unique(messages) 10 | 11 | def map(self, func): 12 | return Result(func(self.value), self.messages) 13 | 14 | def bind(self, func): 15 | result = func(self.value) 16 | return Result(result.value, self.messages + result.messages) 17 | 18 | 19 | Message = collections.namedtuple("Message", ["type", "message"]) 20 | 21 | 22 | def warning(message): 23 | return Message("warning", message) 24 | 25 | 26 | def success(value): 27 | return Result(value, []) 28 | 29 | 30 | def combine(results): 31 | values = [] 32 | messages = [] 33 | for result in results: 34 | values.append(result.value) 35 | for message in result.messages: 36 | messages.append(message) 37 | 38 | return Result(values, messages) 39 | 40 | 41 | def map(func, *args): 42 | return combine(args).map(lambda values: func(*values)) 43 | -------------------------------------------------------------------------------- /mammoth/docx/notes_xml.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | from .. import lists 4 | from .. import documents 5 | from .. import results 6 | 7 | 8 | def _read_notes(note_type, element, body_reader): 9 | def read_notes_xml_element(element): 10 | note_elements = lists.filter( 11 | _is_note_element, 12 | element.find_children("w:" + note_type), 13 | ) 14 | return results.combine(lists.map(_read_note_element, note_elements)) 15 | 16 | 17 | def _is_note_element(element): 18 | return element.attributes.get("w:type") not in ["continuationSeparator", "separator"] 19 | 20 | 21 | def _read_note_element(element): 22 | return body_reader.read_all(element.children).map(lambda body: 23 | documents.note( 24 | note_type=note_type, 25 | note_id=element.attributes["w:id"], 26 | body=body 27 | )) 28 | 29 | return read_notes_xml_element(element) 30 | 31 | read_footnotes_xml_element = functools.partial(_read_notes, "footnote") 32 | read_endnotes_xml_element = functools.partial(_read_notes, "endnote") 33 | -------------------------------------------------------------------------------- /tests/zips_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth import zips 2 | from .testing import assert_equal 3 | 4 | 5 | def test_split_path_splits_zip_paths_on_last_forward_slash(): 6 | assert_equal(("a", "b"), zips.split_path("a/b")) 7 | assert_equal(("a/b", "c"), zips.split_path("a/b/c")) 8 | assert_equal(("/a/b", "c"), zips.split_path("/a/b/c")) 9 | 10 | 11 | def test_when_path_has_no_forward_slashes_then_split_path_returns_empty_dirname(): 12 | assert_equal(("", "name"), zips.split_path("name")) 13 | 14 | 15 | def test_join_path_joins_arguments_with_forward_slashes(): 16 | assert_equal("a/b", zips.join_path("a", "b")) 17 | assert_equal("a/b/c", zips.join_path("a/b", "c")) 18 | assert_equal("/a/b/c", zips.join_path("/a/b", "c")) 19 | 20 | 21 | def test_empty_parts_are_ignored_when_joining_paths(): 22 | assert_equal("a", zips.join_path("a", "")) 23 | assert_equal("b", zips.join_path("", "b")) 24 | assert_equal("a/b", zips.join_path("a", "", "b")) 25 | 26 | 27 | def test_when_joining_paths_then_absolute_paths_ignore_earlier_paths(): 28 | assert_equal("/b", zips.join_path("a", "/b")) 29 | assert_equal("/b/c", zips.join_path("a", "/b", "c")) 30 | assert_equal("/b", zips.join_path("/a", "/b")) 31 | assert_equal("/a", zips.join_path("/a")) 32 | -------------------------------------------------------------------------------- /tests/test-data/hyperlinks/word/_rels/document.xml.rels: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /makefile: -------------------------------------------------------------------------------- 1 | .PHONY: test 2 | 3 | test: 4 | _virtualenv/bin/pyflakes mammoth tests 5 | sh -c '. _virtualenv/bin/activate; py.test tests' 6 | 7 | .PHONY: test-all 8 | 9 | test-all: 10 | tox 11 | 12 | .PHONY: upload 13 | 14 | upload: setup assert-converted-readme build-dist 15 | _virtualenv/bin/twine upload dist/* 16 | make clean 17 | 18 | .PHONY: build-dist 19 | 20 | build-dist: 21 | rm -rf dist 22 | _virtualenv/bin/pyproject-build 23 | 24 | README: README.md 25 | pandoc --from=markdown --to=rst README.md > README || cp README.md README 26 | 27 | .PHONY: assert-converted-readme 28 | 29 | assert-converted-readme: 30 | test "`cat README`" != "`cat README.md`" 31 | 32 | .PHONY: clean 33 | 34 | clean: 35 | rm -f README 36 | rm -f MANIFEST 37 | rm -rf dist 38 | 39 | .PHONY: bootstrap 40 | 41 | bootstrap: _virtualenv setup 42 | _virtualenv/bin/pip install -e . 43 | ifneq ($(wildcard test-requirements.txt),) 44 | _virtualenv/bin/pip install -r test-requirements.txt 45 | endif 46 | make clean 47 | 48 | .PHONY: setup 49 | 50 | setup: README 51 | 52 | _virtualenv: 53 | python3 -m venv _virtualenv 54 | _virtualenv/bin/pip install --upgrade pip 55 | _virtualenv/bin/pip install --upgrade setuptools 56 | _virtualenv/bin/pip install --upgrade wheel 57 | _virtualenv/bin/pip install --upgrade build twine 58 | -------------------------------------------------------------------------------- /tests/styles/parser/token_parser_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth.styles.parser.tokeniser import Token, TokenType 2 | from mammoth.styles.parser.token_parser import decode_escape_sequences, parse_identifier, parse_string 3 | from mammoth.styles.parser.token_iterator import TokenIterator 4 | from ...testing import assert_equal 5 | 6 | 7 | def test_escape_sequences_in_identifiers_are_decoded(): 8 | assert_equal( 9 | ":", 10 | parse_identifier(TokenIterator([ 11 | Token(0, TokenType.IDENTIFIER, r"\:"), 12 | ])), 13 | ) 14 | 15 | 16 | def test_escape_sequences_in_strings_are_decoded(): 17 | assert_equal( 18 | "\n", 19 | parse_string(TokenIterator([ 20 | Token(0, TokenType.STRING, r"'\n'"), 21 | ])), 22 | ) 23 | 24 | 25 | def test_line_feeds_are_decoded(): 26 | assert_equal("\n", decode_escape_sequences(r"\n")) 27 | 28 | 29 | def test_carriage_returns_are_decoded(): 30 | assert_equal("\r", decode_escape_sequences(r"\r")) 31 | 32 | 33 | def test_tabs_are_decoded(): 34 | assert_equal("\t", decode_escape_sequences(r"\t")) 35 | 36 | 37 | def test_backslashes_are_decoded(): 38 | assert_equal("\\", decode_escape_sequences(r"\\")) 39 | 40 | 41 | def test_colons_are_decoded(): 42 | assert_equal(":", decode_escape_sequences(r"\:")) 43 | -------------------------------------------------------------------------------- /tests/docx/notes_xml_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth import documents 2 | from mammoth.docx.xmlparser import element as xml_element 3 | from mammoth.docx.notes_xml import read_footnotes_xml_element 4 | from mammoth.docx import body_xml 5 | from ..testing import assert_equal 6 | 7 | 8 | def test_id_and_body_of_footnote_are_read(): 9 | footnote_body = [xml_element("w:p")] 10 | footnotes = read_footnotes_xml_element(xml_element("w:footnotes", {}, [ 11 | xml_element("w:footnote", {"w:id": "1"}, footnote_body), 12 | ]), body_reader=body_xml.reader()) 13 | assert_equal(1, len(footnotes.value)) 14 | assert isinstance(footnotes.value[0].body[0], documents.Paragraph) 15 | assert_equal("1", footnotes.value[0].note_id) 16 | 17 | 18 | def test_continuation_separator_is_ignored(): 19 | _assert_footnote_type_is_ignored("continuationSeparator") 20 | 21 | 22 | def test_separator_is_ignored(): 23 | _assert_footnote_type_is_ignored("separator") 24 | 25 | 26 | def _assert_footnote_type_is_ignored(footnote_type): 27 | footnote_body = [xml_element("w:p")] 28 | footnotes = read_footnotes_xml_element(xml_element("w:footnotes", {}, [ 29 | xml_element("w:footnote", {"w:id": "1", "w:type": footnote_type}, footnote_body), 30 | ]), body_reader=None) 31 | assert_equal(0, len(footnotes.value)) 32 | 33 | -------------------------------------------------------------------------------- /mammoth/html/nodes.py: -------------------------------------------------------------------------------- 1 | import cobble 2 | 3 | 4 | class Node(object): 5 | pass 6 | 7 | 8 | @cobble.data 9 | class TextNode(Node): 10 | value = cobble.field() 11 | 12 | 13 | @cobble.data 14 | class Tag(object): 15 | tag_names = cobble.field() 16 | attributes = cobble.field() 17 | collapsible = cobble.field() 18 | separator = cobble.field() 19 | 20 | @property 21 | def tag_name(self): 22 | return self.tag_names[0] 23 | 24 | 25 | @cobble.data 26 | class Element(Node): 27 | tag = cobble.field() 28 | children = cobble.field() 29 | 30 | @property 31 | def tag_name(self): 32 | return self.tag.tag_name 33 | 34 | @property 35 | def tag_names(self): 36 | return self.tag.tag_names 37 | 38 | @property 39 | def attributes(self): 40 | return self.tag.attributes 41 | 42 | @property 43 | def collapsible(self): 44 | return self.tag.collapsible 45 | 46 | @property 47 | def separator(self): 48 | return self.tag.separator 49 | 50 | _VOID_TAG_NAMES = set(["br", "hr", "img", "input"]) 51 | 52 | def is_void(self): 53 | return not self.children and self.tag_name in self._VOID_TAG_NAMES 54 | 55 | 56 | @cobble.visitable 57 | class ForceWrite(Node): 58 | pass 59 | 60 | 61 | NodeVisitor = cobble.visitor(Node) 62 | -------------------------------------------------------------------------------- /tests/options_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth.options import read_options, _default_style_map 2 | from mammoth.styles.parser import read_style_mapping 3 | from .testing import assert_equal 4 | 5 | 6 | def test_default_style_map_is_used_if_style_map_is_not_set(): 7 | assert_equal(_default_style_map, read_options({}).value["style_map"]) 8 | 9 | 10 | def test_custom_style_mappings_are_prepended_to_default_style_mappings(): 11 | style_map = read_options({ 12 | "style_map": "p.SectionTitle => h2" 13 | }).value["style_map"] 14 | assert_equal(read_style_mapping("p.SectionTitle => h2").value, style_map[0]) 15 | assert_equal(_default_style_map, style_map[1:]) 16 | 17 | 18 | def test_default_style_mappings_are_ignored_if_include_default_style_map_is_false(): 19 | style_map = read_options({ 20 | "style_map": "p.SectionTitle => h2", 21 | "include_default_style_map": False 22 | }).value["style_map"] 23 | assert_equal([read_style_mapping("p.SectionTitle => h2").value], style_map) 24 | 25 | 26 | def test_lines_starting_with_hash_in_custom_style_map_are_ignored(): 27 | style_map = read_options({ 28 | "style_map": "#p.SectionTitle => h3\np.SectionTitle => h2", 29 | "include_default_style_map": False 30 | }).value["style_map"] 31 | assert_equal([read_style_mapping("p.SectionTitle => h2").value], style_map) 32 | -------------------------------------------------------------------------------- /mammoth/docx/relationships_xml.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | 4 | class Relationships(object): 5 | def __init__(self, relationships): 6 | self._targets_by_id = dict( 7 | (relationship.relationship_id, relationship.target) 8 | for relationship in relationships 9 | ) 10 | self._targets_by_type = collections.defaultdict(list) 11 | for relationship in relationships: 12 | self._targets_by_type[relationship.type].append(relationship.target) 13 | 14 | def find_target_by_relationship_id(self, key): 15 | return self._targets_by_id[key] 16 | 17 | def find_targets_by_type(self, relationship_type): 18 | return self._targets_by_type[relationship_type] 19 | 20 | 21 | Relationships.EMPTY = Relationships([]) 22 | 23 | 24 | Relationship = collections.namedtuple("Relationship", ["relationship_id", "target", "type"]) 25 | 26 | 27 | def read_relationships_xml_element(element): 28 | children = element.find_children("relationships:Relationship") 29 | return Relationships(list(map(_read_relationship, children))) 30 | 31 | 32 | def _read_relationship(element): 33 | relationship = Relationship( 34 | relationship_id=element.attributes["Id"], 35 | target=element.attributes["Target"], 36 | type=element.attributes["Type"], 37 | ) 38 | return relationship 39 | -------------------------------------------------------------------------------- /mammoth/writers/html.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from xml.sax.saxutils import escape 3 | 4 | from .abc import Writer 5 | 6 | 7 | class HtmlWriter(Writer): 8 | def __init__(self): 9 | self._fragments = [] 10 | 11 | def text(self, text): 12 | self._fragments.append(_escape_html(text)) 13 | 14 | def start(self, name, attributes=None): 15 | attribute_string = _generate_attribute_string(attributes) 16 | self._fragments.append("<{0}{1}>".format(name, attribute_string)) 17 | 18 | def end(self, name): 19 | self._fragments.append("".format(name)) 20 | 21 | def self_closing(self, name, attributes=None): 22 | attribute_string = _generate_attribute_string(attributes) 23 | self._fragments.append("<{0}{1} />".format(name, attribute_string)) 24 | 25 | def append(self, html): 26 | self._fragments.append(html) 27 | 28 | def as_string(self): 29 | return "".join(self._fragments) 30 | 31 | 32 | def _escape_html(text): 33 | return escape(text, {'"': """}) 34 | 35 | 36 | def _generate_attribute_string(attributes): 37 | if attributes is None: 38 | return "" 39 | else: 40 | return "".join( 41 | ' {0}="{1}"'.format(key, _escape_html(attributes[key])) 42 | for key in sorted(attributes) 43 | ) 44 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013, Michael Williamson 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | -------------------------------------------------------------------------------- /mammoth/html_paths.py: -------------------------------------------------------------------------------- 1 | import cobble 2 | 3 | from . import html 4 | 5 | 6 | def path(elements): 7 | return HtmlPath(elements) 8 | 9 | 10 | def element(names, attributes=None, class_names=None, fresh=None, separator=None): 11 | if attributes is None: 12 | attributes = {} 13 | if class_names is None: 14 | class_names = [] 15 | if fresh is None: 16 | fresh = False 17 | if class_names: 18 | attributes["class"] = " ".join(class_names) 19 | 20 | return HtmlPathElement(html.tag( 21 | tag_names=names, 22 | attributes=attributes, 23 | collapsible=not fresh, 24 | separator=separator, 25 | )) 26 | 27 | 28 | @cobble.data 29 | class HtmlPath(object): 30 | elements = cobble.field() 31 | 32 | def wrap(self, generate_nodes): 33 | nodes = generate_nodes() 34 | 35 | for element in reversed(self.elements): 36 | nodes = element.wrap_nodes(nodes) 37 | 38 | return nodes 39 | 40 | 41 | @cobble.data 42 | class HtmlPathElement(object): 43 | tag = cobble.field() 44 | 45 | def wrap(self, generate_nodes): 46 | return self.wrap_nodes(generate_nodes()) 47 | 48 | def wrap_nodes(self, nodes): 49 | element = html.Element(self.tag, nodes) 50 | return [element] 51 | 52 | empty = path([]) 53 | 54 | 55 | class ignore(object): 56 | @staticmethod 57 | def wrap(generate_nodes): 58 | return [] 59 | -------------------------------------------------------------------------------- /mammoth/docx/files.py: -------------------------------------------------------------------------------- 1 | import os 2 | import contextlib 3 | try: 4 | from urllib2 import urlopen 5 | except ImportError: 6 | from urllib.request import urlopen 7 | try: 8 | from urllib.parse import urlparse 9 | except ImportError: 10 | from urlparse import urlparse 11 | 12 | 13 | class Files(object): 14 | def __init__(self, base, external_file_access): 15 | self._base = base 16 | self._external_file_access = external_file_access 17 | 18 | def open(self, uri): 19 | if not self._external_file_access: 20 | raise ExternalFileAccessIsDisabledError( 21 | "could not open external image '{0}', external file access is disabled".format(uri) 22 | ) 23 | 24 | try: 25 | if _is_absolute(uri): 26 | return contextlib.closing(urlopen(uri)) 27 | elif self._base is not None: 28 | return open(os.path.join(self._base, uri), "rb") 29 | else: 30 | raise InvalidFileReferenceError("could not find external image '{0}', fileobj has no name".format(uri)) 31 | except IOError as error: 32 | message = "could not open external image: '{0}' (document directory: '{1}')\n{2}".format( 33 | uri, self._base, str(error)) 34 | raise InvalidFileReferenceError(message) 35 | 36 | 37 | def _is_absolute(url): 38 | return urlparse(url).scheme != "" 39 | 40 | 41 | class InvalidFileReferenceError(ValueError): 42 | pass 43 | 44 | 45 | class ExternalFileAccessIsDisabledError(InvalidFileReferenceError): 46 | pass 47 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | from setuptools import setup 5 | 6 | def read(fname): 7 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 8 | 9 | 10 | setup( 11 | name='mammoth', 12 | version='1.11.0', 13 | description='Convert Word documents from docx to simple and clean HTML and Markdown', 14 | long_description=read("README"), 15 | author='Michael Williamson', 16 | author_email='mike@zwobble.org', 17 | url='https://github.com/mwilliamson/python-mammoth', 18 | packages=['mammoth', 'mammoth.docx', 'mammoth.html', 'mammoth.styles', 'mammoth.styles.parser', 'mammoth.writers'], 19 | entry_points={ 20 | "console_scripts": [ 21 | "mammoth=mammoth.cli:main" 22 | ] 23 | }, 24 | keywords="docx word office clean html markdown md", 25 | install_requires=[ 26 | "cobble>=0.1.3,<0.2", 27 | ], 28 | python_requires='>=3.7', 29 | license="BSD-2-Clause", 30 | classifiers=[ 31 | 'Development Status :: 5 - Production/Stable', 32 | 'Intended Audience :: Developers', 33 | 'License :: OSI Approved :: BSD License', 34 | 'Programming Language :: Python', 35 | 'Programming Language :: Python :: 3', 36 | 'Programming Language :: Python :: 3.7', 37 | 'Programming Language :: Python :: 3.8', 38 | 'Programming Language :: Python :: 3.9', 39 | 'Programming Language :: Python :: 3.10', 40 | 'Programming Language :: Python :: 3.11', 41 | 'Programming Language :: Python :: 3.12', 42 | ], 43 | ) 44 | 45 | -------------------------------------------------------------------------------- /tests/docx/office_xml_tests.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | 3 | import io 4 | 5 | from mammoth.docx import xmlparser as xml, office_xml 6 | from ..testing import assert_equal 7 | 8 | 9 | class AlternateContentTests(object): 10 | def test_when_fallback_is_present_then_fallback_is_read(self): 11 | xml_string = ( 12 | '' + 13 | '' + 14 | '' + 15 | '' + 16 | '' + 17 | '' + 18 | '' + 19 | '' + 20 | '' + 21 | '' + 22 | '') 23 | 24 | result = office_xml.read(io.StringIO(xml_string)) 25 | assert_equal([xml.element("fallback")], result.children) 26 | 27 | 28 | def test_when_fallback_is_not_present_then_element_is_ignored(self): 29 | xml_string = ( 30 | '' + 31 | '' + 32 | '' + 33 | '' + 34 | '' + 35 | '' + 36 | '' + 37 | '') 38 | 39 | result = office_xml.read(io.StringIO(xml_string)) 40 | assert_equal([], result.children) 41 | -------------------------------------------------------------------------------- /tests/raw_text_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth.raw_text import extract_raw_text_from_element 2 | from mammoth import documents 3 | from .testing import assert_equal 4 | 5 | 6 | def test_text_element_is_converted_to_text_content(): 7 | element = documents.Text("Hello.") 8 | 9 | result = extract_raw_text_from_element(element) 10 | 11 | assert_equal("Hello.", result) 12 | 13 | 14 | def test_tab_element_is_converted_to_tab_character(): 15 | element = documents.tab() 16 | 17 | result = extract_raw_text_from_element(element) 18 | 19 | assert_equal("\t", result) 20 | 21 | 22 | def test_paragraphs_are_terminated_with_newlines(): 23 | element = documents.paragraph( 24 | children=[ 25 | documents.Text("Hello "), 26 | documents.Text("world."), 27 | ], 28 | ) 29 | 30 | result = extract_raw_text_from_element(element) 31 | 32 | assert_equal("Hello world.\n\n", result) 33 | 34 | 35 | def test_children_are_recursively_converted_to_text(): 36 | element = documents.document([ 37 | documents.paragraph( 38 | [ 39 | documents.text("Hello "), 40 | documents.text("world.") 41 | ], 42 | {} 43 | ) 44 | ]) 45 | 46 | result = extract_raw_text_from_element(element) 47 | 48 | assert_equal("Hello world.\n\n", result) 49 | 50 | 51 | def test_non_text_element_without_children_is_converted_to_empty_string(): 52 | element = documents.line_break 53 | assert not hasattr(element, "children") 54 | 55 | result = extract_raw_text_from_element(element) 56 | 57 | assert_equal("", result) 58 | -------------------------------------------------------------------------------- /mammoth/transforms.py: -------------------------------------------------------------------------------- 1 | from . import documents 2 | 3 | 4 | def paragraph(transform_paragraph): 5 | return element_of_type(documents.Paragraph, transform_paragraph) 6 | 7 | 8 | def run(transform_run): 9 | return element_of_type(documents.Run, transform_run) 10 | 11 | 12 | def element_of_type(element_type, transform): 13 | def transform_element(element): 14 | if isinstance(element, element_type): 15 | return transform(element) 16 | else: 17 | return element 18 | 19 | return _each_element(transform_element) 20 | 21 | 22 | def _each_element(transform_element): 23 | def transform_element_and_children(element): 24 | if isinstance(element, (documents.HasChildren, documents.TableCellUnmerged)): 25 | children = list(map(transform_element_and_children, element.children)) 26 | element = element.copy(children=children) 27 | 28 | return transform_element(element) 29 | 30 | return transform_element_and_children 31 | 32 | 33 | def get_descendants_of_type(element, element_type): 34 | return list(filter( 35 | lambda descendant: isinstance(descendant, element_type), 36 | get_descendants(element), 37 | )) 38 | 39 | 40 | def get_descendants(element): 41 | descendants = [] 42 | 43 | def visit(element): 44 | descendants.append(element) 45 | 46 | _visit_descendants(element, visit) 47 | 48 | return descendants 49 | 50 | 51 | def _visit_descendants(element, visit): 52 | if isinstance(element, documents.HasChildren): 53 | for child in element.children: 54 | _visit_descendants(child, visit) 55 | visit(child) 56 | 57 | -------------------------------------------------------------------------------- /tests/test-data/hyperlinks/word/document.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | coconuts 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /tests/html/strip_empty_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth import html 2 | from ..testing import assert_equal 3 | 4 | 5 | def test_text_nodes_with_text_are_not_stripped(): 6 | assert_equal( 7 | [html.text("H")], 8 | html.strip_empty([html.text("H")])) 9 | 10 | 11 | def test_empty_text_nodes_are_stripped(): 12 | assert_equal( 13 | [], 14 | html.strip_empty([html.text("")])) 15 | 16 | 17 | def test_elements_with_non_empty_children_are_not_stripped(): 18 | assert_equal( 19 | [html.element("p", {}, [html.text("H")])], 20 | html.strip_empty([html.element("p", {}, [html.text("H")])])) 21 | 22 | 23 | def test_elements_with_no_children_are_stripped(): 24 | assert_equal( 25 | [], 26 | html.strip_empty([html.element("p")])) 27 | 28 | 29 | def test_elements_with_only_empty_children_are_stripped(): 30 | assert_equal( 31 | [], 32 | html.strip_empty([html.element("p", {}, [html.text("")])])) 33 | 34 | 35 | def test_empty_children_are_removed(): 36 | assert_equal( 37 | html.strip_empty([html.element("ul", {}, [ 38 | html.element("li", {}, [html.text("")]), 39 | html.element("li", {}, [html.text("H")]), 40 | ])]), 41 | 42 | [html.element("ul", {}, [ 43 | html.element("li", {}, [html.text("H")]) 44 | ])]) 45 | 46 | 47 | def test_self_closing_elements_are_never_empty(): 48 | assert_equal( 49 | [html.element("br")], 50 | html.strip_empty([html.element("br")])) 51 | 52 | 53 | def test_force_writes_are_never_empty(): 54 | assert_equal( 55 | [html.force_write], 56 | html.strip_empty([html.force_write])) 57 | -------------------------------------------------------------------------------- /tests/test-data/simple/word/document.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Hello. 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /mammoth/__init__.py: -------------------------------------------------------------------------------- 1 | from . import docx, conversion, options, images, transforms, underline 2 | from .raw_text import extract_raw_text_from_element 3 | from .docx.style_map import write_style_map, read_style_map 4 | 5 | __all__ = ["convert_to_html", "extract_raw_text", "images", "transforms", "underline"] 6 | 7 | 8 | _undefined = object() 9 | 10 | 11 | def convert_to_html(*args, **kwargs): 12 | return convert(*args, output_format="html", **kwargs) 13 | 14 | 15 | def convert_to_markdown(*args, **kwargs): 16 | return convert(*args, output_format="markdown", **kwargs) 17 | 18 | 19 | def convert( 20 | fileobj, 21 | transform_document=None, 22 | id_prefix=None, 23 | include_embedded_style_map=_undefined, 24 | external_file_access=_undefined, 25 | **kwargs 26 | ): 27 | if include_embedded_style_map is _undefined: 28 | include_embedded_style_map = True 29 | 30 | if transform_document is None: 31 | transform_document = lambda x: x 32 | 33 | if include_embedded_style_map: 34 | kwargs["embedded_style_map"] = read_style_map(fileobj) 35 | 36 | if external_file_access is _undefined: 37 | external_file_access = False 38 | 39 | return options.read_options(kwargs).bind(lambda convert_options: 40 | docx.read(fileobj, external_file_access=external_file_access).map(transform_document).bind(lambda document: 41 | conversion.convert_document_element_to_html( 42 | document, 43 | id_prefix=id_prefix, 44 | **convert_options 45 | ) 46 | ) 47 | ) 48 | 49 | 50 | def extract_raw_text(fileobj): 51 | return docx.read(fileobj).map(extract_raw_text_from_element) 52 | 53 | 54 | def embed_style_map(fileobj, style_map): 55 | write_style_map(fileobj, style_map) 56 | 57 | def read_embedded_style_map(fileobj): 58 | return read_style_map(fileobj) 59 | -------------------------------------------------------------------------------- /mammoth/docx/content_types_xml.py: -------------------------------------------------------------------------------- 1 | def read_content_types_xml_element(element): 2 | extension_defaults = dict(map( 3 | _read_default, 4 | element.find_children("content-types:Default") 5 | )) 6 | overrides = dict(map( 7 | _read_override, 8 | element.find_children("content-types:Override") 9 | )) 10 | return _ContentTypes(extension_defaults, overrides) 11 | 12 | 13 | def _read_default(element): 14 | extension = element.attributes["Extension"] 15 | content_type = element.attributes["ContentType"] 16 | return extension, content_type 17 | 18 | 19 | def _read_override(element): 20 | part_name = element.attributes["PartName"] 21 | content_type = element.attributes["ContentType"] 22 | return part_name.lstrip("/"), content_type 23 | 24 | 25 | class _ContentTypes(object): 26 | _image_content_types = { 27 | "png": "png", 28 | "gif": "gif", 29 | "jpeg": "jpeg", 30 | "jpg": "jpeg", 31 | "tif": "tiff", 32 | "tiff": "tiff", 33 | "bmp": "bmp", 34 | } 35 | 36 | def __init__(self, extension_defaults, overrides): 37 | self._extension_defaults = extension_defaults 38 | self._overrides = overrides 39 | 40 | def find_content_type(self, path): 41 | if path in self._overrides: 42 | return self._overrides[path] 43 | 44 | extension = _get_extension(path) 45 | default_type = self._extension_defaults.get(extension) 46 | if default_type is not None: 47 | return default_type 48 | 49 | image_type = self._image_content_types.get(extension.lower()) 50 | if image_type is not None: 51 | return "image/" + image_type 52 | 53 | return None 54 | 55 | empty_content_types = _ContentTypes({}, {}) 56 | 57 | def _get_extension(path): 58 | return path.rpartition(".")[2] 59 | -------------------------------------------------------------------------------- /mammoth/styles/parser/tokeniser.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import re 3 | 4 | 5 | Token = collections.namedtuple("Token", ["character_index", "type", "value"]) 6 | 7 | 8 | class TokenType(object): 9 | IDENTIFIER = "identifier" 10 | SYMBOL = "symbol" 11 | WHITESPACE = "whitespace" 12 | STRING = "string" 13 | UNTERMINATED_STRING = "unterminated string" 14 | INTEGER = "integer" 15 | END = "end" 16 | 17 | 18 | 19 | def regex_tokeniser(rules): 20 | rules = [(token_type, _to_regex(regex)) for token_type, regex in rules] 21 | rules.append(("unknown", re.compile("."))) 22 | 23 | def tokenise(value): 24 | tokens = [] 25 | index = 0 26 | while index < len(value): 27 | for token_type, regex in rules: 28 | match = regex.match(value, index) 29 | if match is not None: 30 | tokens.append(Token(index, token_type, match.group(0))) 31 | index = match.end() 32 | break 33 | else: 34 | # Should be impossible 35 | raise Exception("Remaining: " + value[index:]) 36 | 37 | tokens.append(Token(index, TokenType.END, "")) 38 | 39 | return tokens 40 | 41 | return tokenise 42 | 43 | 44 | def _to_regex(value): 45 | if hasattr(value, "match"): 46 | return value 47 | else: 48 | return re.compile(value) 49 | 50 | 51 | _string_prefix = r"'(?:\\.|[^'])*" 52 | _identifier_character = r"(?:[a-zA-Z\-_]|\\.)" 53 | 54 | tokenise = regex_tokeniser([ 55 | (TokenType.IDENTIFIER, _identifier_character + "(?:" + _identifier_character + "|[0-9])*"), 56 | (TokenType.SYMBOL, r":|>|=>|\^=|=|\(|\)|\[|\]|\||!|\."), 57 | (TokenType.WHITESPACE, r"\s+"), 58 | (TokenType.STRING, _string_prefix + "'"), 59 | (TokenType.UNTERMINATED_STRING, _string_prefix), 60 | (TokenType.INTEGER, "([0-9]+)"), 61 | ]) 62 | -------------------------------------------------------------------------------- /tests/docx/comments_xml_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth import documents 2 | from mammoth.docx.xmlparser import element as xml_element 3 | from mammoth.docx.comments_xml import read_comments_xml_element 4 | from mammoth.docx import body_xml 5 | from ..testing import assert_equal 6 | 7 | 8 | def test_id_and_body_of_comment_is_read(): 9 | body = [xml_element("w:p")] 10 | comments = read_comments_xml_element(xml_element("w:comments", {}, [ 11 | xml_element("w:comment", {"w:id": "1"}, body), 12 | ]), body_reader=body_xml.reader()) 13 | assert_equal(1, len(comments.value)) 14 | assert_equal(comments.value[0].body, [documents.paragraph(children=[])]) 15 | assert_equal("1", comments.value[0].comment_id) 16 | 17 | 18 | def test_when_optional_attributes_of_comment_are_missing_then_they_are_read_as_none(): 19 | comments = read_comments_xml_element(xml_element("w:comments", {}, [ 20 | xml_element("w:comment", {"w:id": "1"}, []), 21 | ]), body_reader=body_xml.reader()) 22 | comment, = comments.value 23 | assert_equal(None, comment.author_name) 24 | assert_equal(None, comment.author_initials) 25 | 26 | 27 | def test_when_optional_attributes_of_comment_are_blank_then_they_are_read_as_none(): 28 | comments = read_comments_xml_element(xml_element("w:comments", {}, [ 29 | xml_element("w:comment", {"w:id": "1", "w:author": " ", "w:initials": " "}, []), 30 | ]), body_reader=body_xml.reader()) 31 | comment, = comments.value 32 | assert_equal(None, comment.author_name) 33 | assert_equal(None, comment.author_initials) 34 | 35 | 36 | def test_when_optional_attributes_of_comment_are_not_blank_then_they_are_read(): 37 | comments = read_comments_xml_element(xml_element("w:comments", {}, [ 38 | xml_element("w:comment", {"w:id": "1", "w:author": "The Piemaker", "w:initials": "TP"}, []), 39 | ]), body_reader=body_xml.reader()) 40 | comment, = comments.value 41 | assert_equal("The Piemaker", comment.author_name) 42 | assert_equal("TP", comment.author_initials) 43 | -------------------------------------------------------------------------------- /mammoth/docx/office_xml.py: -------------------------------------------------------------------------------- 1 | from ..lists import flat_map 2 | from .xmlparser import parse_xml, XmlElement 3 | 4 | 5 | _namespaces = [ 6 | # Transitional format 7 | ("w", "http://schemas.openxmlformats.org/wordprocessingml/2006/main"), 8 | ("r", "http://schemas.openxmlformats.org/officeDocument/2006/relationships"), 9 | ("wp", "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"), 10 | ("a", "http://schemas.openxmlformats.org/drawingml/2006/main"), 11 | ("pic", "http://schemas.openxmlformats.org/drawingml/2006/picture"), 12 | 13 | # Strict format 14 | ("w", "http://purl.oclc.org/ooxml/wordprocessingml/main"), 15 | ("r", "http://purl.oclc.org/ooxml/officeDocument/relationships"), 16 | ("wp", "http://purl.oclc.org/ooxml/drawingml/wordprocessingDrawing"), 17 | ("a", "http://purl.oclc.org/ooxml/drawingml/main"), 18 | ("pic", "http://purl.oclc.org/ooxml/drawingml/picture"), 19 | 20 | # Common 21 | ("content-types", "http://schemas.openxmlformats.org/package/2006/content-types"), 22 | ("relationships", "http://schemas.openxmlformats.org/package/2006/relationships"), 23 | ("mc", "http://schemas.openxmlformats.org/markup-compatibility/2006"), 24 | ("v", "urn:schemas-microsoft-com:vml"), 25 | ("office-word", "urn:schemas-microsoft-com:office:word"), 26 | 27 | # [MS-DOCX]: Word Extensions to the Office Open XML (.docx) File Format 28 | # https://learn.microsoft.com/en-us/openspecs/office_standards/ms-docx/b839fe1f-e1ca-4fa6-8c26-5954d0abbccd 29 | ("wordml", "http://schemas.microsoft.com/office/word/2010/wordml"), 30 | ] 31 | 32 | 33 | def read(fileobj): 34 | return _collapse_alternate_content(parse_xml(fileobj, _namespaces))[0] 35 | 36 | 37 | def _collapse_alternate_content(node): 38 | if isinstance(node, XmlElement): 39 | if node.name == "mc:AlternateContent": 40 | return node.find_child_or_null("mc:Fallback").children 41 | else: 42 | node.children = flat_map(_collapse_alternate_content, node.children) 43 | return [node] 44 | else: 45 | return [node] 46 | -------------------------------------------------------------------------------- /mammoth/styles/parser/token_iterator.py: -------------------------------------------------------------------------------- 1 | # TODO: check indices 2 | # TODO: proper tests for unexpected tokens 3 | 4 | from .errors import LineParseError 5 | 6 | 7 | class TokenIterator(object): 8 | def __init__(self, tokens): 9 | self._tokens = tokens 10 | self._index = 0 11 | 12 | def peek_token_type(self): 13 | return self._tokens[self._index].type 14 | 15 | def next_value(self, token_type=None): 16 | return self._next(token_type).value 17 | 18 | def _next(self, token_type=None): 19 | token = self._tokens[self._index] 20 | if token_type is None or token.type == token_type: 21 | self._index += 1 22 | return token 23 | else: 24 | raise self._unexpected_token_type(token_type, token) 25 | 26 | def skip(self, token_type, token_value=None): 27 | token = self._tokens[self._index] 28 | if token.type == token_type and (token_value is None or token.value == token_value): 29 | self._index += 1 30 | return True 31 | else: 32 | raise self._unexpected_token_type(token_type, token) 33 | 34 | def try_skip(self, token_type, token_value=None): 35 | if self.is_next(token_type, token_value): 36 | self._index += 1 37 | return True 38 | else: 39 | return False 40 | 41 | def try_skip_many(self, tokens): 42 | start = self._index 43 | for token_type, token_value in tokens: 44 | token = self._tokens[self._index] 45 | if not (token.type == token_type and (token_value is None or token.value == token_value)): 46 | self._index = start 47 | return False 48 | else: 49 | self._index += 1 50 | 51 | return True 52 | 53 | def is_next(self, token_type, token_value=None): 54 | token = self._tokens[self._index] 55 | return token.type == token_type and (token_value is None or token.value == token_value) 56 | 57 | def _unexpected_token_type(self, token_type, token): 58 | raise LineParseError() 59 | 60 | -------------------------------------------------------------------------------- /mammoth/zips.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import io 3 | import shutil 4 | 5 | from zipfile import ZipFile 6 | 7 | 8 | def open_zip(fileobj, mode): 9 | return _Zip(ZipFile(fileobj, mode)) 10 | 11 | 12 | class _Zip(object): 13 | def __init__(self, zip_file): 14 | self._zip_file = zip_file 15 | 16 | def __enter__(self): 17 | return self 18 | 19 | def __exit__(self, *args): 20 | self._zip_file.close() 21 | 22 | def open(self, name): 23 | return contextlib.closing(self._zip_file.open(name)) 24 | 25 | def exists(self, name): 26 | try: 27 | self._zip_file.getinfo(name) 28 | return True 29 | except KeyError: 30 | return False 31 | 32 | def read_str(self, name): 33 | return self._zip_file.read(name).decode("utf8") 34 | 35 | 36 | def update_zip(fileobj, files): 37 | source = ZipFile(fileobj, "r") 38 | try: 39 | destination_fileobj = io.BytesIO() 40 | destination = ZipFile(destination_fileobj, "w") 41 | try: 42 | names = set(source.namelist()) | set(files.keys()) 43 | for name in names: 44 | if name in files: 45 | contents = files[name] 46 | else: 47 | contents = source.read(name) 48 | destination.writestr(name, contents) 49 | finally: 50 | destination.close() 51 | finally: 52 | source.close() 53 | 54 | fileobj.seek(0) 55 | destination_fileobj.seek(0) 56 | shutil.copyfileobj(destination_fileobj, fileobj) 57 | 58 | 59 | def split_path(path): 60 | parts = path.rsplit("/", 1) 61 | if len(parts) == 1: 62 | return ("", path) 63 | else: 64 | return tuple(parts) 65 | 66 | 67 | def join_path(*args): 68 | non_empty_paths = list(filter(None, args)) 69 | 70 | relevant_paths = [] 71 | for path in non_empty_paths: 72 | if path.startswith("/"): 73 | relevant_paths = [path] 74 | else: 75 | relevant_paths.append(path) 76 | 77 | return "/".join(relevant_paths) 78 | -------------------------------------------------------------------------------- /tests/docx/document_xml_tests.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from mammoth import documents 4 | from mammoth.docx.xmlparser import element as xml_element, text as xml_text 5 | from mammoth.docx.document_xml import read_document_xml_element 6 | from mammoth.docx import body_xml 7 | from ..testing import assert_equal 8 | 9 | 10 | def test_when_body_element_is_present_then_body_is_read(): 11 | text_xml = xml_element("w:t", {}, [xml_text("Hello!")]) 12 | run_xml = xml_element("w:r", {}, [text_xml]) 13 | paragraph_xml = xml_element("w:p", {}, [run_xml]) 14 | body_xml = xml_element("w:body", {}, [paragraph_xml]) 15 | document_xml = xml_element("w:document", {}, [body_xml]) 16 | 17 | document = _read_and_get_document_xml_element(document_xml) 18 | 19 | assert_equal( 20 | documents.document([documents.paragraph([documents.run([documents.text("Hello!")])])]), 21 | document 22 | ) 23 | 24 | 25 | def test_when_body_element_is_not_present_then_error_is_raised(): 26 | paragraph_xml = xml_element("w:p", {}, []) 27 | body_xml = xml_element("w:body2", {}, [paragraph_xml]) 28 | document_xml = xml_element("w:document", {}, [body_xml]) 29 | 30 | error = pytest.raises(ValueError, lambda: _read_and_get_document_xml_element(document_xml)) 31 | 32 | assert_equal(str(error.value), "Could not find the body element: are you sure this is a docx file?") 33 | 34 | 35 | def test_footnotes_of_document_are_read(): 36 | notes = [documents.note("footnote", "4", [documents.paragraph([])])] 37 | 38 | body_xml = xml_element("w:body") 39 | document_xml = xml_element("w:document", {}, [body_xml]) 40 | 41 | document = _read_and_get_document_xml_element(document_xml, notes=notes) 42 | footnote = document.notes.find_note("footnote", "4") 43 | assert_equal("4", footnote.note_id) 44 | assert isinstance(footnote.body[0], documents.Paragraph) 45 | 46 | 47 | def _read_and_get_document_xml_element(*args, **kwargs): 48 | body_reader = body_xml.reader() 49 | result = read_document_xml_element(*args, body_reader=body_reader, **kwargs) 50 | assert_equal([], result.messages) 51 | return result.value 52 | -------------------------------------------------------------------------------- /mammoth/document_matchers.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | import cobble 4 | 5 | 6 | def paragraph(style_id=None, style_name=None, numbering=None): 7 | return ParagraphMatcher(style_id, style_name, numbering) 8 | 9 | 10 | ParagraphMatcher = collections.namedtuple("ParagraphMatcher", ["style_id", "style_name", "numbering"]) 11 | ParagraphMatcher.element_type = "paragraph" 12 | 13 | 14 | def run(style_id=None, style_name=None): 15 | return RunMatcher(style_id, style_name) 16 | 17 | 18 | RunMatcher = collections.namedtuple("RunMatcher", ["style_id", "style_name"]) 19 | RunMatcher.element_type = "run" 20 | 21 | 22 | def table(style_id=None, style_name=None): 23 | return TableMatcher(style_id, style_name) 24 | 25 | 26 | TableMatcher = collections.namedtuple("TableMatcher", ["style_id", "style_name"]) 27 | TableMatcher.element_type = "table" 28 | 29 | 30 | class bold(object): 31 | element_type = "bold" 32 | 33 | 34 | class italic(object): 35 | element_type = "italic" 36 | 37 | 38 | class underline(object): 39 | element_type = "underline" 40 | 41 | 42 | class strikethrough(object): 43 | element_type = "strikethrough" 44 | 45 | 46 | class all_caps(object): 47 | element_type = "all_caps" 48 | 49 | 50 | class small_caps(object): 51 | element_type = "small_caps" 52 | 53 | 54 | def highlight(color=None): 55 | return HighlightMatcher(color=color) 56 | 57 | 58 | HighlightMatcher = collections.namedtuple("HighlightMatcher", ["color"]) 59 | HighlightMatcher.element_type = "highlight" 60 | 61 | class comment_reference(object): 62 | element_type = "comment_reference" 63 | 64 | 65 | BreakMatcher = collections.namedtuple("BreakMatcher", ["break_type"]) 66 | BreakMatcher.element_type = "break" 67 | 68 | 69 | line_break = BreakMatcher("line") 70 | page_break = BreakMatcher("page") 71 | column_break = BreakMatcher("column") 72 | 73 | 74 | def equal_to(value): 75 | return StringMatcher(_operator_equal_to, value) 76 | 77 | 78 | def _operator_equal_to(first, second): 79 | return first.upper() == second.upper() 80 | 81 | 82 | def starts_with(value): 83 | return StringMatcher(_operator_starts_with, value) 84 | 85 | def _operator_starts_with(first, second): 86 | return second.upper().startswith(first.upper()) 87 | 88 | 89 | @cobble.data 90 | class StringMatcher(object): 91 | operator = cobble.field() 92 | value = cobble.field() 93 | 94 | def matches(self, other): 95 | return self.operator(self.value, other) 96 | -------------------------------------------------------------------------------- /tests/images_tests.py: -------------------------------------------------------------------------------- 1 | import io 2 | 3 | from precisely import assert_that, has_attrs, is_sequence 4 | 5 | import mammoth 6 | 7 | 8 | def test_inline_is_available_as_alias_of_img_element(): 9 | assert mammoth.images.inline is mammoth.images.img_element 10 | 11 | 12 | def test_data_uri_encodes_images_in_base64(): 13 | image_bytes = b"abc" 14 | image = mammoth.documents.Image( 15 | alt_text=None, 16 | content_type="image/jpeg", 17 | open=lambda: io.BytesIO(image_bytes), 18 | ) 19 | 20 | result = mammoth.images.data_uri(image) 21 | 22 | assert_that(result, is_sequence( 23 | has_attrs(attributes={"src": ""}), 24 | )) 25 | 26 | 27 | class ImgElementTests: 28 | def test_when_element_does_not_have_alt_text_then_alt_attribute_is_not_set(self): 29 | image_bytes = b"abc" 30 | image = mammoth.documents.Image( 31 | alt_text=None, 32 | content_type="image/jpeg", 33 | open=lambda: io.BytesIO(image_bytes), 34 | ) 35 | 36 | @mammoth.images.img_element 37 | def convert_image(image): 38 | return {"src": ""} 39 | 40 | result = convert_image(image) 41 | 42 | assert_that(result, is_sequence( 43 | has_attrs(attributes={"src": ""}), 44 | )) 45 | 46 | def test_when_element_se_alt_text_then_alt_attribute_is_set(self): 47 | image_bytes = b"abc" 48 | image = mammoth.documents.Image( 49 | alt_text="", 50 | content_type="image/jpeg", 51 | open=lambda: io.BytesIO(image_bytes), 52 | ) 53 | 54 | @mammoth.images.img_element 55 | def convert_image(image): 56 | return {"src": ""} 57 | 58 | result = convert_image(image) 59 | 60 | assert_that(result, is_sequence( 61 | has_attrs(attributes={"alt": "", "src": ""}), 62 | )) 63 | 64 | def test_image_alt_text_can_be_overridden_by_alt_attribute_returned_from_function(self): 65 | image_bytes = b"abc" 66 | image = mammoth.documents.Image( 67 | alt_text="", 68 | content_type="image/jpeg", 69 | open=lambda: io.BytesIO(image_bytes), 70 | ) 71 | 72 | @mammoth.images.img_element 73 | def convert_image(image): 74 | return {"alt": "", "src": ""} 75 | 76 | result = convert_image(image) 77 | 78 | assert_that(result, is_sequence( 79 | has_attrs(attributes={"alt": "", "src": ""}), 80 | )) 81 | -------------------------------------------------------------------------------- /tests/docx/relationships_xml_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth.docx.xmlparser import element as xml_element 2 | from mammoth.docx.relationships_xml import read_relationships_xml_element 3 | from ..testing import assert_equal 4 | 5 | 6 | def test_relationship_targets_can_be_found_by_id(): 7 | element = xml_element("relationships:Relationships", {}, [ 8 | xml_element("relationships:Relationship", { 9 | "Id": "rId8", 10 | "Type": "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink", 11 | "Target": "http://example.com", 12 | }), 13 | xml_element("relationships:Relationship", { 14 | "Id": "rId2", 15 | "Type": "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink", 16 | "Target": "http://example.net", 17 | }), 18 | ]) 19 | relationships = read_relationships_xml_element(element) 20 | assert_equal( 21 | "http://example.com", 22 | relationships.find_target_by_relationship_id("rId8"), 23 | ) 24 | 25 | 26 | def test_relationship_targets_can_be_found_by_type(): 27 | element = xml_element("relationships:Relationships", {}, [ 28 | xml_element("relationships:Relationship", { 29 | "Id": "rId2", 30 | "Target": "docProps/core.xml", 31 | "Type": "http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties", 32 | }), 33 | xml_element("relationships:Relationship", { 34 | "Id": "rId1", 35 | "Target": "word/document.xml", 36 | "Type": "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument", 37 | }), 38 | xml_element("relationships:Relationship", { 39 | "Id": "rId3", 40 | "Target": "word/document2.xml", 41 | "Type": "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument", 42 | }), 43 | ]) 44 | relationships = read_relationships_xml_element(element) 45 | assert_equal( 46 | ["word/document.xml", "word/document2.xml"], 47 | relationships.find_targets_by_type("http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument"), 48 | ) 49 | 50 | 51 | def test_when_there_are_no_relationships_of_requested_type_then_empty_list_is_returned(): 52 | element = xml_element("relationships:Relationships", {}, []) 53 | relationships = read_relationships_xml_element(element) 54 | assert_equal( 55 | [], 56 | relationships.find_targets_by_type("http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument"), 57 | ) 58 | -------------------------------------------------------------------------------- /tests/docx/content_types_xml_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth.docx.xmlparser import element as xml_element 2 | from mammoth.docx.content_types_xml import read_content_types_xml_element 3 | from ..testing import assert_equal 4 | 5 | 6 | def test_content_type_is_based_on_default_for_extension_if_there_is_no_override(): 7 | element = xml_element("content-types:Types", {}, [ 8 | xml_element("content-types:Default", { 9 | "Extension": "png", 10 | "ContentType": "image/png", 11 | }) 12 | ]) 13 | content_types = read_content_types_xml_element(element) 14 | assert_equal( 15 | "image/png", 16 | content_types.find_content_type("word/media/hat.png"), 17 | ) 18 | 19 | 20 | def test_content_type_is_based_on_override_if_present(): 21 | element = xml_element("content-types:Types", {}, [ 22 | xml_element("content-types:Default", { 23 | "Extension": "png", 24 | "ContentType": "image/png", 25 | }), 26 | xml_element("content-types:Override", { 27 | "PartName": "/word/media/hat.png", 28 | "ContentType": "image/hat" 29 | }), 30 | ]) 31 | content_types = read_content_types_xml_element(element) 32 | assert_equal( 33 | "image/hat", 34 | content_types.find_content_type("word/media/hat.png"), 35 | ) 36 | 37 | 38 | def test_fallback_content_types_have_common_image_types(): 39 | element = xml_element("content-types:Types", {}, []) 40 | content_types = read_content_types_xml_element(element) 41 | assert_equal( 42 | "image/png", 43 | content_types.find_content_type("word/media/hat.png"), 44 | ) 45 | assert_equal( 46 | "image/gif", 47 | content_types.find_content_type("word/media/hat.gif"), 48 | ) 49 | assert_equal( 50 | "image/jpeg", 51 | content_types.find_content_type("word/media/hat.jpg"), 52 | ) 53 | assert_equal( 54 | "image/jpeg", 55 | content_types.find_content_type("word/media/hat.jpeg"), 56 | ) 57 | assert_equal( 58 | "image/bmp", 59 | content_types.find_content_type("word/media/hat.bmp"), 60 | ) 61 | assert_equal( 62 | "image/tiff", 63 | content_types.find_content_type("word/media/hat.tif"), 64 | ) 65 | assert_equal( 66 | "image/tiff", 67 | content_types.find_content_type("word/media/hat.tiff"), 68 | ) 69 | 70 | 71 | def test_fallback_content_types_are_case_insensitive(): 72 | element = xml_element("content-types:Types", {}, []) 73 | content_types = read_content_types_xml_element(element) 74 | assert_equal( 75 | "image/png", 76 | content_types.find_content_type("word/media/hat.PnG"), 77 | ) 78 | -------------------------------------------------------------------------------- /tests/styles/parser/html_path_parser_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth import html_paths 2 | from mammoth.styles.parser.html_path_parser import parse_html_path 3 | from mammoth.styles.parser.tokeniser import tokenise 4 | from mammoth.styles.parser.token_iterator import TokenIterator 5 | from ...testing import assert_equal 6 | 7 | 8 | def test_can_read_empty_path(): 9 | assert_equal( 10 | html_paths.empty, 11 | read_html_path("") 12 | ) 13 | 14 | def test_can_read_single_element(): 15 | assert_equal( 16 | html_paths.path([html_paths.element(["p"])]), 17 | read_html_path("p") 18 | ) 19 | 20 | 21 | def test_can_read_choice_of_two_elements(): 22 | assert_equal( 23 | html_paths.path([html_paths.element(["ul", "ol"])]), 24 | read_html_path("ul|ol") 25 | ) 26 | 27 | 28 | def test_can_read_choice_of_three_elements(): 29 | assert_equal( 30 | html_paths.path([html_paths.element(["ul", "ol", "p"])]), 31 | read_html_path("ul|ol|p") 32 | ) 33 | 34 | 35 | def test_can_read_nested_elements(): 36 | assert_equal( 37 | html_paths.path([html_paths.element(["ul"]), html_paths.element(["li"])]), 38 | read_html_path("ul > li") 39 | ) 40 | 41 | 42 | def test_can_read_class_on_element(): 43 | assert_equal( 44 | html_paths.path([html_paths.element(["p"], class_names=["tip"])]), 45 | read_html_path("p.tip") 46 | ) 47 | 48 | 49 | def test_can_read_multiple_classes_on_element(): 50 | assert_equal( 51 | html_paths.path([html_paths.element(["p"], class_names=["tip", "help"])]), 52 | read_html_path("p.tip.help") 53 | ) 54 | 55 | 56 | def test_can_read_attribute_on_element(): 57 | assert_equal( 58 | html_paths.path([html_paths.element(["p"], attributes={"lang": "fr"})]), 59 | read_html_path("p[lang='fr']") 60 | ) 61 | 62 | 63 | def test_can_read_multiple_attributes_on_element(): 64 | assert_equal( 65 | html_paths.path([html_paths.element(["p"], attributes={"lang": "fr", "data-x": "y"})]), 66 | read_html_path("p[lang='fr'][data-x='y']") 67 | ) 68 | 69 | 70 | def test_can_read_when_element_must_be_fresh(): 71 | assert_equal( 72 | html_paths.path([html_paths.element(["p"], fresh=True)]), 73 | read_html_path("p:fresh") 74 | ) 75 | 76 | 77 | def test_can_read_separator_for_elements(): 78 | assert_equal( 79 | html_paths.path([html_paths.element(["p"], separator="x")]), 80 | read_html_path("p:separator('x')") 81 | ) 82 | 83 | 84 | def test_can_read_ignore_element(): 85 | assert_equal( 86 | html_paths.ignore, 87 | read_html_path("!") 88 | ) 89 | 90 | def read_html_path(string): 91 | return parse_html_path(TokenIterator(tokenise(string))) 92 | -------------------------------------------------------------------------------- /tests/docx/files_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth.docx.files import ExternalFileAccessIsDisabledError, Files, InvalidFileReferenceError 2 | from ..testing import generate_test_path, assert_equal, assert_raises 3 | 4 | 5 | def test_when_external_file_access_is_disabled_then_opening_file_raises_error(): 6 | files = Files(None, external_file_access=False) 7 | error = assert_raises(ExternalFileAccessIsDisabledError, lambda: files.open("/tmp/image.png")) 8 | expected_message = ( 9 | "could not open external image '/tmp/image.png', external file access is disabled" 10 | ) 11 | assert_equal(expected_message, str(error)) 12 | 13 | 14 | def test_can_open_files_with_file_uri(): 15 | path = generate_test_path("tiny-picture.png") 16 | files = Files(None, external_file_access=True) 17 | with files.open("file:///" + path) as image_file: 18 | contents = image_file.read() 19 | assert_equal(bytes, type(contents)) 20 | with open(path, "rb") as source_file: 21 | assert_equal(source_file.read(), contents) 22 | 23 | 24 | def test_can_open_files_with_relative_uri(): 25 | files = Files(generate_test_path(""), external_file_access=True) 26 | with files.open("tiny-picture.png") as image_file: 27 | contents = image_file.read() 28 | assert_equal(bytes, type(contents)) 29 | with open(generate_test_path("tiny-picture.png"), "rb") as source_file: 30 | assert_equal(source_file.read(), contents) 31 | 32 | 33 | def test_given_base_is_not_set_when_opening_relative_uri_then_error_is_raised(): 34 | files = Files(None, external_file_access=True) 35 | error = assert_raises(InvalidFileReferenceError, lambda: files.open("not-a-real-file.png")) 36 | expected_message = ( 37 | "could not find external image 'not-a-real-file.png', fileobj has no name" 38 | ) 39 | assert_equal(expected_message, str(error)) 40 | 41 | 42 | def test_error_is_raised_if_relative_uri_cannot_be_opened(): 43 | files = Files("/tmp", external_file_access=True) 44 | error = assert_raises(InvalidFileReferenceError, lambda: files.open("not-a-real-file.png")) 45 | expected_message = ( 46 | "could not open external image: 'not-a-real-file.png' (document directory: '/tmp')\n" + 47 | "[Errno 2] No such file or directory: '/tmp/not-a-real-file.png'" 48 | ) 49 | assert_equal(expected_message, str(error)) 50 | 51 | 52 | def test_error_is_raised_if_file_uri_cannot_be_opened(): 53 | files = Files("/tmp", external_file_access=True) 54 | error = assert_raises(InvalidFileReferenceError, lambda: files.open("file:///not-a-real-file.png")) 55 | expected_message = "could not open external image: 'file:///not-a-real-file.png' (document directory: '/tmp')\n" 56 | assert str(error).startswith(expected_message) 57 | -------------------------------------------------------------------------------- /mammoth/docx/style_map.py: -------------------------------------------------------------------------------- 1 | from xml.etree import ElementTree 2 | 3 | from ..zips import open_zip, update_zip 4 | 5 | 6 | _style_map_path = "mammoth/style-map" 7 | _style_map_absolute_path = "/" + _style_map_path 8 | _relationships_path = "word/_rels/document.xml.rels" 9 | _content_types_path = "[Content_Types].xml" 10 | 11 | 12 | def write_style_map(fileobj, style_map): 13 | with open_zip(fileobj, "r") as zip_file: 14 | relationships_xml = _generate_relationships_xml(zip_file.read_str(_relationships_path)) 15 | content_types_xml = _generate_content_types_xml(zip_file.read_str(_content_types_path)) 16 | 17 | update_zip(fileobj, { 18 | _style_map_path: style_map.encode("utf8"), 19 | _relationships_path: relationships_xml, 20 | _content_types_path: content_types_xml, 21 | }) 22 | 23 | def _generate_relationships_xml(relationships_xml): 24 | schema = "http://schemas.zwobble.org/mammoth/style-map" 25 | relationships_uri = "http://schemas.openxmlformats.org/package/2006/relationships" 26 | relationship_element_name = "{" + relationships_uri + "}Relationship" 27 | 28 | relationships = ElementTree.fromstring(relationships_xml) 29 | _add_or_update_element(relationships, relationship_element_name, "Id", { 30 | "Id": "rMammothStyleMap", 31 | "Type": schema, 32 | "Target": _style_map_absolute_path, 33 | }) 34 | 35 | return ElementTree.tostring(relationships, "UTF-8") 36 | 37 | 38 | def _generate_content_types_xml(content_types_xml): 39 | content_types_uri = "http://schemas.openxmlformats.org/package/2006/content-types" 40 | override_name = "{" + content_types_uri + "}Override" 41 | 42 | types = ElementTree.fromstring(content_types_xml) 43 | _add_or_update_element(types, override_name, "PartName", { 44 | "PartName": _style_map_absolute_path, 45 | "ContentType": "text/prs.mammoth.style-map", 46 | }) 47 | 48 | return ElementTree.tostring(types, "UTF-8") 49 | 50 | 51 | def _add_or_update_element(parent, name, identifying_attribute, attributes): 52 | existing_child = _find_child(parent, name, identifying_attribute, attributes) 53 | if existing_child is None: 54 | ElementTree.SubElement(parent, name, attributes) 55 | else: 56 | existing_child.attrib = attributes 57 | 58 | 59 | def _find_child(parent, name, identifying_attribute, attributes): 60 | for element in parent.iter(): 61 | if element.tag == name and element.get(identifying_attribute) == attributes.get(identifying_attribute): 62 | return element 63 | 64 | 65 | def read_style_map(fileobj): 66 | with open_zip(fileobj, "r") as zip_file: 67 | if zip_file.exists(_style_map_path): 68 | return zip_file.read_str(_style_map_path) 69 | 70 | 71 | -------------------------------------------------------------------------------- /tests/docx/xmlparser_tests.py: -------------------------------------------------------------------------------- 1 | import io 2 | 3 | from mammoth.docx.xmlparser import parse_xml, element as xml_element, text as xml_text 4 | from ..testing import assert_equal 5 | 6 | 7 | def test_can_parse_self_closing_element(): 8 | xml = _parse_xml_string(b"") 9 | assert_equal(xml_element("body", {}, []), xml) 10 | 11 | 12 | def test_can_parse_empty_element_with_separate_closing_tag(): 13 | xml = _parse_xml_string(b"") 14 | assert_equal(xml_element("body", {}, []), xml) 15 | 16 | 17 | def test_can_parse_attributes_of_tag(): 18 | xml = _parse_xml_string(b"") 19 | assert_equal(xml_element("body", {"name": "bob"}, []), xml) 20 | 21 | 22 | def test_can_parse_text_element(): 23 | xml = _parse_xml_string(b"Hello!") 24 | assert_equal(xml_element("body", {}, [xml_text("Hello!")]), xml) 25 | 26 | 27 | def test_can_parse_text_element_before_new_tag(): 28 | xml = _parse_xml_string(b"Hello!
") 29 | assert_equal(xml_element("body", {}, [xml_text("Hello!"), xml_element("br", {}, [])]), xml) 30 | 31 | 32 | def test_can_parse_element_with_children(): 33 | xml = _parse_xml_string(b"") 34 | assert_equal([xml_element("a", {}, []), xml_element("b", {}, [])], xml.children) 35 | 36 | 37 | def test_unmapped_namespaces_uris_are_included_in_braces_as_prefix(): 38 | xml = _parse_xml_string(b'') 39 | assert_equal("{word}body", xml.name) 40 | 41 | 42 | def test_mapped_namespaces_uris_are_translated_using_namespace_map(): 43 | xml = _parse_xml_string(b'', [("x", "word")]) 44 | assert_equal("x:body", xml.name) 45 | 46 | 47 | def test_namespace_of_attributes_is_mapped_to_prefix(): 48 | xml = _parse_xml_string(b'', [("x", "word")]) 49 | assert_equal("Hello!", xml.attributes["x:val"]) 50 | 51 | 52 | def test_whitespace_between_xml_declaration_and_root_tag_is_ignored(): 53 | xml = _parse_xml_string(b'\n') 54 | assert_equal("body", xml.name) 55 | 56 | 57 | class FindChildTests(object): 58 | def test_returns_none_if_no_children(self): 59 | xml = xml_element("a") 60 | assert_equal(None, xml.find_child("b")) 61 | 62 | def test_returns_none_if_no_matching_children(self): 63 | xml = xml_element("a", {}, [xml_element("c")]) 64 | assert_equal(None, xml.find_child("b")) 65 | 66 | def test_returns_first_matching_child(self): 67 | xml = xml_element("a", {}, [xml_element("b", {"id": 1}), xml_element("b", {"id": 2})]) 68 | assert_equal(1, xml.find_child("b").attributes["id"]) 69 | 70 | def test_ignores_text_nodes(self): 71 | xml = xml_element("a", {}, [xml_text("Hello!")]) 72 | assert_equal(None, xml.find_child("b")) 73 | 74 | 75 | def _parse_xml_string(string, namespace_mapping=None): 76 | return parse_xml(io.BytesIO(string), namespace_mapping) 77 | -------------------------------------------------------------------------------- /recipes/wmf_images.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | import shutil 4 | import subprocess 5 | import tempfile 6 | 7 | 8 | # An example of how to use LibreOffice and ImageMagick to convert WMF images to 9 | # PNGs. 10 | # 11 | # libreoffice_wmf_conversion uses LibreOffice to convert the image to a PNG. 12 | # This normally creates an image with a large amount of padding, so 13 | # imagemagick_trim can be used to trim the image. 14 | # 15 | # The image can be then be converted using a normal image handler, such as 16 | # mammoth.images.data_uri. 17 | # 18 | # Example usage: 19 | # 20 | # def convert_image(image): 21 | # image = libreoffice_wmf_conversion(image, post_process=imagemagick_trim) 22 | # return mammoth.images.data_uri(image) 23 | # 24 | # with open("document.docx", "rb") as fileobj: 25 | # result = mammoth.convert_to_html(fileobj, convert_image=convert_image) 26 | 27 | 28 | _wmf_extensions = { 29 | "image/x-wmf": ".wmf", 30 | "image/x-emf": ".emf", 31 | } 32 | 33 | 34 | def libreoffice_wmf_conversion(image, post_process=None): 35 | if post_process is None: 36 | post_process = lambda x: x 37 | 38 | wmf_extension = _wmf_extensions.get(image.content_type) 39 | if wmf_extension is None: 40 | return image 41 | else: 42 | temporary_directory = tempfile.mkdtemp() 43 | try: 44 | input_path = os.path.join(temporary_directory, "image" + wmf_extension) 45 | with io.open(input_path, "wb") as input_fileobj: 46 | with image.open() as image_fileobj: 47 | shutil.copyfileobj(image_fileobj, input_fileobj) 48 | 49 | output_path = os.path.join(temporary_directory, "image.png") 50 | subprocess.check_call([ 51 | "libreoffice", 52 | "--headless", 53 | "--convert-to", 54 | "png", 55 | input_path, 56 | "--outdir", 57 | temporary_directory, 58 | ]) 59 | 60 | with io.open(output_path, "rb") as output_fileobj: 61 | output = output_fileobj.read() 62 | 63 | def open_image(): 64 | return io.BytesIO(output) 65 | 66 | return post_process(image.copy( 67 | content_type="image/png", 68 | open=open_image, 69 | )) 70 | finally: 71 | shutil.rmtree(temporary_directory) 72 | 73 | 74 | def imagemagick_trim(image): 75 | command = ["convert", "-", "-trim", "-"] 76 | process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) 77 | try: 78 | with image.open() as image_fileobj: 79 | shutil.copyfileobj(image_fileobj, process.stdin) 80 | output, err_output = process.communicate() 81 | except: 82 | process.kill() 83 | process.wait() 84 | raise 85 | 86 | return_code = process.poll() 87 | if return_code: 88 | raise subprocess.CalledProcessError(return_code, command) 89 | else: 90 | def open_image(): 91 | return io.BytesIO(output) 92 | 93 | return image.copy(open=open_image) 94 | 95 | -------------------------------------------------------------------------------- /tests/cli_tests.py: -------------------------------------------------------------------------------- 1 | import os 2 | import base64 3 | 4 | import spur 5 | import tempman 6 | 7 | from .testing import assert_equal, generate_test_path 8 | 9 | 10 | _local = spur.LocalShell() 11 | 12 | 13 | def test_html_is_printed_to_stdout_if_output_file_is_not_set(): 14 | docx_path = generate_test_path("single-paragraph.docx") 15 | result = _local.run(["mammoth", docx_path]) 16 | assert_equal(b"", result.stderr_output) 17 | assert_equal(b"

Walking on imported air

", result.output) 18 | 19 | 20 | def test_html_is_written_to_file_if_output_file_is_set(): 21 | with tempman.create_temp_dir() as temp_dir: 22 | output_path = os.path.join(temp_dir.path, "output.html") 23 | docx_path = generate_test_path("single-paragraph.docx") 24 | result = _local.run(["mammoth", docx_path, output_path]) 25 | assert_equal(b"", result.stderr_output) 26 | assert_equal(b"", result.output) 27 | with open(output_path) as output_file: 28 | assert_equal("

Walking on imported air

", output_file.read()) 29 | 30 | 31 | _image_base_64 = b"iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAIAAAACUFjqAAAAAXNSR0IArs4c6QAAAAlwSFlzAAAOvgAADr4B6kKxwAAAABNJREFUKFNj/M+ADzDhlWUYqdIAQSwBE8U+X40AAAAASUVORK5CYII=" 32 | 33 | 34 | def test_inline_images_are_included_in_output_if_writing_to_single_file(): 35 | docx_path = generate_test_path("tiny-picture.docx") 36 | result = _local.run(["mammoth", docx_path]) 37 | assert_equal(b"""

""", result.output) 38 | 39 | 40 | def test_images_are_written_to_separate_files_if_output_dir_is_set(): 41 | with tempman.create_temp_dir() as temp_dir: 42 | output_path = os.path.join(temp_dir.path, "tiny-picture.html") 43 | image_path = os.path.join(temp_dir.path, "1.png") 44 | 45 | docx_path = generate_test_path("tiny-picture.docx") 46 | result = _local.run(["mammoth", docx_path, "--output-dir", temp_dir.path]) 47 | assert_equal(b"", result.stderr_output) 48 | assert_equal(b"", result.output) 49 | with open(output_path) as output_file: 50 | assert_equal("""

""", output_file.read()) 51 | 52 | with open(image_path, "rb") as image_file: 53 | assert_equal(_image_base_64, base64.b64encode(image_file.read())) 54 | 55 | 56 | def test_style_map_is_used_if_set(): 57 | with tempman.create_temp_dir() as temp_dir: 58 | docx_path = generate_test_path("single-paragraph.docx") 59 | style_map_path = os.path.join(temp_dir.path, "style-map") 60 | with open(style_map_path, "w") as style_map_file: 61 | style_map_file.write("p => span:fresh") 62 | result = _local.run(["mammoth", docx_path, "--style-map", style_map_path]) 63 | assert_equal(b"", result.stderr_output) 64 | assert_equal(b"Walking on imported air", result.output) 65 | 66 | 67 | def test_output_format_markdown_option_generates_markdown_output(): 68 | docx_path = generate_test_path("single-paragraph.docx") 69 | result = _local.run(["mammoth", docx_path, "--output-format=markdown"]) 70 | assert_equal(b"", result.stderr_output) 71 | assert_equal(b"Walking on imported air\n\n", result.output) 72 | -------------------------------------------------------------------------------- /tests/transforms_tests.py: -------------------------------------------------------------------------------- 1 | import cobble 2 | 3 | from mammoth import documents, transforms 4 | from mammoth.transforms import get_descendants, get_descendants_of_type, _each_element 5 | from .testing import assert_equal 6 | 7 | 8 | class ParagraphTests(object): 9 | def test_paragraph_is_transformed(self): 10 | paragraph = documents.paragraph(children=[]) 11 | result = transforms.paragraph(lambda _: documents.tab())(paragraph) 12 | assert_equal(documents.tab(), result) 13 | 14 | def test_non_paragraph_elements_are_not_transformed(self): 15 | run = documents.run(children=[]) 16 | result = transforms.paragraph(lambda _: documents.tab())(run) 17 | assert_equal(documents.run(children=[]), result) 18 | 19 | 20 | class RunTests(object): 21 | def test_run_is_transformed(self): 22 | run = documents.run(children=[]) 23 | result = transforms.run(lambda _: documents.tab())(run) 24 | assert_equal(documents.tab(), result) 25 | 26 | def test_non_paragraph_elements_are_not_transformed(self): 27 | paragraph = documents.paragraph(children=[]) 28 | result = transforms.run(lambda _: documents.tab())(paragraph) 29 | assert_equal(documents.paragraph(children=[]), result) 30 | 31 | 32 | class EachElementTests(object): 33 | def test_all_descendants_are_transformed(self): 34 | @cobble.data 35 | class Count(documents.HasChildren): 36 | count = cobble.field() 37 | 38 | root = Count(count=None, children=[ 39 | Count(count=None, children=[ 40 | Count(count=None, children=[]), 41 | ]), 42 | ]) 43 | 44 | current_count = [0] 45 | def set_count(node): 46 | current_count[0] += 1 47 | return node.copy(count=current_count[0]) 48 | 49 | result = _each_element(set_count)(root) 50 | 51 | assert_equal(Count(count=3, children=[ 52 | Count(count=2, children=[ 53 | Count(count=1, children=[]), 54 | ]), 55 | ]), result) 56 | 57 | 58 | class GetDescendantsTests(object): 59 | def test_returns_nothing_if_element_type_has_no_children(self): 60 | assert_equal([], get_descendants(documents.tab())) 61 | 62 | def test_returns_nothing_if_element_has_empty_children(self): 63 | assert_equal([], get_descendants(documents.paragraph(children=[]))) 64 | 65 | def test_includes_children(self): 66 | children = [documents.text("child 1"), documents.text("child 2")] 67 | element = documents.paragraph(children=children) 68 | assert_equal(children, get_descendants(element)) 69 | 70 | def test_includes_indirect_descendants(self): 71 | grandchild = documents.text("grandchild") 72 | child = documents.run(children=[grandchild]) 73 | element = documents.paragraph(children=[child]) 74 | assert_equal([grandchild, child], get_descendants(element)) 75 | 76 | 77 | class GetDescendantsOfTypeTests(object): 78 | def test_filters_descendants_to_type(self): 79 | tab = documents.tab() 80 | run = documents.run(children=[]) 81 | element = documents.paragraph(children=[tab, run]) 82 | assert_equal([run], get_descendants_of_type(element, documents.Run)) 83 | -------------------------------------------------------------------------------- /tests/styles/parser/tokeniser_tests.py: -------------------------------------------------------------------------------- 1 | from precisely import assert_that, has_attrs, is_sequence 2 | 3 | from mammoth.styles.parser.tokeniser import tokenise 4 | 5 | 6 | def test_unknown_tokens_are_tokenised(): 7 | assert_tokens("~", is_token("unknown", "~")) 8 | 9 | 10 | def test_empty_string_is_tokenised_to_end_of_file_token(): 11 | assert_tokens("") 12 | 13 | 14 | def test_whitespace_is_tokenised(): 15 | assert_tokens(" \t\t ", is_token("whitespace", " \t\t ")) 16 | 17 | 18 | def test_identifiers_are_tokenised(): 19 | assert_tokens("Overture", is_token("identifier", "Overture")) 20 | 21 | 22 | def test_escape_sequences_in_identifiers_are_tokenised(): 23 | assert_tokens(r"\:", is_token("identifier", r"\:")) 24 | 25 | 26 | def test_integers_are_tokenised(): 27 | assert_tokens("123", is_token("integer", "123")) 28 | 29 | 30 | def test_strings_are_tokenised(): 31 | assert_tokens("'Tristan'", is_token("string", "'Tristan'")) 32 | 33 | 34 | def test_escape_sequences_in_strings_are_tokenised(): 35 | assert_tokens(r"'Tristan\''", is_token("string", r"'Tristan\''")) 36 | 37 | 38 | def test_unterminated_strings_are_tokenised(): 39 | assert_tokens("'Tristan", is_token("unterminated string", "'Tristan")) 40 | 41 | 42 | def test_arrows_are_tokenised(): 43 | assert_tokens("=>=>", is_token("symbol", "=>"), is_token("symbol", "=>")) 44 | 45 | 46 | def test_dots_are_tokenised(): 47 | assert_tokens(".", is_token("symbol", ".")) 48 | 49 | 50 | def test_colons_are_tokenised(): 51 | assert_tokens("::", is_token("symbol", ":"), is_token("symbol", ":")) 52 | 53 | 54 | def test_greater_thans_are_tokenised(): 55 | assert_tokens(">>", is_token("symbol", ">"), is_token("symbol", ">")) 56 | 57 | 58 | def test_equals_are_tokenised(): 59 | assert_tokens("==", is_token("symbol", "="), is_token("symbol", "=")) 60 | 61 | 62 | def test_open_parens_are_tokenised(): 63 | assert_tokens("((", is_token("symbol", "("), is_token("symbol", "(")) 64 | 65 | 66 | def test_close_parens_are_tokenised(): 67 | assert_tokens("))", is_token("symbol", ")"), is_token("symbol", ")")) 68 | 69 | 70 | def test_open_square_brackets_are_tokenised(): 71 | assert_tokens("[[", is_token("symbol", "["), is_token("symbol", "[")) 72 | 73 | 74 | def test_close_square_brackets_are_tokenised(): 75 | assert_tokens("]]", is_token("symbol", "]"), is_token("symbol", "]")) 76 | 77 | 78 | def test_choices_are_tokenised(): 79 | assert_tokens("||", is_token("symbol", "|"), is_token("symbol", "|")) 80 | 81 | 82 | def test_bangs_are_tokenised(): 83 | assert_tokens("!!", is_token("symbol", "!"), is_token("symbol", "!")) 84 | 85 | 86 | def test_can_tokenise_multiple_tokens(): 87 | assert_tokens("The Magic Position", 88 | is_token("identifier", "The"), 89 | is_token("whitespace", " "), 90 | is_token("identifier", "Magic"), 91 | is_token("whitespace", " "), 92 | is_token("identifier", "Position"), 93 | ) 94 | 95 | 96 | def assert_tokens(string, *expected): 97 | expected = list(expected) 98 | expected.append(is_token("end", "")) 99 | assert_that( 100 | tokenise(string), 101 | is_sequence(*expected), 102 | ) 103 | 104 | 105 | def is_token(token_type, value): 106 | return has_attrs( 107 | type=token_type, 108 | value=value, 109 | ) 110 | -------------------------------------------------------------------------------- /mammoth/cli.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import io 3 | import os 4 | import shutil 5 | import sys 6 | 7 | import mammoth 8 | from . import writers 9 | 10 | 11 | def main(): 12 | args = _parse_args() 13 | 14 | if args.style_map is None: 15 | style_map = None 16 | else: 17 | with open(args.style_map) as style_map_fileobj: 18 | style_map = style_map_fileobj.read() 19 | 20 | with open(args.path, "rb") as docx_fileobj: 21 | if args.output_dir is None: 22 | convert_image = None 23 | output_path = args.output 24 | else: 25 | convert_image = mammoth.images.img_element(ImageWriter(args.output_dir)) 26 | output_filename = "{0}.html".format(os.path.basename(args.path).rpartition(".")[0]) 27 | output_path = os.path.join(args.output_dir, output_filename) 28 | 29 | result = mammoth.convert( 30 | docx_fileobj, 31 | style_map=style_map, 32 | convert_image=convert_image, 33 | output_format=args.output_format, 34 | ) 35 | for message in result.messages: 36 | sys.stderr.write(message.message) 37 | sys.stderr.write("\n") 38 | 39 | _write_output(output_path, result.value) 40 | 41 | 42 | class ImageWriter(object): 43 | def __init__(self, output_dir): 44 | self._output_dir = output_dir 45 | self._image_number = 1 46 | 47 | def __call__(self, element): 48 | extension = element.content_type.partition("/")[2] 49 | image_filename = "{0}.{1}".format(self._image_number, extension) 50 | with open(os.path.join(self._output_dir, image_filename), "wb") as image_dest: 51 | with element.open() as image_source: 52 | shutil.copyfileobj(image_source, image_dest) 53 | 54 | self._image_number += 1 55 | 56 | return {"src": image_filename} 57 | 58 | 59 | def _write_output(path, contents): 60 | if path is None: 61 | if sys.version_info[0] <= 2: 62 | stdout = sys.stdout 63 | else: 64 | stdout = sys.stdout.buffer 65 | 66 | stdout.write(contents.encode("utf-8")) 67 | stdout.flush() 68 | else: 69 | with io.open(path, "w", encoding="utf-8") as fileobj: 70 | fileobj.write(contents) 71 | 72 | 73 | def _parse_args(): 74 | parser = argparse.ArgumentParser() 75 | parser.add_argument( 76 | "path", 77 | metavar="docx-path", 78 | help="Path to the .docx file to convert.") 79 | 80 | output_group = parser.add_mutually_exclusive_group() 81 | output_group.add_argument( 82 | "output", 83 | nargs="?", 84 | metavar="output-path", 85 | help="Output path for the generated document. Images will be stored inline in the output document. Output is written to stdout if not set.") 86 | output_group.add_argument( 87 | "--output-dir", 88 | help="Output directory for generated HTML and images. Images will be stored in separate files. Mutually exclusive with output-path.") 89 | 90 | parser.add_argument( 91 | "--output-format", 92 | required=False, 93 | choices=writers.formats(), 94 | help="Output format.") 95 | parser.add_argument( 96 | "--style-map", 97 | required=False, 98 | help="File containg a style map.") 99 | return parser.parse_args() 100 | 101 | 102 | if __name__ == "__main__": 103 | main() 104 | 105 | -------------------------------------------------------------------------------- /mammoth/options.py: -------------------------------------------------------------------------------- 1 | from .styles.parser import read_style_mapping 2 | from . import lists, results 3 | 4 | 5 | def read_options(options): 6 | custom_style_map_text = options.pop("style_map", "") or "" 7 | embedded_style_map_text = options.pop("embedded_style_map", "") or "" 8 | include_default_style_map = options.pop("include_default_style_map", True) 9 | 10 | read_style_map_result = results.combine([ 11 | _read_style_map(custom_style_map_text), 12 | _read_style_map(embedded_style_map_text), 13 | ]) 14 | 15 | custom_style_map, embedded_style_map = read_style_map_result.value 16 | style_map = custom_style_map + embedded_style_map 17 | 18 | if include_default_style_map: 19 | style_map += _default_style_map 20 | 21 | options["ignore_empty_paragraphs"] = options.get("ignore_empty_paragraphs", True) 22 | options["style_map"] = style_map 23 | return read_style_map_result.map(lambda _: options) 24 | 25 | 26 | def _read_style_map(style_text): 27 | lines = filter(None, map(_get_line, style_text.split("\n"))) 28 | return results.combine(lists.map(read_style_mapping, lines)) \ 29 | .map(lambda style_mappings: lists.filter(None, style_mappings)) 30 | 31 | 32 | def _get_line(line): 33 | line = line.strip() 34 | if line.startswith("#"): 35 | return None 36 | else: 37 | return line 38 | 39 | 40 | _default_style_map_result = _read_style_map(""" 41 | p.Heading1 => h1:fresh 42 | p.Heading2 => h2:fresh 43 | p.Heading3 => h3:fresh 44 | p.Heading4 => h4:fresh 45 | p.Heading5 => h5:fresh 46 | p.Heading6 => h6:fresh 47 | p[style-name='Heading 1'] => h1:fresh 48 | p[style-name='Heading 2'] => h2:fresh 49 | p[style-name='Heading 3'] => h3:fresh 50 | p[style-name='Heading 4'] => h4:fresh 51 | p[style-name='Heading 5'] => h5:fresh 52 | p[style-name='Heading 6'] => h6:fresh 53 | p[style-name='heading 1'] => h1:fresh 54 | p[style-name='heading 2'] => h2:fresh 55 | p[style-name='heading 3'] => h3:fresh 56 | p[style-name='heading 4'] => h4:fresh 57 | p[style-name='heading 5'] => h5:fresh 58 | p[style-name='heading 6'] => h6:fresh 59 | 60 | # Apple Pages 61 | p.Heading => h1:fresh 62 | p[style-name='Heading'] => h1:fresh 63 | 64 | r[style-name='Strong'] => strong 65 | 66 | p[style-name='footnote text'] => p:fresh 67 | r[style-name='footnote reference'] => 68 | p[style-name='endnote text'] => p:fresh 69 | r[style-name='endnote reference'] => 70 | p[style-name='annotation text'] => p:fresh 71 | r[style-name='annotation reference'] => 72 | 73 | # LibreOffice 74 | p[style-name='Footnote'] => p:fresh 75 | r[style-name='Footnote anchor'] => 76 | p[style-name='Endnote'] => p:fresh 77 | r[style-name='Endnote anchor'] => 78 | 79 | p:unordered-list(1) => ul > li:fresh 80 | p:unordered-list(2) => ul|ol > li > ul > li:fresh 81 | p:unordered-list(3) => ul|ol > li > ul|ol > li > ul > li:fresh 82 | p:unordered-list(4) => ul|ol > li > ul|ol > li > ul|ol > li > ul > li:fresh 83 | p:unordered-list(5) => ul|ol > li > ul|ol > li > ul|ol > li > ul|ol > li > ul > li:fresh 84 | p:ordered-list(1) => ol > li:fresh 85 | p:ordered-list(2) => ul|ol > li > ol > li:fresh 86 | p:ordered-list(3) => ul|ol > li > ul|ol > li > ol > li:fresh 87 | p:ordered-list(4) => ul|ol > li > ul|ol > li > ul|ol > li > ol > li:fresh 88 | p:ordered-list(5) => ul|ol > li > ul|ol > li > ul|ol > li > ul|ol > li > ol > li:fresh 89 | 90 | r[style-name='Hyperlink'] => 91 | 92 | p[style-name='Normal'] => p:fresh 93 | 94 | # Apple Pages 95 | p.Body => p:fresh 96 | p[style-name='Body'] => p:fresh 97 | """) 98 | 99 | 100 | assert not _default_style_map_result.messages 101 | _default_style_map = _default_style_map_result.value 102 | -------------------------------------------------------------------------------- /mammoth/docx/xmlparser.py: -------------------------------------------------------------------------------- 1 | import xml.dom.minidom 2 | 3 | import cobble 4 | 5 | 6 | @cobble.data 7 | class XmlElement(object): 8 | name = cobble.field() 9 | attributes = cobble.field() 10 | children = cobble.field() 11 | 12 | def find_child_or_null(self, name): 13 | return self.find_child(name) or null_xml_element 14 | 15 | def find_child(self, name): 16 | for child in self.children: 17 | if isinstance(child, XmlElement) and child.name == name: 18 | return child 19 | 20 | 21 | def find_children(self, name): 22 | return XmlElementList(filter( 23 | lambda child: child.node_type == node_types.element and child.name == name, 24 | self.children 25 | )) 26 | 27 | 28 | class XmlElementList(object): 29 | def __init__(self, elements): 30 | self._elements = elements 31 | 32 | def __iter__(self): 33 | return iter(self._elements) 34 | 35 | def find_children(self, name): 36 | children = [] 37 | for element in self._elements: 38 | for child in element.find_children(name): 39 | children.append(child) 40 | return XmlElementList(children) 41 | 42 | 43 | class NullXmlElement(object): 44 | attributes = {} 45 | children = [] 46 | 47 | def find_child_or_null(self, name): 48 | return self 49 | 50 | def find_child(self, name): 51 | return None 52 | 53 | 54 | null_xml_element = NullXmlElement() 55 | 56 | 57 | @cobble.data 58 | class XmlText(object): 59 | value = cobble.field() 60 | 61 | 62 | def element(name, attributes=None, children=None): 63 | return XmlElement(name, attributes or {}, children or []) 64 | 65 | text = XmlText 66 | 67 | 68 | class node_types(object): 69 | element = 1 70 | text = 3 71 | 72 | 73 | XmlElement.node_type = node_types.element 74 | XmlText.node_type = node_types.text 75 | 76 | 77 | 78 | def parse_xml(fileobj, namespace_mapping=None): 79 | if namespace_mapping is None: 80 | namespace_prefixes = {} 81 | else: 82 | namespace_prefixes = dict((uri, prefix) for prefix, uri in namespace_mapping) 83 | 84 | document = xml.dom.minidom.parse(fileobj) 85 | 86 | def convert_node(node): 87 | if node.nodeType == xml.dom.Node.ELEMENT_NODE: 88 | return convert_element(node) 89 | elif node.nodeType == xml.dom.Node.TEXT_NODE: 90 | return XmlText(node.nodeValue) 91 | else: 92 | return None 93 | 94 | def convert_element(element): 95 | converted_name = convert_name(element) 96 | 97 | converted_attributes = dict( 98 | (convert_name(attribute), attribute.value) 99 | for attribute in element.attributes.values() 100 | if attribute.namespaceURI != "http://www.w3.org/2000/xmlns/" 101 | ) 102 | 103 | converted_children = [] 104 | for child_node in element.childNodes: 105 | converted_child_node = convert_node(child_node) 106 | if converted_child_node is not None: 107 | converted_children.append(converted_child_node) 108 | 109 | return XmlElement(converted_name, converted_attributes, converted_children) 110 | 111 | def convert_name(node): 112 | if node.namespaceURI is None: 113 | return node.localName 114 | else: 115 | prefix = namespace_prefixes.get(node.namespaceURI) 116 | if prefix is None: 117 | return "{%s}%s" % (node.namespaceURI, node.localName) 118 | else: 119 | return "%s:%s" % (prefix, node.localName) 120 | 121 | return convert_node(document.documentElement) 122 | -------------------------------------------------------------------------------- /mammoth/styles/parser/html_path_parser.py: -------------------------------------------------------------------------------- 1 | import cobble 2 | 3 | from ... import html_paths 4 | from .tokeniser import TokenType 5 | from .token_parser import parse_identifier, parse_string 6 | 7 | 8 | @cobble.data 9 | class _AttributeOrClassName(object): 10 | name = cobble.field() 11 | value = cobble.field() 12 | append = cobble.field() 13 | 14 | 15 | def parse_html_path(tokens): 16 | if tokens.try_skip(TokenType.SYMBOL, "!"): 17 | return html_paths.ignore 18 | else: 19 | return html_paths.path(_parse_html_path_elements(tokens)) 20 | 21 | 22 | def _parse_html_path_elements(tokens): 23 | elements = [] 24 | 25 | if tokens.peek_token_type() == TokenType.IDENTIFIER: 26 | elements.append(_parse_element(tokens)) 27 | 28 | while tokens.try_skip_many(((TokenType.WHITESPACE, None), (TokenType.SYMBOL, ">"))): 29 | tokens.skip(TokenType.WHITESPACE) 30 | elements.append(_parse_element(tokens)) 31 | 32 | return elements 33 | 34 | 35 | def _parse_element(tokens): 36 | tag_names = _parse_tag_names(tokens) 37 | attributes_list = _parse_attribute_or_class_names(tokens) 38 | is_fresh = _parse_is_fresh(tokens) 39 | separator = _parse_separator(tokens) 40 | 41 | attributes = {} 42 | for attribute in attributes_list: 43 | if attribute.append and attributes.get(attribute.name): 44 | attributes[attribute.name] += " " + attribute.value 45 | else: 46 | attributes[attribute.name] = attribute.value 47 | 48 | return html_paths.element( 49 | tag_names, 50 | attributes=attributes, 51 | fresh=is_fresh, 52 | separator=separator, 53 | ) 54 | 55 | 56 | def _parse_tag_names(tokens): 57 | tag_names = [parse_identifier(tokens)] 58 | 59 | while tokens.try_skip(TokenType.SYMBOL, "|"): 60 | tag_names.append(parse_identifier(tokens)) 61 | 62 | return tag_names 63 | 64 | 65 | def _parse_attribute_or_class_names(tokens): 66 | attribute_or_class_names = [] 67 | 68 | while True: 69 | attribute_or_class_name = _try_parse_attribute_or_class_name(tokens) 70 | if attribute_or_class_name is None: 71 | break 72 | else: 73 | attribute_or_class_names.append(attribute_or_class_name) 74 | 75 | return attribute_or_class_names 76 | 77 | 78 | def _try_parse_attribute_or_class_name(tokens): 79 | if tokens.is_next(TokenType.SYMBOL, "["): 80 | return _parse_attribute(tokens) 81 | if tokens.is_next(TokenType.SYMBOL, "."): 82 | return _parse_class_name(tokens) 83 | else: 84 | return None 85 | 86 | 87 | def _parse_attribute(tokens): 88 | tokens.skip(TokenType.SYMBOL, "[") 89 | name = parse_identifier(tokens) 90 | tokens.skip(TokenType.SYMBOL, "=") 91 | value = parse_string(tokens) 92 | tokens.skip(TokenType.SYMBOL, "]") 93 | return _AttributeOrClassName(name=name, value=value, append=False) 94 | 95 | 96 | def _parse_class_name(tokens): 97 | tokens.skip(TokenType.SYMBOL, ".") 98 | class_name = parse_identifier(tokens) 99 | return _AttributeOrClassName(name="class", value=class_name, append=True) 100 | 101 | 102 | def _parse_is_fresh(tokens): 103 | return tokens.try_skip_many(( 104 | (TokenType.SYMBOL, ":"), 105 | (TokenType.IDENTIFIER, "fresh"), 106 | )) 107 | 108 | 109 | def _parse_separator(tokens): 110 | is_separator = tokens.try_skip_many(( 111 | (TokenType.SYMBOL, ":"), 112 | (TokenType.IDENTIFIER, "separator"), 113 | )) 114 | if is_separator: 115 | tokens.skip(TokenType.SYMBOL, "(") 116 | value = parse_string(tokens) 117 | tokens.skip(TokenType.SYMBOL, ")") 118 | return value 119 | else: 120 | return None 121 | -------------------------------------------------------------------------------- /mammoth/html/__init__.py: -------------------------------------------------------------------------------- 1 | from ..lists import flat_map 2 | from .nodes import TextNode, Tag, Element, ForceWrite, NodeVisitor 3 | 4 | 5 | def text(value): 6 | return TextNode(value) 7 | 8 | 9 | def tag(tag_names, attributes=None, collapsible=None, separator=None): 10 | if not isinstance(tag_names, list): 11 | tag_names = [tag_names] 12 | if attributes is None: 13 | attributes = {} 14 | return Tag(tag_names=tag_names, attributes=attributes, collapsible=bool(collapsible), separator=separator) 15 | 16 | 17 | def element(tag_names, attributes=None, children=None, collapsible=None, separator=None): 18 | if children is None: 19 | children = [] 20 | 21 | element_tag = tag(tag_names=tag_names, attributes=attributes, collapsible=collapsible, separator=separator) 22 | return Element(element_tag, children) 23 | 24 | 25 | def collapsible_element(tag_names, attributes=None, children=None): 26 | return element(tag_names, attributes, children, collapsible=True) 27 | 28 | 29 | force_write = ForceWrite() 30 | 31 | 32 | def strip_empty(nodes): 33 | return flat_map(_strip_empty_node, nodes) 34 | 35 | 36 | def _strip_empty_node(node): 37 | return StripEmpty().visit(node) 38 | 39 | 40 | class StripEmpty(NodeVisitor): 41 | def visit_text_node(self, node): 42 | if node.value: 43 | return [node] 44 | else: 45 | return [] 46 | 47 | def visit_element(self, element): 48 | children = strip_empty(element.children) 49 | if len(children) == 0 and not element.is_void(): 50 | return [] 51 | else: 52 | return [Element(element.tag, children)] 53 | 54 | def visit_force_write(self, node): 55 | return [node] 56 | 57 | 58 | def collapse(nodes): 59 | collapsed = [] 60 | 61 | for node in nodes: 62 | _collapsing_add(collapsed, node) 63 | 64 | return collapsed 65 | 66 | class _CollapseNode(NodeVisitor): 67 | def visit_text_node(self, node): 68 | return node 69 | 70 | def visit_element(self, element): 71 | return Element(element.tag, collapse(element.children)) 72 | 73 | def visit_force_write(self, node): 74 | return node 75 | 76 | _collapse_node = _CollapseNode().visit 77 | 78 | 79 | def _collapsing_add(collapsed, node): 80 | collapsed_node = _collapse_node(node) 81 | if not _try_collapse(collapsed, collapsed_node): 82 | collapsed.append(collapsed_node) 83 | 84 | def _try_collapse(collapsed, node): 85 | if not collapsed: 86 | return False 87 | 88 | last = collapsed[-1] 89 | if not isinstance(last, Element) or not isinstance(node, Element): 90 | return False 91 | 92 | if not node.collapsible: 93 | return False 94 | 95 | if not _is_match(last, node): 96 | return False 97 | 98 | if node.separator: 99 | last.children.append(text(node.separator)) 100 | 101 | for child in node.children: 102 | _collapsing_add(last.children, child) 103 | 104 | return True 105 | 106 | def _is_match(first, second): 107 | return first.tag_name in second.tag_names and first.attributes == second.attributes 108 | 109 | 110 | def write(writer, nodes): 111 | visitor = _NodeWriter(writer) 112 | visitor.visit_all(nodes) 113 | 114 | 115 | class _NodeWriter(NodeVisitor): 116 | def __init__(self, writer): 117 | self._writer = writer 118 | 119 | def visit_text_node(self, node): 120 | self._writer.text(node.value) 121 | 122 | def visit_element(self, element): 123 | if element.is_void(): 124 | self._writer.self_closing(element.tag_name, element.attributes) 125 | else: 126 | self._writer.start(element.tag_name, element.attributes) 127 | self.visit_all(element.children) 128 | self._writer.end(element.tag_name) 129 | 130 | def visit_force_write(self, element): 131 | pass 132 | 133 | def visit_all(self, nodes): 134 | for node in nodes: 135 | self.visit(node) 136 | -------------------------------------------------------------------------------- /mammoth/docx/styles_xml.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | 4 | class Styles(object): 5 | @staticmethod 6 | def create(paragraph_styles=None, character_styles=None, table_styles=None, numbering_styles=None): 7 | if paragraph_styles is None: 8 | paragraph_styles = {} 9 | if character_styles is None: 10 | character_styles = {} 11 | if table_styles is None: 12 | table_styles = {} 13 | if numbering_styles is None: 14 | numbering_styles = {} 15 | 16 | return Styles( 17 | paragraph_styles=paragraph_styles, 18 | character_styles=character_styles, 19 | table_styles=table_styles, 20 | numbering_styles=numbering_styles, 21 | ) 22 | 23 | def __init__(self, paragraph_styles, character_styles, table_styles, numbering_styles): 24 | self._paragraph_styles = paragraph_styles 25 | self._character_styles = character_styles 26 | self._table_styles = table_styles 27 | self._numbering_styles = numbering_styles 28 | 29 | def find_paragraph_style_by_id(self, style_id): 30 | return self._paragraph_styles.get(style_id) 31 | 32 | def find_character_style_by_id(self, style_id): 33 | return self._character_styles.get(style_id) 34 | 35 | def find_table_style_by_id(self, style_id): 36 | return self._table_styles.get(style_id) 37 | 38 | def find_numbering_style_by_id(self, style_id): 39 | return self._numbering_styles.get(style_id) 40 | 41 | 42 | Styles.EMPTY = Styles( 43 | paragraph_styles={}, 44 | character_styles={}, 45 | table_styles={}, 46 | numbering_styles={}, 47 | ) 48 | 49 | 50 | def read_styles_xml_element(element): 51 | paragraph_styles = {} 52 | character_styles = {} 53 | table_styles = {} 54 | numbering_styles = {} 55 | styles = { 56 | "paragraph": paragraph_styles, 57 | "character": character_styles, 58 | "table": table_styles, 59 | "numbering": numbering_styles, 60 | } 61 | 62 | for style_element in element.find_children("w:style"): 63 | element_type = style_element.attributes["w:type"] 64 | if element_type == "numbering": 65 | style = _read_numbering_style_element(style_element) 66 | else: 67 | style = _read_style_element(style_element) 68 | 69 | style_set = styles.get(element_type) 70 | 71 | # Per 17.7.4.17 style (Style Definition) of ECMA-376 4th edition Part 1: 72 | # 73 | # > If multiple style definitions each declare the same value for their 74 | # > styleId, then the first such instance shall keep its current 75 | # > identifier with all other instances being reassigned in any manner 76 | # > desired. 77 | # 78 | # For the purpose of conversion, there's no point holding onto styles 79 | # with reassigned style IDs, so we ignore such style definitions. 80 | 81 | if style_set is not None and style.style_id not in style_set: 82 | style_set[style.style_id] = style 83 | 84 | return Styles( 85 | paragraph_styles=paragraph_styles, 86 | character_styles=character_styles, 87 | table_styles=table_styles, 88 | numbering_styles=numbering_styles, 89 | ) 90 | 91 | 92 | Style = collections.namedtuple("Style", ["style_id", "name"]) 93 | 94 | 95 | def _read_style_element(element): 96 | style_id = _read_style_id(element) 97 | name = element.find_child_or_null("w:name").attributes.get("w:val") 98 | return Style(style_id=style_id, name=name) 99 | 100 | 101 | NumberingStyle = collections.namedtuple("NumberingStyle", ["style_id", "num_id"]) 102 | 103 | 104 | def _read_numbering_style_element(element): 105 | style_id = _read_style_id(element) 106 | 107 | num_id = element \ 108 | .find_child_or_null("w:pPr") \ 109 | .find_child_or_null("w:numPr") \ 110 | .find_child_or_null("w:numId") \ 111 | .attributes.get("w:val") 112 | 113 | return NumberingStyle(style_id=style_id, num_id=num_id) 114 | 115 | 116 | def _read_style_id(element): 117 | return element.attributes["w:styleId"] 118 | -------------------------------------------------------------------------------- /tests/html/collapse_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth import html 2 | from ..testing import assert_equal 3 | 4 | 5 | def test_collapsing_does_nothing_to_single_text_node(): 6 | assert_equal( 7 | html.collapse([html.text("Bluebells")]), 8 | [html.text("Bluebells")]) 9 | 10 | 11 | def test_consecutive_fresh_elements_are_not_collapsed(): 12 | assert_equal( 13 | html.collapse([html.element("p"), html.element("p")]), 14 | [html.element("p"), html.element("p")]) 15 | 16 | 17 | def test_consecutive_collapsible_elements_are_collapsed_if_they_have_the_same_tag_and_attributes(): 18 | assert_equal( 19 | [html.collapsible_element("p", {}, [html.text("One"), html.text("Two")])], 20 | html.collapse([ 21 | html.collapsible_element("p", {}, [html.text("One")]), 22 | html.collapsible_element("p", {}, [html.text("Two")]) 23 | ])) 24 | 25 | 26 | def test_elements_with_different_tag_names_are_not_collapsed(): 27 | assert_equal( 28 | [ 29 | html.collapsible_element("p", {}, [html.text("One")]), 30 | html.collapsible_element("div", {}, [html.text("Two")]) 31 | ], 32 | 33 | html.collapse([ 34 | html.collapsible_element("p", {}, [html.text("One")]), 35 | html.collapsible_element("div", {}, [html.text("Two")]) 36 | ])) 37 | 38 | 39 | def test_elements_with_different_attributes_are_not_collapsed(): 40 | assert_equal( 41 | [ 42 | html.collapsible_element("p", {"id": "a"}, [html.text("One")]), 43 | html.collapsible_element("p", {}, [html.text("Two")]) 44 | ], 45 | 46 | html.collapse([ 47 | html.collapsible_element("p", {"id": "a"}, [html.text("One")]), 48 | html.collapsible_element("p", {}, [html.text("Two")]) 49 | ])) 50 | 51 | 52 | def test_children_of_collapsed_element_can_collapse_with_children_of_previous_element(): 53 | assert_equal( 54 | [ 55 | html.collapsible_element("blockquote", {}, [ 56 | html.collapsible_element("p", {}, [ 57 | html.text("One"), 58 | html.text("Two") 59 | ]) 60 | ]), 61 | ], 62 | 63 | html.collapse([ 64 | html.collapsible_element("blockquote", {}, [ 65 | html.collapsible_element("p", {}, [html.text("One")]) 66 | ]), 67 | html.collapsible_element("blockquote", {}, [ 68 | html.collapsible_element("p", {}, [html.text("Two")]) 69 | ]), 70 | ])) 71 | 72 | 73 | def test_collapsible_element_can_collapse_into_previous_fresh_element(): 74 | assert_equal( 75 | [html.element("p", {}, [html.text("One"), html.text("Two")])], 76 | html.collapse([ 77 | html.element("p", {}, [html.text("One")]), 78 | html.collapsible_element("p", {}, [html.text("Two")]) 79 | ])) 80 | 81 | 82 | def test_element_with_choice_of_tag_names_can_collapse_into_previous_element_if_it_has_one_of_those_tag_names_as_its_main_tag_name(): 83 | assert_equal( 84 | [html.collapsible_element(["ol"])], 85 | html.collapse([ 86 | html.collapsible_element("ol"), 87 | html.collapsible_element(["ul", "ol"]) 88 | ])) 89 | 90 | assert_equal( 91 | [ 92 | html.collapsible_element(["ul", "ol"]), 93 | html.collapsible_element("ol") 94 | ], 95 | html.collapse([ 96 | html.collapsible_element(["ul", "ol"]), 97 | html.collapsible_element("ol") 98 | ])) 99 | 100 | 101 | def test_when_separator_is_present_then_separator_is_prepended_to_collapsed_element(): 102 | assert_equal( 103 | [ 104 | html.element("pre", collapsible=False, children=[ 105 | html.text("Hello"), 106 | html.text("\n"), 107 | html.text(" the"), 108 | html.text("re") 109 | ]) 110 | ], 111 | html.collapse([ 112 | html.element("pre", collapsible=False, children=[html.text("Hello")]), 113 | html.element("pre", collapsible=True, separator="\n", children=[html.text(" the"), html.text("re")]), 114 | ]), 115 | ) 116 | -------------------------------------------------------------------------------- /mammoth/docx/numbering_xml.py: -------------------------------------------------------------------------------- 1 | import cobble 2 | 3 | from ..documents import numbering_level 4 | from .styles_xml import Styles 5 | 6 | 7 | def read_numbering_xml_element(element, styles): 8 | abstract_nums = _read_abstract_nums(element) 9 | nums = _read_nums(element) 10 | return Numbering(abstract_nums=abstract_nums, nums=nums, styles=styles) 11 | 12 | 13 | def _read_abstract_nums(element): 14 | abstract_num_elements = element.find_children("w:abstractNum") 15 | return dict(map(_read_abstract_num, abstract_num_elements)) 16 | 17 | 18 | def _read_abstract_num(element): 19 | abstract_num_id = element.attributes.get("w:abstractNumId") 20 | levels = _read_abstract_num_levels(element) 21 | num_style_link = element.find_child_or_null("w:numStyleLink").attributes.get("w:val") 22 | return abstract_num_id, _AbstractNum(levels=levels, num_style_link=num_style_link) 23 | 24 | 25 | @cobble.data 26 | class _AbstractNum(object): 27 | levels = cobble.field() 28 | num_style_link = cobble.field() 29 | 30 | 31 | @cobble.data 32 | class _AbstractNumLevel(object): 33 | level_index = cobble.field() 34 | is_ordered = cobble.field() 35 | paragraph_style_id = cobble.field() 36 | 37 | 38 | def _read_abstract_num_levels(element): 39 | levels = {} 40 | 41 | # Some malformed documents define numbering levels without an index, and 42 | # reference the numbering using a w:numPr element without a w:ilvl child. 43 | # To handle such cases, we assume a level of 0 as a fallback. 44 | level_without_index = None 45 | 46 | for level_element in element.find_children("w:lvl"): 47 | level = _read_abstract_num_level(level_element) 48 | if level.level_index is None: 49 | level.level_index = "0" 50 | level_without_index = level 51 | else: 52 | levels[level.level_index] = level 53 | 54 | if level_without_index is not None and level_without_index.level_index not in levels: 55 | levels[level_without_index.level_index] = level_without_index 56 | 57 | return levels 58 | 59 | 60 | def _read_abstract_num_level(element): 61 | level_index = element.attributes.get("w:ilvl") 62 | num_fmt = element.find_child_or_null("w:numFmt").attributes.get("w:val") 63 | is_ordered = num_fmt != "bullet" 64 | paragraph_style_id = element.find_child_or_null("w:pStyle").attributes.get("w:val") 65 | return _AbstractNumLevel( 66 | level_index=level_index, 67 | is_ordered=is_ordered, 68 | paragraph_style_id=paragraph_style_id, 69 | ) 70 | 71 | 72 | def _read_nums(element): 73 | num_elements = element.find_children("w:num") 74 | return dict( 75 | _read_num(num_element) 76 | for num_element in num_elements 77 | ) 78 | 79 | 80 | def _read_num(element): 81 | num_id = element.attributes.get("w:numId") 82 | abstract_num_id = element.find_child_or_null("w:abstractNumId").attributes["w:val"] 83 | return num_id, _Num(abstract_num_id=abstract_num_id) 84 | 85 | 86 | @cobble.data 87 | class _Num(object): 88 | abstract_num_id = cobble.field() 89 | 90 | 91 | class Numbering(object): 92 | def __init__(self, abstract_nums, nums, styles): 93 | self._abstract_nums = abstract_nums 94 | self._levels_by_paragraph_style_id = dict( 95 | (level.paragraph_style_id, self._to_numbering_level(level)) 96 | for abstract_num in abstract_nums.values() 97 | for level in abstract_num.levels.values() 98 | if level.paragraph_style_id is not None 99 | ) 100 | self._nums = nums 101 | self._styles = styles 102 | 103 | def find_level(self, num_id, level): 104 | num = self._nums.get(num_id) 105 | if num is None: 106 | return None 107 | else: 108 | abstract_num = self._abstract_nums.get(num.abstract_num_id) 109 | if abstract_num is None: 110 | return None 111 | elif abstract_num.num_style_link is None: 112 | return self._to_numbering_level(abstract_num.levels.get(level)) 113 | else: 114 | style = self._styles.find_numbering_style_by_id(abstract_num.num_style_link) 115 | return self.find_level(style.num_id, level) 116 | 117 | def find_level_by_paragraph_style_id(self, style_id): 118 | return self._levels_by_paragraph_style_id.get(style_id) 119 | 120 | def _to_numbering_level(self, abstract_num_level): 121 | if abstract_num_level is None: 122 | return None 123 | else: 124 | return numbering_level( 125 | level_index=abstract_num_level.level_index, 126 | is_ordered=abstract_num_level.is_ordered, 127 | ) 128 | 129 | 130 | Numbering.EMPTY = Numbering(abstract_nums={}, nums={}, styles=Styles.EMPTY) 131 | -------------------------------------------------------------------------------- /mammoth/styles/parser/document_matcher_parser.py: -------------------------------------------------------------------------------- 1 | from ... import documents, document_matchers 2 | from .errors import LineParseError 3 | from .tokeniser import TokenType 4 | from .token_parser import try_parse_class_name, parse_string 5 | 6 | 7 | def parse_document_matcher(tokens): 8 | if tokens.try_skip(TokenType.IDENTIFIER, "p"): 9 | style_id = try_parse_class_name(tokens) 10 | style_name = _parse_style_name(tokens) 11 | numbering = _parse_numbering(tokens) 12 | 13 | return document_matchers.paragraph( 14 | style_id=style_id, 15 | style_name=style_name, 16 | numbering=numbering, 17 | ) 18 | 19 | elif tokens.try_skip(TokenType.IDENTIFIER, "r"): 20 | style_id = try_parse_class_name(tokens) 21 | style_name = _parse_style_name(tokens) 22 | 23 | return document_matchers.run( 24 | style_id=style_id, 25 | style_name=style_name, 26 | ) 27 | 28 | elif tokens.try_skip(TokenType.IDENTIFIER, "table"): 29 | style_id = try_parse_class_name(tokens) 30 | style_name = _parse_style_name(tokens) 31 | 32 | return document_matchers.table( 33 | style_id=style_id, 34 | style_name=style_name, 35 | ) 36 | 37 | elif tokens.try_skip(TokenType.IDENTIFIER, "b"): 38 | return document_matchers.bold 39 | 40 | elif tokens.try_skip(TokenType.IDENTIFIER, "i"): 41 | return document_matchers.italic 42 | 43 | elif tokens.try_skip(TokenType.IDENTIFIER, "u"): 44 | return document_matchers.underline 45 | 46 | elif tokens.try_skip(TokenType.IDENTIFIER, "strike"): 47 | return document_matchers.strikethrough 48 | 49 | elif tokens.try_skip(TokenType.IDENTIFIER, "all-caps"): 50 | return document_matchers.all_caps 51 | 52 | elif tokens.try_skip(TokenType.IDENTIFIER, "small-caps"): 53 | return document_matchers.small_caps 54 | 55 | elif tokens.try_skip(TokenType.IDENTIFIER, "highlight"): 56 | return _parse_highlight(tokens) 57 | 58 | elif tokens.try_skip(TokenType.IDENTIFIER, "comment-reference"): 59 | return document_matchers.comment_reference 60 | 61 | elif tokens.try_skip(TokenType.IDENTIFIER, "br"): 62 | return _parse_break(tokens) 63 | 64 | else: 65 | raise LineParseError("Unrecognised document element: {0}".format(tokens.next_value(TokenType.IDENTIFIER))) 66 | 67 | def _parse_style_name(tokens): 68 | if tokens.try_skip(TokenType.SYMBOL, "["): 69 | tokens.skip(TokenType.IDENTIFIER, "style-name") 70 | string_matcher = _parse_string_matcher(tokens) 71 | tokens.skip(TokenType.SYMBOL, "]") 72 | return string_matcher 73 | else: 74 | return None 75 | 76 | 77 | def _parse_string_matcher(tokens): 78 | if tokens.try_skip(TokenType.SYMBOL, "="): 79 | return document_matchers.equal_to(parse_string(tokens)) 80 | elif tokens.try_skip(TokenType.SYMBOL, "^="): 81 | return document_matchers.starts_with(parse_string(tokens)) 82 | else: 83 | raise LineParseError("Unrecognised string matcher: {0}".format(tokens.next_value())) 84 | 85 | def _parse_numbering(tokens): 86 | if tokens.try_skip(TokenType.SYMBOL, ":"): 87 | is_ordered = _parse_list_type(tokens) 88 | tokens.skip(TokenType.SYMBOL, "(") 89 | level = int(tokens.next_value(TokenType.INTEGER)) - 1 90 | tokens.skip(TokenType.SYMBOL, ")") 91 | return documents.numbering_level(level, is_ordered=is_ordered) 92 | 93 | 94 | def _parse_list_type(tokens): 95 | list_type = tokens.next_value(TokenType.IDENTIFIER) 96 | if list_type == "ordered-list": 97 | return True 98 | elif list_type == "unordered-list": 99 | return False 100 | else: 101 | raise LineParseError("Unrecognised list type: {0}".format(list_type)) 102 | 103 | 104 | def _parse_highlight(tokens): 105 | if tokens.try_skip(TokenType.SYMBOL, "["): 106 | tokens.skip(TokenType.IDENTIFIER, "color") 107 | tokens.skip(TokenType.SYMBOL, "=") 108 | color = parse_string(tokens) 109 | tokens.skip(TokenType.SYMBOL, "]"); 110 | else: 111 | color = None 112 | 113 | return document_matchers.highlight(color=color) 114 | 115 | 116 | def _parse_break(tokens): 117 | tokens.skip(TokenType.SYMBOL, "[") 118 | tokens.skip(TokenType.IDENTIFIER, "type") 119 | tokens.skip(TokenType.SYMBOL, "=") 120 | type_name = parse_string(tokens) 121 | tokens.skip(TokenType.SYMBOL, "]"); 122 | 123 | if type_name == "line": 124 | return document_matchers.line_break 125 | elif type_name == "page": 126 | return document_matchers.page_break 127 | elif type_name == "column": 128 | return document_matchers.column_break 129 | else: 130 | raise LineParseError("Unrecognised break type: {0}".format(type_name)) 131 | -------------------------------------------------------------------------------- /tests/docx/style_map_tests.py: -------------------------------------------------------------------------------- 1 | import io 2 | from zipfile import ZipFile 3 | 4 | from mammoth.docx.style_map import write_style_map, read_style_map 5 | from mammoth.zips import open_zip 6 | from mammoth.docx import xmlparser as xml 7 | from ..testing import assert_equal 8 | 9 | 10 | def test_reading_embedded_style_map_on_document_without_embedded_style_map_returns_none(): 11 | fileobj = _normal_docx() 12 | assert_equal(None, read_style_map(fileobj)) 13 | 14 | 15 | def test_writing_style_map_preserves_unrelated_files(): 16 | fileobj = _normal_docx() 17 | write_style_map(fileobj, "p => h1") 18 | with open_zip(fileobj, "r") as zip_file: 19 | assert_equal("placeholder", zip_file.read_str("placeholder")) 20 | 21 | def test_embedded_style_map_can_be_read_after_being_written(): 22 | fileobj = _normal_docx() 23 | write_style_map(fileobj, "p => h1") 24 | assert_equal("p => h1", read_style_map(fileobj)) 25 | 26 | 27 | def test_embedded_style_map_is_written_to_separate_file(): 28 | fileobj = _normal_docx() 29 | write_style_map(fileobj, "p => h1") 30 | with open_zip(fileobj, "r") as zip_file: 31 | assert_equal("p => h1", zip_file.read_str("mammoth/style-map")) 32 | 33 | 34 | def test_embedded_style_map_is_referenced_in_relationships(): 35 | fileobj = _normal_docx() 36 | write_style_map(fileobj, "p => h1") 37 | assert_equal(expected_relationships_xml, _read_relationships_xml(fileobj)) 38 | 39 | def test_embedded_style_map_has_override_content_type_in_content_types_xml(): 40 | fileobj = _normal_docx() 41 | write_style_map(fileobj, "p => h1") 42 | assert_equal(expected_content_types_xml, _read_content_types_xml(fileobj)) 43 | 44 | 45 | def test_can_overwrite_existing_style_map(): 46 | fileobj = _normal_docx() 47 | write_style_map(fileobj, "p => h1") 48 | write_style_map(fileobj, "p => h2") 49 | with open_zip(fileobj, "r") as zip_file: 50 | assert_equal("p => h2", read_style_map(fileobj)) 51 | _assert_no_duplicates(zip_file._zip_file.namelist()) 52 | assert_equal(expected_relationships_xml, _read_relationships_xml(fileobj)) 53 | assert_equal(expected_content_types_xml, _read_content_types_xml(fileobj)) 54 | 55 | 56 | def _read_relationships_xml(fileobj): 57 | with open_zip(fileobj, "r") as zip_file: 58 | return xml.parse_xml( 59 | io.StringIO(zip_file.read_str("word/_rels/document.xml.rels")), 60 | [("r", "http://schemas.openxmlformats.org/package/2006/relationships")], 61 | ) 62 | 63 | 64 | def _read_content_types_xml(fileobj): 65 | with open_zip(fileobj, "r") as zip_file: 66 | return xml.parse_xml( 67 | io.StringIO(zip_file.read_str("[Content_Types].xml")), 68 | [("ct", "http://schemas.openxmlformats.org/package/2006/content-types")], 69 | ) 70 | 71 | 72 | original_relationships_xml = ('' + 73 | '' + 74 | '' + 75 | '') 76 | 77 | expected_relationships_xml = xml.element("r:Relationships", {}, [ 78 | xml.element("r:Relationship", {"Id": "rId3", "Type": "http://schemas.openxmlformats.org/officeDocument/2006/relationships/settings", "Target": "settings.xml"}), 79 | xml.element("r:Relationship", {"Id": "rMammothStyleMap", "Type": "http://schemas.zwobble.org/mammoth/style-map", "Target": "/mammoth/style-map"}), 80 | ]) 81 | 82 | original_content_types_xml = ('' + 83 | '' + 84 | '' + 85 | '' 86 | ) 87 | 88 | expected_content_types_xml = xml.element("ct:Types", {}, [ 89 | xml.element("ct:Default", {"Extension": "png", "ContentType": "image/png"}), 90 | xml.element("ct:Override", {"PartName": "/mammoth/style-map", "ContentType": "text/prs.mammoth.style-map"}), 91 | ]) 92 | 93 | 94 | def _normal_docx(): 95 | fileobj = io.BytesIO() 96 | zip_file = ZipFile(fileobj, "w") 97 | try: 98 | zip_file.writestr("placeholder", "placeholder") 99 | zip_file.writestr("word/_rels/document.xml.rels", original_relationships_xml) 100 | zip_file.writestr("[Content_Types].xml", original_content_types_xml) 101 | expected_relationships_xml 102 | finally: 103 | zip_file.close() 104 | return fileobj 105 | 106 | 107 | def _assert_no_duplicates(values): 108 | counts = {} 109 | for value in values: 110 | counts[value] = counts.get(value, 0) + 1 111 | for value, count in counts.items(): 112 | if count != 1: 113 | assert False, "{0} has count of {1}".format(value, count) 114 | -------------------------------------------------------------------------------- /tests/docx/styles_xml_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth.docx.xmlparser import element as xml_element 2 | from mammoth.docx.styles_xml import read_styles_xml_element 3 | from ..testing import assert_equal 4 | 5 | 6 | def test_paragraph_style_is_null_if_no_style_with_that_id_exists(): 7 | element = xml_element("w:styles") 8 | styles = read_styles_xml_element(element) 9 | assert_equal(None, styles.find_paragraph_style_by_id("Heading1")) 10 | 11 | 12 | def test_paragraph_style_can_be_found_by_id(): 13 | element = xml_element("w:styles", {}, [ 14 | _paragraph_style_element("Heading1", "Heading 1"), 15 | ]) 16 | styles = read_styles_xml_element(element) 17 | assert_equal( 18 | "Heading1", 19 | styles.find_paragraph_style_by_id("Heading1").style_id 20 | ) 21 | 22 | 23 | def test_character_style_can_be_found_by_id(): 24 | element = xml_element("w:styles", {}, [ 25 | _character_style_element("Heading1Char", "Heading 1 Char"), 26 | ]) 27 | styles = read_styles_xml_element(element) 28 | assert_equal( 29 | "Heading1Char", 30 | styles.find_character_style_by_id("Heading1Char").style_id 31 | ) 32 | 33 | 34 | def test_table_style_can_be_found_by_id(): 35 | element = xml_element("w:styles", {}, [ 36 | _table_style_element("TableNormal", "Normal Table"), 37 | ]) 38 | styles = read_styles_xml_element(element) 39 | assert_equal( 40 | "TableNormal", 41 | styles.find_table_style_by_id("TableNormal").style_id 42 | ) 43 | 44 | 45 | def test_paragraph_and_character_styles_are_distinct(): 46 | element = xml_element("w:styles", {}, [ 47 | _paragraph_style_element("Heading1", "Heading 1"), 48 | _character_style_element("Heading1Char", "Heading 1 Char"), 49 | ]) 50 | styles = read_styles_xml_element(element) 51 | assert_equal(None, styles.find_character_style_by_id("Heading1")) 52 | assert_equal(None, styles.find_paragraph_style_by_id("Heading1Char")) 53 | 54 | 55 | def test_styles_include_names(): 56 | element = xml_element("w:styles", {}, [ 57 | _paragraph_style_element("Heading1", "Heading 1"), 58 | ]) 59 | styles = read_styles_xml_element(element) 60 | assert_equal( 61 | "Heading 1", 62 | styles.find_paragraph_style_by_id("Heading1").name 63 | ) 64 | 65 | 66 | def test_style_name_is_none_if_name_element_does_not_exist(): 67 | element = xml_element("w:styles", {}, [ 68 | _style_without_name_element("paragraph", "Heading1"), 69 | _style_without_name_element("character", "Heading1Char") 70 | ]) 71 | styles = read_styles_xml_element(element) 72 | assert_equal(None, styles.find_paragraph_style_by_id("Heading1").name) 73 | assert_equal(None, styles.find_character_style_by_id("Heading1Char").name) 74 | 75 | 76 | def test_numbering_style_is_none_if_no_style_with_that_id_exists(): 77 | element = xml_element("w:styles", {}, []) 78 | styles = read_styles_xml_element(element) 79 | assert_equal(None, styles.find_numbering_style_by_id("List1")) 80 | 81 | 82 | def test_numbering_style_has_none_num_id_if_style_has_no_paragraph_properties(): 83 | element = xml_element("w:styles", {}, [ 84 | xml_element("w:style", {"w:type": "numbering", "w:styleId": "List1"}), 85 | ]) 86 | styles = read_styles_xml_element(element) 87 | assert_equal(None, styles.find_numbering_style_by_id("List1").num_id) 88 | 89 | 90 | def test_numbering_style_has_num_id_read_from_paragraph_properties(): 91 | element = xml_element("w:styles", {}, [ 92 | xml_element("w:style", {"w:type": "numbering", "w:styleId": "List1"}, [ 93 | xml_element("w:pPr", {}, [ 94 | xml_element("w:numPr", {}, [ 95 | xml_element("w:numId", {"w:val": "42"}) 96 | ]), 97 | ]), 98 | ]), 99 | ]) 100 | styles = read_styles_xml_element(element) 101 | assert_equal("42", styles.find_numbering_style_by_id("List1").num_id) 102 | 103 | 104 | def test_when_multiple_style_elements_have_same_style_id_then_only_first_element_is_used(): 105 | element = xml_element("w:styles", {}, [ 106 | _table_style_element("TableNormal", "Normal Table"), 107 | _table_style_element("TableNormal", "Table Normal"), 108 | ]) 109 | styles = read_styles_xml_element(element) 110 | assert_equal( 111 | "Normal Table", 112 | styles.find_table_style_by_id("TableNormal").name 113 | ) 114 | 115 | 116 | def _paragraph_style_element(style_id, name): 117 | return _style_element("paragraph", style_id, name) 118 | 119 | def _character_style_element(style_id, name): 120 | return _style_element("character", style_id, name) 121 | 122 | def _table_style_element(style_id, name): 123 | return _style_element("table", style_id, name) 124 | 125 | def _style_element(element_type, style_id, name): 126 | children = [xml_element("w:name", {"w:val": name}, [])] 127 | return _style_element_with_children(element_type, style_id, children) 128 | 129 | def _style_without_name_element(element_type, style_id): 130 | return _style_element_with_children(element_type, style_id, []) 131 | 132 | def _style_element_with_children(element_type, style_id, children): 133 | attributes = {"w:type": element_type, "w:styleId": style_id} 134 | return xml_element("w:style", attributes, children) 135 | -------------------------------------------------------------------------------- /tests/styles/parser/document_matcher_parser_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth import documents, document_matchers 2 | from mammoth.styles.parser.document_matcher_parser import parse_document_matcher 3 | from mammoth.styles.parser.errors import LineParseError 4 | from mammoth.styles.parser.tokeniser import tokenise 5 | from mammoth.styles.parser.token_iterator import TokenIterator 6 | from ...testing import assert_equal, assert_raises 7 | 8 | 9 | def test_unrecognised_document_element_raises_error(): 10 | error = assert_raises(LineParseError, lambda: read_document_matcher("x")) 11 | assert_equal("Unrecognised document element: x", str(error)) 12 | 13 | 14 | def test_reads_plain_paragraph(): 15 | assert_equal( 16 | document_matchers.paragraph(), 17 | read_document_matcher("p") 18 | ) 19 | 20 | 21 | def test_reads_paragraph_with_style_id(): 22 | assert_equal( 23 | document_matchers.paragraph(style_id="Heading1"), 24 | read_document_matcher("p.Heading1") 25 | ) 26 | 27 | 28 | def test_reads_paragraph_with_exact_style_name(): 29 | assert_equal( 30 | document_matchers.paragraph(style_name=document_matchers.equal_to("Heading 1")), 31 | read_document_matcher("p[style-name='Heading 1']") 32 | ) 33 | 34 | 35 | def test_reads_paragraph_with_style_name_prefix(): 36 | assert_equal( 37 | document_matchers.paragraph(style_name=document_matchers.starts_with("Heading")), 38 | read_document_matcher("p[style-name^='Heading']") 39 | ) 40 | 41 | 42 | def test_unrecognised_string_matcher_raises_error(): 43 | error = assert_raises(LineParseError, lambda: read_document_matcher("p[style-name*='Heading']")) 44 | assert_equal("Unrecognised string matcher: *", str(error)) 45 | 46 | 47 | def test_reads_paragraph_ordered_list(): 48 | assert_equal( 49 | document_matchers.paragraph(numbering=documents.numbering_level(1, is_ordered=True)), 50 | read_document_matcher("p:ordered-list(2)") 51 | ) 52 | 53 | 54 | def test_reads_paragraph_unordered_list(): 55 | assert_equal( 56 | document_matchers.paragraph(numbering=documents.numbering_level(1, is_ordered=False)), 57 | read_document_matcher("p:unordered-list(2)") 58 | ) 59 | 60 | 61 | def test_unrecognised_list_type_raises_error(): 62 | error = assert_raises(LineParseError, lambda: read_document_matcher("p:blah")) 63 | assert_equal("Unrecognised list type: blah", str(error)) 64 | 65 | 66 | def test_reads_plain_run(): 67 | assert_equal( 68 | document_matchers.run(), 69 | read_document_matcher("r") 70 | ) 71 | 72 | 73 | def test_reads_run_with_style_id(): 74 | assert_equal( 75 | document_matchers.run(style_id="Emphasis"), 76 | read_document_matcher("r.Emphasis") 77 | ) 78 | 79 | 80 | def test_reads_run_with_style_name(): 81 | assert_equal( 82 | document_matchers.run(style_name=document_matchers.equal_to("Emphasis")), 83 | read_document_matcher("r[style-name='Emphasis']") 84 | ) 85 | 86 | 87 | def test_reads_plain_table(): 88 | assert_equal( 89 | document_matchers.table(), 90 | read_document_matcher("table") 91 | ) 92 | 93 | 94 | def test_reads_table_with_style_id(): 95 | assert_equal( 96 | document_matchers.table(style_id="TableNormal"), 97 | read_document_matcher("table.TableNormal") 98 | ) 99 | 100 | 101 | def test_reads_table_with_style_name(): 102 | assert_equal( 103 | document_matchers.table(style_name=document_matchers.equal_to("Normal Table")), 104 | read_document_matcher("table[style-name='Normal Table']") 105 | ) 106 | 107 | 108 | def test_reads_bold(): 109 | assert_equal( 110 | document_matchers.bold, 111 | read_document_matcher("b") 112 | ) 113 | 114 | def test_reads_italic(): 115 | assert_equal( 116 | document_matchers.italic, 117 | read_document_matcher("i") 118 | ) 119 | 120 | def test_reads_underline(): 121 | assert_equal( 122 | document_matchers.underline, 123 | read_document_matcher("u") 124 | ) 125 | 126 | def test_reads_strikethrough(): 127 | assert_equal( 128 | document_matchers.strikethrough, 129 | read_document_matcher("strike") 130 | ) 131 | 132 | def test_reads_all_caps(): 133 | assert_equal( 134 | document_matchers.all_caps, 135 | read_document_matcher("all-caps") 136 | ) 137 | 138 | def test_reads_small_caps(): 139 | assert_equal( 140 | document_matchers.small_caps, 141 | read_document_matcher("small-caps") 142 | ) 143 | 144 | def test_reads_highlight_without_color(): 145 | assert_equal( 146 | document_matchers.highlight(), 147 | read_document_matcher("highlight") 148 | ) 149 | 150 | def test_reads_highlight_with_color(): 151 | assert_equal( 152 | document_matchers.highlight(color="yellow"), 153 | read_document_matcher("highlight[color='yellow']") 154 | ) 155 | 156 | def test_reads_comment_reference(): 157 | assert_equal( 158 | document_matchers.comment_reference, 159 | read_document_matcher("comment-reference") 160 | ) 161 | 162 | def test_reads_line_breaks(): 163 | assert_equal( 164 | document_matchers.line_break, 165 | read_document_matcher("br[type='line']"), 166 | ) 167 | 168 | def test_reads_page_breaks(): 169 | assert_equal( 170 | document_matchers.page_break, 171 | read_document_matcher("br[type='page']"), 172 | ) 173 | 174 | def test_reads_column_breaks(): 175 | assert_equal( 176 | document_matchers.column_break, 177 | read_document_matcher("br[type='column']"), 178 | ) 179 | 180 | 181 | def test_unrecognised_break_type_raises_error(): 182 | error = assert_raises(LineParseError, lambda: read_document_matcher("br[type='unknownBreakType']")) 183 | assert_equal("Unrecognised break type: unknownBreakType", str(error)) 184 | 185 | 186 | def read_document_matcher(string): 187 | return parse_document_matcher(TokenIterator(tokenise(string))) 188 | -------------------------------------------------------------------------------- /tests/writers/markdown_tests.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | 3 | from mammoth.writers.markdown import MarkdownWriter 4 | from ..testing import assert_equal 5 | 6 | 7 | def test_special_markdown_characters_are_escaped(): 8 | writer = _create_writer() 9 | writer.text(r"\*") 10 | assert_equal(r"\\\*", writer.as_string()) 11 | 12 | 13 | def test_unrecognised_elements_are_treated_as_normal_text(): 14 | writer = _create_writer() 15 | writer.start("blah"); 16 | writer.text("Hello"); 17 | writer.end("blah"); 18 | assert_equal("Hello", writer.as_string()) 19 | 20 | 21 | def test_paragraphs_are_terminated_with_double_new_line(): 22 | writer = _create_writer() 23 | writer.start("p"); 24 | writer.text("Hello"); 25 | writer.end("p"); 26 | assert_equal("Hello\n\n", writer.as_string()) 27 | 28 | 29 | def test_h1_elements_are_converted_to_heading_with_leading_hash(): 30 | writer = _create_writer() 31 | writer.start("h1"); 32 | writer.text("Hello"); 33 | writer.end("h1"); 34 | assert_equal("# Hello\n\n", writer.as_string()) 35 | 36 | 37 | def test_h6_elements_are_converted_to_heading_with_six_leading_hashes(): 38 | writer = _create_writer() 39 | writer.start("h6"); 40 | writer.text("Hello"); 41 | writer.end("h6"); 42 | assert_equal("###### Hello\n\n", writer.as_string()) 43 | 44 | 45 | def test_br_is_written_as_two_spaces_followed_by_newline(): 46 | writer = _create_writer() 47 | writer.text("Hello"); 48 | writer.self_closing("br"); 49 | assert_equal("Hello \n", writer.as_string()) 50 | 51 | 52 | def test_strong_text_is_surrounded_by_two_underscores(): 53 | writer = _create_writer() 54 | writer.text("Hello "); 55 | writer.start("strong"); 56 | writer.text("World") 57 | writer.end("strong") 58 | assert_equal("Hello __World__", writer.as_string()) 59 | 60 | 61 | def test_emphasised_text_is_surrounded_by_one_asterix(): 62 | writer = _create_writer() 63 | writer.text("Hello "); 64 | writer.start("em"); 65 | writer.text("World") 66 | writer.end("em") 67 | assert_equal("Hello *World*", writer.as_string()) 68 | 69 | 70 | def test_anchor_tags_are_written_as_hyperlinks(): 71 | writer = _create_writer() 72 | writer.start("a", {"href": "http://example.com"}); 73 | writer.text("Hello"); 74 | writer.end("a"); 75 | assert_equal("[Hello](http://example.com)", writer.as_string()) 76 | 77 | 78 | def test_anchor_tags_without_href_attribute_are_treated_as_ordinary_text(): 79 | writer = _create_writer() 80 | writer.start("a"); 81 | writer.text("Hello"); 82 | writer.end("a"); 83 | assert_equal("Hello", writer.as_string()) 84 | 85 | 86 | def test_elements_with_ids_have_anchor_tags_with_ids_appended_to_start_of_markdown_element(): 87 | writer = _create_writer() 88 | writer.start("h1", {"id": "start"}) 89 | writer.text("Hello") 90 | writer.end("h1") 91 | assert_equal('#
Hello\n\n', writer.as_string()) 92 | 93 | 94 | def test_links_have_anchors_before_opening_square_bracket(): 95 | writer = _create_writer() 96 | writer.start("a", {"href": "http://example.com", "id": "start"}) 97 | writer.text("Hello") 98 | writer.end("a") 99 | assert_equal('[Hello](http://example.com)', writer.as_string()) 100 | 101 | 102 | def test_image_elements_are_written_as_markdown_images(): 103 | writer = _create_writer() 104 | writer.self_closing("img", {"src": "http://example.com/image.jpg", "alt": "Alt Text"}) 105 | assert_equal("![Alt Text](http://example.com/image.jpg)", writer.as_string()) 106 | 107 | 108 | def test_images_are_written_even_if_they_dont_have_alt_text(): 109 | writer = _create_writer() 110 | writer.self_closing("img", {"src": "http://example.com/image.jpg"}) 111 | assert_equal("![](http://example.com/image.jpg)", writer.as_string()) 112 | 113 | 114 | def test_images_are_written_even_if_they_dont_have_a_src_attribute(): 115 | writer = _create_writer() 116 | writer.self_closing("img", {"alt": "Alt Text"}) 117 | assert_equal("![Alt Text]()", writer.as_string()) 118 | 119 | 120 | def test_image_elements_are_ignored_if_they_have_no_src_and_no_alt_text(): 121 | writer = _create_writer() 122 | writer.self_closing("img") 123 | assert_equal("", writer.as_string()) 124 | 125 | 126 | def test_list_item_outside_of_list_is_treated_as_unordered_list(): 127 | writer = _create_writer() 128 | writer.start("li") 129 | writer.text("Fruit") 130 | writer.end("li") 131 | assert_equal("- Fruit\n", writer.as_string()) 132 | 133 | 134 | def test_ol_element_is_written_as_ordered_list_with_sequential_numbering(): 135 | writer = _create_writer() 136 | writer.start("ol") 137 | writer.start("li") 138 | writer.text("Fruit") 139 | writer.end("li") 140 | writer.start("li") 141 | writer.text("Condiments") 142 | writer.end("li") 143 | writer.end("ol") 144 | assert_equal("1. Fruit\n2. Condiments\n\n", writer.as_string()) 145 | 146 | 147 | def test_ul_element_is_written_as_unordered_list_using_hyphens_as_bullets(): 148 | writer = _create_writer() 149 | writer.start("ul") 150 | writer.start("li") 151 | writer.text("Fruit") 152 | writer.end("li") 153 | writer.start("li") 154 | writer.text("Condiments") 155 | writer.end("li") 156 | writer.end("ul") 157 | assert_equal("- Fruit\n- Condiments\n\n", writer.as_string()) 158 | 159 | 160 | def test_numbering_is_separate_for_nested_list_and_parent_list(): 161 | writer = _create_writer() 162 | writer.start("ol") 163 | 164 | writer.start("li") 165 | writer.text("Fruit") 166 | writer.start("ol") 167 | writer.start("li") 168 | writer.text("Apple") 169 | writer.end("li") 170 | writer.start("li") 171 | writer.text("Banana") 172 | writer.end("li") 173 | writer.end("ol") 174 | writer.end("li") 175 | 176 | writer.start("li") 177 | writer.text("Condiments") 178 | writer.end("li") 179 | writer.end("ol") 180 | assert_equal("1. Fruit\n\t1. Apple\n\t2. Banana\n2. Condiments\n\n", writer.as_string()) 181 | 182 | 183 | 184 | def _create_writer(): 185 | return MarkdownWriter() 186 | -------------------------------------------------------------------------------- /mammoth/writers/markdown.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | 3 | from .abc import Writer 4 | 5 | import re 6 | 7 | 8 | class _WriterOutput(object): 9 | def __init__(self, start, end=None, generate_end=None, anchor_position=None): 10 | if generate_end is None: 11 | generate_end = _constant(end) 12 | 13 | self.start = start 14 | self.generate_end = generate_end 15 | self.anchor_position = anchor_position 16 | 17 | 18 | def _constant(value): 19 | def get(): 20 | return value 21 | 22 | return get 23 | 24 | 25 | class _MarkdownState(object): 26 | def __init__(self): 27 | self._list_state_stack = [] 28 | self.list_state = None 29 | self.list_item_has_closed = False 30 | 31 | def update_list_state(self, list_state): 32 | self._list_state_stack.append(self.list_state) 33 | self.list_state = list_state 34 | 35 | def pop_list_state(self): 36 | self.list_state = self._list_state_stack.pop() 37 | 38 | 39 | class _MarkdownListState(object): 40 | def __init__(self, ordered, indentation): 41 | self.ordered = ordered 42 | self.count = 0 43 | self.indentation = indentation 44 | 45 | 46 | def _symmetric_wrapped(end): 47 | return _Wrapped(end, end) 48 | 49 | 50 | class _Wrapped(object): 51 | def __init__(self, start, end): 52 | self._start = start 53 | self._end = end 54 | 55 | def __call__(self, attributes, markdown_state): 56 | return _WriterOutput(self._start, self._end) 57 | 58 | 59 | def _hyperlink(attributes, markdown_state): 60 | href = attributes.get("href", "") 61 | if href: 62 | return _WriterOutput( 63 | "[", "]({0})".format(href), 64 | anchor_position="before", 65 | ) 66 | else: 67 | return _default_output 68 | 69 | 70 | def _image(attributes, markdown_state): 71 | src = attributes.get("src", "") 72 | alt_text = attributes.get("alt", "") 73 | if src or alt_text: 74 | return _WriterOutput("![{0}]({1})".format(alt_text, src), "") 75 | else: 76 | return _default_output 77 | 78 | 79 | def _list(ordered): 80 | def call(attributes, markdown_state): 81 | if markdown_state.list_state is None: 82 | start = "" 83 | end_text = "\n" 84 | indentation = 0 85 | else: 86 | start = "\n" 87 | end_text = "" 88 | indentation = markdown_state.list_state.indentation + 1 89 | 90 | def generate_end(): 91 | markdown_state.pop_list_state() 92 | return end_text 93 | 94 | markdown_state.update_list_state(_MarkdownListState( 95 | ordered=ordered, 96 | indentation=indentation, 97 | )) 98 | 99 | return _WriterOutput(start, generate_end=generate_end) 100 | 101 | return call 102 | 103 | 104 | def _list_item(attributes, markdown_state): 105 | markdown_state.list_item_has_closed = False 106 | 107 | list_state = markdown_state.list_state or _MarkdownListState(ordered=False, indentation=0) 108 | list_state.count += 1 109 | 110 | if list_state.ordered: 111 | bullet = "{0}.".format(list_state.count) 112 | else: 113 | bullet = "-" 114 | 115 | def generate_end(): 116 | if markdown_state.list_item_has_closed: 117 | return "" 118 | else: 119 | markdown_state.list_item_has_closed = True 120 | return "\n" 121 | 122 | return _WriterOutput( 123 | start=("\t" * list_state.indentation) + bullet + " ", 124 | generate_end=generate_end 125 | ) 126 | 127 | 128 | def _init_writers(): 129 | writers = { 130 | "p": _Wrapped("", "\n\n"), 131 | "br": _Wrapped("", " \n"), 132 | "strong": _symmetric_wrapped("__"), 133 | "em": _symmetric_wrapped("*"), 134 | "a": _hyperlink, 135 | "img": _image, 136 | "ol": _list(ordered=True), 137 | "ul": _list(ordered=False), 138 | "li": _list_item, 139 | } 140 | 141 | for level in range(1, 7): 142 | writers["h{0}".format(level)] = _Wrapped("#" * level + " ", "\n\n") 143 | 144 | return writers 145 | 146 | 147 | _writers = _init_writers() 148 | _default_output = _WriterOutput("", "") 149 | 150 | def _default_writer(attributes, markdown_state): 151 | return _default_output 152 | 153 | 154 | class MarkdownWriter(Writer): 155 | def __init__(self): 156 | self._fragments = [] 157 | self._element_stack = [] 158 | self._markdown_state = _MarkdownState() 159 | 160 | def text(self, text): 161 | self._fragments.append(_escape_markdown(text)) 162 | 163 | def start(self, name, attributes=None): 164 | if attributes is None: 165 | attributes = {} 166 | 167 | output = _writers.get(name, _default_writer)(attributes, self._markdown_state) 168 | self._element_stack.append(output.generate_end) 169 | 170 | anchor_before_start = output.anchor_position == "before" 171 | if anchor_before_start: 172 | self._write_anchor(attributes) 173 | 174 | self._fragments.append(output.start) 175 | 176 | if not anchor_before_start: 177 | self._write_anchor(attributes) 178 | 179 | 180 | 181 | def end(self, name): 182 | end = self._element_stack.pop() 183 | output = end() 184 | self._fragments.append(output) 185 | 186 | def self_closing(self, name, attributes=None): 187 | self.start(name, attributes) 188 | self.end(name) 189 | 190 | def append(self, other): 191 | self._fragments.append(other) 192 | 193 | def as_string(self): 194 | return "".join(self._fragments) 195 | 196 | def _write_anchor(self, attributes): 197 | html_id = attributes.get("id") 198 | if html_id: 199 | self._fragments.append(''.format(html_id)) 200 | 201 | 202 | def _escape_markdown(value): 203 | return re.sub(r"([\`\*_\{\}\[\]\(\)\#\+\-\.\!])", r"\\\1", re.sub("\\\\", "\\\\\\\\", value)) 204 | -------------------------------------------------------------------------------- /mammoth/docx/__init__.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | import os 3 | 4 | import cobble 5 | 6 | from .. import results, lists, zips 7 | from .document_xml import read_document_xml_element 8 | from .content_types_xml import empty_content_types, read_content_types_xml_element 9 | from .relationships_xml import read_relationships_xml_element, Relationships 10 | from .numbering_xml import read_numbering_xml_element, Numbering 11 | from .styles_xml import read_styles_xml_element, Styles 12 | from .notes_xml import read_endnotes_xml_element, read_footnotes_xml_element 13 | from .comments_xml import read_comments_xml_element 14 | from .files import Files 15 | from . import body_xml, office_xml 16 | from ..zips import open_zip 17 | 18 | 19 | _empty_result = results.success([]) 20 | 21 | 22 | def read(fileobj, external_file_access=False): 23 | zip_file = open_zip(fileobj, "r") 24 | part_paths = _find_part_paths(zip_file) 25 | read_part_with_body = _part_with_body_reader( 26 | getattr(fileobj, "name", None), 27 | zip_file, 28 | part_paths=part_paths, 29 | external_file_access=external_file_access, 30 | ) 31 | 32 | return results.combine([ 33 | _read_notes(read_part_with_body, part_paths), 34 | _read_comments(read_part_with_body, part_paths), 35 | ]).bind(lambda referents: 36 | _read_document(zip_file, read_part_with_body, notes=referents[0], comments=referents[1], part_paths=part_paths) 37 | ) 38 | 39 | 40 | @cobble.data 41 | class _PartPaths(object): 42 | main_document = cobble.field() 43 | comments = cobble.field() 44 | endnotes = cobble.field() 45 | footnotes = cobble.field() 46 | numbering = cobble.field() 47 | styles = cobble.field() 48 | 49 | 50 | def _find_part_paths(zip_file): 51 | package_relationships = _read_relationships(zip_file, "_rels/.rels") 52 | document_filename = _find_document_filename(zip_file, package_relationships) 53 | 54 | document_relationships = _read_relationships( 55 | zip_file, 56 | _find_relationships_path_for(document_filename), 57 | ) 58 | 59 | def find(name): 60 | return _find_part_path( 61 | zip_file=zip_file, 62 | relationships=document_relationships, 63 | relationship_type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/" + name, 64 | fallback_path="word/{0}.xml".format(name), 65 | base_path=zips.split_path(document_filename)[0], 66 | ) 67 | 68 | return _PartPaths( 69 | main_document=document_filename, 70 | comments=find("comments"), 71 | endnotes=find("endnotes"), 72 | footnotes=find("footnotes"), 73 | numbering=find("numbering"), 74 | styles=find("styles"), 75 | ) 76 | 77 | 78 | def _find_document_filename(zip_file, relationships): 79 | path = _find_part_path( 80 | zip_file, 81 | relationships, 82 | relationship_type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument", 83 | base_path="", 84 | fallback_path="word/document.xml", 85 | ) 86 | if zip_file.exists(path): 87 | return path 88 | else: 89 | raise IOError("Could not find main document part. Are you sure this is a valid .docx file?") 90 | 91 | 92 | def _find_part_path(zip_file, relationships, relationship_type, base_path, fallback_path): 93 | targets = [ 94 | zips.join_path(base_path, target).lstrip("/") 95 | for target in relationships.find_targets_by_type(relationship_type) 96 | ] 97 | valid_targets = list(filter(lambda target: zip_file.exists(target), targets)) 98 | if len(valid_targets) == 0: 99 | return fallback_path 100 | else: 101 | return valid_targets[0] 102 | 103 | 104 | def _read_notes(read_part_with_body, part_paths): 105 | footnotes = read_part_with_body( 106 | part_paths.footnotes, 107 | lambda root, body_reader: read_footnotes_xml_element(root, body_reader=body_reader), 108 | default=_empty_result, 109 | ) 110 | endnotes = read_part_with_body( 111 | part_paths.endnotes, 112 | lambda root, body_reader: read_endnotes_xml_element(root, body_reader=body_reader), 113 | default=_empty_result, 114 | ) 115 | 116 | return results.combine([footnotes, endnotes]).map(lists.flatten) 117 | 118 | 119 | def _read_comments(read_part_with_body, part_paths): 120 | return read_part_with_body( 121 | part_paths.comments, 122 | lambda root, body_reader: read_comments_xml_element(root, body_reader=body_reader), 123 | default=_empty_result, 124 | ) 125 | 126 | 127 | def _read_document(zip_file, read_part_with_body, notes, comments, part_paths): 128 | return read_part_with_body( 129 | part_paths.main_document, 130 | partial( 131 | read_document_xml_element, 132 | notes=notes, 133 | comments=comments, 134 | ), 135 | ) 136 | 137 | 138 | def _part_with_body_reader(document_path, zip_file, part_paths, external_file_access): 139 | content_types = _try_read_entry_or_default( 140 | zip_file, 141 | "[Content_Types].xml", 142 | read_content_types_xml_element, 143 | empty_content_types, 144 | ) 145 | 146 | styles = _try_read_entry_or_default( 147 | zip_file, 148 | part_paths.styles, 149 | read_styles_xml_element, 150 | Styles.EMPTY, 151 | ) 152 | 153 | numbering = _try_read_entry_or_default( 154 | zip_file, 155 | part_paths.numbering, 156 | lambda element: read_numbering_xml_element(element, styles=styles), 157 | default=Numbering.EMPTY, 158 | ) 159 | 160 | files = Files( 161 | None if document_path is None else os.path.dirname(document_path), 162 | external_file_access=external_file_access, 163 | ) 164 | 165 | def read_part(name, reader, default=_undefined): 166 | relationships = _read_relationships(zip_file, _find_relationships_path_for(name)) 167 | 168 | body_reader = body_xml.reader( 169 | numbering=numbering, 170 | content_types=content_types, 171 | relationships=relationships, 172 | styles=styles, 173 | docx_file=zip_file, 174 | files=files, 175 | ) 176 | 177 | if default is _undefined: 178 | return _read_entry(zip_file, name, partial(reader, body_reader=body_reader)) 179 | else: 180 | return _try_read_entry_or_default(zip_file, name, partial(reader, body_reader=body_reader), default=default) 181 | 182 | return read_part 183 | 184 | 185 | 186 | def _find_relationships_path_for(name): 187 | dirname, basename = zips.split_path(name) 188 | return zips.join_path(dirname, "_rels", basename + ".rels") 189 | 190 | 191 | def _read_relationships(zip_file, name): 192 | return _try_read_entry_or_default( 193 | zip_file, 194 | name, 195 | read_relationships_xml_element, 196 | default=Relationships.EMPTY, 197 | ) 198 | 199 | def _try_read_entry_or_default(zip_file, name, reader, default): 200 | if zip_file.exists(name): 201 | return _read_entry(zip_file, name, reader) 202 | else: 203 | return default 204 | 205 | 206 | def _read_entry(zip_file, name, reader): 207 | with zip_file.open(name) as fileobj: 208 | return reader(office_xml.read(fileobj)) 209 | 210 | 211 | _undefined = object() 212 | -------------------------------------------------------------------------------- /tests/docx/numbering_xml_tests.py: -------------------------------------------------------------------------------- 1 | from mammoth.docx.xmlparser import element as xml_element 2 | from mammoth.docx.numbering_xml import read_numbering_xml_element 3 | from mammoth.docx.styles_xml import NumberingStyle, Styles 4 | from ..testing import assert_equal 5 | 6 | 7 | def test_find_level_returns_none_if_num_with_id_cannot_be_found(): 8 | numbering = _read_numbering_xml_element(xml_element("w:numbering")) 9 | assert_equal(None, numbering.find_level("47", "0")) 10 | 11 | 12 | _sample_numbering_xml = xml_element("w:numbering", {}, [ 13 | xml_element("w:abstractNum", {"w:abstractNumId": "42"}, [ 14 | xml_element("w:lvl", {"w:ilvl": "0"}, [ 15 | xml_element("w:numFmt", {"w:val": "bullet"}) 16 | ]), 17 | xml_element("w:lvl", {"w:ilvl": "1"}, [ 18 | xml_element("w:numFmt", {"w:val": "decimal"}) 19 | ]) 20 | ]), 21 | xml_element("w:num", {"w:numId": "47"}, [ 22 | xml_element("w:abstractNumId", {"w:val": "42"}) 23 | ]) 24 | ]) 25 | 26 | 27 | def test_level_includes_level_index(): 28 | numbering = _read_numbering_xml_element(_sample_numbering_xml) 29 | assert_equal("0", numbering.find_level("47", "0").level_index) 30 | assert_equal("1", numbering.find_level("47", "1").level_index) 31 | 32 | 33 | def test_list_is_not_ordered_if_formatted_as_bullet(): 34 | numbering = _read_numbering_xml_element(_sample_numbering_xml) 35 | assert_equal(False, numbering.find_level("47", "0").is_ordered) 36 | 37 | 38 | def test_list_is_ordered_if_formatted_as_decimal(): 39 | numbering = _read_numbering_xml_element(_sample_numbering_xml) 40 | assert_equal(True, numbering.find_level("47", "1").is_ordered) 41 | 42 | 43 | def test_list_is_ordered_if_there_is_no_explicit_format(): 44 | element = xml_element("w:numbering", {}, [ 45 | xml_element("w:abstractNum", {"w:abstractNumId": "42"}, [ 46 | xml_element("w:lvl", {"w:ilvl": "0"}), 47 | ]), 48 | xml_element("w:num", {"w:numId": "47"}, [ 49 | xml_element("w:abstractNumId", {"w:val": "42"}) 50 | ]) 51 | ]) 52 | 53 | numbering = _read_numbering_xml_element(element) 54 | 55 | assert_equal(True, numbering.find_level("47", "0").is_ordered) 56 | 57 | 58 | def test_find_level_returns_none_if_level_cannot_be_found(): 59 | numbering = _read_numbering_xml_element(_sample_numbering_xml) 60 | assert_equal(None, numbering.find_level("47", "2")) 61 | 62 | 63 | def test_num_referencing_non_existent_abstract_num_is_ignored(): 64 | element = xml_element("w:numbering", {}, [ 65 | xml_element("w:num", {"w:numId": "47"}, [ 66 | xml_element("w:abstractNumId", {"w:val": "42"}) 67 | ]) 68 | ]) 69 | 70 | numbering = _read_numbering_xml_element(element) 71 | 72 | assert_equal(None, numbering.find_level("47", "0")) 73 | 74 | 75 | def test_given_no_other_levels_with_index_of_0_when_level_is_missing_ilvl_then_level_index_is_0(): 76 | element = xml_element("w:numbering", {}, [ 77 | xml_element("w:abstractNum", {"w:abstractNumId": "42"}, [ 78 | xml_element("w:lvl", {}, [ 79 | xml_element("w:numFmt", {"w:val": "decimal"}), 80 | ]), 81 | ]), 82 | xml_element("w:num", {"w:numId": "47"}, [ 83 | xml_element("w:abstractNumId", {"w:val": "42"}) 84 | ]) 85 | ]) 86 | 87 | numbering = _read_numbering_xml_element(element) 88 | 89 | assert_equal(True, numbering.find_level("47", "0").is_ordered) 90 | 91 | 92 | def test_given_previous_other_level_with_index_of_0_when_level_is_missing_ilvl_then_level_is_ignored(): 93 | element = xml_element("w:numbering", {}, [ 94 | xml_element("w:abstractNum", {"w:abstractNumId": "42"}, [ 95 | xml_element("w:lvl", {"w:ilvl": "0"}, [ 96 | xml_element("w:numFmt", {"w:val": "bullet"}), 97 | ]), 98 | xml_element("w:lvl", {}, [ 99 | xml_element("w:numFmt", {"w:val": "decimal"}), 100 | ]), 101 | ]), 102 | xml_element("w:num", {"w:numId": "47"}, [ 103 | xml_element("w:abstractNumId", {"w:val": "42"}) 104 | ]) 105 | ]) 106 | 107 | numbering = _read_numbering_xml_element(element) 108 | 109 | assert_equal(False, numbering.find_level("47", "0").is_ordered) 110 | 111 | 112 | def test_given_subsequent_other_level_with_index_of_0_when_level_is_missing_ilvl_then_level_is_ignored(): 113 | element = xml_element("w:numbering", {}, [ 114 | xml_element("w:abstractNum", {"w:abstractNumId": "42"}, [ 115 | xml_element("w:lvl", {}, [ 116 | xml_element("w:numFmt", {"w:val": "decimal"}), 117 | ]), 118 | xml_element("w:lvl", {"w:ilvl": "0"}, [ 119 | xml_element("w:numFmt", {"w:val": "bullet"}), 120 | ]), 121 | ]), 122 | xml_element("w:num", {"w:numId": "47"}, [ 123 | xml_element("w:abstractNumId", {"w:val": "42"}) 124 | ]) 125 | ]) 126 | 127 | numbering = _read_numbering_xml_element(element) 128 | 129 | assert_equal(False, numbering.find_level("47", "0").is_ordered) 130 | 131 | 132 | def test_when_abstract_num_has_num_style_link_then_style_is_used_to_find_num(): 133 | numbering = _read_numbering_xml_element( 134 | xml_element("w:numbering", {}, [ 135 | xml_element("w:abstractNum", {"w:abstractNumId": "100"}, [ 136 | xml_element("w:lvl", {"w:ilvl": "0"}, [ 137 | xml_element("w:numFmt", {"w:val": "decimal"}), 138 | ]), 139 | ]), 140 | xml_element("w:abstractNum", {"w:abstractNumId": "101"}, [ 141 | xml_element("w:numStyleLink", {"w:val": "List1"}), 142 | ]), 143 | xml_element("w:num", {"w:numId": "200"}, [ 144 | xml_element("w:abstractNumId", {"w:val": "100"}), 145 | ]), 146 | xml_element("w:num", {"w:numId": "201"}, [ 147 | xml_element("w:abstractNumId", {"w:val": "101"}), 148 | ]) 149 | ]), 150 | styles=Styles.create(numbering_styles={ 151 | "List1": NumberingStyle(style_id="List1", num_id="200"), 152 | }), 153 | ) 154 | assert_equal(True, numbering.find_level("201", "0").is_ordered) 155 | 156 | 157 | # See: 17.9.23 pStyle (Paragraph Style's Associated Numbering Level) in ECMA-376, 4th Edition 158 | def test_numbering_level_can_be_found_by_paragraph_style_id(): 159 | numbering = _read_numbering_xml_element( 160 | xml_element("w:numbering", {}, [ 161 | xml_element("w:abstractNum", {"w:abstractNumId": "42"}, [ 162 | xml_element("w:lvl", {"w:ilvl": "0"}, [ 163 | xml_element("w:numFmt", {"w:val": "bullet"}), 164 | ]), 165 | ]), 166 | xml_element("w:abstractNum", {"w:abstractNumId": "43"}, [ 167 | xml_element("w:lvl", {"w:ilvl": "0"}, [ 168 | xml_element("w:pStyle", {"w:val": "List"}), 169 | xml_element("w:numFmt", {"w:val": "decimal"}), 170 | ]), 171 | ]), 172 | ]), 173 | ) 174 | 175 | assert_equal(True, numbering.find_level_by_paragraph_style_id("List").is_ordered) 176 | assert_equal(None, numbering.find_level_by_paragraph_style_id("Paragraph")) 177 | 178 | 179 | def _read_numbering_xml_element(element, styles=None): 180 | if styles is None: 181 | styles = Styles.EMPTY 182 | 183 | return read_numbering_xml_element(element, styles=styles) 184 | -------------------------------------------------------------------------------- /mammoth/documents.py: -------------------------------------------------------------------------------- 1 | import cobble 2 | 3 | 4 | class Element(object): 5 | def copy(self, **kwargs): 6 | return cobble.copy(self, **kwargs) 7 | 8 | 9 | class HasChildren(Element): 10 | children = cobble.field() 11 | 12 | 13 | @cobble.data 14 | class Document(HasChildren): 15 | notes = cobble.field() 16 | comments = cobble.field() 17 | 18 | @cobble.data 19 | class Paragraph(HasChildren): 20 | style_id = cobble.field() 21 | style_name = cobble.field() 22 | numbering = cobble.field() 23 | alignment = cobble.field() 24 | indent = cobble.field() 25 | 26 | 27 | @cobble.data 28 | class ParagraphIndent(object): 29 | start = cobble.field() 30 | end = cobble.field() 31 | first_line = cobble.field() 32 | hanging = cobble.field() 33 | 34 | 35 | @cobble.data 36 | class Indent(object): 37 | left = cobble.field() 38 | right = cobble.field() 39 | first_line = cobble.field() 40 | hanging = cobble.field() 41 | 42 | 43 | @cobble.data 44 | class Run(HasChildren): 45 | style_id = cobble.field() 46 | style_name = cobble.field() 47 | is_bold = cobble.field() 48 | is_italic = cobble.field() 49 | is_underline = cobble.field() 50 | is_strikethrough = cobble.field() 51 | is_all_caps = cobble.field() 52 | is_small_caps = cobble.field() 53 | vertical_alignment = cobble.field() 54 | font = cobble.field() 55 | font_size = cobble.field() 56 | highlight = cobble.field() 57 | 58 | @cobble.data 59 | class Text(Element): 60 | value = cobble.field() 61 | 62 | @cobble.data 63 | class Hyperlink(HasChildren): 64 | href = cobble.field() 65 | anchor = cobble.field() 66 | target_frame = cobble.field() 67 | 68 | @cobble.data 69 | class Checkbox(Element): 70 | checked = cobble.field() 71 | 72 | checkbox = Checkbox 73 | 74 | @cobble.data 75 | class Table(HasChildren): 76 | style_id = cobble.field() 77 | style_name = cobble.field() 78 | 79 | @cobble.data 80 | class TableRow(HasChildren): 81 | is_header = cobble.field() 82 | 83 | @cobble.data 84 | class TableCell(HasChildren): 85 | colspan = cobble.field() 86 | rowspan = cobble.field() 87 | 88 | @cobble.data 89 | class TableCellUnmerged: 90 | children = cobble.field() 91 | colspan = cobble.field() 92 | rowspan = cobble.field() 93 | vmerge = cobble.field() 94 | 95 | def _accept1(self, visitor, arg0): 96 | return visitor.visit_table_cell(self, arg0) 97 | 98 | def copy(self, **kwargs): 99 | return cobble.copy(self, **kwargs) 100 | 101 | @cobble.data 102 | class Break(Element): 103 | break_type = cobble.field() 104 | 105 | line_break = Break("line") 106 | page_break = Break("page") 107 | column_break = Break("column") 108 | 109 | 110 | @cobble.data 111 | class Tab(Element): 112 | pass 113 | 114 | 115 | @cobble.data 116 | class Image(Element): 117 | alt_text = cobble.field() 118 | content_type = cobble.field() 119 | open = cobble.field() 120 | 121 | 122 | def document(children, notes=None, comments=None): 123 | if notes is None: 124 | notes = Notes({}) 125 | if comments is None: 126 | comments = [] 127 | return Document(children, notes, comments=comments) 128 | 129 | def paragraph(children, style_id=None, style_name=None, numbering=None, alignment=None, indent=None): 130 | if indent is None: 131 | indent = paragraph_indent() 132 | 133 | return Paragraph(children, style_id, style_name, numbering, alignment=alignment, indent=indent) 134 | 135 | def paragraph_indent(start=None, end=None, first_line=None, hanging=None): 136 | return ParagraphIndent(start=start, end=end, first_line=first_line, hanging=hanging) 137 | 138 | def run( 139 | children, 140 | style_id=None, 141 | style_name=None, 142 | is_bold=None, 143 | is_italic=None, 144 | is_underline=None, 145 | is_strikethrough=None, 146 | is_all_caps=None, 147 | is_small_caps=None, 148 | vertical_alignment=None, 149 | font=None, 150 | font_size=None, 151 | highlight=None, 152 | ): 153 | if vertical_alignment is None: 154 | vertical_alignment = VerticalAlignment.baseline 155 | return Run( 156 | children=children, 157 | style_id=style_id, 158 | style_name=style_name, 159 | is_bold=bool(is_bold), 160 | is_italic=bool(is_italic), 161 | is_underline=bool(is_underline), 162 | is_strikethrough=bool(is_strikethrough), 163 | is_all_caps=bool(is_all_caps), 164 | is_small_caps=bool(is_small_caps), 165 | vertical_alignment=vertical_alignment, 166 | font=font, 167 | font_size=font_size, 168 | highlight=highlight, 169 | ) 170 | 171 | class VerticalAlignment(object): 172 | baseline = "baseline" 173 | superscript = "superscript" 174 | subscript = "subscript" 175 | 176 | text = Text 177 | 178 | _tab = Tab() 179 | 180 | def tab(): 181 | return _tab 182 | 183 | 184 | image = Image 185 | 186 | def hyperlink(children, href=None, anchor=None, target_frame=None): 187 | return Hyperlink(href=href, anchor=anchor, target_frame=target_frame, children=children) 188 | 189 | 190 | @cobble.data 191 | class Bookmark(Element): 192 | name = cobble.field() 193 | 194 | bookmark = Bookmark 195 | 196 | 197 | def table(children, style_id=None, style_name=None): 198 | return Table(children=children, style_id=style_id, style_name=style_name) 199 | 200 | def table_row(children, is_header=None): 201 | return TableRow(children=children, is_header=bool(is_header)) 202 | 203 | def table_cell(children, colspan=None, rowspan=None): 204 | if colspan is None: 205 | colspan = 1 206 | if rowspan is None: 207 | rowspan = 1 208 | return TableCell(children=children, colspan=colspan, rowspan=rowspan) 209 | 210 | def table_cell_unmerged(children, colspan, rowspan, vmerge): 211 | return TableCellUnmerged(children=children, colspan=colspan, rowspan=rowspan, vmerge=vmerge) 212 | 213 | def numbering_level(level_index, is_ordered): 214 | return _NumberingLevel(str(level_index), bool(is_ordered)) 215 | 216 | @cobble.data 217 | class _NumberingLevel(object): 218 | level_index = cobble.field() 219 | is_ordered = cobble.field() 220 | 221 | @cobble.data 222 | class Note(Element): 223 | note_type = cobble.field() 224 | note_id = cobble.field() 225 | body = cobble.field() 226 | 227 | 228 | note = Note 229 | 230 | 231 | class Notes(object): 232 | def __init__(self, notes): 233 | self._notes = notes 234 | 235 | def find_note(self, note_type, note_id): 236 | return self._notes[(note_type, note_id)] 237 | 238 | def resolve(self, reference): 239 | return self.find_note(reference.note_type, reference.note_id) 240 | 241 | def __eq__(self, other): 242 | return isinstance(other, Notes) and self._notes == other._notes 243 | 244 | def __ne__(self, other): 245 | return not (self == other) 246 | 247 | def notes(notes_list): 248 | return Notes(dict( 249 | (_note_key(note), note) 250 | for note in notes_list 251 | )) 252 | 253 | def _note_key(note): 254 | return (note.note_type, note.note_id) 255 | 256 | @cobble.data 257 | class NoteReference(Element): 258 | note_type = cobble.field() 259 | note_id = cobble.field() 260 | 261 | note_reference = NoteReference 262 | 263 | 264 | @cobble.data 265 | class Comment(object): 266 | comment_id = cobble.field() 267 | body = cobble.field() 268 | author_name = cobble.field() 269 | author_initials = cobble.field() 270 | 271 | def comment(comment_id, body, author_name=None, author_initials=None): 272 | return Comment( 273 | comment_id=comment_id, 274 | body=body, 275 | author_name=author_name, 276 | author_initials=author_initials, 277 | ) 278 | 279 | @cobble.data 280 | class CommentReference(Element): 281 | comment_id = cobble.field() 282 | 283 | comment_reference = CommentReference 284 | 285 | def element_visitor(args): 286 | return cobble.visitor(Element, args=args) 287 | -------------------------------------------------------------------------------- /tests/docx/docx_tests.py: -------------------------------------------------------------------------------- 1 | import io 2 | import textwrap 3 | import zipfile 4 | 5 | from mammoth import docx, documents, zips 6 | from ..testing import assert_equal, assert_raises, generate_test_path 7 | 8 | 9 | class ReadTests(object): 10 | def test_can_read_document_with_single_paragraph_with_single_run_of_text(self): 11 | with open(generate_test_path("single-paragraph.docx"), "rb") as fileobj: 12 | result = docx.read(fileobj=fileobj) 13 | expected_document = documents.document([ 14 | documents.paragraph([ 15 | documents.run([ 16 | documents.text("Walking on imported air") 17 | ]) 18 | ]) 19 | ]) 20 | assert_equal(expected_document, result.value) 21 | 22 | 23 | _relationship_namespaces = { 24 | "r": "http://schemas.openxmlformats.org/package/2006/relationships", 25 | } 26 | 27 | 28 | def test_main_document_is_found_using_package_relationships(): 29 | fileobj = _create_zip({ 30 | "word/document2.xml": textwrap.dedent("""\ 31 | 32 | 33 | 34 | 35 | 36 | Hello. 37 | 38 | 39 | 40 | 41 | """), 42 | "_rels/.rels": textwrap.dedent("""\ 43 | 44 | 45 | 46 | 47 | """), 48 | }) 49 | result = docx.read(fileobj=fileobj) 50 | expected_document = documents.document([ 51 | documents.paragraph([ 52 | documents.run([ 53 | documents.text("Hello.") 54 | ]) 55 | ]) 56 | ]) 57 | assert_equal(expected_document, result.value) 58 | 59 | 60 | def test_error_is_raised_when_main_document_part_does_not_exist(): 61 | fileobj = _create_zip({ 62 | "_rels/.rels": textwrap.dedent("""\ 63 | 64 | 65 | 66 | 67 | """), 68 | }) 69 | error = assert_raises(IOError, lambda: docx.read(fileobj=fileobj)) 70 | assert_equal( 71 | "Could not find main document part. Are you sure this is a valid .docx file?", 72 | str(error), 73 | ) 74 | 75 | class PartPathsTests(object): 76 | def test_main_document_part_is_found_using_package_relationships(self): 77 | fileobj = _create_zip({ 78 | "word/document2.xml": " ", 79 | "_rels/.rels": textwrap.dedent("""\ 80 | 81 | 82 | 83 | 84 | """), 85 | }) 86 | part_paths = self._find_part_paths(fileobj) 87 | assert_equal("word/document2.xml", part_paths.main_document) 88 | 89 | def test_when_relationship_for_main_document_cannot_be_found_then_fallback_is_used(self): 90 | fileobj = _create_zip({ 91 | "word/document.xml": " ", 92 | }) 93 | part_paths = self._find_part_paths(fileobj) 94 | assert_equal("word/document.xml", part_paths.main_document) 95 | 96 | def test_comments_part_is_found_using_main_document_relationships(self): 97 | self._assert_path_is_found_using_main_document_relationships("comments") 98 | 99 | def test_when_relationship_for_comments_cannot_be_found_then_fallback_is_used(self): 100 | self._assert_when_relationship_for_part_cannot_be_found_then_fallback_is_used("comments") 101 | 102 | def test_endnotes_part_is_found_using_main_document_relationships(self): 103 | self._assert_path_is_found_using_main_document_relationships("endnotes") 104 | 105 | def test_when_relationship_for_endnotes_cannot_be_found_then_fallback_is_used(self): 106 | self._assert_when_relationship_for_part_cannot_be_found_then_fallback_is_used("endnotes") 107 | 108 | def test_footnotes_part_is_found_using_main_document_relationships(self): 109 | self._assert_path_is_found_using_main_document_relationships("footnotes") 110 | 111 | def test_when_relationship_for_footnotes_cannot_be_found_then_fallback_is_used(self): 112 | self._assert_when_relationship_for_part_cannot_be_found_then_fallback_is_used("footnotes") 113 | 114 | def test_numbering_part_is_found_using_main_document_relationships(self): 115 | self._assert_path_is_found_using_main_document_relationships("numbering") 116 | 117 | def test_when_relationship_for_numbering_cannot_be_found_then_fallback_is_used(self): 118 | self._assert_when_relationship_for_part_cannot_be_found_then_fallback_is_used("numbering") 119 | 120 | def test_styles_part_is_found_using_main_document_relationships(self): 121 | self._assert_path_is_found_using_main_document_relationships("styles") 122 | 123 | def test_when_relationship_for_styles_cannot_be_found_then_fallback_is_used(self): 124 | self._assert_when_relationship_for_part_cannot_be_found_then_fallback_is_used("styles") 125 | 126 | def _assert_path_is_found_using_main_document_relationships(self, name): 127 | fileobj = _create_zip({ 128 | "_rels/.rels": textwrap.dedent("""\ 129 | 130 | 131 | 132 | 133 | """), 134 | "word/document.xml": " ", 135 | "word/_rels/document.xml.rels": textwrap.dedent("""\ 136 | 137 | 138 | 139 | 140 | """.format(name=name)), 141 | "word/target-path.xml": " " 142 | }) 143 | part_paths = self._find_part_paths(fileobj) 144 | assert_equal("word/target-path.xml", getattr(part_paths, name)) 145 | 146 | def _assert_when_relationship_for_part_cannot_be_found_then_fallback_is_used(self, name): 147 | fileobj = _create_zip({ 148 | "_rels/.rels": textwrap.dedent("""\ 149 | 150 | 151 | 152 | 153 | """), 154 | "word/document.xml": " ", 155 | }) 156 | part_paths = self._find_part_paths(fileobj) 157 | assert_equal("word/{0}.xml".format(name), getattr(part_paths, name)) 158 | 159 | 160 | def _find_part_paths(self, fileobj): 161 | return docx._find_part_paths(zips.open_zip(fileobj, "r")) 162 | 163 | 164 | def _create_zip(files): 165 | fileobj = io.BytesIO() 166 | 167 | zip_file = zipfile.ZipFile(fileobj, "w") 168 | try: 169 | for name, contents in files.items(): 170 | zip_file.writestr(name, contents) 171 | finally: 172 | zip_file.close() 173 | 174 | fileobj.seek(0) 175 | return fileobj 176 | -------------------------------------------------------------------------------- /NEWS: -------------------------------------------------------------------------------- 1 | # 1.12.0 2 | 3 | * Handle hyperlinked wp:anchor and wp:inline elements. 4 | 5 | # 1.11.0 6 | 7 | * Ignore style definitions using a style ID that has already been used. 8 | 9 | * Fix conversion of unmerged table cells. 10 | 11 | * Disable external file accesses by default. External file access can be enabled 12 | using the external_file_access argument. 13 | 14 | * Handle numbering levels defined without an index. 15 | 16 | # 1.10.0 17 | 18 | * Add "Heading" and "Body" styles, as found in documents created by Apple Pages, 19 | to the default style map. 20 | 21 | * Handle structured document tags representing checkboxes wrapped in other 22 | elements, such as table cells. Previously, the wrapping elements would have 23 | been ignored. 24 | 25 | * Ignore deleted table rows. 26 | 27 | * Add notes on security. 28 | 29 | # 1.9.1 30 | 31 | * Ignore AlternateContent elements when there is no Fallback element. 32 | 33 | # 1.9.0 34 | 35 | * Detect checkboxes, both as complex fields and structured document tags, and 36 | convert them to checkbox inputs. 37 | 38 | * Ignore AlternateContent elements when there is no Fallback element. 39 | 40 | # 1.8.0 41 | 42 | * Add style mapping for highlights. 43 | 44 | # 1.7.1 45 | 46 | * Switch the precedence of numbering properties in paragraph properties and the 47 | numbering in paragraph styles so that the numbering properties in paragraph 48 | properties takes precedence. 49 | 50 | # 1.7.0 51 | 52 | * Support attributes in HTML paths in style mappings. 53 | 54 | * Improve error message when failing to find the body element in a document. 55 | 56 | * Drop support for Python 2.7, Python 3.5 and Python 3.6. 57 | 58 | * Add support for the strict document format. 59 | 60 | # 1.6.0 61 | 62 | * Support merged paragraphs when revisions are tracked. 63 | 64 | # 1.5.1 65 | 66 | * Add a pyproject.toml to add an explicit build dependency on setuptools. 67 | 68 | # 1.5.0 69 | 70 | * Only use the alt text of image elements as a fallback. If an alt attribute is 71 | returned from the function passed to mammoth.images.img_element, that value 72 | will now be preferred to the alt text of the image element. 73 | 74 | # 1.4.19 75 | 76 | * Ignore w:u elements when w:val is missing. 77 | 78 | # 1.4.18 79 | 80 | * Emit warning instead of throwing exception when image file cannot be found for 81 | a:blip elements. 82 | 83 | # 1.4.17 84 | 85 | * When extracting raw text, convert tab elements to tab characters. 86 | 87 | * Handle internal hyperlinks created with complex fields. 88 | 89 | # 1.4.16 90 | 91 | * Handle w:num with invalid w:abstractNumId. 92 | 93 | # 1.4.15 94 | 95 | * Convert symbols in supported fonts to corresponding Unicode characters. 96 | 97 | # 1.4.14 98 | 99 | * Support numbering defined by paragraph style. 100 | 101 | # 1.4.13 102 | 103 | * Add style mapping for all caps. 104 | 105 | # 1.4.12 106 | 107 | * Handle underline elements where w:val is "none". 108 | 109 | # 1.4.11 110 | 111 | * Read font size for runs. 112 | * Support soft hyphens. 113 | 114 | # 1.4.10 115 | 116 | * Update supported Python versions to 2.7 and 3.4 to 3.8. 117 | 118 | # 1.4.9 119 | 120 | * Improve list support by following w:numStyleLink in w:abstractNum. 121 | 122 | # 1.4.8 123 | 124 | * Preserve empty table rows. 125 | 126 | # 1.4.7 127 | 128 | * Always write files as UTF-8 in the CLI. 129 | 130 | # 1.4.6 131 | 132 | * Fix: default style mappings caused footnotes, endnotes and comments 133 | containing multiple paragraphs to be converted into a single paragraph. 134 | 135 | # 1.4.5 136 | 137 | * Read the children of v:rect elements. 138 | 139 | # 1.4.4 140 | 141 | * Parse paragraph indents. 142 | 143 | * Read part paths using relationships. This improves support for documents 144 | created by Word Online. 145 | 146 | # 1.4.3 147 | 148 | * Add style mapping for small caps. 149 | 150 | * Add style mapping for tables. 151 | 152 | # 1.4.2 153 | 154 | * Read children of v:group elements. 155 | 156 | # 1.4.1 157 | 158 | * Read w:noBreakHyphen elements as non-breaking hyphen characters. 159 | 160 | # 1.4.0 161 | 162 | * Extract the default data URI image converter to the images module. 163 | 164 | * Add anchor on hyperlinks as fragment if present. 165 | 166 | * Convert target frames on hyperlinks to targets on anchors. 167 | 168 | * Detect header rows in tables and convert to thead > tr > th. 169 | 170 | # 1.3.5 171 | 172 | * Handle complex fields that do not have a "separate" fldChar. 173 | 174 | # 1.3.4 175 | 176 | * Add transforms.run. 177 | 178 | # 1.3.3 179 | 180 | * Read children of w:object elements. 181 | 182 | * Add support for document transforms. 183 | 184 | # 1.3.2 185 | 186 | * Handle hyperlinks created with complex fields. 187 | 188 | # 1.3.1 189 | 190 | * Handle absolute paths within zip files. This should fix an issue where some 191 | images within a document couldn't be found. 192 | 193 | # 1.3.0 194 | 195 | * Allow style names to be mapped by prefix. For instance: 196 | 197 | r[style-name^='Code '] => code 198 | 199 | * Add default style mappings for Heading 5 and Heading 6. 200 | 201 | * Allow escape sequences in style IDs, style names and CSS class names. 202 | 203 | * Allow a separator to be specified when HTML elements are collapsed. 204 | 205 | * Add include_embedded_style_map argument to allow embedded style maps to be 206 | disabled. 207 | 208 | * Include embedded styles when explicit style map is passed. 209 | 210 | # 1.2.2 211 | 212 | * Ignore bold, italic, underline and strikethrough elements that have a value of 213 | false or 0. 214 | 215 | # 1.2.1 216 | 217 | * Ignore v:imagedata elements without relationship ID with warning. 218 | 219 | # 1.2.0 220 | 221 | * Use alt text title as alt text for images when the alt text description is 222 | blank or missing. 223 | 224 | # 1.1.1 225 | 226 | * Handle comments without author initials. 227 | 228 | * Change numbering of comments to be global rather than per-user to match the 229 | behaviour of Word. 230 | 231 | # 1.1.0 232 | 233 | * Add support for comments. 234 | 235 | # 1.0.4 236 | 237 | * Add support for w:sdt elements. This allows the bodies of content controls, 238 | such as bibliographies, to be converted. 239 | 240 | # 1.0.3 241 | 242 | * Add support for table cells spanning multiple rows. 243 | 244 | # 1.0.2 245 | 246 | * Add support for table cells spanning multiple columns. 247 | 248 | # 1.0.1 249 | 250 | * Improve script installation on Windows by using entry_points instead of 251 | scripts in setup.py. 252 | 253 | # 1.0.0 254 | 255 | * Remove deprecated convert_underline argument. 256 | 257 | * Officially support ID prefixes. 258 | 259 | * Generated IDs no longer insert a hyphen after the ID prefix. 260 | 261 | * The default ID prefix is now the empty string rather than a random number 262 | followed by a hyphen. 263 | 264 | * Rename mammoth.images.inline to mammoth.images.img_element to better reflect 265 | its behaviour. 266 | 267 | # 0.3.31 268 | 269 | * Improve collapsing of similar non-fresh HTML elements. 270 | 271 | # 0.3.30 272 | 273 | * Allow bold and italic style mappings to be configured. 274 | 275 | # 0.3.29 276 | 277 | * Handle references to missing styles when reading documents. 278 | 279 | # 0.3.28 280 | 281 | * Improve support for lists made in LibreOffice. Specifically, this changes the 282 | default style mapping for paragraphs with a style of "Normal" to have the 283 | lowest precedence. 284 | 285 | # 0.3.27 286 | 287 | * Handle XML where the child nodes of an element contains text nodes. 288 | 289 | # 0.3.26 290 | 291 | * Always use mc:Fallback when reading mc:AlternateContent elements. 292 | 293 | # 0.3.25 294 | 295 | * Remove duplicate messages from results. 296 | 297 | * Read v:imagedata with r:id attribute. 298 | 299 | * Read children of v:roundrect. 300 | 301 | * Ignore office-word:wrap, v:shadow and v:shapetype. 302 | 303 | # 0.3.24 304 | 305 | * Continue with warning if external images cannot be found. 306 | 307 | * Add support for embedded style maps. 308 | 309 | # 0.3.23 310 | 311 | * Fix Python 3 support. 312 | 313 | # 0.3.22 314 | 315 | * Generate warnings for not-understood style mappings and continue, rather than 316 | stopping with an error. 317 | 318 | * Support file objects without a name attribute again (broken since 0.3.20). 319 | 320 | # 0.3.21 321 | 322 | * Ignore w:numPr elements without w:numId or w:ilvl children. 323 | 324 | # 0.3.20 325 | 326 | * Add support for linked images. 327 | 328 | # 0.3.19 329 | 330 | * Fix: cannot extract raw text from elements without children 331 | 332 | # 0.3.18 333 | 334 | * Support links and images in footnotes and endnotes. 335 | 336 | # 0.3.17 337 | 338 | * Add support for underlines in style map. 339 | 340 | * Add support for strikethrough. 341 | 342 | # 0.3.16 343 | 344 | * Add basic support for text boxes. The contents of the text box are treated as 345 | a separate paragraph that appears after the paragraph containing the text box. 346 | 347 | # 0.3.15 348 | 349 | * Support styles defined without a name 350 | 351 | # 0.3.14 352 | 353 | * Add ignore_empty_paragraphs option, which defaults to True. 354 | 355 | # 0.3.13 356 | 357 | * Always use forward slashes in ZIP paths. This should fix image handling on 358 | Windows. 359 | 360 | # 0.3.12 361 | 362 | * Make style names case-insensitive in style mappings. This should make style 363 | mappings easier to write, especially since Microsoft Word sometimes represents 364 | style names in the UI differently from in the style definition. For instance, 365 | the style displayed in Word as "Heading 1" has a style name of "heading 1". 366 | This hopefully shouldn't cause an issue for anyone, but if you were relying 367 | on case-sensitivity, please do get in touch. 368 | 369 | # 0.3.11 370 | 371 | * Add support for hyperlinks to bookmarks in the same document. 372 | 373 | # 0.3.10 374 | 375 | * Add basic support for Markdown. Not all features are currently supported. 376 | 377 | # 0.3.9 378 | 379 | * Add default style mappings for builtin footnote and endnote styles in 380 | Microsoft Word and LibreOffice. 381 | 382 | * Allow style mappings with a zero-element HTML path. 383 | 384 | * Emit warnings when image types are unlikely to be supported by web browsers. 385 | 386 | # 0.3.8 387 | 388 | * Add support for endnotes. 389 | 390 | # 0.3.7 391 | 392 | * Add support for superscript and subscript text. 393 | 394 | # 0.3.6 395 | 396 | * Add support for footnotes. 397 | 398 | # 0.3.5 399 | 400 | * Add support for line breaks. 401 | 402 | # 0.3.4 403 | 404 | * Add optional underline conversion. 405 | 406 | # 0.3.3 407 | 408 | * Add `mammoth.images.inline`, and document custom image conversion. 409 | 410 | # 0.3.2 411 | 412 | * Add the function `mammoth.extract_raw_text`. 413 | 414 | # 0.3.1 415 | 416 | * Add support for tables 417 | 418 | # 0.3.0 419 | 420 | * Rename --styles CLI argument to --style-map. 421 | 422 | * Rename styles argument in convert_to_html to style_map. 423 | 424 | * Allow paragraphs and runs to be matched by style name. For instance, to match 425 | a paragraph with the style name `Heading 1`: 426 | 427 | p[style-name='Heading 1'] 428 | --------------------------------------------------------------------------------