├── .github └── workflows │ ├── python-app.yml │ └── python-publish.yml ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.rst ├── markdownify ├── __init__.py └── main.py ├── pyproject.toml ├── shell.nix ├── tests ├── __init__.py ├── test_advanced.py ├── test_args.py ├── test_basic.py ├── test_conversions.py ├── test_custom_converter.py ├── test_escaping.py ├── test_lists.py ├── test_tables.py └── utils.py └── tox.ini /.github/workflows/python-app.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python application 5 | 6 | on: 7 | push: 8 | branches: [ develop ] 9 | pull_request: 10 | branches: [ develop ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - uses: actions/checkout@v2 19 | - name: Set up Python 3.8 20 | uses: actions/setup-python@v2 21 | with: 22 | python-version: 3.8 23 | - name: Install dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install --upgrade setuptools setuptools_scm wheel build tox 27 | - name: Lint and test 28 | run: | 29 | tox 30 | - name: Build 31 | run: | 32 | python -m build -nwsx . 33 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Upload Python Package 5 | 6 | on: 7 | release: 8 | types: [created] 9 | 10 | jobs: 11 | deploy: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Set up Python 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: '3.8' 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install --upgrade setuptools setuptools_scm wheel build twine 25 | - name: Build and publish 26 | env: 27 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 28 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 29 | run: | 30 | python -m build -nwsx . 31 | twine upload dist/* 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.egg 3 | .eggs/ 4 | *.egg-info/ 5 | .DS_Store 6 | /.env 7 | /dist 8 | /MANIFEST 9 | /venv 10 | build/ 11 | .vscode/settings.json 12 | .tox/ 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright 2012-2018 Matthew Tretter 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst 2 | prune tests 3 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | |build| |version| |license| |downloads| 2 | 3 | .. |build| image:: https://img.shields.io/github/actions/workflow/status/matthewwithanm/python-markdownify/python-app.yml?branch=develop 4 | :alt: GitHub Workflow Status 5 | :target: https://github.com/matthewwithanm/python-markdownify/actions/workflows/python-app.yml?query=workflow%3A%22Python+application%22 6 | 7 | .. |version| image:: https://img.shields.io/pypi/v/markdownify 8 | :alt: Pypi version 9 | :target: https://pypi.org/project/markdownify/ 10 | 11 | .. |license| image:: https://img.shields.io/pypi/l/markdownify 12 | :alt: License 13 | :target: https://github.com/matthewwithanm/python-markdownify/blob/develop/LICENSE 14 | 15 | .. |downloads| image:: https://pepy.tech/badge/markdownify 16 | :alt: Pypi Downloads 17 | :target: https://pepy.tech/project/markdownify 18 | 19 | Installation 20 | ============ 21 | 22 | ``pip install markdownify`` 23 | 24 | 25 | Usage 26 | ===== 27 | 28 | Convert some HTML to Markdown: 29 | 30 | .. code:: python 31 | 32 | from markdownify import markdownify as md 33 | md('Yay GitHub') # > '**Yay** [GitHub](http://github.com)' 34 | 35 | Specify tags to exclude: 36 | 37 | .. code:: python 38 | 39 | from markdownify import markdownify as md 40 | md('Yay GitHub', strip=['a']) # > '**Yay** GitHub' 41 | 42 | \...or specify the tags you want to include: 43 | 44 | .. code:: python 45 | 46 | from markdownify import markdownify as md 47 | md('Yay GitHub', convert=['b']) # > '**Yay** GitHub' 48 | 49 | 50 | Options 51 | ======= 52 | 53 | Markdownify supports the following options: 54 | 55 | strip 56 | A list of tags to strip. This option can't be used with the 57 | ``convert`` option. 58 | 59 | convert 60 | A list of tags to convert. This option can't be used with the 61 | ``strip`` option. 62 | 63 | autolinks 64 | A boolean indicating whether the "automatic link" style should be used when 65 | a ``a`` tag's contents match its href. Defaults to ``True``. 66 | 67 | default_title 68 | A boolean to enable setting the title of a link to its href, if no title is 69 | given. Defaults to ``False``. 70 | 71 | heading_style 72 | Defines how headings should be converted. Accepted values are ``ATX``, 73 | ``ATX_CLOSED``, ``SETEXT``, and ``UNDERLINED`` (which is an alias for 74 | ``SETEXT``). Defaults to ``UNDERLINED``. 75 | 76 | bullets 77 | An iterable (string, list, or tuple) of bullet styles to be used. If the 78 | iterable only contains one item, it will be used regardless of how deeply 79 | lists are nested. Otherwise, the bullet will alternate based on nesting 80 | level. Defaults to ``'*+-'``. 81 | 82 | strong_em_symbol 83 | In markdown, both ``*`` and ``_`` are used to encode **strong** or 84 | *emphasized* texts. Either of these symbols can be chosen by the options 85 | ``ASTERISK`` (default) or ``UNDERSCORE`` respectively. 86 | 87 | sub_symbol, sup_symbol 88 | Define the chars that surround ```` and ```` text. Defaults to an 89 | empty string, because this is non-standard behavior. Could be something like 90 | ``~`` and ``^`` to result in ``~sub~`` and ``^sup^``. If the value starts 91 | with ``<`` and ends with ``>``, it is treated as an HTML tag and a ``/`` is 92 | inserted after the ``<`` in the string used after the text; this allows 93 | specifying ```` to use raw HTML in the output for subscripts, for 94 | example. 95 | 96 | newline_style 97 | Defines the style of marking linebreaks (``
``) in markdown. The default 98 | value ``SPACES`` of this option will adopt the usual two spaces and a newline, 99 | while ``BACKSLASH`` will convert a linebreak to ``\\n`` (a backslash and a 100 | newline). While the latter convention is non-standard, it is commonly 101 | preferred and supported by a lot of interpreters. 102 | 103 | code_language 104 | Defines the language that should be assumed for all ``
`` sections.
105 |   Useful, if all code on a page is in the same programming language and
106 |   should be annotated with `````python`` or similar.
107 |   Defaults to ``''`` (empty string) and can be any string.
108 | 
109 | code_language_callback
110 |   When the HTML code contains ``pre`` tags that in some way provide the code
111 |   language, for example as class, this callback can be used to extract the
112 |   language from the tag and prefix it to the converted ``pre`` tag.
113 |   The callback gets one single argument, an BeautifylSoup object, and returns
114 |   a string containing the code language, or ``None``.
115 |   An example to use the class name as code language could be::
116 | 
117 |     def callback(el):
118 |         return el['class'][0] if el.has_attr('class') else None
119 | 
120 |   Defaults to ``None``.
121 | 
122 | escape_asterisks
123 |   If set to ``False``, do not escape ``*`` to ``\*`` in text.
124 |   Defaults to ``True``.
125 | 
126 | escape_underscores
127 |   If set to ``False``, do not escape ``_`` to ``\_`` in text.
128 |   Defaults to ``True``.
129 | 
130 | escape_misc
131 |   If set to ``True``, escape miscellaneous punctuation characters
132 |   that sometimes have Markdown significance in text.
133 |   Defaults to ``False``.
134 | 
135 | keep_inline_images_in
136 |   Images are converted to their alt-text when the images are located inside
137 |   headlines or table cells. If some inline images should be converted to
138 |   markdown images instead, this option can be set to a list of parent tags
139 |   that should be allowed to contain inline images, for example ``['td']``.
140 |   Defaults to an empty list.
141 | 
142 | table_infer_header
143 |   Controls handling of tables with no header row (as indicated by ````
144 |   or ````). When set to ``True``, the first body row is used as the header row.
145 |   Defaults to ``False``, which leaves the header row empty.
146 | 
147 | wrap, wrap_width
148 |   If ``wrap`` is set to ``True``, all text paragraphs are wrapped at
149 |   ``wrap_width`` characters. Defaults to ``False`` and ``80``.
150 |   Use with ``newline_style=BACKSLASH`` to keep line breaks in paragraphs.
151 |   A `wrap_width` value of `None` reflows lines to unlimited line length.
152 | 
153 | strip_document
154 |   Controls whether leading and/or trailing separation newlines are removed from
155 |   the final converted document. Supported values are ``LSTRIP`` (leading),
156 |   ``RSTRIP`` (trailing), ``STRIP`` (both), and ``None`` (neither). Newlines
157 |   within the document are unaffected.
158 |   Defaults to ``STRIP``.
159 | 
160 | beautiful_soup_parser
161 |   Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such
162 |   as `html5lib`, `lxml` or even a custom parser as long as it is installed on the execution
163 |   environment. Defaults to ``html.parser``.
164 | 
165 | .. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/
166 | 
167 | Options may be specified as kwargs to the ``markdownify`` function, or as a
168 | nested ``Options`` class in ``MarkdownConverter`` subclasses.
169 | 
170 | 
171 | Converting BeautifulSoup objects
172 | ================================
173 | 
174 | .. code:: python
175 | 
176 |     from markdownify import MarkdownConverter
177 | 
178 |     # Create shorthand method for conversion
179 |     def md(soup, **options):
180 |         return MarkdownConverter(**options).convert_soup(soup)
181 | 
182 | 
183 | Creating Custom Converters
184 | ==========================
185 | 
186 | If you have a special usecase that calls for a special conversion, you can
187 | always inherit from ``MarkdownConverter`` and override the method you want to
188 | change.
189 | The function that handles a HTML tag named ``abc`` is called
190 | ``convert_abc(self, el, text, parent_tags)`` and returns a string
191 | containing the converted HTML tag.
192 | The ``MarkdownConverter`` object will handle the conversion based on the
193 | function names:
194 | 
195 | .. code:: python
196 | 
197 |     from markdownify import MarkdownConverter
198 | 
199 |     class ImageBlockConverter(MarkdownConverter):
200 |         """
201 |         Create a custom MarkdownConverter that adds two newlines after an image
202 |         """
203 |         def convert_img(self, el, text, parent_tags):
204 |             return super().convert_img(el, text, parent_tags) + '\n\n'
205 | 
206 |     # Create shorthand method for conversion
207 |     def md(html, **options):
208 |         return ImageBlockConverter(**options).convert(html)
209 | 
210 | .. code:: python
211 | 
212 |     from markdownify import MarkdownConverter
213 | 
214 |     class IgnoreParagraphsConverter(MarkdownConverter):
215 |         """
216 |         Create a custom MarkdownConverter that ignores paragraphs
217 |         """
218 |         def convert_p(self, el, text, parent_tags):
219 |             return ''
220 | 
221 |     # Create shorthand method for conversion
222 |     def md(html, **options):
223 |         return IgnoreParagraphsConverter(**options).convert(html)
224 | 
225 | 
226 | Command Line Interface
227 | ======================
228 | 
229 | Use ``markdownify example.html > example.md`` or pipe input from stdin
230 | (``cat example.html | markdownify > example.md``).
231 | Call ``markdownify -h`` to see all available options.
232 | They are the same as listed above and take the same arguments.
233 | 
234 | 
235 | Development
236 | ===========
237 | 
238 | To run tests and the linter run ``pip install tox`` once, then ``tox``.
239 | 


--------------------------------------------------------------------------------
/markdownify/__init__.py:
--------------------------------------------------------------------------------
  1 | from bs4 import BeautifulSoup, Comment, Doctype, NavigableString, Tag
  2 | from textwrap import fill
  3 | import re
  4 | import six
  5 | 
  6 | 
  7 | # General-purpose regex patterns
  8 | re_convert_heading = re.compile(r'convert_h(\d+)')
  9 | re_line_with_content = re.compile(r'^(.*)', flags=re.MULTILINE)
 10 | re_whitespace = re.compile(r'[\t ]+')
 11 | re_all_whitespace = re.compile(r'[\t \r\n]+')
 12 | re_newline_whitespace = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*')
 13 | re_html_heading = re.compile(r'h(\d+)')
 14 | 
 15 | # Pattern for creating convert_ function names from tag names
 16 | re_make_convert_fn_name = re.compile(r'[\[\]:-]')
 17 | 
 18 | # Extract (leading_nl, content, trailing_nl) from a string
 19 | # (functionally equivalent to r'^(\n*)(.*?)(\n*)$', but greedy is faster than reluctant here)
 20 | re_extract_newlines = re.compile(r'^(\n*)((?:.*[^\n])?)(\n*)$', flags=re.DOTALL)
 21 | 
 22 | # Escape miscellaneous special Markdown characters
 23 | re_escape_misc_chars = re.compile(r'([]\\&<`[>~=+|])')
 24 | 
 25 | # Escape sequence of one or more consecutive '-', preceded
 26 | # and followed by whitespace or start/end of fragment, as it
 27 | # might be confused with an underline of a header, or with a
 28 | # list marker
 29 | re_escape_misc_dash_sequences = re.compile(r'(\s|^)(-+(?:\s|$))')
 30 | 
 31 | # Escape sequence of up to six consecutive '#', preceded
 32 | # and followed by whitespace or start/end of fragment, as
 33 | # it might be confused with an ATX heading
 34 | re_escape_misc_hashes = re.compile(r'(\s|^)(#{1,6}(?:\s|$))')
 35 | 
 36 | # Escape '.' or ')' preceded by up to nine digits, as it might be
 37 | # confused with a list item
 38 | re_escape_misc_list_items = re.compile(r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))')
 39 | 
 40 | # Heading styles
 41 | ATX = 'atx'
 42 | ATX_CLOSED = 'atx_closed'
 43 | UNDERLINED = 'underlined'
 44 | SETEXT = UNDERLINED
 45 | 
 46 | # Newline style
 47 | SPACES = 'spaces'
 48 | BACKSLASH = 'backslash'
 49 | 
 50 | # Strong and emphasis style
 51 | ASTERISK = '*'
 52 | UNDERSCORE = '_'
 53 | 
 54 | # Document strip styles
 55 | LSTRIP = 'lstrip'
 56 | RSTRIP = 'rstrip'
 57 | STRIP = 'strip'
 58 | 
 59 | 
 60 | def chomp(text):
 61 |     """
 62 |     If the text in an inline tag like b, a, or em contains a leading or trailing
 63 |     space, strip the string and return a space as suffix of prefix, if needed.
 64 |     This function is used to prevent conversions like
 65 |          foo => ** foo**
 66 |     """
 67 |     prefix = ' ' if text and text[0] == ' ' else ''
 68 |     suffix = ' ' if text and text[-1] == ' ' else ''
 69 |     text = text.strip()
 70 |     return (prefix, suffix, text)
 71 | 
 72 | 
 73 | def abstract_inline_conversion(markup_fn):
 74 |     """
 75 |     This abstracts all simple inline tags like b, em, del, ...
 76 |     Returns a function that wraps the chomped text in a pair of the string
 77 |     that is returned by markup_fn, with '/' inserted in the string used after
 78 |     the text if it looks like an HTML tag. markup_fn is necessary to allow for
 79 |     references to self.strong_em_symbol etc.
 80 |     """
 81 |     def implementation(self, el, text, parent_tags):
 82 |         markup_prefix = markup_fn(self)
 83 |         if markup_prefix.startswith('<') and markup_prefix.endswith('>'):
 84 |             markup_suffix = '), ignore adjacent whitespace elements.
230 |                     return True
231 |                 elif should_remove_whitespace_outside(el.previous_sibling) or should_remove_whitespace_outside(el.next_sibling):
232 |                     # Outside block elements (including 
), ignore adjacent whitespace elements.
233 |                     return True
234 |                 else:
235 |                     return False
236 |             elif el is None:
237 |                 return True
238 |             else:
239 |                 raise ValueError('Unexpected element type: %s' % type(el))
240 | 
241 |         children_to_convert = [el for el in node.children if not _can_ignore(el)]
242 | 
243 |         # Create a copy of this tag's parent context, then update it to include this tag
244 |         # to propagate down into the children.
245 |         parent_tags_for_children = set(parent_tags)
246 |         parent_tags_for_children.add(node.name)
247 | 
248 |         # if this tag is a heading or table cell, add an '_inline' parent pseudo-tag
249 |         if (
250 |             re_html_heading.match(node.name) is not None  # headings
251 |             or node.name in {'td', 'th'}  # table cells
252 |         ):
253 |             parent_tags_for_children.add('_inline')
254 | 
255 |         # if this tag is a preformatted element, add a '_noformat' parent pseudo-tag
256 |         if node.name in {'pre', 'code', 'kbd', 'samp'}:
257 |             parent_tags_for_children.add('_noformat')
258 | 
259 |         # Convert the children elements into a list of result strings.
260 |         child_strings = [
261 |             self.process_element(el, parent_tags=parent_tags_for_children)
262 |             for el in children_to_convert
263 |         ]
264 | 
265 |         # Remove empty string values.
266 |         child_strings = [s for s in child_strings if s]
267 | 
268 |         # Collapse newlines at child element boundaries, if needed.
269 |         if node.name == 'pre' or node.find_parent('pre'):
270 |             # Inside 
 blocks, do not collapse newlines.
271 |             pass
272 |         else:
273 |             # Collapse newlines at child element boundaries.
274 |             updated_child_strings = ['']  # so the first lookback works
275 |             for child_string in child_strings:
276 |                 # Separate the leading/trailing newlines from the content.
277 |                 leading_nl, content, trailing_nl = re_extract_newlines.match(child_string).groups()
278 | 
279 |                 # If the last child had trailing newlines and this child has leading newlines,
280 |                 # use the larger newline count, limited to 2.
281 |                 if updated_child_strings[-1] and leading_nl:
282 |                     prev_trailing_nl = updated_child_strings.pop()  # will be replaced by the collapsed value
283 |                     num_newlines = min(2, max(len(prev_trailing_nl), len(leading_nl)))
284 |                     leading_nl = '\n' * num_newlines
285 | 
286 |                 # Add the results to the updated child string list.
287 |                 updated_child_strings.extend([leading_nl, content, trailing_nl])
288 | 
289 |             child_strings = updated_child_strings
290 | 
291 |         # Join all child text strings into a single string.
292 |         text = ''.join(child_strings)
293 | 
294 |         # apply this tag's final conversion function
295 |         convert_fn = self.get_conv_fn_cached(node.name)
296 |         if convert_fn is not None:
297 |             text = convert_fn(node, text, parent_tags=parent_tags)
298 | 
299 |         return text
300 | 
301 |     def convert__document_(self, el, text, parent_tags):
302 |         """Final document-level formatting for BeautifulSoup object (node.name == "[document]")"""
303 |         if self.options['strip_document'] == LSTRIP:
304 |             text = text.lstrip('\n')  # remove leading separation newlines
305 |         elif self.options['strip_document'] == RSTRIP:
306 |             text = text.rstrip('\n')  # remove trailing separation newlines
307 |         elif self.options['strip_document'] == STRIP:
308 |             text = text.strip('\n')  # remove leading and trailing separation newlines
309 |         elif self.options['strip_document'] is None:
310 |             pass  # leave leading and trailing separation newlines as-is
311 |         else:
312 |             raise ValueError('Invalid value for strip_document: %s' % self.options['strip_document'])
313 | 
314 |         return text
315 | 
316 |     def process_text(self, el, parent_tags=None):
317 |         # For the top-level element, initialize the parent context with an empty set.
318 |         if parent_tags is None:
319 |             parent_tags = set()
320 | 
321 |         text = six.text_type(el) or ''
322 | 
323 |         # normalize whitespace if we're not inside a preformatted element
324 |         if 'pre' not in parent_tags:
325 |             if self.options['wrap']:
326 |                 text = re_all_whitespace.sub(' ', text)
327 |             else:
328 |                 text = re_newline_whitespace.sub('\n', text)
329 |                 text = re_whitespace.sub(' ', text)
330 | 
331 |         # escape special characters if we're not inside a preformatted or code element
332 |         if '_noformat' not in parent_tags:
333 |             text = self.escape(text, parent_tags)
334 | 
335 |         # remove leading whitespace at the start or just after a
336 |         # block-level element; remove traliing whitespace at the end
337 |         # or just before a block-level element.
338 |         if (should_remove_whitespace_outside(el.previous_sibling)
339 |                 or (should_remove_whitespace_inside(el.parent)
340 |                     and not el.previous_sibling)):
341 |             text = text.lstrip(' \t\r\n')
342 |         if (should_remove_whitespace_outside(el.next_sibling)
343 |                 or (should_remove_whitespace_inside(el.parent)
344 |                     and not el.next_sibling)):
345 |             text = text.rstrip()
346 | 
347 |         return text
348 | 
349 |     def get_conv_fn_cached(self, tag_name):
350 |         """Given a tag name, return the conversion function using the cache."""
351 |         # If conversion function is not in cache, add it
352 |         if tag_name not in self.convert_fn_cache:
353 |             self.convert_fn_cache[tag_name] = self.get_conv_fn(tag_name)
354 | 
355 |         # Return the cached entry
356 |         return self.convert_fn_cache[tag_name]
357 | 
358 |     def get_conv_fn(self, tag_name):
359 |         """Given a tag name, find and return the conversion function."""
360 |         tag_name = tag_name.lower()
361 | 
362 |         # Handle strip/convert exclusion options
363 |         if not self.should_convert_tag(tag_name):
364 |             return None
365 | 
366 |         # Look for an explicitly defined conversion function by tag name first
367 |         convert_fn_name = "convert_%s" % re_make_convert_fn_name.sub("_", tag_name)
368 |         convert_fn = getattr(self, convert_fn_name, None)
369 |         if convert_fn:
370 |             return convert_fn
371 | 
372 |         # If tag is any heading, handle with convert_hN() function
373 |         match = re_html_heading.match(tag_name)
374 |         if match:
375 |             n = int(match.group(1))  # get value of N from 
376 |             return lambda el, text, parent_tags: self.convert_hN(n, el, text, parent_tags)
377 | 
378 |         # No conversion function was found
379 |         return None
380 | 
381 |     def should_convert_tag(self, tag):
382 |         """Given a tag name, return whether to convert based on strip/convert options."""
383 |         strip = self.options['strip']
384 |         convert = self.options['convert']
385 |         if strip is not None:
386 |             return tag not in strip
387 |         elif convert is not None:
388 |             return tag in convert
389 |         else:
390 |             return True
391 | 
392 |     def escape(self, text, parent_tags):
393 |         if not text:
394 |             return ''
395 |         if self.options['escape_misc']:
396 |             text = re_escape_misc_chars.sub(r'\\\1', text)
397 |             text = re_escape_misc_dash_sequences.sub(r'\1\\\2', text)
398 |             text = re_escape_misc_hashes.sub(r'\1\\\2', text)
399 |             text = re_escape_misc_list_items.sub(r'\1\\\2', text)
400 | 
401 |         if self.options['escape_asterisks']:
402 |             text = text.replace('*', r'\*')
403 |         if self.options['escape_underscores']:
404 |             text = text.replace('_', r'\_')
405 |         return text
406 | 
407 |     def underline(self, text, pad_char):
408 |         text = (text or '').rstrip()
409 |         return '\n\n%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''
410 | 
411 |     def convert_a(self, el, text, parent_tags):
412 |         if '_noformat' in parent_tags:
413 |             return text
414 |         prefix, suffix, text = chomp(text)
415 |         if not text:
416 |             return ''
417 |         href = el.get('href')
418 |         title = el.get('title')
419 |         # For the replacement see #29: text nodes underscores are escaped
420 |         if (self.options['autolinks']
421 |                 and text.replace(r'\_', '_') == href
422 |                 and not title
423 |                 and not self.options['default_title']):
424 |             # Shortcut syntax
425 |             return '<%s>' % href
426 |         if self.options['default_title'] and not title:
427 |             title = href
428 |         title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
429 |         return '%s[%s](%s%s)%s' % (prefix, text, href, title_part, suffix) if href else text
430 | 
431 |     convert_b = abstract_inline_conversion(lambda self: 2 * self.options['strong_em_symbol'])
432 | 
433 |     def convert_blockquote(self, el, text, parent_tags):
434 |         # handle some early-exit scenarios
435 |         text = (text or '').strip(' \t\r\n')
436 |         if '_inline' in parent_tags:
437 |             return ' ' + text + ' '
438 |         if not text:
439 |             return "\n"
440 | 
441 |         # indent lines with blockquote marker
442 |         def _indent_for_blockquote(match):
443 |             line_content = match.group(1)
444 |             return '> ' + line_content if line_content else '>'
445 |         text = re_line_with_content.sub(_indent_for_blockquote, text)
446 | 
447 |         return '\n' + text + '\n\n'
448 | 
449 |     def convert_br(self, el, text, parent_tags):
450 |         if '_inline' in parent_tags:
451 |             return ' '
452 | 
453 |         if self.options['newline_style'].lower() == BACKSLASH:
454 |             return '\\\n'
455 |         else:
456 |             return '  \n'
457 | 
458 |     def convert_code(self, el, text, parent_tags):
459 |         if 'pre' in parent_tags:
460 |             return text
461 |         converter = abstract_inline_conversion(lambda self: '`')
462 |         return converter(self, el, text, parent_tags)
463 | 
464 |     convert_del = abstract_inline_conversion(lambda self: '~~')
465 | 
466 |     def convert_div(self, el, text, parent_tags):
467 |         if '_inline' in parent_tags:
468 |             return ' ' + text.strip() + ' '
469 |         text = text.strip()
470 |         return '\n\n%s\n\n' % text if text else ''
471 | 
472 |     convert_article = convert_div
473 | 
474 |     convert_section = convert_div
475 | 
476 |     convert_em = abstract_inline_conversion(lambda self: self.options['strong_em_symbol'])
477 | 
478 |     convert_kbd = convert_code
479 | 
480 |     def convert_dd(self, el, text, parent_tags):
481 |         text = (text or '').strip()
482 |         if '_inline' in parent_tags:
483 |             return ' ' + text + ' '
484 |         if not text:
485 |             return '\n'
486 | 
487 |         # indent definition content lines by four spaces
488 |         def _indent_for_dd(match):
489 |             line_content = match.group(1)
490 |             return '    ' + line_content if line_content else ''
491 |         text = re_line_with_content.sub(_indent_for_dd, text)
492 | 
493 |         # insert definition marker into first-line indent whitespace
494 |         text = ':' + text[1:]
495 | 
496 |         return '%s\n' % text
497 | 
498 |     # definition lists are formatted as follows:
499 |     #   https://pandoc.org/MANUAL.html#definition-lists
500 |     #   https://michelf.ca/projects/php-markdown/extra/#def-list
501 |     convert_dl = convert_div
502 | 
503 |     def convert_dt(self, el, text, parent_tags):
504 |         # remove newlines from term text
505 |         text = (text or '').strip()
506 |         text = re_all_whitespace.sub(' ', text)
507 |         if '_inline' in parent_tags:
508 |             return ' ' + text + ' '
509 |         if not text:
510 |             return '\n'
511 | 
512 |         # TODO - format consecutive 
elements as directly adjacent lines): 513 | # https://michelf.ca/projects/php-markdown/extra/#def-list 514 | 515 | return '\n\n%s\n' % text 516 | 517 | def convert_hN(self, n, el, text, parent_tags): 518 | # convert_hN() converts tags, where N is any integer 519 | if '_inline' in parent_tags: 520 | return text 521 | 522 | # Markdown does not support heading depths of n > 6 523 | n = max(1, min(6, n)) 524 | 525 | style = self.options['heading_style'].lower() 526 | text = text.strip() 527 | if style == UNDERLINED and n <= 2: 528 | line = '=' if n == 1 else '-' 529 | return self.underline(text, line) 530 | text = re_all_whitespace.sub(' ', text) 531 | hashes = '#' * n 532 | if style == ATX_CLOSED: 533 | return '\n\n%s %s %s\n\n' % (hashes, text, hashes) 534 | return '\n\n%s %s\n\n' % (hashes, text) 535 | 536 | def convert_hr(self, el, text, parent_tags): 537 | return '\n\n---\n\n' 538 | 539 | convert_i = convert_em 540 | 541 | def convert_img(self, el, text, parent_tags): 542 | alt = el.attrs.get('alt', None) or '' 543 | src = el.attrs.get('src', None) or '' 544 | title = el.attrs.get('title', None) or '' 545 | title_part = ' "%s"' % title.replace('"', r'\"') if title else '' 546 | if ('_inline' in parent_tags 547 | and el.parent.name not in self.options['keep_inline_images_in']): 548 | return alt 549 | 550 | return '![%s](%s%s)' % (alt, src, title_part) 551 | 552 | def convert_video(self, el, text, parent_tags): 553 | if ('_inline' in parent_tags 554 | and el.parent.name not in self.options['keep_inline_images_in']): 555 | return text 556 | src = el.attrs.get('src', None) or '' 557 | if not src: 558 | sources = el.find_all('source', attrs={'src': True}) 559 | if sources: 560 | src = sources[0].attrs.get('src', None) or '' 561 | poster = el.attrs.get('poster', None) or '' 562 | if src and poster: 563 | return '[![%s](%s)](%s)' % (text, poster, src) 564 | if src: 565 | return '[%s](%s)' % (text, src) 566 | if poster: 567 | return '![%s](%s)' % (text, poster) 568 | return text 569 | 570 | def convert_list(self, el, text, parent_tags): 571 | 572 | # Converting a list to inline is undefined. 573 | # Ignoring inline conversion parents for list. 574 | 575 | before_paragraph = False 576 | next_sibling = _next_block_content_sibling(el) 577 | if next_sibling and next_sibling.name not in ['ul', 'ol']: 578 | before_paragraph = True 579 | if 'li' in parent_tags: 580 | # remove trailing newline if we're in a nested list 581 | return '\n' + text.rstrip() 582 | return '\n\n' + text + ('\n' if before_paragraph else '') 583 | 584 | convert_ul = convert_list 585 | convert_ol = convert_list 586 | 587 | def convert_li(self, el, text, parent_tags): 588 | # handle some early-exit scenarios 589 | text = (text or '').strip() 590 | if not text: 591 | return "\n" 592 | 593 | # determine list item bullet character to use 594 | parent = el.parent 595 | if parent is not None and parent.name == 'ol': 596 | if parent.get("start") and str(parent.get("start")).isnumeric(): 597 | start = int(parent.get("start")) 598 | else: 599 | start = 1 600 | bullet = '%s.' % (start + len(el.find_previous_siblings('li'))) 601 | else: 602 | depth = -1 603 | while el: 604 | if el.name == 'ul': 605 | depth += 1 606 | el = el.parent 607 | bullets = self.options['bullets'] 608 | bullet = bullets[depth % len(bullets)] 609 | bullet = bullet + ' ' 610 | bullet_width = len(bullet) 611 | bullet_indent = ' ' * bullet_width 612 | 613 | # indent content lines by bullet width 614 | def _indent_for_li(match): 615 | line_content = match.group(1) 616 | return bullet_indent + line_content if line_content else '' 617 | text = re_line_with_content.sub(_indent_for_li, text) 618 | 619 | # insert bullet into first-line indent whitespace 620 | text = bullet + text[bullet_width:] 621 | 622 | return '%s\n' % text 623 | 624 | def convert_p(self, el, text, parent_tags): 625 | if '_inline' in parent_tags: 626 | return ' ' + text.strip(' \t\r\n') + ' ' 627 | text = text.strip(' \t\r\n') 628 | if self.options['wrap']: 629 | # Preserve newlines (and preceding whitespace) resulting 630 | # from
tags. Newlines in the input have already been 631 | # replaced by spaces. 632 | if self.options['wrap_width'] is not None: 633 | lines = text.split('\n') 634 | new_lines = [] 635 | for line in lines: 636 | line = line.lstrip(' \t\r\n') 637 | line_no_trailing = line.rstrip() 638 | trailing = line[len(line_no_trailing):] 639 | line = fill(line, 640 | width=self.options['wrap_width'], 641 | break_long_words=False, 642 | break_on_hyphens=False) 643 | new_lines.append(line + trailing) 644 | text = '\n'.join(new_lines) 645 | return '\n\n%s\n\n' % text if text else '' 646 | 647 | def convert_pre(self, el, text, parent_tags): 648 | if not text: 649 | return '' 650 | code_language = self.options['code_language'] 651 | 652 | if self.options['code_language_callback']: 653 | code_language = self.options['code_language_callback'](el) or code_language 654 | 655 | return '\n\n```%s\n%s\n```\n\n' % (code_language, text) 656 | 657 | def convert_q(self, el, text, parent_tags): 658 | return '"' + text + '"' 659 | 660 | def convert_script(self, el, text, parent_tags): 661 | return '' 662 | 663 | def convert_style(self, el, text, parent_tags): 664 | return '' 665 | 666 | convert_s = convert_del 667 | 668 | convert_strong = convert_b 669 | 670 | convert_samp = convert_code 671 | 672 | convert_sub = abstract_inline_conversion(lambda self: self.options['sub_symbol']) 673 | 674 | convert_sup = abstract_inline_conversion(lambda self: self.options['sup_symbol']) 675 | 676 | def convert_table(self, el, text, parent_tags): 677 | return '\n\n' + text.strip() + '\n\n' 678 | 679 | def convert_caption(self, el, text, parent_tags): 680 | return text.strip() + '\n\n' 681 | 682 | def convert_figcaption(self, el, text, parent_tags): 683 | return '\n\n' + text.strip() + '\n\n' 684 | 685 | def convert_td(self, el, text, parent_tags): 686 | colspan = 1 687 | if 'colspan' in el.attrs and el['colspan'].isdigit(): 688 | colspan = int(el['colspan']) 689 | return ' ' + text.strip().replace("\n", " ") + ' |' * colspan 690 | 691 | def convert_th(self, el, text, parent_tags): 692 | colspan = 1 693 | if 'colspan' in el.attrs and el['colspan'].isdigit(): 694 | colspan = int(el['colspan']) 695 | return ' ' + text.strip().replace("\n", " ") + ' |' * colspan 696 | 697 | def convert_tr(self, el, text, parent_tags): 698 | cells = el.find_all(['td', 'th']) 699 | is_first_row = el.find_previous_sibling() is None 700 | is_headrow = ( 701 | all([cell.name == 'th' for cell in cells]) 702 | or (el.parent.name == 'thead' 703 | # avoid multiple tr in thead 704 | and len(el.parent.find_all('tr')) == 1) 705 | ) 706 | is_head_row_missing = ( 707 | (is_first_row and not el.parent.name == 'tbody') 708 | or (is_first_row and el.parent.name == 'tbody' and len(el.parent.parent.find_all(['thead'])) < 1) 709 | ) 710 | overline = '' 711 | underline = '' 712 | full_colspan = 0 713 | for cell in cells: 714 | if 'colspan' in cell.attrs and cell['colspan'].isdigit(): 715 | full_colspan += int(cell["colspan"]) 716 | else: 717 | full_colspan += 1 718 | if ((is_headrow 719 | or (is_head_row_missing 720 | and self.options['table_infer_header'])) 721 | and is_first_row): 722 | # first row and: 723 | # - is headline or 724 | # - headline is missing and header inference is enabled 725 | # print headline underline 726 | underline += '| ' + ' | '.join(['---'] * full_colspan) + ' |' + '\n' 727 | elif ((is_head_row_missing 728 | and not self.options['table_infer_header']) 729 | or (is_first_row 730 | and (el.parent.name == 'table' 731 | or (el.parent.name == 'tbody' 732 | and not el.parent.find_previous_sibling())))): 733 | # headline is missing and header inference is disabled or: 734 | # first row, not headline, and: 735 | # - the parent is table or 736 | # - the parent is tbody at the beginning of a table. 737 | # print empty headline above this row 738 | overline += '| ' + ' | '.join([''] * full_colspan) + ' |' + '\n' 739 | overline += '| ' + ' | '.join(['---'] * full_colspan) + ' |' + '\n' 740 | return overline + '|' + text + '\n' + underline 741 | 742 | 743 | def markdownify(html, **options): 744 | return MarkdownConverter(**options).convert(html) 745 | -------------------------------------------------------------------------------- /markdownify/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import sys 5 | 6 | from markdownify import markdownify, ATX, ATX_CLOSED, UNDERLINED, \ 7 | SPACES, BACKSLASH, ASTERISK, UNDERSCORE 8 | 9 | 10 | def main(argv=sys.argv[1:]): 11 | parser = argparse.ArgumentParser( 12 | prog='markdownify', 13 | description='Converts html to markdown.', 14 | ) 15 | 16 | parser.add_argument('html', nargs='?', type=argparse.FileType('r'), 17 | default=sys.stdin, 18 | help="The html file to convert. Defaults to STDIN if not " 19 | "provided.") 20 | parser.add_argument('-s', '--strip', nargs='*', 21 | help="A list of tags to strip. This option can't be used with " 22 | "the --convert option.") 23 | parser.add_argument('-c', '--convert', nargs='*', 24 | help="A list of tags to convert. This option can't be used with " 25 | "the --strip option.") 26 | parser.add_argument('-a', '--autolinks', action='store_true', 27 | help="A boolean indicating whether the 'automatic link' style " 28 | "should be used when a 'a' tag's contents match its href.") 29 | parser.add_argument('--default-title', action='store_false', 30 | help="A boolean to enable setting the title of a link to its " 31 | "href, if no title is given.") 32 | parser.add_argument('--heading-style', default=UNDERLINED, 33 | choices=(ATX, ATX_CLOSED, UNDERLINED), 34 | help="Defines how headings should be converted.") 35 | parser.add_argument('-b', '--bullets', default='*+-', 36 | help="A string of bullet styles to use; the bullet will " 37 | "alternate based on nesting level.") 38 | parser.add_argument('--strong-em-symbol', default=ASTERISK, 39 | choices=(ASTERISK, UNDERSCORE), 40 | help="Use * or _ to convert strong and italics text"), 41 | parser.add_argument('--sub-symbol', default='', 42 | help="Define the chars that surround ''.") 43 | parser.add_argument('--sup-symbol', default='', 44 | help="Define the chars that surround ''.") 45 | parser.add_argument('--newline-style', default=SPACES, 46 | choices=(SPACES, BACKSLASH), 47 | help="Defines the style of
conversions: two spaces " 48 | "or backslash at the and of the line thet should break.") 49 | parser.add_argument('--code-language', default='', 50 | help="Defines the language that should be assumed for all " 51 | "'
' sections.")
52 |     parser.add_argument('--no-escape-asterisks', dest='escape_asterisks',
53 |                         action='store_false',
54 |                         help="Do not escape '*' to '\\*' in text.")
55 |     parser.add_argument('--no-escape-underscores', dest='escape_underscores',
56 |                         action='store_false',
57 |                         help="Do not escape '_' to '\\_' in text.")
58 |     parser.add_argument('-i', '--keep-inline-images-in',
59 |                         default=[],
60 |                         nargs='*',
61 |                         help="Images are converted to their alt-text when the images are "
62 |                         "located inside headlines or table cells. If some inline images "
63 |                         "should be converted to markdown images instead, this option can "
64 |                         "be set to a list of parent tags that should be allowed to "
65 |                         "contain inline images.")
66 |     parser.add_argument('--table-infer-header', dest='table_infer_header',
67 |                         action='store_true',
68 |                         help="When a table has no header row (as indicated by '' "
69 |                         "or ''), use the first body row as the header row.")
70 |     parser.add_argument('-w', '--wrap', action='store_true',
71 |                         help="Wrap all text paragraphs at --wrap-width characters.")
72 |     parser.add_argument('--wrap-width', type=int, default=80)
73 |     parser.add_argument('-p', '--beautiful-soup-parser',
74 |                         dest='beautiful_soup_parser',
75 |                         default='html.parser',
76 |                         help="Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such "
77 |                              "as html5lib, lxml or even a custom parser as long as it is installed on the execution "
78 |                              "environment.")
79 | 
80 |     args = parser.parse_args(argv)
81 |     print(markdownify(**vars(args)))
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     main()
86 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.2", "setuptools_scm[toml]>=3.4.3"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "markdownify"
 7 | version = "1.1.0"
 8 | authors = [{name = "Matthew Tretter", email = "m@tthewwithanm.com"}]
 9 | description = "Convert HTML to markdown."
10 | readme = "README.rst"
11 | classifiers = [
12 |     "Environment :: Web Environment",
13 |     "Framework :: Django",
14 |     "Intended Audience :: Developers",
15 |     "License :: OSI Approved :: MIT License",
16 |     "Operating System :: OS Independent",
17 |     "Programming Language :: Python :: 2.5",
18 |     "Programming Language :: Python :: 2.6",
19 |     "Programming Language :: Python :: 2.7",
20 |     "Programming Language :: Python :: 3.6",
21 |     "Programming Language :: Python :: 3.7",
22 |     "Programming Language :: Python :: 3.8",
23 |     "Topic :: Utilities",
24 | ]
25 | dependencies = [
26 |     "beautifulsoup4>=4.9,<5",
27 |     "six>=1.15,<2"
28 | ]
29 | 
30 | [project.urls]
31 | Homepage = "http://github.com/matthewwithanm/python-markdownify"
32 | Download = "http://github.com/matthewwithanm/python-markdownify/tarball/master"
33 | 
34 | [project.scripts]
35 | markdownify = "markdownify.main:main"
36 | 
37 | [tool.setuptools]
38 | zip-safe = false
39 | include-package-data = true
40 | 
41 | [tool.setuptools.packages.find]
42 | include = ["markdownify", "markdownify.*"]
43 | namespaces = false
44 | 
45 | [tool.setuptools_scm]
46 | 


--------------------------------------------------------------------------------
/shell.nix:
--------------------------------------------------------------------------------
 1 | { pkgs ? import  {} }:
 2 | pkgs.mkShell {
 3 |   name = "python-shell";
 4 |   buildInputs = with pkgs; [
 5 |     python38
 6 |     python38Packages.tox
 7 |     python38Packages.setuptools
 8 |     python38Packages.virtualenv
 9 |   ];
10 | }
11 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matthewwithanm/python-markdownify/016251e915a4cb44b2f21a94db85c733e12a665a/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_advanced.py:
--------------------------------------------------------------------------------
 1 | from .utils import md
 2 | 
 3 | 
 4 | def test_chomp():
 5 |     assert md('  ') == '  '
 6 |     assert md('   ') == '  '
 7 |     assert md('    ') == '  '
 8 |     assert md('     ') == '  '
 9 |     assert md(' s  ') == ' **s**  '
10 |     assert md('  s ') == '  **s** '
11 |     assert md('  s  ') == '  **s**  '
12 |     assert md('   s   ') == '  **s**  '
13 | 
14 | 
15 | def test_nested():
16 |     text = md('

This is an example link.

') 17 | assert text == '\n\nThis is an [example link](http://example.com/).\n\n' 18 | 19 | 20 | def test_ignore_comments(): 21 | text = md("") 22 | assert text == "" 23 | 24 | 25 | def test_ignore_comments_with_other_tags(): 26 | text = md("example link") 27 | assert text == "[example link](http://example.com/)" 28 | 29 | 30 | def test_code_with_tricky_content(): 31 | assert md('>') == "`>`" 32 | assert md('/home/username') == "`/home/`**username**" 33 | assert md('First line blah blah
blah blah
second line') \ 34 | == "First line `blah blah \nblah blah` second line" 35 | 36 | 37 | def test_special_tags(): 38 | assert md('') == '' 39 | assert md('') == 'foobar' 40 | -------------------------------------------------------------------------------- /tests/test_args.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test whitelisting/blacklisting of specific tags. 3 | 4 | """ 5 | from markdownify import markdownify, LSTRIP, RSTRIP, STRIP 6 | from .utils import md 7 | 8 | 9 | def test_strip(): 10 | text = md('Some Text', strip=['a']) 11 | assert text == 'Some Text' 12 | 13 | 14 | def test_do_not_strip(): 15 | text = md('Some Text', strip=[]) 16 | assert text == '[Some Text](https://github.com/matthewwithanm)' 17 | 18 | 19 | def test_convert(): 20 | text = md('Some Text', convert=['a']) 21 | assert text == '[Some Text](https://github.com/matthewwithanm)' 22 | 23 | 24 | def test_do_not_convert(): 25 | text = md('Some Text', convert=[]) 26 | assert text == 'Some Text' 27 | 28 | 29 | def test_strip_document(): 30 | assert markdownify("

Hello

") == "Hello" # test default of STRIP 31 | assert markdownify("

Hello

", strip_document=LSTRIP) == "Hello\n\n" 32 | assert markdownify("

Hello

", strip_document=RSTRIP) == "\n\nHello" 33 | assert markdownify("

Hello

", strip_document=STRIP) == "Hello" 34 | assert markdownify("

Hello

", strip_document=None) == "\n\nHello\n\n" 35 | -------------------------------------------------------------------------------- /tests/test_basic.py: -------------------------------------------------------------------------------- 1 | from .utils import md 2 | 3 | 4 | def test_single_tag(): 5 | assert md('Hello') == 'Hello' 6 | 7 | 8 | def test_soup(): 9 | assert md('
Hello
') == '\n\nHello\n\n' 10 | 11 | 12 | def test_whitespace(): 13 | assert md(' a b \t\t c ') == ' a b c ' 14 | assert md(' a b \n\n c ') == ' a b\nc ' 15 | -------------------------------------------------------------------------------- /tests/test_conversions.py: -------------------------------------------------------------------------------- 1 | from markdownify import ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERSCORE 2 | from .utils import md 3 | 4 | 5 | def inline_tests(tag, markup): 6 | # test template for different inline tags 7 | assert md(f'<{tag}>Hello') == f'{markup}Hello{markup}' 8 | assert md(f'foo <{tag}>Hello bar') == f'foo {markup}Hello{markup} bar' 9 | assert md(f'foo<{tag}> Hello bar') == f'foo {markup}Hello{markup} bar' 10 | assert md(f'foo <{tag}>Hello bar') == f'foo {markup}Hello{markup} bar' 11 | assert md(f'foo <{tag}> bar') in ['foo bar', 'foo bar'] # Either is OK 12 | 13 | 14 | def test_a(): 15 | assert md('Google') == '[Google](https://google.com)' 16 | assert md('https://google.com') == '' 17 | assert md('https://community.kde.org/Get_Involved') == '' 18 | assert md('https://community.kde.org/Get_Involved', autolinks=False) == '[https://community.kde.org/Get\\_Involved](https://community.kde.org/Get_Involved)' 19 | 20 | 21 | def test_a_spaces(): 22 | assert md('foo Google bar') == 'foo [Google](http://google.com) bar' 23 | assert md('foo Google bar') == 'foo [Google](http://google.com) bar' 24 | assert md('foo Google bar') == 'foo [Google](http://google.com) bar' 25 | assert md('foo bar') == 'foo bar' 26 | 27 | 28 | def test_a_with_title(): 29 | text = md('Google') 30 | assert text == r'[Google](http://google.com "The \"Goog\"")' 31 | assert md('https://google.com', default_title=True) == '[https://google.com](https://google.com "https://google.com")' 32 | 33 | 34 | def test_a_shortcut(): 35 | text = md('http://google.com') 36 | assert text == '' 37 | 38 | 39 | def test_a_no_autolinks(): 40 | assert md('https://google.com', autolinks=False) == '[https://google.com](https://google.com)' 41 | 42 | 43 | def test_a_in_code(): 44 | assert md('Google') == '`Google`' 45 | assert md('
Google
') == '\n\n```\nGoogle\n```\n\n' 46 | 47 | 48 | def test_b(): 49 | assert md('Hello') == '**Hello**' 50 | 51 | 52 | def test_b_spaces(): 53 | assert md('foo Hello bar') == 'foo **Hello** bar' 54 | assert md('foo Hello bar') == 'foo **Hello** bar' 55 | assert md('foo Hello bar') == 'foo **Hello** bar' 56 | assert md('foo bar') == 'foo bar' 57 | 58 | 59 | def test_blockquote(): 60 | assert md('
Hello
') == '\n> Hello\n\n' 61 | assert md('
\nHello\n
') == '\n> Hello\n\n' 62 | assert md('
 Hello
') == '\n> \u00a0Hello\n\n' 63 | 64 | 65 | def test_blockquote_with_nested_paragraph(): 66 | assert md('

Hello

') == '\n> Hello\n\n' 67 | assert md('

Hello

Hello again

') == '\n> Hello\n>\n> Hello again\n\n' 68 | 69 | 70 | def test_blockquote_with_paragraph(): 71 | assert md('
Hello

handsome

') == '\n> Hello\n\nhandsome\n\n' 72 | 73 | 74 | def test_blockquote_nested(): 75 | text = md('
And she was like
Hello
') 76 | assert text == '\n> And she was like\n> > Hello\n\n' 77 | 78 | 79 | def test_br(): 80 | assert md('a
b
c') == 'a \nb \nc' 81 | assert md('a
b
c', newline_style=BACKSLASH) == 'a\\\nb\\\nc' 82 | assert md('

foo
bar

', heading_style=ATX) == '\n\n# foo bar\n\n' 83 | assert md('foo
bar', heading_style=ATX) == ' foo bar |' 84 | 85 | 86 | def test_code(): 87 | inline_tests('code', '`') 88 | assert md('*this_should_not_escape*') == '`*this_should_not_escape*`' 89 | assert md('*this_should_not_escape*') == '`*this_should_not_escape*`' 90 | assert md('*this_should_not_escape*') == '`*this_should_not_escape*`' 91 | assert md('*this_should_not_escape*') == '`*this_should_not_escape*`' 92 | assert md('this should\t\tnormalize') == '`this should normalize`' 93 | assert md('this should\t\tnormalize') == '`this should normalize`' 94 | assert md('foobarbaz') == '`foobarbaz`' 95 | assert md('foobarbaz') == '`foobarbaz`' 96 | assert md('foo bar baz') == '`foo bar baz`' 97 | assert md('foo bar baz') == '`foo bar baz`' 98 | assert md('foo bar baz') == '`foo bar baz`' 99 | assert md('foo bar baz') == '`foo bar baz`' 100 | assert md('foo bar baz') == '`foo bar baz`' 101 | assert md('foo bar baz') == '`foo bar baz`' 102 | assert md('foobarbaz', sup_symbol='^') == '`foobarbaz`' 103 | assert md('foobarbaz', sub_symbol='^') == '`foobarbaz`' 104 | 105 | 106 | def test_dl(): 107 | assert md('
term
definition
') == '\n\nterm\n: definition\n\n' 108 | assert md('

te

rm

definition
') == '\n\nte rm\n: definition\n\n' 109 | assert md('
term

definition-p1

definition-p2

') == '\n\nterm\n: definition-p1\n\n definition-p2\n\n' 110 | assert md('
term

definition 1

definition 2

') == '\n\nterm\n: definition 1\n: definition 2\n\n' 111 | assert md('
term 1
definition 1
term 2
definition 2
') == '\n\nterm 1\n: definition 1\n\nterm 2\n: definition 2\n\n' 112 | assert md('
term

line 1

line 2

') == '\n\nterm\n: > line 1\n >\n > line 2\n\n' 113 | assert md('
term
  1. 1

    • 2a
    • 2b
  2. 3

') == '\n\nterm\n: 1. 1\n\n * 2a\n * 2b\n 2. 3\n\n' 114 | 115 | 116 | def test_del(): 117 | inline_tests('del', '~~') 118 | 119 | 120 | def test_div_section_article(): 121 | for tag in ['div', 'section', 'article']: 122 | assert md(f'<{tag}>456') == '\n\n456\n\n' 123 | assert md(f'123<{tag}>456789') == '123\n\n456\n\n789' 124 | assert md(f'123<{tag}>\n 456 \n789') == '123\n\n456\n\n789' 125 | assert md(f'123<{tag}>

456

789') == '123\n\n456\n\n789' 126 | assert md(f'123<{tag}>\n

456

\n789') == '123\n\n456\n\n789' 127 | assert md(f'123<{tag}>
4 5 6
789') == '123\n\n```\n4 5 6\n```\n\n789' 128 | assert md(f'123<{tag}>\n
4 5 6
\n789') == '123\n\n```\n4 5 6\n```\n\n789' 129 | assert md(f'123<{tag}>4\n5\n6789') == '123\n\n4\n5\n6\n\n789' 130 | assert md(f'123<{tag}>\n4\n5\n6\n789') == '123\n\n4\n5\n6\n\n789' 131 | assert md(f'123<{tag}>\n

\n4\n5\n6\n

\n789') == '123\n\n4\n5\n6\n\n789' 132 | assert md(f'<{tag}>

title

body', heading_style=ATX) == '\n\n# title\n\nbody\n\n' 133 | 134 | 135 | def test_em(): 136 | inline_tests('em', '*') 137 | 138 | 139 | def test_figcaption(): 140 | assert (md("TEXT
\nCaption\n
SPAN
") == "TEXT\n\nCaption\n\nSPAN") 141 | assert (md("
SPAN
\nCaption\n
TEXT") == "SPAN\n\nCaption\n\nTEXT") 142 | 143 | 144 | def test_header_with_space(): 145 | assert md('

\n\nHello

') == '\n\n### Hello\n\n' 146 | assert md('

Hello\n\n\nWorld

') == '\n\n### Hello World\n\n' 147 | assert md('

\n\nHello

') == '\n\n#### Hello\n\n' 148 | assert md('
\n\nHello
') == '\n\n##### Hello\n\n' 149 | assert md('
\n\nHello\n\n
') == '\n\n##### Hello\n\n' 150 | assert md('
\n\nHello \n\n
') == '\n\n##### Hello\n\n' 151 | 152 | 153 | def test_h1(): 154 | assert md('

Hello

') == '\n\nHello\n=====\n\n' 155 | 156 | 157 | def test_h2(): 158 | assert md('

Hello

') == '\n\nHello\n-----\n\n' 159 | 160 | 161 | def test_hn(): 162 | assert md('

Hello

') == '\n\n### Hello\n\n' 163 | assert md('

Hello

') == '\n\n#### Hello\n\n' 164 | assert md('
Hello
') == '\n\n##### Hello\n\n' 165 | assert md('
Hello
') == '\n\n###### Hello\n\n' 166 | assert md('Hello') == md('
Hello
') 167 | assert md('Hello') == md('

Hello

') 168 | assert md('Hello') == md('Hello') 169 | 170 | 171 | def test_hn_chained(): 172 | assert md('

First

\n

Second

\n

Third

', heading_style=ATX) == '\n\n# First\n\n## Second\n\n### Third\n\n' 173 | assert md('X

First

', heading_style=ATX) == 'X\n\n# First\n\n' 174 | assert md('X

First

', heading_style=ATX_CLOSED) == 'X\n\n# First #\n\n' 175 | assert md('X

First

') == 'X\n\nFirst\n=====\n\n' 176 | 177 | 178 | def test_hn_nested_tag_heading_style(): 179 | assert md('

A

P

C

', heading_style=ATX_CLOSED) == '\n\n# A P C #\n\n' 180 | assert md('

A

P

C

', heading_style=ATX) == '\n\n# A P C\n\n' 181 | 182 | 183 | def test_hn_nested_simple_tag(): 184 | tag_to_markdown = [ 185 | ("strong", "**strong**"), 186 | ("b", "**b**"), 187 | ("em", "*em*"), 188 | ("i", "*i*"), 189 | ("p", "p"), 190 | ("a", "a"), 191 | ("div", "div"), 192 | ("blockquote", "blockquote"), 193 | ] 194 | 195 | for tag, markdown in tag_to_markdown: 196 | assert md('

A <' + tag + '>' + tag + ' B

') == '\n\n### A ' + markdown + ' B\n\n' 197 | 198 | assert md('

A
B

', heading_style=ATX) == '\n\n### A B\n\n' 199 | 200 | # Nested lists not supported 201 | # assert md('

A
  • li1
  • l2

', heading_style=ATX) == '\n### A li1 li2 B\n\n' 202 | 203 | 204 | def test_hn_nested_img(): 205 | image_attributes_to_markdown = [ 206 | ("", "", ""), 207 | ("alt='Alt Text'", "Alt Text", ""), 208 | ("alt='Alt Text' title='Optional title'", "Alt Text", " \"Optional title\""), 209 | ] 210 | for image_attributes, markdown, title in image_attributes_to_markdown: 211 | assert md('

A B

') == '\n\n### A' + (' ' + markdown + ' ' if markdown else ' ') + 'B\n\n' 212 | assert md('

A B

', keep_inline_images_in=['h3']) == '\n\n### A ![' + markdown + '](/path/to/img.jpg' + title + ') B\n\n' 213 | 214 | 215 | def test_hn_atx_headings(): 216 | assert md('

Hello

', heading_style=ATX) == '\n\n# Hello\n\n' 217 | assert md('

Hello

', heading_style=ATX) == '\n\n## Hello\n\n' 218 | 219 | 220 | def test_hn_atx_closed_headings(): 221 | assert md('

Hello

', heading_style=ATX_CLOSED) == '\n\n# Hello #\n\n' 222 | assert md('

Hello

', heading_style=ATX_CLOSED) == '\n\n## Hello ##\n\n' 223 | 224 | 225 | def test_hn_newlines(): 226 | assert md("

H1-1

TEXT

H2-2

TEXT

H1-2

TEXT", heading_style=ATX) == '\n\n# H1-1\n\nTEXT\n\n## H2-2\n\nTEXT\n\n# H1-2\n\nTEXT' 227 | assert md('

H1-1

\n

TEXT

\n

H2-2

\n

TEXT

\n

H1-2

\n

TEXT

', heading_style=ATX) == '\n\n# H1-1\n\nTEXT\n\n## H2-2\n\nTEXT\n\n# H1-2\n\nTEXT\n\n' 228 | 229 | 230 | def test_head(): 231 | assert md('head') == 'head' 232 | 233 | 234 | def test_hr(): 235 | assert md('Hello
World') == 'Hello\n\n---\n\nWorld' 236 | assert md('Hello
World') == 'Hello\n\n---\n\nWorld' 237 | assert md('

Hello

\n
\n

World

') == '\n\nHello\n\n---\n\nWorld\n\n' 238 | 239 | 240 | def test_i(): 241 | assert md('Hello') == '*Hello*' 242 | 243 | 244 | def test_img(): 245 | assert md('Alt text') == '![Alt text](/path/to/img.jpg "Optional title")' 246 | assert md('Alt text') == '![Alt text](/path/to/img.jpg)' 247 | 248 | 249 | def test_video(): 250 | assert md('') == '[![text](/path/to/img.jpg)](/path/to/video.mp4)' 251 | assert md('') == '[text](/path/to/video.mp4)' 252 | assert md('') == '[text](/path/to/video.mp4)' 253 | assert md('') == '![text](/path/to/img.jpg)' 254 | assert md('') == 'text' 255 | 256 | 257 | def test_kbd(): 258 | inline_tests('kbd', '`') 259 | 260 | 261 | def test_p(): 262 | assert md('

hello

') == '\n\nhello\n\n' 263 | assert md("

hello

") == "\n\nhello\n\n" 264 | assert md('

123456789 123456789

') == '\n\n123456789 123456789\n\n' 265 | assert md('

123456789\n\n\n123456789

') == '\n\n123456789\n123456789\n\n' 266 | assert md('

123456789\n\n\n123456789

', wrap=True, wrap_width=80) == '\n\n123456789 123456789\n\n' 267 | assert md('

123456789\n\n\n123456789

', wrap=True, wrap_width=None) == '\n\n123456789 123456789\n\n' 268 | assert md('

123456789 123456789

', wrap=True, wrap_width=10) == '\n\n123456789\n123456789\n\n' 269 | assert md('

Some long link

', wrap=True, wrap_width=10) == '\n\n[Some long\nlink](https://example.com)\n\n' 270 | assert md('

12345
67890

', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n12345\\\n67890\n\n' 271 | assert md('

12345
67890

', wrap=True, wrap_width=50, newline_style=BACKSLASH) == '\n\n12345\\\n67890\n\n' 272 | assert md('

12345
67890

', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n12345 \n67890\n\n' 273 | assert md('

12345
67890

', wrap=True, wrap_width=50, newline_style=SPACES) == '\n\n12345 \n67890\n\n' 274 | assert md('

12345678901
12345

', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n12345678901\\\n12345\n\n' 275 | assert md('

12345678901
12345

', wrap=True, wrap_width=50, newline_style=BACKSLASH) == '\n\n12345678901\\\n12345\n\n' 276 | assert md('

12345678901
12345

', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n12345678901 \n12345\n\n' 277 | assert md('

12345678901
12345

', wrap=True, wrap_width=50, newline_style=SPACES) == '\n\n12345678901 \n12345\n\n' 278 | assert md('

1234 5678 9012
67890

', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n1234 5678\n9012\\\n67890\n\n' 279 | assert md('

1234 5678 9012
67890

', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n1234 5678\n9012 \n67890\n\n' 280 | assert md('First

Second

Third

Fourth') == 'First\n\nSecond\n\nThird\n\nFourth' 281 | assert md('

 x y

', wrap=True, wrap_width=80) == '\n\n\u00a0x y\n\n' 282 | 283 | 284 | def test_pre(): 285 | assert md('
test\n    foo\nbar
') == '\n\n```\ntest\n foo\nbar\n```\n\n' 286 | assert md('
test\n    foo\nbar
') == '\n\n```\ntest\n foo\nbar\n```\n\n' 287 | assert md('
*this_should_not_escape*
') == '\n\n```\n*this_should_not_escape*\n```\n\n' 288 | assert md('
*this_should_not_escape*
') == '\n\n```\n*this_should_not_escape*\n```\n\n' 289 | assert md('
\t\tthis  should\t\tnot  normalize
') == '\n\n```\n\t\tthis should\t\tnot normalize\n```\n\n' 290 | assert md('
\t\tthis  should\t\tnot  normalize
') == '\n\n```\n\t\tthis should\t\tnot normalize\n```\n\n' 291 | assert md('
foo\nbar\nbaz
') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' 292 | assert md('
foo\nbar\nbaz
') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' 293 | assert md('
foo\nbar\nbaz
') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' 294 | assert md('
foo\nbaz
') == '\n\n```\nfoo\nbaz\n```\n\n' 295 | assert md('
foo\nbar\nbaz
') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' 296 | assert md('
foo\nbar\nbaz
') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' 297 | assert md('
foo\nbar\nbaz
') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' 298 | assert md('
foo\nbar\nbaz
') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' 299 | assert md('
foo\nbar\nbaz
') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' 300 | assert md('
foo\nbar\nbaz
', sup_symbol='^') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' 301 | assert md('
foo\nbar\nbaz
', sub_symbol='^') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' 302 | assert md('
foo\nbar\nbaz
', sub_symbol='^') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' 303 | 304 | assert md('foo
bar
baz', sub_symbol='^') == 'foo\n\n```\nbar\n```\n\nbaz' 305 | assert md("

foo

\n
bar
\n

baz

", sub_symbol="^") == "\n\nfoo\n\n```\nbar\n```\n\nbaz" 306 | 307 | 308 | def test_q(): 309 | assert md('foo quote bar') == 'foo "quote" bar' 310 | assert md('foo quote bar') == 'foo "quote" bar' 311 | 312 | 313 | def test_script(): 314 | assert md('foo bar') == 'foo bar' 315 | 316 | 317 | def test_style(): 318 | assert md('foo bar') == 'foo bar' 319 | 320 | 321 | def test_s(): 322 | inline_tests('s', '~~') 323 | 324 | 325 | def test_samp(): 326 | inline_tests('samp', '`') 327 | 328 | 329 | def test_strong(): 330 | assert md('Hello') == '**Hello**' 331 | 332 | 333 | def test_strong_em_symbol(): 334 | assert md('Hello', strong_em_symbol=UNDERSCORE) == '__Hello__' 335 | assert md('Hello', strong_em_symbol=UNDERSCORE) == '__Hello__' 336 | assert md('Hello', strong_em_symbol=UNDERSCORE) == '_Hello_' 337 | assert md('Hello', strong_em_symbol=UNDERSCORE) == '_Hello_' 338 | 339 | 340 | def test_sub(): 341 | assert md('foo') == 'foo' 342 | assert md('foo', sub_symbol='~') == '~foo~' 343 | assert md('foo', sub_symbol='') == 'foo' 344 | 345 | 346 | def test_sup(): 347 | assert md('foo') == 'foo' 348 | assert md('foo', sup_symbol='^') == '^foo^' 349 | assert md('foo', sup_symbol='') == 'foo' 350 | 351 | 352 | def test_lang(): 353 | assert md('
test\n    foo\nbar
', code_language='python') == '\n\n```python\ntest\n foo\nbar\n```\n\n' 354 | assert md('
test\n    foo\nbar
', code_language='javascript') == '\n\n```javascript\ntest\n foo\nbar\n```\n\n' 355 | 356 | 357 | def test_lang_callback(): 358 | def callback(el): 359 | return el['class'][0] if el.has_attr('class') else None 360 | 361 | assert md('
test\n    foo\nbar
', code_language_callback=callback) == '\n\n```python\ntest\n foo\nbar\n```\n\n' 362 | assert md('
test\n    foo\nbar
', code_language_callback=callback) == '\n\n```javascript\ntest\n foo\nbar\n```\n\n' 363 | assert md('
test\n    foo\nbar
', code_language_callback=callback) == '\n\n```javascript\ntest\n foo\nbar\n```\n\n' 364 | 365 | 366 | def test_spaces(): 367 | assert md('

a b

c d

') == '\n\na b\n\nc d\n\n' 368 | assert md('

a

') == '\n\n*a*\n\n' 369 | assert md('test

again

') == 'test\n\nagain\n\n' 370 | assert md('test
text
after') == 'test\n> text\n\nafter' 371 | assert md('
  1. x
  2. y
') == '\n\n1. x\n2. y\n' 372 | assert md('
  • x
  • y
  • ') == '\n\n* x\n* y\n' 373 | assert md('test
     foo 
    bar') == 'test\n\n```\n foo \n```\n\nbar' 374 | -------------------------------------------------------------------------------- /tests/test_custom_converter.py: -------------------------------------------------------------------------------- 1 | from markdownify import MarkdownConverter 2 | from bs4 import BeautifulSoup 3 | 4 | 5 | class UnitTestConverter(MarkdownConverter): 6 | """ 7 | Create a custom MarkdownConverter for unit tests 8 | """ 9 | def convert_img(self, el, text, parent_tags): 10 | """Add two newlines after an image""" 11 | return super().convert_img(el, text, parent_tags) + '\n\n' 12 | 13 | def convert_custom_tag(self, el, text, parent_tags): 14 | """Ensure conversion function is found for tags with special characters in name""" 15 | return "convert_custom_tag(): %s" % text 16 | 17 | def convert_h1(self, el, text, parent_tags): 18 | """Ensure explicit heading conversion function is used""" 19 | return "convert_h1: %s" % (text) 20 | 21 | def convert_hN(self, n, el, text, parent_tags): 22 | """Ensure general heading conversion function is used""" 23 | return "convert_hN(%d): %s" % (n, text) 24 | 25 | 26 | def test_custom_conversion_functions(): 27 | # Create shorthand method for conversion 28 | def md(html, **options): 29 | return UnitTestConverter(**options).convert(html) 30 | 31 | assert md('Alt texttext') == '![Alt text](/path/to/img.jpg "Optional title")\n\ntext' 32 | assert md('Alt texttext') == '![Alt text](/path/to/img.jpg)\n\ntext' 33 | 34 | assert md("text") == "convert_custom_tag(): text" 35 | 36 | assert md("

    text

    ") == "convert_h1: text" 37 | 38 | assert md("

    text

    ") == "convert_hN(3): text" 39 | 40 | 41 | def test_soup(): 42 | html = 'test' 43 | soup = BeautifulSoup(html, 'html.parser') 44 | assert MarkdownConverter().convert_soup(soup) == '**test**' 45 | -------------------------------------------------------------------------------- /tests/test_escaping.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from bs4 import MarkupResemblesLocatorWarning 3 | from .utils import md 4 | 5 | 6 | def test_asterisks(): 7 | assert md('*hey*dude*') == r'\*hey\*dude\*' 8 | assert md('*hey*dude*', escape_asterisks=False) == r'*hey*dude*' 9 | 10 | 11 | def test_underscore(): 12 | assert md('_hey_dude_') == r'\_hey\_dude\_' 13 | assert md('_hey_dude_', escape_underscores=False) == r'_hey_dude_' 14 | 15 | 16 | def test_xml_entities(): 17 | assert md('&', escape_misc=True) == r'\&' 18 | 19 | 20 | def test_named_entities(): 21 | assert md('»') == u'\xbb' 22 | 23 | 24 | def test_hexadecimal_entities(): 25 | # This looks to be a bug in BeautifulSoup (fixed in bs4) that we have to work around. 26 | assert md(''') == '\x27' 27 | 28 | 29 | def test_single_escaping_entities(): 30 | assert md('&amp;', escape_misc=True) == r'\&' 31 | 32 | 33 | def test_misc(): 34 | # ignore the bs4 warning that "1.2" or "*" looks like a filename 35 | warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning) 36 | 37 | assert md('\\*', escape_misc=True) == r'\\\*' 38 | assert md('<foo>', escape_misc=True) == r'\' 39 | assert md('# foo', escape_misc=True) == r'\# foo' 40 | assert md('#5', escape_misc=True) == r'#5' 41 | assert md('5#', escape_misc=True) == '5#' 42 | assert md('####### foo', escape_misc=True) == r'####### foo' 43 | assert md('> foo', escape_misc=True) == r'\> foo' 44 | assert md('~~foo~~', escape_misc=True) == r'\~\~foo\~\~' 45 | assert md('foo\n===\n', escape_misc=True) == 'foo\n\\=\\=\\=\n' 46 | assert md('---\n', escape_misc=True) == '\\---\n' 47 | assert md('- test', escape_misc=True) == r'\- test' 48 | assert md('x - y', escape_misc=True) == r'x \- y' 49 | assert md('test-case', escape_misc=True) == 'test-case' 50 | assert md('x-', escape_misc=True) == 'x-' 51 | assert md('-y', escape_misc=True) == '-y' 52 | assert md('+ x\n+ y\n', escape_misc=True) == '\\+ x\n\\+ y\n' 53 | assert md('`x`', escape_misc=True) == r'\`x\`' 54 | assert md('[text](notalink)', escape_misc=True) == r'\[text\](notalink)' 55 | assert md('text]', escape_misc=True) == r'[text\]](link)' 56 | assert md('[text]', escape_misc=True) == r'[\[text\]](link)' 57 | assert md('1. x', escape_misc=True) == r'1\. x' 58 | # assert md('1. x', escape_misc=True) == r'1\. x' 59 | assert md('1. x', escape_misc=True) == r'1\. x' 60 | assert md(' 1. x', escape_misc=True) == r' 1\. x' 61 | assert md('123456789. x', escape_misc=True) == r'123456789\. x' 62 | assert md('1234567890. x', escape_misc=True) == r'1234567890. x' 63 | assert md('A1. x', escape_misc=True) == r'A1. x' 64 | assert md('1.2', escape_misc=True) == r'1.2' 65 | assert md('not a number. x', escape_misc=True) == r'not a number. x' 66 | assert md('1) x', escape_misc=True) == r'1\) x' 67 | # assert md('1) x', escape_misc=True) == r'1\) x' 68 | assert md('1) x', escape_misc=True) == r'1\) x' 69 | assert md(' 1) x', escape_misc=True) == r' 1\) x' 70 | assert md('123456789) x', escape_misc=True) == r'123456789\) x' 71 | assert md('1234567890) x', escape_misc=True) == r'1234567890) x' 72 | assert md('(1) x', escape_misc=True) == r'(1) x' 73 | assert md('A1) x', escape_misc=True) == r'A1) x' 74 | assert md('1)x', escape_misc=True) == r'1)x' 75 | assert md('not a number) x', escape_misc=True) == r'not a number) x' 76 | assert md('|not table|', escape_misc=True) == r'\|not table\|' 77 | assert md(r'\ <foo> &amp; | ` `', escape_misc=False) == r'\ & | ` `' 78 | -------------------------------------------------------------------------------- /tests/test_lists.py: -------------------------------------------------------------------------------- 1 | from .utils import md 2 | 3 | 4 | nested_uls = """ 5 |
      6 |
    • 1 7 |
        8 |
      • a 9 |
          10 |
        • I
        • 11 |
        • II
        • 12 |
        • III
        • 13 |
        14 |
      • 15 |
      • b
      • 16 |
      • c
      • 17 |
      18 |
    • 19 |
    • 2
    • 20 |
    • 3
    • 21 |
    """ 22 | 23 | nested_ols = """ 24 |
      25 |
    1. 1 26 |
        27 |
      1. a 28 |
          29 |
        1. I
        2. 30 |
        3. II
        4. 31 |
        5. III
        6. 32 |
        33 |
      2. 34 |
      3. b
      4. 35 |
      5. c
      6. 36 |
      37 |
    2. 38 |
    3. 2
    4. 39 |
    5. 3
    6. 40 |
""" 41 | 42 | 43 | def test_ol(): 44 | assert md('
  1. a
  2. b
') == '\n\n1. a\n2. b\n' 45 | assert md('
  1. a
  2. b
') == '\n\n1. a\n2. b\n' 46 | assert md('
  1. a
  2. b
') == '\n\n3. a\n4. b\n' 47 | assert md('foo
  1. a
  2. b
bar') == 'foo\n\n3. a\n4. b\n\nbar' 48 | assert md('
  1. a
  2. b
') == '\n\n1. a\n2. b\n' 49 | assert md('
  1. a
  2. b
') == '\n\n1. a\n2. b\n' 50 | assert md('
  1. a
  2. b
') == '\n\n1. a\n2. b\n' 51 | assert md('
  1. first para

    second para

  2. third para

    fourth para

') == '\n\n1234. first para\n\n second para\n1235. third para\n\n fourth para\n' 52 | 53 | 54 | def test_nested_ols(): 55 | assert md(nested_ols) == '\n\n1. 1\n 1. a\n 1. I\n 2. II\n 3. III\n 2. b\n 3. c\n2. 2\n3. 3\n' 56 | 57 | 58 | def test_ul(): 59 | assert md('
  • a
  • b
') == '\n\n* a\n* b\n' 60 | assert md("""
    61 |
  • 62 | a 63 |
  • 64 |
  • b
  • 65 |
  • c 66 |
  • 67 |
""") == '\n\n* a\n* b\n* c\n' 68 | assert md('
  • first para

    second para

  • third para

    fourth para

') == '\n\n* first para\n\n second para\n* third para\n\n fourth para\n' 69 | 70 | 71 | def test_inline_ul(): 72 | assert md('

foo

  • a
  • b

bar

') == '\n\nfoo\n\n* a\n* b\n\nbar\n\n' 73 | assert md('foo
  • bar
baz') == 'foo\n\n* bar\n\nbaz' 74 | 75 | 76 | def test_nested_uls(): 77 | """ 78 | Nested ULs should alternate bullet characters. 79 | 80 | """ 81 | assert md(nested_uls) == '\n\n* 1\n + a\n - I\n - II\n - III\n + b\n + c\n* 2\n* 3\n' 82 | 83 | 84 | def test_bullets(): 85 | assert md(nested_uls, bullets='-') == '\n\n- 1\n - a\n - I\n - II\n - III\n - b\n - c\n- 2\n- 3\n' 86 | 87 | 88 | def test_li_text(): 89 | assert md('
  • foo bar
  • foo bar
  • foo bar space.
') == '\n\n* foo [bar](#)\n* foo bar\n* foo **bar** *space*.\n' 90 | -------------------------------------------------------------------------------- /tests/test_tables.py: -------------------------------------------------------------------------------- 1 | from .utils import md 2 | 3 | 4 | table = """ 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 |
FirstnameLastnameAge
JillSmith50
EveJackson94
""" 21 | 22 | 23 | table_with_html_content = """ 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 |
FirstnameLastnameAge
JillSmith50
EveJackson94
""" 40 | 41 | 42 | table_with_paragraphs = """ 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 |
Firstname

Lastname

Age

Jill

Smith

50

EveJackson94
""" 59 | 60 | table_with_linebreaks = """ 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 70 | 71 | 72 | 73 | 74 | 76 | 77 | 78 |
FirstnameLastnameAge
JillSmith 69 | Jackson50
EveJackson 75 | Smith94
""" 79 | 80 | 81 | table_with_header_column = """ 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 |
FirstnameLastnameAge
JillSmith50
EveJackson94
""" 98 | 99 | 100 | table_head_body = """ 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 |
FirstnameLastnameAge
JillSmith50
EveJackson94
""" 121 | 122 | table_head_body_missing_head = """ 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 |
FirstnameLastnameAge
JillSmith50
EveJackson94
""" 143 | 144 | table_head_body_multiple_head = """ 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 |
CreatorEditorServer
OperatorManagerEngineer
BobOliverTom
ThomasLucasEthan
""" 170 | 171 | table_missing_text = """ 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 |
LastnameAge
Jill50
EveJackson94
""" 192 | 193 | table_missing_head = """ 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 |
FirstnameLastnameAge
JillSmith50
EveJackson94
""" 210 | 211 | table_body = """ 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 |
FirstnameLastnameAge
JillSmith50
EveJackson94
""" 230 | 231 | table_with_caption = """TEXT 232 | 235 | 236 | 237 | 238 | 239 | 240 |
233 | Caption 234 |
FirstnameLastnameAge
""" 241 | 242 | table_with_colspan = """ 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 |
NameAge
JillSmith50
EveJackson94
""" 258 | 259 | table_with_undefined_colspan = """ 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 |
NameAge
JillSmith
""" 269 | 270 | table_with_colspan_missing_head = """ 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 |
NameAge
JillSmith50
EveJackson94
""" 286 | 287 | 288 | def test_table(): 289 | assert md(table) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' 290 | assert md(table_with_html_content) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| **Jill** | *Smith* | [50](#) |\n| Eve | Jackson | 94 |\n\n' 291 | assert md(table_with_paragraphs) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' 292 | assert md(table_with_linebreaks) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith Jackson | 50 |\n| Eve | Jackson Smith | 94 |\n\n' 293 | assert md(table_with_header_column) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' 294 | assert md(table_head_body) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' 295 | assert md(table_head_body_multiple_head) == '\n\n| | | |\n| --- | --- | --- |\n| Creator | Editor | Server |\n| Operator | Manager | Engineer |\n| Bob | Oliver | Tom |\n| Thomas | Lucas | Ethan |\n\n' 296 | assert md(table_head_body_missing_head) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' 297 | assert md(table_missing_text) == '\n\n| | Lastname | Age |\n| --- | --- | --- |\n| Jill | | 50 |\n| Eve | Jackson | 94 |\n\n' 298 | assert md(table_missing_head) == '\n\n| | | |\n| --- | --- | --- |\n| Firstname | Lastname | Age |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' 299 | assert md(table_body) == '\n\n| | | |\n| --- | --- | --- |\n| Firstname | Lastname | Age |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' 300 | assert md(table_with_caption) == 'TEXT\n\nCaption\n\n| | | |\n| --- | --- | --- |\n| Firstname | Lastname | Age |\n\n' 301 | assert md(table_with_colspan) == '\n\n| Name | | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' 302 | assert md(table_with_undefined_colspan) == '\n\n| Name | Age |\n| --- | --- |\n| Jill | Smith |\n\n' 303 | assert md(table_with_colspan_missing_head) == '\n\n| | | |\n| --- | --- | --- |\n| Name | | Age |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' 304 | 305 | 306 | def test_table_infer_header(): 307 | assert md(table, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' 308 | assert md(table_with_html_content, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| **Jill** | *Smith* | [50](#) |\n| Eve | Jackson | 94 |\n\n' 309 | assert md(table_with_paragraphs, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' 310 | assert md(table_with_linebreaks, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith Jackson | 50 |\n| Eve | Jackson Smith | 94 |\n\n' 311 | assert md(table_with_header_column, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' 312 | assert md(table_head_body, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' 313 | assert md(table_head_body_multiple_head, table_infer_header=True) == '\n\n| Creator | Editor | Server |\n| --- | --- | --- |\n| Operator | Manager | Engineer |\n| Bob | Oliver | Tom |\n| Thomas | Lucas | Ethan |\n\n' 314 | assert md(table_head_body_missing_head, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' 315 | assert md(table_missing_text, table_infer_header=True) == '\n\n| | Lastname | Age |\n| --- | --- | --- |\n| Jill | | 50 |\n| Eve | Jackson | 94 |\n\n' 316 | assert md(table_missing_head, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' 317 | assert md(table_body, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' 318 | assert md(table_with_caption, table_infer_header=True) == 'TEXT\n\nCaption\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n\n' 319 | assert md(table_with_colspan, table_infer_header=True) == '\n\n| Name | | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' 320 | assert md(table_with_undefined_colspan, table_infer_header=True) == '\n\n| Name | Age |\n| --- | --- |\n| Jill | Smith |\n\n' 321 | assert md(table_with_colspan_missing_head, table_infer_header=True) == '\n\n| Name | | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' 322 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | from markdownify import MarkdownConverter 2 | 3 | 4 | # for unit testing, disable document-level stripping by default so that 5 | # separation newlines are included in testing 6 | def md(html, **options): 7 | options = {"strip_document": None, **options} 8 | 9 | return MarkdownConverter(**options).convert(html) 10 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py38 3 | 4 | [testenv] 5 | passenv = PYTHONPATH 6 | deps = 7 | pytest==8 8 | flake8 9 | restructuredtext_lint 10 | Pygments 11 | commands = 12 | pytest 13 | flake8 --ignore=E501,W503 markdownify tests 14 | restructuredtext-lint README.rst 15 | 16 | --------------------------------------------------------------------------------