├── myfile.txt ├── .gitignore ├── README.md ├── README.html ├── advanced_text_editor.py ├── markdown2-oop.py └── markdown2.py /myfile.txt: -------------------------------------------------------------------------------- 1 | This is a text editor made by Luke Carlson (github.com/jLukeC). 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | *.pyc 3 | *~ 4 | myfile* 5 | 6 | .gitignore~ 7 | dependencies/ 8 | index.html 9 | advanced_text_editor_html.py 10 | practice.py 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Packages 16 | *.egg 17 | *.egg-info 18 | dist 19 | build 20 | eggs 21 | parts 22 | bin 23 | var 24 | sdist 25 | develop-eggs 26 | .installed.cfg 27 | lib 28 | lib64 29 | 30 | # Installer logs 31 | pip-log.txt 32 | 33 | # Unit test / coverage reports 34 | .coverage 35 | .tox 36 | nosetests.xml 37 | 38 | # Translations 39 | *.mo 40 | 41 | # Mr Developer 42 | .mr.developer.cfg 43 | .project 44 | .pydevproject 45 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | advanced-text-editor 2 | ==================== 3 | 4 | a text editor made in python. Can handle multiple windows and the major mac shortcuts. 5 | It is the advanced version of [my other editors](https://github.com/jLukeC/mega-project-list/blob/master/python/text-editor/basic_text_editor.py) 6 | 7 | 8 | Hardest parts 9 | -------------- 10 | **Creating live html viewer** now when an html file is opened and Command+M is pressed, a window will render the html. 11 | 12 | **Allowing multiple windows** took a while especially since file commands did not work if I had trouble... (look below) 13 | 14 | **Determining text widget in focus** without this, multiple windows couldn't work nor file commands. The solution ended up being trivial but it took a while to reach, especially since root.focus_get() returns a decimal number when printed out 15 | 16 | **Generally Keeping the Code Clean** still a struggle but as I learn more about Tkinter I have made sure to make my code my straightforward and less of a mess 17 | 18 | 19 | PS this readme was written using my editor 20 | 21 | 22 | -------------------------------------------------------------------------------- /README.html: -------------------------------------------------------------------------------- 1 |
a text editor made in python. Can handle multiple windows and the major mac shortcuts. 4 | It is the advanced version of my other editors
5 | 6 |Creating live html viewer now when an html file is opened and Command+M is pressed, a window will render the html.
9 | 10 |Allowing multiple windows took a while especially since file commands did not work if I had trouble... (look below)
11 | 12 |Determining text widget in focus without this, multiple windows couldn't work nor file commands. The solution ended up being trivial but it took a while to reach, especially since root.focus_get() returns a decimal number when printed out
13 | 14 |Generally Keeping the Code Clean still a struggle but as I learn more about Tkinter I have made sure to make my code my straightforward and less of a mess
15 | 16 |PS this readme was written using my editor
17 | -------------------------------------------------------------------------------- /advanced_text_editor.py: -------------------------------------------------------------------------------- 1 | from Tkinter import * 2 | import tkFileDialog 3 | import os, sys, inspect 4 | import markdown2 5 | main_path= os.path.realpath(os.path.abspath(os.path.split(inspect.getfile( inspect.currentframe() ))[0])) 6 | dependencies_path = os.path.join(main_path, 'dependencies') 7 | sys.path.append(dependencies_path) 8 | from tkhtml import * 9 | from tkutil import unbind_destroy 10 | 11 | class Window(): 12 | def __init__(self, parent): 13 | self.filename ='' 14 | self.window = Toplevel(parent) 15 | self.text_box = Text(self.window, background="black", foreground="firebrick", insertbackground="white") 16 | self.text_box.pack(expand = 1, fill= BOTH) 17 | self.text_box.focus_set() 18 | 19 | class Editor: 20 | def __init__(self, master): 21 | self.file_name = "" 22 | self.html_window="" 23 | #self.html_viewer="" 24 | 25 | initial_text_box = Text(root, background="black", foreground="firebrick", insertbackground="white") 26 | initial_text_box.pack(expand = 1, fill= BOTH) 27 | initial_text_box.focus_set() 28 | initial_text_box.insert(END, """This is a text editor made by Luke Carlson (github.com/jLukeC).""") 29 | 30 | self.file_opt = options = {} 31 | 32 | # options for opening files 33 | options['defaultextension'] = '.txt' 34 | options['filetypes'] = [('all files', '.*'), ('text files', '.txt'), ('markdown', '.md'), ('html', '.html')] 35 | options['initialdir'] = os.path 36 | options['initialfile'] = 'myfile.txt' 37 | options['parent'] = root 38 | options['title'] = 'This is a title' 39 | 40 | 41 | # defining options for opening a directory 42 | self.dir_opt = options = {} 43 | options['initialdir'] = os.path 44 | options['mustexist'] = False 45 | options['parent'] = root 46 | options['title'] = 'This is a title' 47 | 48 | 49 | 50 | 51 | def find_focus(): 52 | focus= root.focus_get() 53 | print focus 54 | print focus.get(1.0, END) 55 | print focus.master 56 | focus.master.wm_title("focused") 57 | 58 | 59 | menubar = Menu(root) 60 | menubar.add_command(label="Hello!", command=find_focus) 61 | menubar.add_command(label="fds!", command=find_focus) 62 | 63 | filemenu = Menu(menubar, tearoff=0) 64 | filemenu.add_command(label="New", command=self.new_window, accelerator="Command+N") 65 | filemenu.add_command(label="Open", command=self.open_file, accelerator="Command+O") 66 | filemenu.add_command(label="Save", command=self.save_file, accelerator="Command+S") 67 | filemenu.add_command(label="Save as...", command=self.save_as_file) 68 | filemenu.add_separator() 69 | filemenu.add_command(label="Close Window", command=self.destroy, accelerator="Command+W") 70 | filemenu.add_command(label="Exit", command=self.quit_project, accelerator="Command+Q") 71 | menubar.add_cascade(label="File", menu=filemenu) 72 | 73 | 74 | editmenu = Menu(menubar, tearoff=0) 75 | editmenu.add_command(label="Undo", command=find_focus) 76 | editmenu.add_separator() 77 | editmenu.add_command(label="Cut", command=self.cut, accelerator="Command+X") 78 | editmenu.add_command(label="Copy", command=self.copy, accelerator="Command+C") 79 | editmenu.add_command(label="Paste", command=self.paste, accelerator="Command+V") 80 | editmenu.add_command(label="Select All", command=self.select_all, accelerator="Command+A") 81 | editmenu.add_command(label="Delete", command=self.delete_selection) 82 | menubar.add_cascade(label="Edit", menu=editmenu) 83 | 84 | 85 | helpmenu = Menu(menubar, tearoff=0) 86 | helpmenu.add_command(label="Find Focus", command=find_focus) 87 | helpmenu.add_command(label="About", command=self.about) 88 | menubar.add_cascade(label="Help", menu=helpmenu) 89 | 90 | root.config(menu=menubar) 91 | 92 | 93 | root.bind_all(" tags.
1449 | """
1450 | yield 0, ""
1451 | for tup in inner:
1452 | yield tup
1453 | yield 0, ""
1454 |
1455 | def wrap(self, source, outfile):
1456 | """Return the source with a code, pre, and div."""
1457 | return self._wrap_div(self._wrap_pre(self._wrap_code(source)))
1458 |
1459 | formatter_opts.setdefault("cssclass", "codehilite")
1460 | formatter = HtmlCodeFormatter(**formatter_opts)
1461 | return pygments.highlight(codeblock, lexer, formatter)
1462 |
1463 | def _code_block_sub(self, match, is_fenced_code_block=False):
1464 | lexer_name = None
1465 | if is_fenced_code_block:
1466 | lexer_name = match.group(1)
1467 | if lexer_name:
1468 | formatter_opts = self.extras['fenced-code-blocks'] or {}
1469 | codeblock = match.group(2)
1470 | codeblock = codeblock[:-1] # drop one trailing newline
1471 | else:
1472 | codeblock = match.group(1)
1473 | codeblock = self._outdent(codeblock)
1474 | codeblock = self._detab(codeblock)
1475 | codeblock = codeblock.lstrip('\n') # trim leading newlines
1476 | codeblock = codeblock.rstrip() # trim trailing whitespace
1477 |
1478 | # Note: "code-color" extra is DEPRECATED.
1479 | if "code-color" in self.extras and codeblock.startswith(":::"):
1480 | lexer_name, rest = codeblock.split('\n', 1)
1481 | lexer_name = lexer_name[3:].strip()
1482 | codeblock = rest.lstrip("\n") # Remove lexer declaration line.
1483 | formatter_opts = self.extras['code-color'] or {}
1484 |
1485 | if lexer_name:
1486 | lexer = self._get_pygments_lexer(lexer_name)
1487 | if lexer:
1488 | colored = self._color_with_pygments(codeblock, lexer,
1489 | **formatter_opts)
1490 | return "\n\n%s\n\n" % colored
1491 |
1492 | codeblock = self._encode_code(codeblock)
1493 | pre_class_str = self._html_class_str_from_tag("pre")
1494 | code_class_str = self._html_class_str_from_tag("code")
1495 | return "\n\n%s\n
\n\n" % (
1496 | pre_class_str, code_class_str, codeblock)
1497 |
1498 | def _html_class_str_from_tag(self, tag):
1499 | """Get the appropriate ' class="..."' string (note the leading
1500 | space), if any, for the given tag.
1501 | """
1502 | if "html-classes" not in self.extras:
1503 | return ""
1504 | try:
1505 | html_classes_from_tag = self.extras["html-classes"]
1506 | except TypeError:
1507 | return ""
1508 | else:
1509 | if tag in html_classes_from_tag:
1510 | return ' class="%s"' % html_classes_from_tag[tag]
1511 | return ""
1512 |
1513 | def _do_code_blocks(self, text):
1514 | """Process Markdown `` blocks."""
1515 | code_block_re = re.compile(r'''
1516 | (?:\n\n|\A\n?)
1517 | ( # $1 = the code block -- one or more lines, starting with a space/tab
1518 | (?:
1519 | (?:[ ]{%d} | \t) # Lines must start with a tab or a tab-width of spaces
1520 | .*\n+
1521 | )+
1522 | )
1523 | ((?=^[ ]{0,%d}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
1524 | ''' % (self.tab_width, self.tab_width),
1525 | re.M | re.X)
1526 | return code_block_re.sub(self._code_block_sub, text)
1527 |
1528 | _fenced_code_block_re = re.compile(r'''
1529 | (?:\n\n|\A\n?)
1530 | ^```([\w+-]+)?[ \t]*\n # opening fence, $1 = optional lang
1531 | (.*?) # $2 = code block content
1532 | ^```[ \t]*\n # closing fence
1533 | ''', re.M | re.X | re.S)
1534 |
1535 | def _fenced_code_block_sub(self, match):
1536 | return self._code_block_sub(match, is_fenced_code_block=True);
1537 |
1538 | def _do_fenced_code_blocks(self, text):
1539 | """Process ```-fenced unindented code blocks ('fenced-code-blocks' extra)."""
1540 | return self._fenced_code_block_re.sub(self._fenced_code_block_sub, text)
1541 |
1542 | # Rules for a code span:
1543 | # - backslash escapes are not interpreted in a code span
1544 | # - to include one or or a run of more backticks the delimiters must
1545 | # be a longer run of backticks
1546 | # - cannot start or end a code span with a backtick; pad with a
1547 | # space and that space will be removed in the emitted HTML
1548 | # See `test/tm-cases/escapes.text` for a number of edge-case
1549 | # examples.
1550 | _code_span_re = re.compile(r'''
1551 | (?%s" % c
1564 |
1565 | def _do_code_spans(self, text):
1566 | # * Backtick quotes are used for spans.
1567 | #
1568 | # * You can use multiple backticks as the delimiters if you want to
1569 | # include literal backticks in the code span. So, this input:
1570 | #
1571 | # Just type ``foo `bar` baz`` at the prompt.
1572 | #
1573 | # Will translate to:
1574 | #
1575 | # Just type foo `bar` baz at the prompt.
1576 | #
1577 | # There's no arbitrary limit to the number of backticks you
1578 | # can use as delimters. If you need three consecutive backticks
1579 | # in your code, use four for delimiters, etc.
1580 | #
1581 | # * You can use spaces to get literal backticks at the edges:
1582 | #
1583 | # ... type `` `bar` `` ...
1584 | #
1585 | # Turns to:
1586 | #
1587 | # ... type `bar` ...
1588 | return self._code_span_re.sub(self._code_span_sub, text)
1589 |
1590 | def _encode_code(self, text):
1591 | """Encode/escape certain characters inside Markdown code runs.
1592 | The point is that in code, these characters are literals,
1593 | and lose their special Markdown meanings.
1594 | """
1595 | replacements = [
1596 | # Encode all ampersands; HTML entities are not
1597 | # entities within a Markdown code span.
1598 | ('&', '&'),
1599 | # Do the angle bracket song and dance:
1600 | ('<', '<'),
1601 | ('>', '>'),
1602 | ]
1603 | for before, after in replacements:
1604 | text = text.replace(before, after)
1605 | hashed = _hash_text(text)
1606 | self._escape_table[text] = hashed
1607 | return hashed
1608 |
1609 | _strong_re = re.compile(r"(\*\*|__)(?=\S)(.+?[*_]*)(?<=\S)\1", re.S)
1610 | _em_re = re.compile(r"(\*|_)(?=\S)(.+?)(?<=\S)\1", re.S)
1611 | _code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.+?[*_]*)(?<=\S)\*\*", re.S)
1612 | _code_friendly_em_re = re.compile(r"\*(?=\S)(.+?)(?<=\S)\*", re.S)
1613 | def _do_italics_and_bold(self, text):
1614 | # must go first:
1615 | if "code-friendly" in self.extras:
1616 | text = self._code_friendly_strong_re.sub(r"\1", text)
1617 | text = self._code_friendly_em_re.sub(r"\1", text)
1618 | else:
1619 | text = self._strong_re.sub(r"\2", text)
1620 | text = self._em_re.sub(r"\2", text)
1621 | return text
1622 |
1623 | # "smarty-pants" extra: Very liberal in interpreting a single prime as an
1624 | # apostrophe; e.g. ignores the fact that "round", "bout", "twer", and
1625 | # "twixt" can be written without an initial apostrophe. This is fine because
1626 | # using scare quotes (single quotation marks) is rare.
1627 | _apostrophe_year_re = re.compile(r"'(\d\d)(?=(\s|,|;|\.|\?|!|$))")
1628 | _contractions = ["tis", "twas", "twer", "neath", "o", "n",
1629 | "round", "bout", "twixt", "nuff", "fraid", "sup"]
1630 | def _do_smart_contractions(self, text):
1631 | text = self._apostrophe_year_re.sub(r"’\1", text)
1632 | for c in self._contractions:
1633 | text = text.replace("'%s" % c, "’%s" % c)
1634 | text = text.replace("'%s" % c.capitalize(),
1635 | "’%s" % c.capitalize())
1636 | return text
1637 |
1638 | # Substitute double-quotes before single-quotes.
1639 | _opening_single_quote_re = re.compile(r"(?
1648 | See "test/tm-cases/smarty_pants.text" for a full discussion of the
1649 | support here and
1650 | for a
1651 | discussion of some diversion from the original SmartyPants.
1652 | """
1653 | if "'" in text: # guard for perf
1654 | text = self._do_smart_contractions(text)
1655 | text = self._opening_single_quote_re.sub("‘", text)
1656 | text = self._closing_single_quote_re.sub("’", text)
1657 |
1658 | if '"' in text: # guard for perf
1659 | text = self._opening_double_quote_re.sub("“", text)
1660 | text = self._closing_double_quote_re.sub("”", text)
1661 |
1662 | text = text.replace("---", "—")
1663 | text = text.replace("--", "–")
1664 | text = text.replace("...", "…")
1665 | text = text.replace(" . . . ", "…")
1666 | text = text.replace(". . .", "…")
1667 | return text
1668 |
1669 | _block_quote_re = re.compile(r'''
1670 | ( # Wrap whole match in \1
1671 | (
1672 | ^[ \t]*>[ \t]? # '>' at the start of a line
1673 | .+\n # rest of the first line
1674 | (.+\n)* # subsequent consecutive lines
1675 | \n* # blanks
1676 | )+
1677 | )
1678 | ''', re.M | re.X)
1679 | _bq_one_level_re = re.compile('^[ \t]*>[ \t]?', re.M);
1680 |
1681 | _html_pre_block_re = re.compile(r'(\s*.+?
)', re.S)
1682 | def _dedent_two_spaces_sub(self, match):
1683 | return re.sub(r'(?m)^ ', '', match.group(1))
1684 |
1685 | def _block_quote_sub(self, match):
1686 | bq = match.group(1)
1687 | bq = self._bq_one_level_re.sub('', bq) # trim one level of quoting
1688 | bq = self._ws_only_line_re.sub('', bq) # trim whitespace-only lines
1689 | bq = self._run_block_gamut(bq) # recurse
1690 |
1691 | bq = re.sub('(?m)^', ' ', bq)
1692 | # These leading spaces screw with content, so we need to fix that:
1693 | bq = self._html_pre_block_re.sub(self._dedent_two_spaces_sub, bq)
1694 |
1695 | return "\n%s\n
\n\n" % bq
1696 |
1697 | def _do_block_quotes(self, text):
1698 | if '>' not in text:
1699 | return text
1700 | return self._block_quote_re.sub(self._block_quote_sub, text)
1701 |
1702 | def _form_paragraphs(self, text):
1703 | # Strip leading and trailing lines:
1704 | text = text.strip('\n')
1705 |
1706 | # Wrap tags.
1707 | grafs = []
1708 | for i, graf in enumerate(re.split(r"\n{2,}", text)):
1709 | if graf in self.html_blocks:
1710 | # Unhashify HTML blocks
1711 | grafs.append(self.html_blocks[graf])
1712 | else:
1713 | cuddled_list = None
1714 | if "cuddled-lists" in self.extras:
1715 | # Need to put back trailing '\n' for `_list_item_re`
1716 | # match at the end of the paragraph.
1717 | li = self._list_item_re.search(graf + '\n')
1718 | # Two of the same list marker in this paragraph: a likely
1719 | # candidate for a list cuddled to preceding paragraph
1720 | # text (issue 33). Note the `[-1]` is a quick way to
1721 | # consider numeric bullets (e.g. "1." and "2.") to be
1722 | # equal.
1723 | if (li and len(li.group(2)) <= 3 and li.group("next_marker")
1724 | and li.group("marker")[-1] == li.group("next_marker")[-1]):
1725 | start = li.start()
1726 | cuddled_list = self._do_lists(graf[start:]).rstrip("\n")
1727 | assert cuddled_list.startswith("
") or cuddled_list.startswith("")
1728 | graf = graf[:start]
1729 |
1730 | # Wrap tags.
1731 | graf = self._run_span_gamut(graf)
1732 | grafs.append("
" + graf.lstrip(" \t") + "
")
1733 |
1734 | if cuddled_list:
1735 | grafs.append(cuddled_list)
1736 |
1737 | return "\n\n".join(grafs)
1738 |
1739 | def _add_footnotes(self, text):
1740 | if self.footnotes:
1741 | footer = [
1742 | '',
1743 | '
',
1745 | ]
1746 | for i, id in enumerate(self.footnote_ids):
1747 | if i != 0:
1748 | footer.append('')
1749 | footer.append('- ' % id)
1750 | footer.append(self._run_block_gamut(self.footnotes[id]))
1751 | backlink = (''
1754 | '↩' % (id, i+1))
1755 | if footer[-1].endswith(""):
1756 | footer[-1] = footer[-1][:-len("")] \
1757 | + ' ' + backlink + ""
1758 | else:
1759 | footer.append("\n
%s
" % backlink)
1760 | footer.append(' ')
1761 | footer.append('')
1762 | footer.append('')
1763 | return text + '\n\n' + '\n'.join(footer)
1764 | else:
1765 | return text
1766 |
1767 | # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
1768 | # http://bumppo.net/projects/amputator/
1769 | _ampersand_re = re.compile(r'&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)')
1770 | _naked_lt_re = re.compile(r'<(?![a-z/?\$!])', re.I)
1771 | _naked_gt_re = re.compile(r'''(?''', re.I)
1772 |
1773 | def _encode_amps_and_angles(self, text):
1774 | # Smart processing for ampersands and angle brackets that need
1775 | # to be encoded.
1776 | text = self._ampersand_re.sub('&', text)
1777 |
1778 | # Encode naked <'s
1779 | text = self._naked_lt_re.sub('<', text)
1780 |
1781 | # Encode naked >'s
1782 | # Note: Other markdown implementations (e.g. Markdown.pl, PHP
1783 | # Markdown) don't do this.
1784 | text = self._naked_gt_re.sub('>', text)
1785 | return text
1786 |
1787 | def _encode_backslash_escapes(self, text):
1788 | for ch, escape in list(self._escape_table.items()):
1789 | text = text.replace("\\"+ch, escape)
1790 | return text
1791 |
1792 | _auto_link_re = re.compile(r'<((https?|ftp):[^\'">\s]+)>', re.I)
1793 | def _auto_link_sub(self, match):
1794 | g1 = match.group(1)
1795 | return '%s' % (g1, g1)
1796 |
1797 | _auto_email_link_re = re.compile(r"""
1798 | <
1799 | (?:mailto:)?
1800 | (
1801 | [-.\w]+
1802 | \@
1803 | [-\w]+(\.[-\w]+)*\.[a-z]+
1804 | )
1805 | >
1806 | """, re.I | re.X | re.U)
1807 | def _auto_email_link_sub(self, match):
1808 | return self._encode_email_address(
1809 | self._unescape_special_chars(match.group(1)))
1810 |
1811 | def _do_auto_links(self, text):
1812 | text = self._auto_link_re.sub(self._auto_link_sub, text)
1813 | text = self._auto_email_link_re.sub(self._auto_email_link_sub, text)
1814 | return text
1815 |
1816 | def _encode_email_address(self, addr):
1817 | # Input: an email address, e.g. "foo@example.com"
1818 | #
1819 | # Output: the email address as a mailto link, with each character
1820 | # of the address encoded as either a decimal or hex entity, in
1821 | # the hopes of foiling most address harvesting spam bots. E.g.:
1822 | #
1823 | # foo
1825 | # @example.com
1826 | #
1827 | # Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
1828 | # mailing list:
1829 | chars = [_xml_encode_email_char_at_random(ch)
1830 | for ch in "mailto:" + addr]
1831 | # Strip the mailto: from the visible part.
1832 | addr = '%s' \
1833 | % (''.join(chars), ''.join(chars[7:]))
1834 | return addr
1835 |
1836 | def _do_link_patterns(self, text):
1837 | """Caveat emptor: there isn't much guarding against link
1838 | patterns being formed inside other standard Markdown links, e.g.
1839 | inside a [link def][like this].
1840 |
1841 | Dev Notes: *Could* consider prefixing regexes with a negative
1842 | lookbehind assertion to attempt to guard against this.
1843 | """
1844 | link_from_hash = {}
1845 | for regex, repl in self.link_patterns:
1846 | replacements = []
1847 | for match in regex.finditer(text):
1848 | if hasattr(repl, "__call__"):
1849 | href = repl(match)
1850 | else:
1851 | href = match.expand(repl)
1852 | replacements.append((match.span(), href))
1853 | for (start, end), href in reversed(replacements):
1854 | escaped_href = (
1855 | href.replace('"', '"') # b/c of attr quote
1856 | # To avoid markdown and :
1857 | .replace('*', self._escape_table['*'])
1858 | .replace('_', self._escape_table['_']))
1859 | link = '%s' % (escaped_href, text[start:end])
1860 | hash = _hash_text(link)
1861 | link_from_hash[hash] = link
1862 | text = text[:start] + hash + text[end:]
1863 | for hash, link in list(link_from_hash.items()):
1864 | text = text.replace(hash, link)
1865 | return text
1866 |
1867 | def _unescape_special_chars(self, text):
1868 | # Swap back in all the special characters we've hidden.
1869 | for ch, hash in list(self._escape_table.items()):
1870 | text = text.replace(hash, ch)
1871 | return text
1872 |
1873 | def _outdent(self, text):
1874 | # Remove one level of line-leading tabs or spaces
1875 | return self._outdent_re.sub('', text)
1876 |
1877 |
1878 | class MarkdownWithExtras(Markdown):
1879 | """A markdowner class that enables most extras:
1880 |
1881 | - footnotes
1882 | - code-color (only has effect if 'pygments' Python module on path)
1883 |
1884 | These are not included:
1885 | - pyshell (specific to Python-related documenting)
1886 | - code-friendly (because it *disables* part of the syntax)
1887 | - link-patterns (because you need to specify some actual
1888 | link-patterns anyway)
1889 | """
1890 | extras = ["footnotes", "code-color"]
1891 |
1892 |
1893 | #---- internal support functions
1894 |
1895 | class UnicodeWithAttrs(unicode):
1896 | """A subclass of unicode used for the return value of conversion to
1897 | possibly attach some attributes. E.g. the "toc_html" attribute when
1898 | the "toc" extra is used.
1899 | """
1900 | metadata = None
1901 | _toc = None
1902 | def toc_html(self):
1903 | """Return the HTML for the current TOC.
1904 |
1905 | This expects the `_toc` attribute to have been set on this instance.
1906 | """
1907 | if self._toc is None:
1908 | return None
1909 |
1910 | def indent():
1911 | return ' ' * (len(h_stack) - 1)
1912 | lines = []
1913 | h_stack = [0] # stack of header-level numbers
1914 | for level, id, name in self._toc:
1915 | if level > h_stack[-1]:
1916 | lines.append("%s" % indent())
1917 | h_stack.append(level)
1918 | elif level == h_stack[-1]:
1919 | lines[-1] += ""
1920 | else:
1921 | while level < h_stack[-1]:
1922 | h_stack.pop()
1923 | if not lines[-1].endswith(""):
1924 | lines[-1] += ""
1925 | lines.append("%s
" % indent())
1926 | lines.append('%s- %s' % (
1927 | indent(), id, name))
1928 | while len(h_stack) > 1:
1929 | h_stack.pop()
1930 | if not lines[-1].endswith("
"):
1931 | lines[-1] += ""
1932 | lines.append("%s
" % indent())
1933 | return '\n'.join(lines) + '\n'
1934 | toc_html = property(toc_html)
1935 |
1936 | ## {{{ http://code.activestate.com/recipes/577257/ (r1)
1937 | _slugify_strip_re = re.compile(r'[^\w\s-]')
1938 | _slugify_hyphenate_re = re.compile(r'[-\s]+')
1939 | def _slugify(value):
1940 | """
1941 | Normalizes string, converts to lowercase, removes non-alpha characters,
1942 | and converts spaces to hyphens.
1943 |
1944 | From Django's "django/template/defaultfilters.py".
1945 | """
1946 | import unicodedata
1947 | value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode()
1948 | value = _slugify_strip_re.sub('', value).strip().lower()
1949 | return _slugify_hyphenate_re.sub('-', value)
1950 | ## end of http://code.activestate.com/recipes/577257/ }}}
1951 |
1952 |
1953 | # From http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52549
1954 | def _curry(*args, **kwargs):
1955 | function, args = args[0], args[1:]
1956 | def result(*rest, **kwrest):
1957 | combined = kwargs.copy()
1958 | combined.update(kwrest)
1959 | return function(*args + rest, **combined)
1960 | return result
1961 |
1962 | # Recipe: regex_from_encoded_pattern (1.0)
1963 | def _regex_from_encoded_pattern(s):
1964 | """'foo' -> re.compile(re.escape('foo'))
1965 | '/foo/' -> re.compile('foo')
1966 | '/foo/i' -> re.compile('foo', re.I)
1967 | """
1968 | if s.startswith('/') and s.rfind('/') != 0:
1969 | # Parse it: /PATTERN/FLAGS
1970 | idx = s.rfind('/')
1971 | pattern, flags_str = s[1:idx], s[idx+1:]
1972 | flag_from_char = {
1973 | "i": re.IGNORECASE,
1974 | "l": re.LOCALE,
1975 | "s": re.DOTALL,
1976 | "m": re.MULTILINE,
1977 | "u": re.UNICODE,
1978 | }
1979 | flags = 0
1980 | for char in flags_str:
1981 | try:
1982 | flags |= flag_from_char[char]
1983 | except KeyError:
1984 | raise ValueError("unsupported regex flag: '%s' in '%s' "
1985 | "(must be one of '%s')"
1986 | % (char, s, ''.join(list(flag_from_char.keys()))))
1987 | return re.compile(s[1:idx], flags)
1988 | else: # not an encoded regex
1989 | return re.compile(re.escape(s))
1990 |
1991 | # Recipe: dedent (0.1.2)
1992 | def _dedentlines(lines, tabsize=8, skip_first_line=False):
1993 | """_dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines
1994 |
1995 | "lines" is a list of lines to dedent.
1996 | "tabsize" is the tab width to use for indent width calculations.
1997 | "skip_first_line" is a boolean indicating if the first line should
1998 | be skipped for calculating the indent width and for dedenting.
1999 | This is sometimes useful for docstrings and similar.
2000 |
2001 | Same as dedent() except operates on a sequence of lines. Note: the
2002 | lines list is modified **in-place**.
2003 | """
2004 | DEBUG = False
2005 | if DEBUG:
2006 | print("dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\
2007 | % (tabsize, skip_first_line))
2008 | indents = []
2009 | margin = None
2010 | for i, line in enumerate(lines):
2011 | if i == 0 and skip_first_line: continue
2012 | indent = 0
2013 | for ch in line:
2014 | if ch == ' ':
2015 | indent += 1
2016 | elif ch == '\t':
2017 | indent += tabsize - (indent % tabsize)
2018 | elif ch in '\r\n':
2019 | continue # skip all-whitespace lines
2020 | else:
2021 | break
2022 | else:
2023 | continue # skip all-whitespace lines
2024 | if DEBUG: print("dedent: indent=%d: %r" % (indent, line))
2025 | if margin is None:
2026 | margin = indent
2027 | else:
2028 | margin = min(margin, indent)
2029 | if DEBUG: print("dedent: margin=%r" % margin)
2030 |
2031 | if margin is not None and margin > 0:
2032 | for i, line in enumerate(lines):
2033 | if i == 0 and skip_first_line: continue
2034 | removed = 0
2035 | for j, ch in enumerate(line):
2036 | if ch == ' ':
2037 | removed += 1
2038 | elif ch == '\t':
2039 | removed += tabsize - (removed % tabsize)
2040 | elif ch in '\r\n':
2041 | if DEBUG: print("dedent: %r: EOL -> strip up to EOL" % line)
2042 | lines[i] = lines[i][j:]
2043 | break
2044 | else:
2045 | raise ValueError("unexpected non-whitespace char %r in "
2046 | "line %r while removing %d-space margin"
2047 | % (ch, line, margin))
2048 | if DEBUG:
2049 | print("dedent: %r: %r -> removed %d/%d"\
2050 | % (line, ch, removed, margin))
2051 | if removed == margin:
2052 | lines[i] = lines[i][j+1:]
2053 | break
2054 | elif removed > margin:
2055 | lines[i] = ' '*(removed-margin) + lines[i][j+1:]
2056 | break
2057 | else:
2058 | if removed:
2059 | lines[i] = lines[i][removed:]
2060 | return lines
2061 |
2062 | def _dedent(text, tabsize=8, skip_first_line=False):
2063 | """_dedent(text, tabsize=8, skip_first_line=False) -> dedented text
2064 |
2065 | "text" is the text to dedent.
2066 | "tabsize" is the tab width to use for indent width calculations.
2067 | "skip_first_line" is a boolean indicating if the first line should
2068 | be skipped for calculating the indent width and for dedenting.
2069 | This is sometimes useful for docstrings and similar.
2070 |
2071 | textwrap.dedent(s), but don't expand tabs to spaces
2072 | """
2073 | lines = text.splitlines(1)
2074 | _dedentlines(lines, tabsize=tabsize, skip_first_line=skip_first_line)
2075 | return ''.join(lines)
2076 |
2077 |
2078 | class _memoized(object):
2079 | """Decorator that caches a function's return value each time it is called.
2080 | If called later with the same arguments, the cached value is returned, and
2081 | not re-evaluated.
2082 |
2083 | http://wiki.python.org/moin/PythonDecoratorLibrary
2084 | """
2085 | def __init__(self, func):
2086 | self.func = func
2087 | self.cache = {}
2088 | def __call__(self, *args):
2089 | try:
2090 | return self.cache[args]
2091 | except KeyError:
2092 | self.cache[args] = value = self.func(*args)
2093 | return value
2094 | except TypeError:
2095 | # uncachable -- for instance, passing a list as an argument.
2096 | # Better to not cache than to blow up entirely.
2097 | return self.func(*args)
2098 | def __repr__(self):
2099 | """Return the function's docstring."""
2100 | return self.func.__doc__
2101 |
2102 |
2103 | def _xml_oneliner_re_from_tab_width(tab_width):
2104 | """Standalone XML processing instruction regex."""
2105 | return re.compile(r"""
2106 | (?:
2107 | (?<=\n\n) # Starting after a blank line
2108 | | # or
2109 | \A\n? # the beginning of the doc
2110 | )
2111 | ( # save in $1
2112 | [ ]{0,%d}
2113 | (?:
2114 | <\?\w+\b\s+.*?\?> # XML processing instruction
2115 | |
2116 | <\w+:\w+\b\s+.*?/> # namespaced single tag
2117 | )
2118 | [ \t]*
2119 | (?=\n{2,}|\Z) # followed by a blank line or end of document
2120 | )
2121 | """ % (tab_width - 1), re.X)
2122 | _xml_oneliner_re_from_tab_width = _memoized(_xml_oneliner_re_from_tab_width)
2123 |
2124 | def _hr_tag_re_from_tab_width(tab_width):
2125 | return re.compile(r"""
2126 | (?:
2127 | (?<=\n\n) # Starting after a blank line
2128 | | # or
2129 | \A\n? # the beginning of the doc
2130 | )
2131 | ( # save in \1
2132 | [ ]{0,%d}
2133 | <(hr) # start tag = \2
2134 | \b # word break
2135 | ([^<>])*? #
2136 | /?> # the matching end tag
2137 | [ \t]*
2138 | (?=\n{2,}|\Z) # followed by a blank line or end of document
2139 | )
2140 | """ % (tab_width - 1), re.X)
2141 | _hr_tag_re_from_tab_width = _memoized(_hr_tag_re_from_tab_width)
2142 |
2143 |
2144 | def _xml_escape_attr(attr, skip_single_quote=True):
2145 | """Escape the given string for use in an HTML/XML tag attribute.
2146 |
2147 | By default this doesn't bother with escaping `'` to `'`, presuming that
2148 | the tag attribute is surrounded by double quotes.
2149 | """
2150 | escaped = (attr
2151 | .replace('&', '&')
2152 | .replace('"', '"')
2153 | .replace('<', '<')
2154 | .replace('>', '>'))
2155 | if not skip_single_quote:
2156 | escaped = escaped.replace("'", "'")
2157 | return escaped
2158 |
2159 |
2160 | def _xml_encode_email_char_at_random(ch):
2161 | r = random()
2162 | # Roughly 10% raw, 45% hex, 45% dec.
2163 | # '@' *must* be encoded. I [John Gruber] insist.
2164 | # Issue 26: '_' must be encoded.
2165 | if r > 0.9 and ch not in "@_":
2166 | return ch
2167 | elif r < 0.45:
2168 | # The [1:] is to drop leading '0': 0x63 -> x63
2169 | return '%s;' % hex(ord(ch))[1:]
2170 | else:
2171 | return '%s;' % ord(ch)
2172 |
2173 |
2174 |
2175 | #---- mainline
2176 |
2177 | class _NoReflowFormatter(optparse.IndentedHelpFormatter):
2178 | """An optparse formatter that does NOT reflow the description."""
2179 | def format_description(self, description):
2180 | return description or ""
2181 |
2182 | def _test():
2183 | import doctest
2184 | doctest.testmod()
2185 |
2186 | def main(argv=None):
2187 | if argv is None:
2188 | argv = sys.argv
2189 | if not logging.root.handlers:
2190 | logging.basicConfig()
2191 |
2192 | usage = "usage: %prog [PATHS...]"
2193 | version = "%prog "+__version__
2194 | parser = optparse.OptionParser(prog="markdown2", usage=usage,
2195 | version=version, description=cmdln_desc,
2196 | formatter=_NoReflowFormatter())
2197 | parser.add_option("-v", "--verbose", dest="log_level",
2198 | action="store_const", const=logging.DEBUG,
2199 | help="more verbose output")
2200 | parser.add_option("--encoding",
2201 | help="specify encoding of text content")
2202 | parser.add_option("--html4tags", action="store_true", default=False,
2203 | help="use HTML 4 style for empty element tags")
2204 | parser.add_option("-s", "--safe", metavar="MODE", dest="safe_mode",
2205 | help="sanitize literal HTML: 'escape' escapes "
2206 | "HTML meta chars, 'replace' replaces with an "
2207 | "[HTML_REMOVED] note")
2208 | parser.add_option("-x", "--extras", action="append",
2209 | help="Turn on specific extra features (not part of "
2210 | "the core Markdown spec). See above.")
2211 | parser.add_option("--use-file-vars",
2212 | help="Look for and use Emacs-style 'markdown-extras' "
2213 | "file var to turn on extras. See "
2214 | "")
2215 | parser.add_option("--link-patterns-file",
2216 | help="path to a link pattern file")
2217 | parser.add_option("--self-test", action="store_true",
2218 | help="run internal self-tests (some doctests)")
2219 | parser.add_option("--compare", action="store_true",
2220 | help="run against Markdown.pl as well (for testing)")
2221 | parser.set_defaults(log_level=logging.INFO, compare=False,
2222 | encoding="utf-8", safe_mode=None, use_file_vars=False)
2223 | opts, paths = parser.parse_args()
2224 | log.setLevel(opts.log_level)
2225 |
2226 | if opts.self_test:
2227 | return _test()
2228 |
2229 | if opts.extras:
2230 | extras = {}
2231 | for s in opts.extras:
2232 | splitter = re.compile("[,;: ]+")
2233 | for e in splitter.split(s):
2234 | if '=' in e:
2235 | ename, earg = e.split('=', 1)
2236 | try:
2237 | earg = int(earg)
2238 | except ValueError:
2239 | pass
2240 | else:
2241 | ename, earg = e, None
2242 | extras[ename] = earg
2243 | else:
2244 | extras = None
2245 |
2246 | if opts.link_patterns_file:
2247 | link_patterns = []
2248 | f = open(opts.link_patterns_file)
2249 | try:
2250 | for i, line in enumerate(f.readlines()):
2251 | if not line.strip(): continue
2252 | if line.lstrip().startswith("#"): continue
2253 | try:
2254 | pat, href = line.rstrip().rsplit(None, 1)
2255 | except ValueError:
2256 | raise MarkdownError("%s:%d: invalid link pattern line: %r"
2257 | % (opts.link_patterns_file, i+1, line))
2258 | link_patterns.append(
2259 | (_regex_from_encoded_pattern(pat), href))
2260 | finally:
2261 | f.close()
2262 | else:
2263 | link_patterns = None
2264 |
2265 | from os.path import join, dirname, abspath, exists
2266 | markdown_pl = join(dirname(dirname(abspath(__file__))), "test",
2267 | "Markdown.pl")
2268 | if not paths:
2269 | paths = ['-']
2270 | for path in paths:
2271 | if path == '-':
2272 | text = sys.stdin.read()
2273 | else:
2274 | fp = codecs.open(path, 'r', opts.encoding)
2275 | text = fp.read()
2276 | fp.close()
2277 | if opts.compare:
2278 | from subprocess import Popen, PIPE
2279 | print("==== Markdown.pl ====")
2280 | p = Popen('perl %s' % markdown_pl, shell=True, stdin=PIPE, stdout=PIPE, close_fds=True)
2281 | p.stdin.write(text.encode('utf-8'))
2282 | p.stdin.close()
2283 | perl_html = p.stdout.read().decode('utf-8')
2284 | if py3:
2285 | sys.stdout.write(perl_html)
2286 | else:
2287 | sys.stdout.write(perl_html.encode(
2288 | sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))
2289 | print("==== markdown2.py ====")
2290 | html = markdown(text,
2291 | html4tags=opts.html4tags,
2292 | safe_mode=opts.safe_mode,
2293 | extras=extras, link_patterns=link_patterns,
2294 | use_file_vars=opts.use_file_vars)
2295 | if py3:
2296 | sys.stdout.write(html)
2297 | else:
2298 | sys.stdout.write(html.encode(
2299 | sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))
2300 | if extras and "toc" in extras:
2301 | log.debug("toc_html: " +
2302 | html.toc_html.encode(sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))
2303 | if opts.compare:
2304 | test_dir = join(dirname(dirname(abspath(__file__))), "test")
2305 | if exists(join(test_dir, "test_markdown2.py")):
2306 | sys.path.insert(0, test_dir)
2307 | from test_markdown2 import norm_html_from_html
2308 | norm_html = norm_html_from_html(html)
2309 | norm_perl_html = norm_html_from_html(perl_html)
2310 | else:
2311 | norm_html = html
2312 | norm_perl_html = perl_html
2313 | print("==== match? %r ====" % (norm_perl_html == norm_html))
2314 |
2315 |
2316 | if __name__ == "__main__":
2317 | sys.exit( main(sys.argv) )
2318 |
--------------------------------------------------------------------------------
/markdown2.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Copyright (c) 2012 Trent Mick.
3 | # Copyright (c) 2007-2008 ActiveState Corp.
4 | # License: MIT (http://www.opensource.org/licenses/mit-license.php)
5 |
6 | from __future__ import generators
7 |
8 | r"""A fast and complete Python implementation of Markdown.
9 |
10 | [from http://daringfireball.net/projects/markdown/]
11 | > Markdown is a text-to-HTML filter; it translates an easy-to-read /
12 | > easy-to-write structured text format into HTML. Markdown's text
13 | > format is most similar to that of plain text email, and supports
14 | > features such as headers, *emphasis*, code blocks, blockquotes, and
15 | > links.
16 | >
17 | > Markdown's syntax is designed not as a generic markup language, but
18 | > specifically to serve as a front-end to (X)HTML. You can use span-level
19 | > HTML tags anywhere in a Markdown document, and you can use block level
20 | > HTML tags (like and `
898 | # Must come after _do_links(), because you can use < and >
899 | # delimiters in inline links like [this]().
900 | text = self._do_auto_links(text)
901 |
902 | if "link-patterns" in self.extras:
903 | text = self._do_link_patterns(text)
904 |
905 | text = self._encode_amps_and_angles(text)
906 |
907 | text = self._do_italics_and_bold(text)
908 |
909 | if "smarty-pants" in self.extras:
910 | text = self._do_smart_punctuation(text)
911 |
912 | # Do hard breaks:
913 | text = re.sub(r" {2,}\n", "
925 | |
926 | # auto-link (e.g., )
927 | <\w+[^>]*>
928 | |
929 | # comment
930 | |
931 | <\?.*?\?> # processing instruction
932 | )
933 | """, re.X)
934 |
935 | def _escape_special_chars(self, text):
936 | # Python markdown note: the HTML tokenization here differs from
937 | # that in Markdown.pl, hence the behaviour for subtle cases can
938 | # differ (I believe the tokenizer here does a better job because
939 | # it isn't susceptible to unmatched '<' and '>' in HTML tags).
940 | # Note, however, that '>' is not allowed in an auto-link URL
941 | # here.
942 | escaped = []
943 | is_html_markup = False
944 | for token in self._sorta_html_tokenize_re.split(text):
945 | if is_html_markup:
946 | # Within tags/HTML-comments/auto-links, encode * and _
947 | # so they don't conflict with their use in Markdown for
948 | # italics and strong. We're replacing each such
949 | # character with its corresponding MD5 checksum value;
950 | # this is likely overkill, but it should prevent us from
951 | # colliding with the escape values by accident.
952 | escaped.append(token.replace('*', self._escape_table['*'])
953 | .replace('_', self._escape_table['_']))
954 | else:
955 | escaped.append(self._encode_backslash_escapes(token))
956 | is_html_markup = not is_html_markup
957 | return ''.join(escaped)
958 |
959 | def _hash_html_spans(self, text):
960 | # Used for safe_mode.
961 |
962 | def _is_auto_link(s):
963 | if ':' in s and self._auto_link_re.match(s):
964 | return True
965 | elif '@' in s and self._auto_email_link_re.match(s):
966 | return True
967 | return False
968 |
969 | tokens = []
970 | is_html_markup = False
971 | for token in self._sorta_html_tokenize_re.split(text):
972 | if is_html_markup and not _is_auto_link(token):
973 | sanitized = self._sanitize_html(token)
974 | key = _hash_text(sanitized)
975 | self.html_spans[key] = sanitized
976 | tokens.append(key)
977 | else:
978 | tokens.append(token)
979 | is_html_markup = not is_html_markup
980 | return ''.join(tokens)
981 |
982 | def _unhash_html_spans(self, text):
983 | for key, sanitized in list(self.html_spans.items()):
984 | text = text.replace(key, sanitized)
985 | return text
986 |
987 | def _sanitize_html(self, s):
988 | if self.safe_mode == "replace":
989 | return self.html_removed_text
990 | elif self.safe_mode == "escape":
991 | replacements = [
992 | ('&', '&'),
993 | ('<', '<'),
994 | ('>', '>'),
995 | ]
996 | for before, after in replacements:
997 | s = s.replace(before, after)
998 | return s
999 | else:
1000 | raise MarkdownError("invalid value for 'safe_mode': %r (must be "
1001 | "'escape' or 'replace')" % self.safe_mode)
1002 |
1003 | _tail_of_inline_link_re = re.compile(r'''
1004 | # Match tail of: [text](/url/) or [text](/url/ "title")
1005 | \( # literal paren
1006 | [ \t]*
1007 | (?P # \1
1008 | <.*?>
1009 | |
1010 | .*?
1011 | )
1012 | [ \t]*
1013 | ( # \2
1014 | (['"]) # quote char = \3
1015 | (?P.*?)
1016 | \3 # matching quote
1017 | )? # title is optional
1018 | \)
1019 | ''', re.X | re.S)
1020 | _tail_of_reference_link_re = re.compile(r'''
1021 | # Match tail of: [text][id]
1022 | [ ]? # one optional space
1023 | (?:\n[ ]*)? # one optional newline followed by spaces
1024 | \[
1025 | (?P.*?)
1026 | \]
1027 | ''', re.X | re.S)
1028 |
1029 | def _do_links(self, text):
1030 | """Turn Markdown link shortcuts into XHTML and
tags.
1031 |
1032 | This is a combination of Markdown.pl's _DoAnchors() and
1033 | _DoImages(). They are done together because that simplified the
1034 | approach. It was necessary to use a different approach than
1035 | Markdown.pl because of the lack of atomic matching support in
1036 | Python's regex engine used in $g_nested_brackets.
1037 | """
1038 | MAX_LINK_TEXT_SENTINEL = 3000 # markdown2 issue 24
1039 |
1040 | # `anchor_allowed_pos` is used to support img links inside
1041 | # anchors, but not anchors inside anchors. An anchor's start
1042 | # pos must be `>= anchor_allowed_pos`.
1043 | anchor_allowed_pos = 0
1044 |
1045 | curr_pos = 0
1046 | while True: # Handle the next link.
1047 | # The next '[' is the start of:
1048 | # - an inline anchor: [text](url "title")
1049 | # - a reference anchor: [text][id]
1050 | # - an inline img: 
1051 | # - a reference img: ![text][id]
1052 | # - a footnote ref: [^id]
1053 | # (Only if 'footnotes' extra enabled)
1054 | # - a footnote defn: [^id]: ...
1055 | # (Only if 'footnotes' extra enabled) These have already
1056 | # been stripped in _strip_footnote_definitions() so no
1057 | # need to watch for them.
1058 | # - a link definition: [id]: url "title"
1059 | # These have already been stripped in
1060 | # _strip_link_definitions() so no need to watch for them.
1061 | # - not markup: [...anything else...
1062 | try:
1063 | start_idx = text.index('[', curr_pos)
1064 | except ValueError:
1065 | break
1066 | text_length = len(text)
1067 |
1068 | # Find the matching closing ']'.
1069 | # Markdown.pl allows *matching* brackets in link text so we
1070 | # will here too. Markdown.pl *doesn't* currently allow
1071 | # matching brackets in img alt text -- we'll differ in that
1072 | # regard.
1073 | bracket_depth = 0
1074 | for p in range(start_idx+1, min(start_idx+MAX_LINK_TEXT_SENTINEL,
1075 | text_length)):
1076 | ch = text[p]
1077 | if ch == ']':
1078 | bracket_depth -= 1
1079 | if bracket_depth < 0:
1080 | break
1081 | elif ch == '[':
1082 | bracket_depth += 1
1083 | else:
1084 | # Closing bracket not found within sentinel length.
1085 | # This isn't markup.
1086 | curr_pos = start_idx + 1
1087 | continue
1088 | link_text = text[start_idx+1:p]
1089 |
1090 | # Possibly a footnote ref?
1091 | if "footnotes" in self.extras and link_text.startswith("^"):
1092 | normed_id = re.sub(r'\W', '-', link_text[1:])
1093 | if normed_id in self.footnotes:
1094 | self.footnote_ids.append(normed_id)
1095 | result = '' \
1096 | '%s' \
1097 | % (normed_id, normed_id, len(self.footnote_ids))
1098 | text = text[:start_idx] + result + text[p+1:]
1099 | else:
1100 | # This id isn't defined, leave the markup alone.
1101 | curr_pos = p+1
1102 | continue
1103 |
1104 | # Now determine what this is by the remainder.
1105 | p += 1
1106 | if p == text_length:
1107 | return text
1108 |
1109 | # Inline anchor or img?
1110 | if text[p] == '(': # attempt at perf improvement
1111 | match = self._tail_of_inline_link_re.match(text, p)
1112 | if match:
1113 | # Handle an inline anchor or img.
1114 | is_img = start_idx > 0 and text[start_idx-1] == "!"
1115 | if is_img:
1116 | start_idx -= 1
1117 |
1118 | url, title = match.group("url"), match.group("title")
1119 | if url and url[0] == '<':
1120 | url = url[1:-1] # '' -> 'url'
1121 | # We've got to encode these to avoid conflicting
1122 | # with italics/bold.
1123 | url = url.replace('*', self._escape_table['*']) \
1124 | .replace('_', self._escape_table['_'])
1125 | if title:
1126 | title_str = ' title="%s"' % (
1127 | _xml_escape_attr(title)
1128 | .replace('*', self._escape_table['*'])
1129 | .replace('_', self._escape_table['_']))
1130 | else:
1131 | title_str = ''
1132 | if is_img:
1133 | result = '
= anchor_allowed_pos:
1142 | result_head = '' % (url, title_str)
1143 | result = '%s%s' % (result_head, link_text)
1144 | if "smarty-pants" in self.extras:
1145 | result = result.replace('"', self._escape_table['"'])
1146 | #
allowed from curr_pos on, from
1147 | # anchor_allowed_pos on.
1148 | curr_pos = start_idx + len(result_head)
1149 | anchor_allowed_pos = start_idx + len(result)
1150 | text = text[:start_idx] + result + text[match.end():]
1151 | else:
1152 | # Anchor not allowed here.
1153 | curr_pos = start_idx + 1
1154 | continue
1155 |
1156 | # Reference anchor or img?
1157 | else:
1158 | match = self._tail_of_reference_link_re.match(text, p)
1159 | if match:
1160 | # Handle a reference-style anchor or img.
1161 | is_img = start_idx > 0 and text[start_idx-1] == "!"
1162 | if is_img:
1163 | start_idx -= 1
1164 | link_id = match.group("id").lower()
1165 | if not link_id:
1166 | link_id = link_text.lower() # for links like [this][]
1167 | if link_id in self.urls:
1168 | url = self.urls[link_id]
1169 | # We've got to encode these to avoid conflicting
1170 | # with italics/bold.
1171 | url = url.replace('*', self._escape_table['*']) \
1172 | .replace('_', self._escape_table['_'])
1173 | title = self.titles.get(link_id)
1174 | if title:
1175 | before = title
1176 | title = _xml_escape_attr(title) \
1177 | .replace('*', self._escape_table['*']) \
1178 | .replace('_', self._escape_table['_'])
1179 | title_str = ' title="%s"' % title
1180 | else:
1181 | title_str = ''
1182 | if is_img:
1183 | result = '
= anchor_allowed_pos:
1192 | result = '%s' \
1193 | % (url, title_str, link_text)
1194 | result_head = '' % (url, title_str)
1195 | result = '%s%s' % (result_head, link_text)
1196 | if "smarty-pants" in self.extras:
1197 | result = result.replace('"', self._escape_table['"'])
1198 | #
allowed from curr_pos on, from
1199 | # anchor_allowed_pos on.
1200 | curr_pos = start_idx + len(result_head)
1201 | anchor_allowed_pos = start_idx + len(result)
1202 | text = text[:start_idx] + result + text[match.end():]
1203 | else:
1204 | # Anchor not allowed here.
1205 | curr_pos = start_idx + 1
1206 | else:
1207 | # This id isn't defined, leave the markup alone.
1208 | curr_pos = match.end()
1209 | continue
1210 |
1211 | # Otherwise, it isn't markup.
1212 | curr_pos = start_idx + 1
1213 |
1214 | return text
1215 |
1216 | def header_id_from_text(self, text, prefix, n):
1217 | """Generate a header id attribute value from the given header
1218 | HTML content.
1219 |
1220 | This is only called if the "header-ids" extra is enabled.
1221 | Subclasses may override this for different header ids.
1222 |
1223 | @param text {str} The text of the header tag
1224 | @param prefix {str} The requested prefix for header ids. This is the
1225 | value of the "header-ids" extra key, if any. Otherwise, None.
1226 | @param n {int} The tag number, i.e. `1` for an tag.
1227 | @returns {str} The value for the header tag's "id" attribute. Return
1228 | None to not have an id attribute and to exclude this header from
1229 | the TOC (if the "toc" extra is specified).
1230 | """
1231 | header_id = _slugify(text)
1232 | if prefix and isinstance(prefix, base_string_type):
1233 | header_id = prefix + '-' + header_id
1234 | if header_id in self._count_from_header_id:
1235 | self._count_from_header_id[header_id] += 1
1236 | header_id += '-%s' % self._count_from_header_id[header_id]
1237 | else:
1238 | self._count_from_header_id[header_id] = 1
1239 | return header_id
1240 |
1241 | _toc = None
1242 | def _toc_add_entry(self, level, id, name):
1243 | if self._toc is None:
1244 | self._toc = []
1245 | self._toc.append((level, id, self._unescape_special_chars(name)))
1246 |
1247 | _setext_h_re = re.compile(r'^(.+)[ \t]*\n(=+|-+)[ \t]*\n+', re.M)
1248 | def _setext_h_sub(self, match):
1249 | n = {"=": 1, "-": 2}[match.group(2)[0]]
1250 | demote_headers = self.extras.get("demote-headers")
1251 | if demote_headers:
1252 | n = min(n + demote_headers, 6)
1253 | header_id_attr = ""
1254 | if "header-ids" in self.extras:
1255 | header_id = self.header_id_from_text(match.group(1),
1256 | self.extras["header-ids"], n)
1257 | if header_id:
1258 | header_id_attr = ' id="%s"' % header_id
1259 | html = self._run_span_gamut(match.group(1))
1260 | if "toc" in self.extras and header_id:
1261 | self._toc_add_entry(n, header_id, html)
1262 | return "%s \n\n" % (n, header_id_attr, html, n)
1263 |
1264 | _atx_h_re = re.compile(r'''
1265 | ^(\#{1,6}) # \1 = string of #'s
1266 | [ \t]+
1267 | (.+?) # \2 = Header text
1268 | [ \t]*
1269 | (?%s\n\n" % (n, header_id_attr, html, n)
1288 |
1289 | def _do_headers(self, text):
1290 | # Setext-style headers:
1291 | # Header 1
1292 | # ========
1293 | #
1294 | # Header 2
1295 | # --------
1296 | text = self._setext_h_re.sub(self._setext_h_sub, text)
1297 |
1298 | # atx-style headers:
1299 | # # Header 1
1300 | # ## Header 2
1301 | # ## Header 2 with closing hashes ##
1302 | # ...
1303 | # ###### Header 6
1304 | text = self._atx_h_re.sub(self._atx_h_sub, text)
1305 |
1306 | return text
1307 |
1308 |
1309 | _marker_ul_chars = '*+-'
1310 | _marker_any = r'(?:[%s]|\d+\.)' % _marker_ul_chars
1311 | _marker_ul = '(?:[%s])' % _marker_ul_chars
1312 | _marker_ol = r'(?:\d+\.)'
1313 |
1314 | def _list_sub(self, match):
1315 | lst = match.group(1)
1316 | lst_type = match.group(3) in self._marker_ul_chars and "ul" or "ol"
1317 | result = self._process_list_items(lst)
1318 | if self.list_level:
1319 | return "<%s>\n%s%s>\n" % (lst_type, result, lst_type)
1320 | else:
1321 | return "<%s>\n%s%s>\n\n" % (lst_type, result, lst_type)
1322 |
1323 | def _do_lists(self, text):
1324 | # Form HTML ordered (numbered) and unordered (bulleted) lists.
1325 |
1326 | # Iterate over each *non-overlapping* list match.
1327 | pos = 0
1328 | while True:
1329 | # Find the *first* hit for either list style (ul or ol). We
1330 | # match ul and ol separately to avoid adjacent lists of different
1331 | # types running into each other (see issue #16).
1332 | hits = []
1333 | for marker_pat in (self._marker_ul, self._marker_ol):
1334 | less_than_tab = self.tab_width - 1
1335 | whole_list = r'''
1336 | ( # \1 = whole list
1337 | ( # \2
1338 | [ ]{0,%d}
1339 | (%s) # \3 = first list item marker
1340 | [ \t]+
1341 | (?!\ *\3\ ) # '- - - ...' isn't a list. See 'not_quite_a_list' test case.
1342 | )
1343 | (?:.+?)
1344 | ( # \4
1345 | \Z
1346 | |
1347 | \n{2,}
1348 | (?=\S)
1349 | (?! # Negative lookahead for another list item marker
1350 | [ \t]*
1351 | %s[ \t]+
1352 | )
1353 | )
1354 | )
1355 | ''' % (less_than_tab, marker_pat, marker_pat)
1356 | if self.list_level: # sub-list
1357 | list_re = re.compile("^"+whole_list, re.X | re.M | re.S)
1358 | else:
1359 | list_re = re.compile(r"(?:(?<=\n\n)|\A\n?)"+whole_list,
1360 | re.X | re.M | re.S)
1361 | match = list_re.search(text, pos)
1362 | if match:
1363 | hits.append((match.start(), match))
1364 | if not hits:
1365 | break
1366 | hits.sort()
1367 | match = hits[0][1]
1368 | start, end = match.span()
1369 | text = text[:start] + self._list_sub(match) + text[end:]
1370 | pos = end
1371 |
1372 | return text
1373 |
1374 | _list_item_re = re.compile(r'''
1375 | (\n)? # leading line = \1
1376 | (^[ \t]*) # leading whitespace = \2
1377 | (?P%s) [ \t]+ # list marker = \3
1378 | ((?:.+?) # list item text = \4
1379 | (\n{1,2})) # eols = \5
1380 | (?= \n* (\Z | \2 (?P%s) [ \t]+))
1381 | ''' % (_marker_any, _marker_any),
1382 | re.M | re.X | re.S)
1383 |
1384 | _last_li_endswith_two_eols = False
1385 | def _list_item_sub(self, match):
1386 | item = match.group(4)
1387 | leading_line = match.group(1)
1388 | leading_space = match.group(2)
1389 | if leading_line or "\n\n" in item or self._last_li_endswith_two_eols:
1390 | item = self._run_block_gamut(self._outdent(item))
1391 | else:
1392 | # Recursion for sub-lists:
1393 | item = self._do_lists(self._outdent(item))
1394 | if item.endswith('\n'):
1395 | item = item[:-1]
1396 | item = self._run_span_gamut(item)
1397 | self._last_li_endswith_two_eols = (len(match.group(5)) == 2)
1398 | return "%s \n" % item
1399 |
1400 | def _process_list_items(self, list_str):
1401 | # Process the contents of a single ordered or unordered list,
1402 | # splitting it into individual list items.
1403 |
1404 | # The $g_list_level global keeps track of when we're inside a list.
1405 | # Each time we enter a list, we increment it; when we leave a list,
1406 | # we decrement. If it's zero, we're not in a list anymore.
1407 | #
1408 | # We do this because when we're not inside a list, we want to treat
1409 | # something like this:
1410 | #
1411 | # I recommend upgrading to version
1412 | # 8. Oops, now this line is treated
1413 | # as a sub-list.
1414 | #
1415 | # As a single paragraph, despite the fact that the second line starts
1416 | # with a digit-period-space sequence.
1417 | #
1418 | # Whereas when we're inside a list (or sub-list), that line will be
1419 | # treated as the start of a sub-list. What a kludge, huh? This is
1420 | # an aspect of Markdown's syntax that's hard to parse perfectly
1421 | # without resorting to mind-reading. Perhaps the solution is to
1422 | # change the syntax rules such that sub-lists must start with a
1423 | # starting cardinal number; e.g. "1." or "a.".
1424 | self.list_level += 1
1425 | self._last_li_endswith_two_eols = False
1426 | list_str = list_str.rstrip('\n') + '\n'
1427 | list_str = self._list_item_re.sub(self._list_item_sub, list_str)
1428 | self.list_level -= 1
1429 | return list_str
1430 |
1431 | def _get_pygments_lexer(self, lexer_name):
1432 | try:
1433 | from pygments import lexers, util
1434 | except ImportError:
1435 | return None
1436 | try:
1437 | return lexers.get_lexer_by_name(lexer_name)
1438 | except util.ClassNotFound:
1439 | return None
1440 |
1441 | def _color_with_pygments(self, codeblock, lexer, **formatter_opts):
1442 | import pygments
1443 | import pygments.formatters
1444 |
1445 | class HtmlCodeFormatter(pygments.formatters.HtmlFormatter):
1446 | def _wrap_code(self, inner):
1447 | """A function for use in a Pygments Formatter which
1448 | wraps in tags.
1449 | """
1450 | yield 0, ""
1451 | for tup in inner:
1452 | yield tup
1453 | yield 0, ""
1454 |
1455 | def wrap(self, source, outfile):
1456 | """Return the source with a code, pre, and div."""
1457 | return self._wrap_div(self._wrap_pre(self._wrap_code(source)))
1458 |
1459 | formatter_opts.setdefault("cssclass", "codehilite")
1460 | formatter = HtmlCodeFormatter(**formatter_opts)
1461 | return pygments.highlight(codeblock, lexer, formatter)
1462 |
1463 | def _code_block_sub(self, match, is_fenced_code_block=False):
1464 | lexer_name = None
1465 | if is_fenced_code_block:
1466 | lexer_name = match.group(1)
1467 | if lexer_name:
1468 | formatter_opts = self.extras['fenced-code-blocks'] or {}
1469 | codeblock = match.group(2)
1470 | codeblock = codeblock[:-1] # drop one trailing newline
1471 | else:
1472 | codeblock = match.group(1)
1473 | codeblock = self._outdent(codeblock)
1474 | codeblock = self._detab(codeblock)
1475 | codeblock = codeblock.lstrip('\n') # trim leading newlines
1476 | codeblock = codeblock.rstrip() # trim trailing whitespace
1477 |
1478 | # Note: "code-color" extra is DEPRECATED.
1479 | if "code-color" in self.extras and codeblock.startswith(":::"):
1480 | lexer_name, rest = codeblock.split('\n', 1)
1481 | lexer_name = lexer_name[3:].strip()
1482 | codeblock = rest.lstrip("\n") # Remove lexer declaration line.
1483 | formatter_opts = self.extras['code-color'] or {}
1484 |
1485 | if lexer_name:
1486 | lexer = self._get_pygments_lexer(lexer_name)
1487 | if lexer:
1488 | colored = self._color_with_pygments(codeblock, lexer,
1489 | **formatter_opts)
1490 | return "\n\n%s\n\n" % colored
1491 |
1492 | codeblock = self._encode_code(codeblock)
1493 | pre_class_str = self._html_class_str_from_tag("pre")
1494 | code_class_str = self._html_class_str_from_tag("code")
1495 | return "\n\n%s\n
\n\n" % (
1496 | pre_class_str, code_class_str, codeblock)
1497 |
1498 | def _html_class_str_from_tag(self, tag):
1499 | """Get the appropriate ' class="..."' string (note the leading
1500 | space), if any, for the given tag.
1501 | """
1502 | if "html-classes" not in self.extras:
1503 | return ""
1504 | try:
1505 | html_classes_from_tag = self.extras["html-classes"]
1506 | except TypeError:
1507 | return ""
1508 | else:
1509 | if tag in html_classes_from_tag:
1510 | return ' class="%s"' % html_classes_from_tag[tag]
1511 | return ""
1512 |
1513 | def _do_code_blocks(self, text):
1514 | """Process Markdown `` blocks."""
1515 | code_block_re = re.compile(r'''
1516 | (?:\n\n|\A\n?)
1517 | ( # $1 = the code block -- one or more lines, starting with a space/tab
1518 | (?:
1519 | (?:[ ]{%d} | \t) # Lines must start with a tab or a tab-width of spaces
1520 | .*\n+
1521 | )+
1522 | )
1523 | ((?=^[ ]{0,%d}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
1524 | ''' % (self.tab_width, self.tab_width),
1525 | re.M | re.X)
1526 | return code_block_re.sub(self._code_block_sub, text)
1527 |
1528 | _fenced_code_block_re = re.compile(r'''
1529 | (?:\n\n|\A\n?)
1530 | ^```([\w+-]+)?[ \t]*\n # opening fence, $1 = optional lang
1531 | (.*?) # $2 = code block content
1532 | ^```[ \t]*\n # closing fence
1533 | ''', re.M | re.X | re.S)
1534 |
1535 | def _fenced_code_block_sub(self, match):
1536 | return self._code_block_sub(match, is_fenced_code_block=True);
1537 |
1538 | def _do_fenced_code_blocks(self, text):
1539 | """Process ```-fenced unindented code blocks ('fenced-code-blocks' extra)."""
1540 | return self._fenced_code_block_re.sub(self._fenced_code_block_sub, text)
1541 |
1542 | # Rules for a code span:
1543 | # - backslash escapes are not interpreted in a code span
1544 | # - to include one or or a run of more backticks the delimiters must
1545 | # be a longer run of backticks
1546 | # - cannot start or end a code span with a backtick; pad with a
1547 | # space and that space will be removed in the emitted HTML
1548 | # See `test/tm-cases/escapes.text` for a number of edge-case
1549 | # examples.
1550 | _code_span_re = re.compile(r'''
1551 | (?%s" % c
1564 |
1565 | def _do_code_spans(self, text):
1566 | # * Backtick quotes are used for spans.
1567 | #
1568 | # * You can use multiple backticks as the delimiters if you want to
1569 | # include literal backticks in the code span. So, this input:
1570 | #
1571 | # Just type ``foo `bar` baz`` at the prompt.
1572 | #
1573 | # Will translate to:
1574 | #
1575 | # Just type foo `bar` baz at the prompt.
1576 | #
1577 | # There's no arbitrary limit to the number of backticks you
1578 | # can use as delimters. If you need three consecutive backticks
1579 | # in your code, use four for delimiters, etc.
1580 | #
1581 | # * You can use spaces to get literal backticks at the edges:
1582 | #
1583 | # ... type `` `bar` `` ...
1584 | #
1585 | # Turns to:
1586 | #
1587 | # ... type `bar` ...
1588 | return self._code_span_re.sub(self._code_span_sub, text)
1589 |
1590 | def _encode_code(self, text):
1591 | """Encode/escape certain characters inside Markdown code runs.
1592 | The point is that in code, these characters are literals,
1593 | and lose their special Markdown meanings.
1594 | """
1595 | replacements = [
1596 | # Encode all ampersands; HTML entities are not
1597 | # entities within a Markdown code span.
1598 | ('&', '&'),
1599 | # Do the angle bracket song and dance:
1600 | ('<', '<'),
1601 | ('>', '>'),
1602 | ]
1603 | for before, after in replacements:
1604 | text = text.replace(before, after)
1605 | hashed = _hash_text(text)
1606 | self._escape_table[text] = hashed
1607 | return hashed
1608 |
1609 | _strong_re = re.compile(r"(\*\*|__)(?=\S)(.+?[*_]*)(?<=\S)\1", re.S)
1610 | _em_re = re.compile(r"(\*|_)(?=\S)(.+?)(?<=\S)\1", re.S)
1611 | _code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.+?[*_]*)(?<=\S)\*\*", re.S)
1612 | _code_friendly_em_re = re.compile(r"\*(?=\S)(.+?)(?<=\S)\*", re.S)
1613 | def _do_italics_and_bold(self, text):
1614 | # must go first:
1615 | if "code-friendly" in self.extras:
1616 | text = self._code_friendly_strong_re.sub(r"\1", text)
1617 | text = self._code_friendly_em_re.sub(r"\1", text)
1618 | else:
1619 | text = self._strong_re.sub(r"\2", text)
1620 | text = self._em_re.sub(r"\2", text)
1621 | return text
1622 |
1623 | # "smarty-pants" extra: Very liberal in interpreting a single prime as an
1624 | # apostrophe; e.g. ignores the fact that "round", "bout", "twer", and
1625 | # "twixt" can be written without an initial apostrophe. This is fine because
1626 | # using scare quotes (single quotation marks) is rare.
1627 | _apostrophe_year_re = re.compile(r"'(\d\d)(?=(\s|,|;|\.|\?|!|$))")
1628 | _contractions = ["tis", "twas", "twer", "neath", "o", "n",
1629 | "round", "bout", "twixt", "nuff", "fraid", "sup"]
1630 | def _do_smart_contractions(self, text):
1631 | text = self._apostrophe_year_re.sub(r"’\1", text)
1632 | for c in self._contractions:
1633 | text = text.replace("'%s" % c, "’%s" % c)
1634 | text = text.replace("'%s" % c.capitalize(),
1635 | "’%s" % c.capitalize())
1636 | return text
1637 |
1638 | # Substitute double-quotes before single-quotes.
1639 | _opening_single_quote_re = re.compile(r"(?
1648 | See "test/tm-cases/smarty_pants.text" for a full discussion of the
1649 | support here and
1650 | for a
1651 | discussion of some diversion from the original SmartyPants.
1652 | """
1653 | if "'" in text: # guard for perf
1654 | text = self._do_smart_contractions(text)
1655 | text = self._opening_single_quote_re.sub("‘", text)
1656 | text = self._closing_single_quote_re.sub("’", text)
1657 |
1658 | if '"' in text: # guard for perf
1659 | text = self._opening_double_quote_re.sub("“", text)
1660 | text = self._closing_double_quote_re.sub("”", text)
1661 |
1662 | text = text.replace("---", "—")
1663 | text = text.replace("--", "–")
1664 | text = text.replace("...", "…")
1665 | text = text.replace(" . . . ", "…")
1666 | text = text.replace(". . .", "…")
1667 | return text
1668 |
1669 | _block_quote_re = re.compile(r'''
1670 | ( # Wrap whole match in \1
1671 | (
1672 | ^[ \t]*>[ \t]? # '>' at the start of a line
1673 | .+\n # rest of the first line
1674 | (.+\n)* # subsequent consecutive lines
1675 | \n* # blanks
1676 | )+
1677 | )
1678 | ''', re.M | re.X)
1679 | _bq_one_level_re = re.compile('^[ \t]*>[ \t]?', re.M);
1680 |
1681 | _html_pre_block_re = re.compile(r'(\s*.+?
)', re.S)
1682 | def _dedent_two_spaces_sub(self, match):
1683 | return re.sub(r'(?m)^ ', '', match.group(1))
1684 |
1685 | def _block_quote_sub(self, match):
1686 | bq = match.group(1)
1687 | bq = self._bq_one_level_re.sub('', bq) # trim one level of quoting
1688 | bq = self._ws_only_line_re.sub('', bq) # trim whitespace-only lines
1689 | bq = self._run_block_gamut(bq) # recurse
1690 |
1691 | bq = re.sub('(?m)^', ' ', bq)
1692 | # These leading spaces screw with content, so we need to fix that:
1693 | bq = self._html_pre_block_re.sub(self._dedent_two_spaces_sub, bq)
1694 |
1695 | return "\n%s\n
\n\n" % bq
1696 |
1697 | def _do_block_quotes(self, text):
1698 | if '>' not in text:
1699 | return text
1700 | return self._block_quote_re.sub(self._block_quote_sub, text)
1701 |
1702 | def _form_paragraphs(self, text):
1703 | # Strip leading and trailing lines:
1704 | text = text.strip('\n')
1705 |
1706 | # Wrap tags.
1707 | grafs = []
1708 | for i, graf in enumerate(re.split(r"\n{2,}", text)):
1709 | if graf in self.html_blocks:
1710 | # Unhashify HTML blocks
1711 | grafs.append(self.html_blocks[graf])
1712 | else:
1713 | cuddled_list = None
1714 | if "cuddled-lists" in self.extras:
1715 | # Need to put back trailing '\n' for `_list_item_re`
1716 | # match at the end of the paragraph.
1717 | li = self._list_item_re.search(graf + '\n')
1718 | # Two of the same list marker in this paragraph: a likely
1719 | # candidate for a list cuddled to preceding paragraph
1720 | # text (issue 33). Note the `[-1]` is a quick way to
1721 | # consider numeric bullets (e.g. "1." and "2.") to be
1722 | # equal.
1723 | if (li and len(li.group(2)) <= 3 and li.group("next_marker")
1724 | and li.group("marker")[-1] == li.group("next_marker")[-1]):
1725 | start = li.start()
1726 | cuddled_list = self._do_lists(graf[start:]).rstrip("\n")
1727 | assert cuddled_list.startswith("
") or cuddled_list.startswith("")
1728 | graf = graf[:start]
1729 |
1730 | # Wrap tags.
1731 | graf = self._run_span_gamut(graf)
1732 | grafs.append("
" + graf.lstrip(" \t") + "
")
1733 |
1734 | if cuddled_list:
1735 | grafs.append(cuddled_list)
1736 |
1737 | return "\n\n".join(grafs)
1738 |
1739 | def _add_footnotes(self, text):
1740 | if self.footnotes:
1741 | footer = [
1742 | '',
1743 | '
',
1745 | ]
1746 | for i, id in enumerate(self.footnote_ids):
1747 | if i != 0:
1748 | footer.append('')
1749 | footer.append('- ' % id)
1750 | footer.append(self._run_block_gamut(self.footnotes[id]))
1751 | backlink = (''
1754 | '↩' % (id, i+1))
1755 | if footer[-1].endswith(""):
1756 | footer[-1] = footer[-1][:-len("")] \
1757 | + ' ' + backlink + ""
1758 | else:
1759 | footer.append("\n
%s
" % backlink)
1760 | footer.append(' ')
1761 | footer.append('')
1762 | footer.append('')
1763 | return text + '\n\n' + '\n'.join(footer)
1764 | else:
1765 | return text
1766 |
1767 | # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
1768 | # http://bumppo.net/projects/amputator/
1769 | _ampersand_re = re.compile(r'&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)')
1770 | _naked_lt_re = re.compile(r'<(?![a-z/?\$!])', re.I)
1771 | _naked_gt_re = re.compile(r'''(?''', re.I)
1772 |
1773 | def _encode_amps_and_angles(self, text):
1774 | # Smart processing for ampersands and angle brackets that need
1775 | # to be encoded.
1776 | text = self._ampersand_re.sub('&', text)
1777 |
1778 | # Encode naked <'s
1779 | text = self._naked_lt_re.sub('<', text)
1780 |
1781 | # Encode naked >'s
1782 | # Note: Other markdown implementations (e.g. Markdown.pl, PHP
1783 | # Markdown) don't do this.
1784 | text = self._naked_gt_re.sub('>', text)
1785 | return text
1786 |
1787 | def _encode_backslash_escapes(self, text):
1788 | for ch, escape in list(self._escape_table.items()):
1789 | text = text.replace("\\"+ch, escape)
1790 | return text
1791 |
1792 | _auto_link_re = re.compile(r'<((https?|ftp):[^\'">\s]+)>', re.I)
1793 | def _auto_link_sub(self, match):
1794 | g1 = match.group(1)
1795 | return '%s' % (g1, g1)
1796 |
1797 | _auto_email_link_re = re.compile(r"""
1798 | <
1799 | (?:mailto:)?
1800 | (
1801 | [-.\w]+
1802 | \@
1803 | [-\w]+(\.[-\w]+)*\.[a-z]+
1804 | )
1805 | >
1806 | """, re.I | re.X | re.U)
1807 | def _auto_email_link_sub(self, match):
1808 | return self._encode_email_address(
1809 | self._unescape_special_chars(match.group(1)))
1810 |
1811 | def _do_auto_links(self, text):
1812 | text = self._auto_link_re.sub(self._auto_link_sub, text)
1813 | text = self._auto_email_link_re.sub(self._auto_email_link_sub, text)
1814 | return text
1815 |
1816 | def _encode_email_address(self, addr):
1817 | # Input: an email address, e.g. "foo@example.com"
1818 | #
1819 | # Output: the email address as a mailto link, with each character
1820 | # of the address encoded as either a decimal or hex entity, in
1821 | # the hopes of foiling most address harvesting spam bots. E.g.:
1822 | #
1823 | # foo
1825 | # @example.com
1826 | #
1827 | # Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
1828 | # mailing list:
1829 | chars = [_xml_encode_email_char_at_random(ch)
1830 | for ch in "mailto:" + addr]
1831 | # Strip the mailto: from the visible part.
1832 | addr = '%s' \
1833 | % (''.join(chars), ''.join(chars[7:]))
1834 | return addr
1835 |
1836 | def _do_link_patterns(self, text):
1837 | """Caveat emptor: there isn't much guarding against link
1838 | patterns being formed inside other standard Markdown links, e.g.
1839 | inside a [link def][like this].
1840 |
1841 | Dev Notes: *Could* consider prefixing regexes with a negative
1842 | lookbehind assertion to attempt to guard against this.
1843 | """
1844 | link_from_hash = {}
1845 | for regex, repl in self.link_patterns:
1846 | replacements = []
1847 | for match in regex.finditer(text):
1848 | if hasattr(repl, "__call__"):
1849 | href = repl(match)
1850 | else:
1851 | href = match.expand(repl)
1852 | replacements.append((match.span(), href))
1853 | for (start, end), href in reversed(replacements):
1854 | escaped_href = (
1855 | href.replace('"', '"') # b/c of attr quote
1856 | # To avoid markdown and :
1857 | .replace('*', self._escape_table['*'])
1858 | .replace('_', self._escape_table['_']))
1859 | link = '%s' % (escaped_href, text[start:end])
1860 | hash = _hash_text(link)
1861 | link_from_hash[hash] = link
1862 | text = text[:start] + hash + text[end:]
1863 | for hash, link in list(link_from_hash.items()):
1864 | text = text.replace(hash, link)
1865 | return text
1866 |
1867 | def _unescape_special_chars(self, text):
1868 | # Swap back in all the special characters we've hidden.
1869 | for ch, hash in list(self._escape_table.items()):
1870 | text = text.replace(hash, ch)
1871 | return text
1872 |
1873 | def _outdent(self, text):
1874 | # Remove one level of line-leading tabs or spaces
1875 | return self._outdent_re.sub('', text)
1876 |
1877 |
1878 | class MarkdownWithExtras(Markdown):
1879 | """A markdowner class that enables most extras:
1880 |
1881 | - footnotes
1882 | - code-color (only has effect if 'pygments' Python module on path)
1883 |
1884 | These are not included:
1885 | - pyshell (specific to Python-related documenting)
1886 | - code-friendly (because it *disables* part of the syntax)
1887 | - link-patterns (because you need to specify some actual
1888 | link-patterns anyway)
1889 | """
1890 | extras = ["footnotes", "code-color"]
1891 |
1892 |
1893 | #---- internal support functions
1894 |
1895 | class UnicodeWithAttrs(unicode):
1896 | """A subclass of unicode used for the return value of conversion to
1897 | possibly attach some attributes. E.g. the "toc_html" attribute when
1898 | the "toc" extra is used.
1899 | """
1900 | metadata = None
1901 | _toc = None
1902 | def toc_html(self):
1903 | """Return the HTML for the current TOC.
1904 |
1905 | This expects the `_toc` attribute to have been set on this instance.
1906 | """
1907 | if self._toc is None:
1908 | return None
1909 |
1910 | def indent():
1911 | return ' ' * (len(h_stack) - 1)
1912 | lines = []
1913 | h_stack = [0] # stack of header-level numbers
1914 | for level, id, name in self._toc:
1915 | if level > h_stack[-1]:
1916 | lines.append("%s" % indent())
1917 | h_stack.append(level)
1918 | elif level == h_stack[-1]:
1919 | lines[-1] += ""
1920 | else:
1921 | while level < h_stack[-1]:
1922 | h_stack.pop()
1923 | if not lines[-1].endswith(""):
1924 | lines[-1] += ""
1925 | lines.append("%s
" % indent())
1926 | lines.append('%s- %s' % (
1927 | indent(), id, name))
1928 | while len(h_stack) > 1:
1929 | h_stack.pop()
1930 | if not lines[-1].endswith("
"):
1931 | lines[-1] += ""
1932 | lines.append("%s
" % indent())
1933 | return '\n'.join(lines) + '\n'
1934 | toc_html = property(toc_html)
1935 |
1936 | ## {{{ http://code.activestate.com/recipes/577257/ (r1)
1937 | _slugify_strip_re = re.compile(r'[^\w\s-]')
1938 | _slugify_hyphenate_re = re.compile(r'[-\s]+')
1939 | def _slugify(value):
1940 | """
1941 | Normalizes string, converts to lowercase, removes non-alpha characters,
1942 | and converts spaces to hyphens.
1943 |
1944 | From Django's "django/template/defaultfilters.py".
1945 | """
1946 | import unicodedata
1947 | value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode()
1948 | value = _slugify_strip_re.sub('', value).strip().lower()
1949 | return _slugify_hyphenate_re.sub('-', value)
1950 | ## end of http://code.activestate.com/recipes/577257/ }}}
1951 |
1952 |
1953 | # From http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52549
1954 | def _curry(*args, **kwargs):
1955 | function, args = args[0], args[1:]
1956 | def result(*rest, **kwrest):
1957 | combined = kwargs.copy()
1958 | combined.update(kwrest)
1959 | return function(*args + rest, **combined)
1960 | return result
1961 |
1962 | # Recipe: regex_from_encoded_pattern (1.0)
1963 | def _regex_from_encoded_pattern(s):
1964 | """'foo' -> re.compile(re.escape('foo'))
1965 | '/foo/' -> re.compile('foo')
1966 | '/foo/i' -> re.compile('foo', re.I)
1967 | """
1968 | if s.startswith('/') and s.rfind('/') != 0:
1969 | # Parse it: /PATTERN/FLAGS
1970 | idx = s.rfind('/')
1971 | pattern, flags_str = s[1:idx], s[idx+1:]
1972 | flag_from_char = {
1973 | "i": re.IGNORECASE,
1974 | "l": re.LOCALE,
1975 | "s": re.DOTALL,
1976 | "m": re.MULTILINE,
1977 | "u": re.UNICODE,
1978 | }
1979 | flags = 0
1980 | for char in flags_str:
1981 | try:
1982 | flags |= flag_from_char[char]
1983 | except KeyError:
1984 | raise ValueError("unsupported regex flag: '%s' in '%s' "
1985 | "(must be one of '%s')"
1986 | % (char, s, ''.join(list(flag_from_char.keys()))))
1987 | return re.compile(s[1:idx], flags)
1988 | else: # not an encoded regex
1989 | return re.compile(re.escape(s))
1990 |
1991 | # Recipe: dedent (0.1.2)
1992 | def _dedentlines(lines, tabsize=8, skip_first_line=False):
1993 | """_dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines
1994 |
1995 | "lines" is a list of lines to dedent.
1996 | "tabsize" is the tab width to use for indent width calculations.
1997 | "skip_first_line" is a boolean indicating if the first line should
1998 | be skipped for calculating the indent width and for dedenting.
1999 | This is sometimes useful for docstrings and similar.
2000 |
2001 | Same as dedent() except operates on a sequence of lines. Note: the
2002 | lines list is modified **in-place**.
2003 | """
2004 | DEBUG = False
2005 | if DEBUG:
2006 | print("dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\
2007 | % (tabsize, skip_first_line))
2008 | indents = []
2009 | margin = None
2010 | for i, line in enumerate(lines):
2011 | if i == 0 and skip_first_line: continue
2012 | indent = 0
2013 | for ch in line:
2014 | if ch == ' ':
2015 | indent += 1
2016 | elif ch == '\t':
2017 | indent += tabsize - (indent % tabsize)
2018 | elif ch in '\r\n':
2019 | continue # skip all-whitespace lines
2020 | else:
2021 | break
2022 | else:
2023 | continue # skip all-whitespace lines
2024 | if DEBUG: print("dedent: indent=%d: %r" % (indent, line))
2025 | if margin is None:
2026 | margin = indent
2027 | else:
2028 | margin = min(margin, indent)
2029 | if DEBUG: print("dedent: margin=%r" % margin)
2030 |
2031 | if margin is not None and margin > 0:
2032 | for i, line in enumerate(lines):
2033 | if i == 0 and skip_first_line: continue
2034 | removed = 0
2035 | for j, ch in enumerate(line):
2036 | if ch == ' ':
2037 | removed += 1
2038 | elif ch == '\t':
2039 | removed += tabsize - (removed % tabsize)
2040 | elif ch in '\r\n':
2041 | if DEBUG: print("dedent: %r: EOL -> strip up to EOL" % line)
2042 | lines[i] = lines[i][j:]
2043 | break
2044 | else:
2045 | raise ValueError("unexpected non-whitespace char %r in "
2046 | "line %r while removing %d-space margin"
2047 | % (ch, line, margin))
2048 | if DEBUG:
2049 | print("dedent: %r: %r -> removed %d/%d"\
2050 | % (line, ch, removed, margin))
2051 | if removed == margin:
2052 | lines[i] = lines[i][j+1:]
2053 | break
2054 | elif removed > margin:
2055 | lines[i] = ' '*(removed-margin) + lines[i][j+1:]
2056 | break
2057 | else:
2058 | if removed:
2059 | lines[i] = lines[i][removed:]
2060 | return lines
2061 |
2062 | def _dedent(text, tabsize=8, skip_first_line=False):
2063 | """_dedent(text, tabsize=8, skip_first_line=False) -> dedented text
2064 |
2065 | "text" is the text to dedent.
2066 | "tabsize" is the tab width to use for indent width calculations.
2067 | "skip_first_line" is a boolean indicating if the first line should
2068 | be skipped for calculating the indent width and for dedenting.
2069 | This is sometimes useful for docstrings and similar.
2070 |
2071 | textwrap.dedent(s), but don't expand tabs to spaces
2072 | """
2073 | lines = text.splitlines(1)
2074 | _dedentlines(lines, tabsize=tabsize, skip_first_line=skip_first_line)
2075 | return ''.join(lines)
2076 |
2077 |
2078 | class _memoized(object):
2079 | """Decorator that caches a function's return value each time it is called.
2080 | If called later with the same arguments, the cached value is returned, and
2081 | not re-evaluated.
2082 |
2083 | http://wiki.python.org/moin/PythonDecoratorLibrary
2084 | """
2085 | def __init__(self, func):
2086 | self.func = func
2087 | self.cache = {}
2088 | def __call__(self, *args):
2089 | try:
2090 | return self.cache[args]
2091 | except KeyError:
2092 | self.cache[args] = value = self.func(*args)
2093 | return value
2094 | except TypeError:
2095 | # uncachable -- for instance, passing a list as an argument.
2096 | # Better to not cache than to blow up entirely.
2097 | return self.func(*args)
2098 | def __repr__(self):
2099 | """Return the function's docstring."""
2100 | return self.func.__doc__
2101 |
2102 |
2103 | def _xml_oneliner_re_from_tab_width(tab_width):
2104 | """Standalone XML processing instruction regex."""
2105 | return re.compile(r"""
2106 | (?:
2107 | (?<=\n\n) # Starting after a blank line
2108 | | # or
2109 | \A\n? # the beginning of the doc
2110 | )
2111 | ( # save in $1
2112 | [ ]{0,%d}
2113 | (?:
2114 | <\?\w+\b\s+.*?\?> # XML processing instruction
2115 | |
2116 | <\w+:\w+\b\s+.*?/> # namespaced single tag
2117 | )
2118 | [ \t]*
2119 | (?=\n{2,}|\Z) # followed by a blank line or end of document
2120 | )
2121 | """ % (tab_width - 1), re.X)
2122 | _xml_oneliner_re_from_tab_width = _memoized(_xml_oneliner_re_from_tab_width)
2123 |
2124 | def _hr_tag_re_from_tab_width(tab_width):
2125 | return re.compile(r"""
2126 | (?:
2127 | (?<=\n\n) # Starting after a blank line
2128 | | # or
2129 | \A\n? # the beginning of the doc
2130 | )
2131 | ( # save in \1
2132 | [ ]{0,%d}
2133 | <(hr) # start tag = \2
2134 | \b # word break
2135 | ([^<>])*? #
2136 | /?> # the matching end tag
2137 | [ \t]*
2138 | (?=\n{2,}|\Z) # followed by a blank line or end of document
2139 | )
2140 | """ % (tab_width - 1), re.X)
2141 | _hr_tag_re_from_tab_width = _memoized(_hr_tag_re_from_tab_width)
2142 |
2143 |
2144 | def _xml_escape_attr(attr, skip_single_quote=True):
2145 | """Escape the given string for use in an HTML/XML tag attribute.
2146 |
2147 | By default this doesn't bother with escaping `'` to `'`, presuming that
2148 | the tag attribute is surrounded by double quotes.
2149 | """
2150 | escaped = (attr
2151 | .replace('&', '&')
2152 | .replace('"', '"')
2153 | .replace('<', '<')
2154 | .replace('>', '>'))
2155 | if not skip_single_quote:
2156 | escaped = escaped.replace("'", "'")
2157 | return escaped
2158 |
2159 |
2160 | def _xml_encode_email_char_at_random(ch):
2161 | r = random()
2162 | # Roughly 10% raw, 45% hex, 45% dec.
2163 | # '@' *must* be encoded. I [John Gruber] insist.
2164 | # Issue 26: '_' must be encoded.
2165 | if r > 0.9 and ch not in "@_":
2166 | return ch
2167 | elif r < 0.45:
2168 | # The [1:] is to drop leading '0': 0x63 -> x63
2169 | return '%s;' % hex(ord(ch))[1:]
2170 | else:
2171 | return '%s;' % ord(ch)
2172 |
2173 |
2174 |
2175 | #---- mainline
2176 |
2177 | class _NoReflowFormatter(optparse.IndentedHelpFormatter):
2178 | """An optparse formatter that does NOT reflow the description."""
2179 | def format_description(self, description):
2180 | return description or ""
2181 |
2182 | def _test():
2183 | import doctest
2184 | doctest.testmod()
2185 |
2186 | def main(argv=None):
2187 | if argv is None:
2188 | argv = sys.argv
2189 | if not logging.root.handlers:
2190 | logging.basicConfig()
2191 |
2192 | usage = "usage: %prog [PATHS...]"
2193 | version = "%prog "+__version__
2194 | parser = optparse.OptionParser(prog="markdown2", usage=usage,
2195 | version=version, description=cmdln_desc,
2196 | formatter=_NoReflowFormatter())
2197 | parser.add_option("-v", "--verbose", dest="log_level",
2198 | action="store_const", const=logging.DEBUG,
2199 | help="more verbose output")
2200 | parser.add_option("--encoding",
2201 | help="specify encoding of text content")
2202 | parser.add_option("--html4tags", action="store_true", default=False,
2203 | help="use HTML 4 style for empty element tags")
2204 | parser.add_option("-s", "--safe", metavar="MODE", dest="safe_mode",
2205 | help="sanitize literal HTML: 'escape' escapes "
2206 | "HTML meta chars, 'replace' replaces with an "
2207 | "[HTML_REMOVED] note")
2208 | parser.add_option("-x", "--extras", action="append",
2209 | help="Turn on specific extra features (not part of "
2210 | "the core Markdown spec). See above.")
2211 | parser.add_option("--use-file-vars",
2212 | help="Look for and use Emacs-style 'markdown-extras' "
2213 | "file var to turn on extras. See "
2214 | "")
2215 | parser.add_option("--link-patterns-file",
2216 | help="path to a link pattern file")
2217 | parser.add_option("--self-test", action="store_true",
2218 | help="run internal self-tests (some doctests)")
2219 | parser.add_option("--compare", action="store_true",
2220 | help="run against Markdown.pl as well (for testing)")
2221 | parser.set_defaults(log_level=logging.INFO, compare=False,
2222 | encoding="utf-8", safe_mode=None, use_file_vars=False)
2223 | opts, paths = parser.parse_args()
2224 | log.setLevel(opts.log_level)
2225 |
2226 | if opts.self_test:
2227 | return _test()
2228 |
2229 | if opts.extras:
2230 | extras = {}
2231 | for s in opts.extras:
2232 | splitter = re.compile("[,;: ]+")
2233 | for e in splitter.split(s):
2234 | if '=' in e:
2235 | ename, earg = e.split('=', 1)
2236 | try:
2237 | earg = int(earg)
2238 | except ValueError:
2239 | pass
2240 | else:
2241 | ename, earg = e, None
2242 | extras[ename] = earg
2243 | else:
2244 | extras = None
2245 |
2246 | if opts.link_patterns_file:
2247 | link_patterns = []
2248 | f = open(opts.link_patterns_file)
2249 | try:
2250 | for i, line in enumerate(f.readlines()):
2251 | if not line.strip(): continue
2252 | if line.lstrip().startswith("#"): continue
2253 | try:
2254 | pat, href = line.rstrip().rsplit(None, 1)
2255 | except ValueError:
2256 | raise MarkdownError("%s:%d: invalid link pattern line: %r"
2257 | % (opts.link_patterns_file, i+1, line))
2258 | link_patterns.append(
2259 | (_regex_from_encoded_pattern(pat), href))
2260 | finally:
2261 | f.close()
2262 | else:
2263 | link_patterns = None
2264 |
2265 | from os.path import join, dirname, abspath, exists
2266 | markdown_pl = join(dirname(dirname(abspath(__file__))), "test",
2267 | "Markdown.pl")
2268 | if not paths:
2269 | paths = ['-']
2270 | for path in paths:
2271 | if path == '-':
2272 | text = sys.stdin.read()
2273 | else:
2274 | fp = codecs.open(path, 'r', opts.encoding)
2275 | text = fp.read()
2276 | fp.close()
2277 | if opts.compare:
2278 | from subprocess import Popen, PIPE
2279 | print("==== Markdown.pl ====")
2280 | p = Popen('perl %s' % markdown_pl, shell=True, stdin=PIPE, stdout=PIPE, close_fds=True)
2281 | p.stdin.write(text.encode('utf-8'))
2282 | p.stdin.close()
2283 | perl_html = p.stdout.read().decode('utf-8')
2284 | if py3:
2285 | sys.stdout.write(perl_html)
2286 | else:
2287 | sys.stdout.write(perl_html.encode(
2288 | sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))
2289 | print("==== markdown2.py ====")
2290 | html = markdown(text,
2291 | html4tags=opts.html4tags,
2292 | safe_mode=opts.safe_mode,
2293 | extras=extras, link_patterns=link_patterns,
2294 | use_file_vars=opts.use_file_vars)
2295 | if py3:
2296 | sys.stdout.write(html)
2297 | else:
2298 | sys.stdout.write(html.encode(
2299 | sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))
2300 | if extras and "toc" in extras:
2301 | log.debug("toc_html: " +
2302 | html.toc_html.encode(sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))
2303 | if opts.compare:
2304 | test_dir = join(dirname(dirname(abspath(__file__))), "test")
2305 | if exists(join(test_dir, "test_markdown2.py")):
2306 | sys.path.insert(0, test_dir)
2307 | from test_markdown2 import norm_html_from_html
2308 | norm_html = norm_html_from_html(html)
2309 | norm_perl_html = norm_html_from_html(perl_html)
2310 | else:
2311 | norm_html = html
2312 | norm_perl_html = perl_html
2313 | print("==== match? %r ====" % (norm_perl_html == norm_html))
2314 |
2315 |
2316 | if __name__ == "__main__":
2317 | sys.exit( main(sys.argv) )
2318 |
--------------------------------------------------------------------------------